## In This Notebook we will calculate residuals from our Random Forest Models

In [17]:
import math

#### Making our working directory

In [18]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [19]:
%run python/04_Populations-model-fitting/01_model-preparation.ipynb


In [20]:
import pandas as pd

covari_path = 'data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(3)


Unnamed: 0,time,population,lat,lon,biomass,salin,temp,cruisename,SiO2,POSi,...,NH4,FeT,DOP,DON,DOFe,DOC,DIC,CDOM,ALK,par
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
2,2016-04-20 07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193


### First we will load each random forest model into our notebook

In [21]:
import joblib
# Prochlorooccus
rf_pro = joblib.load("RF_models/pro_random_forest.joblib")

# Synechococus
rf_syn = joblib.load("RF_models/syn_random_forest.joblib")

# Nanoeukaryotes
rf_nano = joblib.load("RF_models/nano_random_forest.joblib")

# Picoeukaryotes
rf_pico = joblib.load("RF_models/pico_random_forest.joblib")

## To make residuals we will predict on all available data for each picophytoplankton's population

In [24]:
def predictions_and_residuals(rf, features, pop_df):
    # Filter dataframes based on PopulationName
    
    # Use the forest's predict method on the test data
    predictions = rf.predict(features)
    
    # Add 'Prediction' and 'residuals' columns to the dataframe

    pop_df.rename(columns={'biomass': 'actual'}, inplace=True)
    pop_df['prediction'] = predictions  # <-- Corrected column name
    pop_df['residuals'] = pop_df['actual'] - pop_df['prediction']
    
    return pop_df  


pro_df = predictions_and_residuals(rf=rf_pro, features=features_pro, pop_df=pro_df)
syn_df = predictions_and_residuals(rf=rf_syn, features=features_syn, pop_df=syn_df)
nano_df = predictions_and_residuals(rf=rf_nano, features=features_nano, pop_df=nano_df)
pico_df = predictions_and_residuals(rf=rf_pico, features=features_pico, pop_df=pico_df)


In [26]:
import plotly.graph_objects as go

def create_globe_scatter_plot(df, title_prefix="Prochlorococcus"):
    # Create a 3D scatter plot on a globe
    fig = go.Figure(data=go.Scattergeo(
        lat=df['lat'],
        lon=df['lon'],
        mode='markers',
        marker=dict(
            size=4,
            color=df['residuals'],
            colorscale='RdBu',
            cmin=min(df['residuals']),
            cmax=max(df['residuals']),
            colorbar=dict(title='Residuals (pgC/L)')
        )
    ))

    # Set the projection type to 'orthographic' for a globe
    fig.update_geos(projection_type='orthographic')

    # Set the title
    title = f"{title_prefix} Biomass Residuals"
    fig.update_layout(title=title)

    # Show the figure
    fig.show()
    f"{title_prefix} Biomass Residuals"
    fig.write_html(f"figures/globe_heatmap-residuals_{title_prefix}.html")


In [27]:
create_globe_scatter_plot(df=pro_df, title_prefix='Prochlorococcus')
create_globe_scatter_plot(df=syn_df, title_prefix='Synechococcus')
create_globe_scatter_plot(df=pico_df, title_prefix='Picoeukaryote')
create_globe_scatter_plot(df=nano_df, title_prefix='Nanoeukaryotes')

In [28]:
# !pip install contextily

In [29]:
#!pip install plotly==5.15.0
