## In This Notebook we will calculate residuals from our Random Forest Models

In [32]:
import math

#### Making our working directory

In [33]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [34]:
%run python/04_Populations-model-fitting/01_model-preparation.ipynb


In [35]:
import pandas as pd

covari_path = 'data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(4)


Unnamed: 0,time,population,lat,lon,biomass,salin,temp,cruisename,SiO2,POSi,...,NH4,FeT,DOP,DON,DOFe,DOC,DIC,CDOM,ALK,par
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
2,2016-04-20 07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
3,2016-04-20 07:00:00,picoeukaryotes (< 2µm),21.520326,-158.326984,0.701902,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193


### First we will load each random forest model into our notebook

In [36]:
import joblib
# Prochlorooccus
rf_pro = joblib.load("RF_models/pro_random_forest.joblib")

# Synechococus
rf_syn = joblib.load("RF_models/syn_random_forest.joblib")

# Nanoeukaryotes
rf_nano = joblib.load("RF_models/nano_random_forest.joblib")

# Picoeukaryotes
rf_pico = joblib.load("RF_models/pico_random_forest.joblib")

## To make residuals we will predict on all available data for each picophytoplankton's population

In [37]:
def predictions_and_residuals(rf, features, pop_df):
    # Filter dataframes based on PopulationName
    
    # Use the forest's predict method on the test data
    predictions = rf.predict(features)
    
    # Add 'Prediction' and 'residuals' columns to the dataframe
    pop_df.rename(columns={'biomass': 'actual'}, inplace=True)
    pop_df['prediction'] = predictions
    pop_df['residuals'] = ((pop_df['actual'] - pop_df['prediction']) / pop_df['actual']) * 100
    
    return pop_df  



pro_df = predictions_and_residuals(rf=rf_pro, features=features_pro, pop_df=pro_df)
syn_df = predictions_and_residuals(rf=rf_syn, features=features_syn, pop_df=syn_df)
nano_df = predictions_and_residuals(rf=rf_nano, features=features_nano, pop_df=nano_df)
pico_df = predictions_and_residuals(rf=rf_pico, features=features_pico, pop_df=pico_df)


In [46]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_actual_prediction_plots(pop_df, title_prefix='Prochlorococcus'):
    unique_cruises = pop_df['cruisename'].unique()

    # Create a subplot grid
    num_cruises = len(unique_cruises)
    rows = int(num_cruises / 2) if num_cruises % 2 == 0 else int(num_cruises / 2) + 1
    fig = make_subplots(rows=rows, cols=2, subplot_titles=unique_cruises)

    # Define colors for 'actual' and 'prediction' traces
    actual_color = 'blue'
    prediction_color = 'red'

    # Iterate over each unique cruise and add a subplot
    for i, cruise in enumerate(unique_cruises):
        # Filter dataframe for the current cruise
        cruise_df = pop_df[pop_df['cruisename'] == cruise]

        # Add the scatter plots for 'actual' and 'prediction' to the subplot
        row = int(i / 2) + 1
        col = i % 2 + 1
        fig.add_trace(go.Scatter(x=cruise_df['time'], y=cruise_df['actual'], mode='markers', name='Actual',
                                 marker=dict(color=actual_color)),
                      row=row, col=col)
        fig.add_trace(go.Scatter(x=cruise_df['time'], y=cruise_df['prediction'], mode='lines', name='Prediction',
                                 line=dict(color=prediction_color)),
                      row=row, col=col)
        fig.update_xaxes(title_text='Time', row=row, col=col)
        fig.update_yaxes(title_text='Value', row=row, col=col)

    # Update the layout and display the figure
    fig.update_layout(height=600 * rows, width=800, title_text='Actual and Prediction for Each Cruise')
    fig.show()
    
    fig.write_html(f"figures/{title_prefix}/Actual_vs_Prediction-{title_prefix}.html")


In [47]:
create_actual_prediction_plots(pop_df=pro_df, title_prefix='Prochlorococcus')
create_actual_prediction_plots(pop_df=syn_df, title_prefix='Synechococcus')
create_actual_prediction_plots(pop_df=nano_df, title_prefix='Nanoeukaryotes')
create_actual_prediction_plots(pop_df=pico_df, title_prefix='Picoeukaryotes')

In [40]:
import plotly.graph_objects as go

def create_globe_scatter_plot(pop_df, title_prefix="Prochlorococcus", start_lat=0, start_lon=0):
    # Create a 3D scatter plot on a globe
    fig = go.Figure(data=go.Scattergeo(
        lat=pop_df['lat'],
        lon=pop_df['lon'],
        mode='markers',
        marker=dict(
            size=4,
            color=pop_df['residuals'],
            colorscale='Spectral',
            cmin=-50,
            cmax=50,
            colorbar=dict(title='Residuals (%)')
        )
    ))

    # Set the projection type to 'orthographic' for a globe, fitbounds centers the plot on data
    fig.update_geos(projection_type='orthographic', fitbounds="locations")
    # Set the title
    title = f"{title_prefix} Biomass Residuals"
    fig.update_layout(title=title)

    # Show the figure
    fig.show()
    f"{title_prefix} Biomass Residuals"
    fig.write_html(f"figures/{title_prefix}/globe_heatmap-residuals_{title_prefix}.html")


In [41]:
create_globe_scatter_plot(pop_df=pro_df, title_prefix='Prochlorococcus')
create_globe_scatter_plot(pop_df=syn_df, title_prefix='Synechococcus')
create_globe_scatter_plot(pop_df=pico_df, title_prefix='Picoeukaryotes')
create_globe_scatter_plot(pop_df=nano_df, title_prefix='Nanoeukaryotes')

In [42]:
# !pip install contextily

In [43]:
#!pip install plotly==5.15.0
