## In This Notebook we will calculate residuals from our Random Forest Models

In [109]:
import math

#### Making our wokring directory

In [110]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [111]:
%run python/04_Populations-model-fitting/01_model-preparation.ipynb


In [112]:
import pandas as pd

covari_path = 'data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(3)


Unnamed: 0,time,population,lat,lon,biomass,salin,temp,cruisename,SiO2,POSi,...,NO2,NH4,FeT,DOP,DON,DOFe,DOC,DIC,CDOM,ALK
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.295276,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.295276,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665
2,2016-04-20 07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.295276,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665


### First we will load each random forest model into our notebook

In [113]:
import joblib
# Prochlorooccus
rf_pro = joblib.load("RF_models/pro_random_forest.joblib")

# Synechococus
rf_syn = joblib.load("RF_models/syn_random_forest.joblib")

# Nanoeukaryotes
rf_nano = joblib.load("RF_models/nano_random_forest.joblib")

# Picoeukaryotes
rf_pico = joblib.load("RF_models/pico_random_forest.joblib")

## To make residuals we will predict on all available data for each picophytoplankton's population

In [114]:
def predictions_and_residuals(rf, features, pop_df):
    # Filter dataframes based on PopulationName
    
    # Use the forest's predict method on the test data
    predictions = rf.predict(features)
    
    # Add 'Prediction' and 'residuals' columns to the dataframe

    pop_df.rename(columns={'biomass': 'actual'}, inplace=True)
    pop_df['Prediction'] = predictions
    pop_df['residuals'] = pop_df['actual'] - pop_df['prediction']
        
    
    return df


pro_df = predictions_and_residuals(rf=rf_pro, features=features_pro, pop_df=pro_df)
syn_df = predictions_and_residuals(rf=rf_syn, features=features_syn, pop_df=syn_df)
nano_df = predictions_and_residuals(rf=rf_nano, features=features_nano, pop_df=nano_df)
pico_df = predictions_and_residuals(rf=rf_pico, features=features_pico, pop_df=pico_df)

KeyError: 'prediction'

In [105]:
import plotly.graph_objects as go

def create_globe_scatter_plot(df, title_prefix="Prochlorococcus"):
    # Create a 3D scatter plot on a globe
    fig = go.Figure(data=go.Scattergeo(
        lat=df['lat'],
        lon=df['lon'],
        mode='markers',
        marker=dict(
            size=4,
            color=df['residuals'],
            colorscale='RdBu',
            cmin=min(df['residuals']),
            cmax=max(df['residuals']),
            colorbar=dict(title='Residuals (pgC/L)')
        )
    ))

    # Set the projection type to 'orthographic' for a globe
    fig.update_geos(projection_type='orthographic')

    # Set the title
    title = f"{title_prefix} Biomass Residuals"
    fig.update_layout(title=title)

    # Show the figure
    fig.show()
    f"{title_prefix} Biomass Residuals"
    fig.write_html(f"figures/globe_heatmap-residuals_{title_prefix}.html")


In [106]:
pro_df

Unnamed: 0,time,population,lat,lon,Actual,salin,temp,cruisename,SiO2,POSi,...,FeT,DOP,DON,DOFe,DOC,DIC,CDOM,ALK,Prediction,Errors
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.382262,-0.040833
5,2016-04-20 08:00:00,Synechococcus,21.662710,-158.323430,0.374117,34.902376,24.339265,KOK1606,-0.022845,-0.000127,...,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.382481,-0.008364
9,2016-04-20 09:00:00,Synechococcus,21.802385,-158.305650,0.400944,34.880590,24.320725,KOK1606,-0.021982,-0.000120,...,0.000017,0.012739,0.235864,0.000016,1.528731,1699.677975,0.000031,1956.261750,0.398275,0.002669
13,2016-04-20 10:00:00,Synechococcus,21.943210,-158.289675,0.391992,34.884053,24.310826,KOK1606,-0.021982,-0.000120,...,0.000017,0.012739,0.235864,0.000016,1.528731,1699.677975,0.000031,1956.261750,0.398275,-0.006284
17,2016-04-20 11:00:00,Synechococcus,22.081630,-158.284815,0.404445,34.882005,24.312820,KOK1606,-0.021982,-0.000120,...,0.000017,0.012739,0.235864,0.000016,1.528731,1699.677975,0.000031,1956.261750,0.398275,0.006170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10892,2021-12-29 21:00:00,Synechococcus,32.640114,-117.639691,13.311190,33.460921,15.293194,TN398,0.363296,0.099231,...,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,7.786000,5.525190
10896,2021-12-29 22:00:00,Synechococcus,32.640186,-117.520431,10.638536,33.461781,15.104845,TN398,0.363296,0.099231,...,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,7.851386,2.787150
10900,2021-12-29 23:00:00,Synechococcus,32.643083,-117.531917,8.042523,33.471363,15.314338,TN398,0.363296,0.099231,...,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,7.786000,0.256523
10904,2021-12-30 00:00:00,Synechococcus,32.673493,-117.545342,7.710090,33.468151,15.189021,TN398,0.363296,0.099231,...,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,7.851386,-0.141296


In [107]:
create_globe_scatter_plot(df=pro_df, title_prefix='Prochlorococcus')
create_globe_scatter_plot(df=syn_df, title_prefix='Synechococcus')
create_globe_scatter_plot(df=nano_df, title_prefix='Picoeukaryote')

KeyError: 'residuals'

In [None]:
import matplotlib.pyplot as plt

# Extract the 'lat' and 'residuals' columns from the rf_pro dataframe
latitudes = rf_pro['lat']
residuals = rf_pro['residuals']

# Create a bar chart
plt.bar(latitudes, residuals)

# Set the x-axis label
plt.xlabel('Latitude')

# Set the y-axis label
plt.ylabel('Residuals')

# Set the title of the chart
plt.title('Bar Chart: Latitude vs. Residuals prochlorococcus')

# Display the chart
plt.show()


In [None]:
# !pip install contextily

In [None]:
#!pip install plotly==5.15.0
