# This notebook is the last step in preparing the data the Random Forest Regressors for Each population to Predict Biomass

 Here we will seperate the dataframes by population, prepare split features (variables used for prediction) and the label (what we are trying to predict, biomass).

This notebook also includes a few functions we will use within the notebook and within each population's random forest model building notebooks.

#### Set a working directory

In [1]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)


In [2]:
import pandas as pd

covari_path = 'data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(3)



Unnamed: 0,time,cruise,lat,lon,abundance_pro,abundance_syn,abundance_pico,abundance_croco,biomass_pro,biomass_syn,...,sss,sst,ugos,vgos,Fe,O2,NO3,PO4,Si,hours_since_sunrise
0,2015-05-22 22:00:00,KM1508,21.3434,-158.2737,135.216812,2.021318,1.456863,0.006307,4.024661,0.337763,...,34.571716,25.653118,0.005764,-0.132531,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,6.129444
1,2015-05-22 23:00:00,KM1508,21.343533,-158.273744,136.856649,2.437622,1.774607,0.007009,4.167834,0.413687,...,34.571716,25.653118,0.005764,-0.132531,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,7.129444
2,2015-05-23 00:00:00,KM1508,21.346175,-158.27415,130.873523,3.810792,2.01813,0.006307,4.65436,0.654208,...,34.609317,25.646243,-0.002256,-0.132022,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,8.129722


### We have data for 4 types of phytoplankton, here we will split the df into one df per population

In [3]:
covari_pro = covari.drop(columns=['biomass_pico','biomass_croco','biomass_syn','abundance_pico','abundance_croco','abundance_syn'])
covari_syn = covari.drop(columns=['biomass_pico','biomass_croco','biomass_pro','abundance_pico','abundance_croco','abundance_pro'])
covari_pico = covari.drop(columns=['biomass_syn','biomass_croco','biomass_pro','abundance_syn','abundance_croco','abundance_pro'])
covari_croco = covari.drop(columns=['biomass_pico','biomass_syn','biomass_pro','abundance_pico','abundance_syn','abundance_pro'])

In [4]:
def population_dfer(covari_pop, pop_name):
    """
    This function removes population names from the columns of each df
    """
    df = covari_pop
    pop_name = pop_name
    df.rename(columns=lambda x: x.replace('_'+pop_name, ''), inplace=True)

In [5]:
#removing population names so columns are consistent accross dataframes
population_dfer(covari_syn, 'syn')
population_dfer(covari_pro, 'pro')
population_dfer(covari_pico, 'pico')
population_dfer(covari_croco, 'croco')


In [6]:
#dropping abundance columns since we will just be predicting on biomass for now
covari_pro.drop(columns=['abundance'], inplace=True)
covari_syn.drop(columns=['abundance'], inplace=True)
covari_pico.drop(columns=['abundance'], inplace=True)   
covari_croco.drop(columns=['abundance'], inplace=True)

In [18]:
print(covari_pro.columns)

Index(['time', 'cruise', 'lat', 'lon', 'biomass', 'ALK', 'sss', 'sst', 'ugos',
       'vgos', 'Fe', 'O2', 'NO3', 'PO4', 'Si', 'hours_since_sunrise'],
      dtype='object')


In [7]:
import numpy as np

def preprocess_single_population(covari_population):
    """
    Takes the covari dataframe, and whichever of the four populations and returns a dataframe that
    only includes the selected population's rows, a list of labels (biomass values associated with the dataframe)
    and a list of all of the features.
    """
    # Selecting the population based on the provided name
    pop_df = covari_population

    # Creating the labels and features for the population
    labels = np.array(pop_df.biomass)
    labels = np.delete(labels, 0, 0)
    features = pop_df.drop(['time', 'biomass', 'lat', 'lon', 'cruise'], axis=1)
    # Saving feature names for later use
    feature_list = list(features.columns)
    features = features.to_numpy()
    features = np.delete(features, 0, 0)
    return pop_df, labels, features, feature_list




### Using the preprocess_single_population function for all of the populations

In [17]:
# creating a df, labels and features with only for the observed Prochlorooccus


pro_df, labels_pro, features_pro, feature_list_pro = preprocess_single_population(covari_pro)
# Checking length and type to make sure our model can process them
print(features_pro.shape)
print(type(features_pro))
print(labels_pro.shape)

(4679, 11)
<class 'numpy.ndarray'>
(4679,)


In [9]:
# creating a df, labels and features with only for the observed Synechoccoccus


syn_df, labels_syn, features_syn, feature_list_syn = preprocess_single_population(covari_syn)

In [10]:
# creating a df, labels and features with only for the observed Picoeukaryotes


pico_df, labels_pico, features_pico, feature_list_pico = preprocess_single_population(covari_pico)

In [11]:
# creating a df, labels and features with only for the observed Nanoeukaryotes


croco_df, labels_croco, features_croco, feature_list_croco = preprocess_single_population(covari_croco)

## Defining a function for finding the optimal testing to training ratio

Used in specific random forest model notebooks. This function graphs the Root Mean Square Error (RMSE) vs. the testing to training ratio for data used in the model. The aim is to choose the highest testing to training ratio where the RMSE starts to fall most dramatically in value. Keeping a higher testing to training ratio keeps our model more  generalizable and prevents overfitting.  This function is called in each population specific notebook to find the optimal testing to training ratio for that population's model.

In [12]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

def testing_training_ratio(features, labels, feature_list, title_prefix):
    """
    This function uses K-fold cross validation to split the training set, each population specific notebook will use this function
    """
    # Graphs the RMSE of different testing and training ratios
    RMSEs = {'Test_Ratio':[], 'RMSE': []}
    #define number of folds to try
    splits = [2,4,6,8,12,16]
    # Loop through the number of splits
    for n in splits:
        n_splits = n
        kf = KFold(n_splits=n_splits, shuffle=False)
        #finding RMSEs for each fold
        for train_index, test_index in kf.split(features):
            train_features, test_features = features[train_index], features[test_index]
            train_labels, test_labels = labels[train_index], labels[test_index]

            rf = RandomForestRegressor(n_estimators = 80, max_depth= 20, max_features='sqrt', random_state = 42)
            rf.fit(train_features, train_labels)

            # Use the forest's predict method on the test data
            predictions = rf.predict(test_features)

            # Calculate the absolute errors
            errors = abs(predictions - test_labels)

            # Finding the root mean square error (RMSE)
            RMSE = mean_squared_error(test_labels, predictions, squared=False) #setting squared=False gives us RMSE not MSE
            RMSEs['RMSE'].append(RMSE)
            RMSEs['Test_Ratio'].append(1/n_splits)  # The test ratio for n-fold cross-validation is 1/n_splits
   
    # Extract Test Ratios and RMSEs from the dictionary
    test_ratios = RMSEs['Test_Ratio']
    rmse_values = RMSEs['RMSE']

    # Create a line plot
    plt.figure(figsize=(10, 6))  
    plt.plot(test_ratios, rmse_values, marker='o')

    # Fill the area under the curve
    plt.fill_between(test_ratios, rmse_values, alpha=0.3)
    
    plt.xlabel('Testing:Training Ratio', fontsize=15)
    plt.ylabel('RMSE of Biomass (pgC/L)fn', fontsize=15)
    plt.title(f"{title_prefix} - RMSE of Biomass vs. Testing: Training Ratio", fontsize=22)

    plt.xlim(0, 1)  
    
    plt.xticks([i/10 for i in range(11)])  # Set the x-axis tick locations at 0.1 increments
    plt.gca().invert_xaxis()

    
    plt.grid(True) 
    
    plt.tight_layout()  # Improves spacing between the plot elements
    plt.savefig(f"figures/{title_prefix}/RMSEsByFolds.png")
    plt.show()
    
    

    return RMSEs



In [13]:
def testing_training_ratio_random(features, labels, feature_list, title_prefix):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error
    import matplotlib.pyplot as plt
    """
    Used in specific random forest model notebooks. This function graphs the Root Mean Square Error (RMSE) vs.
    the ratio to testing to training data. Aim is to choose the highest testing to training ratio where the RMSE
    starts to fall most dramatically in value. Keeping a higher testing to training ratio keeps our model more
    generalizable prevent overfitting.
    """
    # Graphs the RMSE of differnt testing and training ratios
    RMSEs = {'Test_Ratio':[], 'RMSE': []}
    
    range_list = [i / 20.0 for i in range(1, 19)] + [0.9][:-1] #prints 0.9 twice so I use all but last value

    for fifth in range_list:
        fifth = round(fifth, ndigits=2)
        RMSEs['Test_Ratio'].append(fifth)
        #using train_test_split to manipulate the training to testing ratio
        train_features, test_features, train_labels, test_labels = train_test_split(
            features, labels, test_size = fifth, random_state = 42
            
        )
        rf = RandomForestRegressor(n_estimators = 80, max_depth = 10, max_features='sqrt', random_state = 42)
        rf.fit(train_features, train_labels)
        
        
        # Convert test_features to a DataFrame
        test_features_df = pd.DataFrame(test_features, columns=feature_list)

        # Use the forest's predict method on the test data
        predictions = rf.predict(test_features)

        # Create a new Series with predicted values and index from test_features_df
        predic_biomass = pd.Series(predictions, index=test_features_df.index)

        # Assign the new Series to the DataFrame using .loc
        test_features_df.loc[:, 'Prediction'] = predic_biomass

        # Calculate the absolute errors
        errors = abs(predictions - test_labels)

        # Finding the root mean square error (RMSE)

        # RMSE give realtively high weight to large errors 
        RMSE = mean_squared_error(test_labels, predictions, squared=False) #setting squared=False gives us RMSE not MSE
        RMSEs['RMSE'].append(RMSE)
        
    
    # Extract Test Ratios and RMSEs from the dictionary
    test_ratios = RMSEs['Test_Ratio']
    rmse_values = RMSEs['RMSE']

    # Create a line plot
    plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
    plt.plot(test_ratios, rmse_values, marker='o')

    # Fill the area under the curve
    plt.fill_between(test_ratios, rmse_values, alpha=0.3)
    
    plt.xlabel('Testing:Training Ratio', fontsize=15)
    plt.ylabel('RMSE of Biomass (pgC/L)fn', fontsize=15)
    plt.title(f"{title_prefix} - RMSE of Biomass vs. Testing: Training Ratio", fontsize=22)

    plt.xlim(0, 1)  # Set the x-axis limits from 0 to 1
    
    plt.xticks([i/10 for i in range(11)])  # Set the x-axis tick locations at 0.1 increments
    #inversing the x axis
    plt.gca().invert_xaxis()

    
    plt.grid(True)  # Add a grid to the plot
    
    plt.tight_layout()  # Improves spacing between the plot elements
    plt.show()
    
    return RMSEs

## Defining a function to plot out-of-bag error (OOB) againts number of trees in random forest model

This function tests different numbers of trees used in the random forest model and finds the OOB for each number of trees.  This is then called in each population-specific notebook to find the optimal number of trees for that population's model.  

In [14]:
from collections import OrderedDict
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

def plot_oob_error_vs_num_trees(train_features, train_labels, title_prefix):
    """
    Developes a plot of Out of Bag (oob) error vs the number of trees grown in a random forest model. There are
        three labeled lines within the plot one representing.
    
    Max features represent the amount of all features (varaibles we are predicting on) used for each 
        tree in the random forest. n = all features.
    
    Warm start = true:reuse the solution of the previous call to fit and add more
        estimators to the ensemble, otherwise, just fit a whole new forest.
        
    oob_score = True: Use out-of-bag samples to estimate the generalization score. By default, r2_score is used.
        Provide a callable with signature.
    
    random state: controls random number generator that is used to shuffle/split the data. Ensures the same
        randomization is used each time the code is ran.
    
    """
    RANDOM_STATE = 42

    ensemble_clfs = [
        (
            "max_features='sqrt(n)'",
            RandomForestRegressor(
                warm_start=True,
                max_features="sqrt",
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
        (
            "max_features='1/3 n'",
            RandomForestRegressor(
                warm_start=True,
                max_features=1/3,
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
        (
            "max_features= n",
            RandomForestRegressor(
                warm_start=True,
                max_features=None,
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
    ]

    error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

    min_estimators = 15
    max_estimators = 128

    for label, clf in ensemble_clfs:
        for i in range(min_estimators, max_estimators + 1, 5):
            oob_errors = []
            for fold_features, fold_labels in zip(train_features, train_labels):
                clf.set_params(n_estimators=i)
                clf.fit(fold_features, fold_labels)
                oob_error = 1 - clf.oob_score_
                oob_errors.append(oob_error)
            avg_oob_error = np.mean(oob_errors)
            error_rate[label].append((i, avg_oob_error))
    for label, clf_err in error_rate.items():
        xs, ys = zip(*clf_err)
        plt.plot(xs, ys, label=label)

    plt.xlim(min_estimators, max_estimators)
    plt.xlabel("# of Trees")
    plt.ylabel("OOB error rate (1 - R^2)")
    plt.legend(loc="upper right")
    plt.suptitle(f"{title_prefix} - Out-of-Bag Error Rate vs. Number of Trees in Random Forest Regression")
    plt.show()


## Defining functions to compare predicted biomass with actual data

These functions give us a preliminary look at how well the model is predicting biomass.  These are called in each of the population specific notebooks.  

In [15]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error
import seaborn as sns


def plot_model_predictions():
    """
    Creates two plots. First a Line density plot with the Root Mean Square Percentage Error (RMSPE) displayed 
    with two line density placments one for predicted and one for actual values. The second plot displayes a 
    scatter density plot of predicted vs true values with a linear regression line.
    """
    
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.metrics import mean_absolute_error
    import seaborn as sns

    fig, axes = plt.subplots(1, 2, figsize=(14, 7))
    
    # First subplot: Line Density Plot
    ax1 = axes[0]
    mae = mean_absolute_error(test_labels[ftu], predictions)
    sns.histplot(x=test_labels[ftu], fill=True, color="blue", label="True Values", ax=ax1)
    sns.histplot(x=predictions, fill=True, color="red", label="Predictions", ax=ax1)
    ax1.text(0.05, 0.9, 'RMSPE = {:.2f}%'.format(percentage_RMSE), transform=ax1.transAxes, fontsize=20)
    ax1.set_xlabel('Biomass (pgC per L)')
    ax1.set_ylabel('Density')
    ax1.grid(True)
    ax1.legend()

    # Second subplot: Scatter Plot with Linear Regression Line
    ax2 = axes[1]
    ax2.set_xlabel('Actual Biomass (pgC per L)')
    ax2.set_ylabel('Predicted Biomass (pgC per L)')
    ax2.set_ylim(0, max(test_labels.max(), predictions.max())-5)
    ax2.set_xlim(0, max(test_labels.max(), predictions.max())-5)
    ax2.set_aspect('equal')
    ax2.grid(True)

    
    # PRO Model Predictions vs True Values
    sns.kdeplot(x=test_labels[ftu], y=predictions, shade=True, cmap="Reds", ax=ax2)
    ax2.set_aspect('equal')
    ax2.grid(True)

    # Add linear regression line
    x = test_labels[ftu]
    y = predictions
    slope, intercept = np.polyfit(x, y, 1)
    ax2.plot(x, slope*x + intercept, color='blue', label='Linear Regression Line')
    ax2.legend(loc='upper left')  # Add legend to the upper left corner of the plot


    # Adjust spacing between subplots
    plt.tight_layout()

    # Display the plot
    
    return fig, axes



In [16]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error
import seaborn as sns

def plot_model_predictions_density():
    """
    Displayes a  scatter density plot of predicted vs true values with a linear regression line in addition
    to a one-to-one line that displays a proportionate relationship
    """
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.metrics import mean_absolute_error
    import seaborn as sns

    fig, ax = plt.subplots(1, 1, figsize=(7, 7))


    # Scatter Plot with Linear Regression Line
    ax.set_xlabel('Actual Biomass (pgC per L)', fontsize=24)
    ax.set_ylabel('Predicted Biomass (pgC per L)', fontsize=24)
    ax.set_ylim(0, max(test_labels.max(), predictions.max())-5)
    ax.set_xlim(0, max(test_labels.max(), predictions.max())-5)
    ax.set_aspect('equal')
    ax.grid(True)
    ax.text(0.05, 0.9, 'RMSPE = {:.2f} %'.format(percentage_RMSE), transform=ax.transAxes, fontsize=20)
    
    
    # PRO Model Predictions vs True Values
    sns.kdeplot(x=test_labels[ftu], y=predictions, shade=True, cmap="Reds", ax=ax)
    ax.set_aspect('equal')
    ax.grid(True)

    # Add linear regression line
    x = test_labels[ftu]
    y = predictions
    slope, intercept = np.polyfit(x, y, 1)
    ax.plot(x, slope*x + intercept, color='blue', label='Linear Regression')
    
    # Add black dashed one-to-one line
    max_val = max(test_labels.max(), predictions.max())
    ax.plot([0, max_val], [0, max_val], linestyle='--', color='black', label='One-to-One')


    # Adjust spacing between subplots
    plt.tight_layout()

    # Display the plot
    
    return fig, ax

