In [1]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [2]:
import numpy as np

def preprocess_single_population(covari_population):
    """
    Takes the covari dataframe, and whichever of the four populations and returns a dataframe that
    only includes the selected population's rows, a list of labels (biomass values associated with the dataframe)
    and a list of all of the features.
    """
    # Selecting the population based on the provided name
    pop_df = covari_population
    
    # pop_df.drop(columns='hours_since_sunrise', inplace=True)

    # Creating the labels and features for the population
    labels = np.array(pop_df.biomass, copy=True)
    labels = np.delete(labels, 0, 0)
    features = pop_df.drop(['time', 'biomass', 'lat', 'lon', 'cruise'], axis=1, inplace=False)
    # Saving feature names for later use
    feature_list = list(features.columns)
    features = features.to_numpy()
    features = np.delete(features, 0, 0)
    return pop_df, labels, features, feature_list


In [3]:
def preprocess_global(covari_population):
    """
    Takes the covari dataframe, and whichever of the four populations and returns a dataframe that
    only includes the selected population's rows, a list of labels (biomass values associated with the dataframe)
    and a list of all of the features.
    """
    # Selecting the population based on the provided name
    pop_df = covari_population.copy()
    pop_df['biomass'] = 0.0
    if 'time' not in pop_df.columns:
        pop_df['time'] = 0.0
    # pop_df.drop(columns='hours_since_sunrise', inplace=True)

    # Creating the labels and features for the population
    labels = np.array(pop_df.biomass, copy=True)
    labels = np.delete(labels, 0, 0)
    features = pop_df.drop(['time', 'lat', 'lon', 'biomass'], axis=1, inplace=False)
    # Saving feature names for later use
    feature_list = list(features.columns)
    features = features.to_numpy()
    features = np.delete(features, 0, 0)
    return pop_df, labels, features, feature_list

In [4]:
import pandas as pd

covari_path = 'data_ingest/data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.drop(columns=['ugos', 'vgos'], inplace=True)
covari.head(3)

Unnamed: 0,time,cruise,lat,lon,biomass_pro,biomass_syn,biomass_pico,biomass_croco,sss,sst,Fe,O2,NO3,PO4,Si,ALK,hours_since_sunrise
0,2015-05-22 22:00:00,KM1508,21.3434,-158.2737,4.024661,0.337763,0.555395,0.009181,34.571716,25.653118,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,1952.6418,6.129444
1,2015-05-22 23:00:00,KM1508,21.343533,-158.273744,4.167834,0.413687,0.720884,0.013144,34.571716,25.653118,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,1952.6418,7.129444
2,2015-05-23 00:00:00,KM1508,21.346175,-158.27415,4.65436,0.654208,0.635654,0.008443,34.609317,25.646243,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,1952.6418,8.129722


In [5]:
covari_pro = covari.drop(columns=['biomass_pico','biomass_croco','biomass_syn'])
covari_syn = covari.drop(columns=['biomass_pico','biomass_croco','biomass_pro'])
covari_pico = covari.drop(columns=['biomass_syn','biomass_croco','biomass_pro'])
covari_croco = covari.drop(columns=['biomass_pico','biomass_syn','biomass_pro'])

In [6]:
def population_dfer(covari_pop, pop_name):
    """
    This function removes population names from the columns of each df
    """
    df = covari_pop
    pop_name = pop_name
    df.rename(columns=lambda x: x.replace('_'+pop_name, ''), inplace=True)

In [7]:
population_dfer(covari_syn, 'syn')
population_dfer(covari_pro, 'pro')
population_dfer(covari_pico, 'pico')
population_dfer(covari_croco, 'croco')

In [8]:
pro_df, labels_pro, features_pro, feature_list_pro = preprocess_single_population(covari_pro)

In [9]:
from sklearn.model_selection import KFold
import numpy as np

# Define the number of splits for k-fold cross-validation
n_splits = 8
kf = KFold(n_splits=n_splits, shuffle=False)

# Initialize lists to hold training and testing data
train_features = []
test_features = []
train_labels = []
test_labels = []

# Split the data into training and testing sets for each fold
for train_index, test_index in kf.split(features_pro):
    train_feat, test_feat = features_pro[train_index], features_pro[test_index]
    train_lab, test_lab = labels_pro[train_index], labels_pro[test_index]
    
    # Append the training and testing data for this fold to the lists
    train_features.append(train_feat)
    test_features.append(test_feat)
    train_labels.append(train_lab)
    test_labels.append(test_lab)


In [10]:
fold = [0,1,2,3,4,5,6,7]

In [11]:
def equalize(data):
    # Find the length of the shortest sublist
    min_len = min(len(sublist) for sublist in data)

    # Remove data points from each sublist until they are all the same length
    data_truncated = [sublist[:min_len] for sublist in data]

    return data_truncated

In [12]:
train_features = equalize(train_features)
test_features = equalize(test_features)
train_labels = equalize(train_labels)
test_labels = equalize(test_labels)

In [13]:
import numpy
lengths = [len(sublist) for sublist in train_features]

train_features = numpy.array(train_features)
train_labels = numpy.array(train_labels)
test_features = numpy.array(test_features)
test_labels = numpy.array(test_labels)


In [14]:
# We are using a regressor RF model because we are predicting on continous values
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib

# Initialize a list to hold the models for each fold
models = []



# Loop over the folds
for i in range(train_features.shape[0]):
    
    rf = RandomForestRegressor(n_estimators = 120, max_features='sqrt', max_depth = 12, random_state = 42)
    
   
    rf.fit(train_features[i], train_labels[i])
    
    
    models.append(rf)

# Save the models
for i, model in enumerate(models):
    joblib.dump(model, f"RF_models/predictions/pro_random_forest_fold_{i}.joblib")


In [15]:
def k_fold(features_pro, labels_pro, splits):
    # initialize kfold
    n_splits = splits
    kf = KFold(n_splits=n_splits, shuffle=False)

    # Initialize lists to hold training and testing data
    train_features = []
    test_features = []
    train_labels = []
    test_labels = []

    # Split the data into training and testing sets for each fold
    for train_index, test_index in kf.split(features_pro):
        train_feat, test_feat = features_pro[train_index], features_pro[test_index]
        train_lab, test_lab = labels_pro[train_index], labels_pro[test_index]
        
        # Append the training and testing data for this fold to the lists
        train_features.append(train_feat)
        test_features.append(test_feat)
        train_labels.append(train_lab)
        test_labels.append(test_lab)
    return train_features, test_features, train_labels, test_labels

In [16]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
def model_training(train_features, train_labels, test_features, test_labels, hyperparameters):
    '''
    this function trains a random forest regressor model for each fold and saves the model
    '''
    # make sure features and labels are the same length
    train_features = equalize(train_features)
    test_features = equalize(test_features)
    train_labels = equalize(train_labels)
    test_labels = equalize(test_labels)


    train_features = np.array(train_features)
    train_labels = np.array(train_labels)
    test_features = np.array(test_features)
    test_labels = np.array(test_labels)

    
    models = []
    n_estimators = hyperparameters['n_estimators']
    max_depth = hyperparameters['max_depth']
    max_features = hyperparameters['max_features']
    # Loop over the folds
    for i in range(train_features.shape[0]):
        
        rf = RandomForestRegressor(n_estimators = n_estimators, max_features=max_features, max_depth = max_depth, random_state = 33)
        
    
        rf.fit(train_features[i], train_labels[i])
        
        
        models.append(rf)

    # Save the models
    for i, model in enumerate(models):
        joblib.dump(model, f"RF_models/predictions/new_features/pro_random_forest_fold_{i}.joblib")

In [17]:
def new_features(covari_pro):
    covari_pro = covari_pro.drop(columns=['O2', 'NO3', 'PO4', 'Si'])
    pro_df, labels_pro, features_pro, feature_list_pro = preprocess_single_population(covari_pro)
    splits = 8
    train_features, test_features, train_labels, test_labels = k_fold(features_pro, labels_pro, splits)
    hyperparameters = {
    'n_estimators': 120,
    'max_features': 'sqrt',
    'max_depth': 10,
    'random_state': 33
    }   
    model_training(train_features, train_labels, test_features, test_labels, hyperparameters)
    

In [18]:
import joblib
from sklearn.ensemble import RandomForestRegressor

def predict_global(features, new):
    '''
    This uses our 8 trained models (one per fold) to predict on the global features
    '''
    predictions = []
    for f in fold:
        if new:
            rf = joblib.load(f"RF_models/predictions/new_features/pro_random_forest_fold_{f}.joblib")
        else:
            rf = joblib.load(f"RF_models/predictions/pro_random_forest_fold_{f}.joblib")
        
        # Use the model to predict on the test data for this fold
        preds = rf.predict(features)
        predictions.append(preds)
    return predictions



In [19]:
def predictions_preparation(global_df, predictions):
    '''
    This function takes the predictions from the predict_global function and the global_df and returns global_df with the mean
    and relative standard deviation of the predictions
    '''
    predictions.loc[-1] = [0,0,0,0,0,0,0,0]  
    predictions.index = predictions.index + 1  
    predictions = predictions.sort_index()
    predictions['mean'] = predictions.mean(axis=1)
    def pred_conf(preds):
        preds['conf'] = preds.std(axis=1)
        return preds['conf']

    predictions['conf'] = pred_conf(predictions[[0, 1, 2, 3, 4, 5, 6, 7]])
    global_df['biomass'] = predictions['mean']

    
    global_df['std'] = predictions['conf']
    global_df['relative_std'] = (predictions['conf'] / predictions['mean'] * 100)
    return global_df

In [None]:
def predictions_monthly(month_df, predictions):
    '''
    This function takes the predictions from the predict_global function and the global_df and returns global_df with the mean
    and relative standard deviation of the predictions
    '''
    predictions.loc[-1] = [0,0,0,0,0,0,0,0]  
    predictions.index = predictions.index + 1  
    predictions = predictions.sort_index()     
    predictions['mean'] = predictions.mean(axis=1)
    def pred_conf(preds):
        preds['conf'] = preds.std(axis=1)
        return preds['conf']
    predictions.reset_index(inplace=True)
    month_df.reset_index(inplace=True)
    predictions['conf'] = pred_conf(predictions[[0, 1, 2, 3, 4, 5, 6, 7]])
    month_df['biomass'] = predictions['mean']

    month_df['std'] = predictions['conf']
    month_df['relative_std'] = (predictions['conf'] / predictions['mean'] * 100)
    return month_df

In [None]:
def predict_yearly(month, m):
    '''
    This function takes the yearly dataframe and returns the dataframe with the mean and relative standard deviation of the predictions
    '''
    
    month.drop(columns=['month'], inplace=True)

    month_pop_df, labels_month, features_month, feature_list_month = preprocess_global(month)
    predictions_m = predict_global(features_month, False)
    
    preds_month = pd.DataFrame(predictions_m).T
    
    month_pred = predictions_monthly(month, preds_month)
    
    month_pred['month'] = m
    return month_pred

    

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def process_yearly(yearly):
    
    jan = yearly[yearly['month'] == 1]
    feb = yearly[yearly['month'] == 2]
    mar = yearly[yearly['month'] == 3]
    apr = yearly[yearly['month'] == 4]
    may = yearly[yearly['month'] == 5]
    jun = yearly[yearly['month'] == 6]
    jul = yearly[yearly['month'] == 7]
    aug = yearly[yearly['month'] == 8]
    sep = yearly[yearly['month'] == 9]
    oct = yearly[yearly['month'] == 10]
    nov = yearly[yearly['month'] == 11]
    dec = yearly[yearly['month'] == 12]
    month_dfs = [jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec]
    for i, df in enumerate(month_dfs):
        df = predict_yearly(df, i+1)
    predicted = pd.concat(month_dfs, axis=0)
    return predicted