In [2]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [3]:
import numpy as np

def preprocess_single_population(covari_population):
    """
    Takes the covari dataframe, and whichever of the four populations and returns a dataframe that
    only includes the selected population's rows, a list of labels (biomass values associated with the dataframe)
    and a list of all of the features.
    """
    # Selecting the population based on the provided name
    pop_df = covari_population
    
    # pop_df.drop(columns='hours_since_sunrise', inplace=True)

    # Creating the labels and features for the population
    labels = np.array(pop_df.biomass, copy=True)
    labels = np.delete(labels, 0, 0)
    features = pop_df.drop(['time', 'biomass', 'lat', 'lon', 'cruise'], axis=1, inplace=False)
    # Saving feature names for later use
    feature_list = list(features.columns)
    features = features.to_numpy()
    features = np.delete(features, 0, 0)
    return pop_df, labels, features, feature_list


In [4]:
import pandas as pd

covari_path = 'data_ingest/data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(3)

Unnamed: 0,time,cruise,lat,lon,biomass_pro,biomass_syn,biomass_pico,biomass_croco,sss,sst,ugos,vgos,Fe,O2,NO3,PO4,Si,ALK,hours_since_sunrise
0,2015-05-22 22:00:00,KM1508,21.3434,-158.2737,4.024661,0.337763,0.555395,0.009181,34.571716,25.653118,0.005764,-0.132531,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,1952.6418,6.129444
1,2015-05-22 23:00:00,KM1508,21.343533,-158.273744,4.167834,0.413687,0.720884,0.013144,34.571716,25.653118,0.005764,-0.132531,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,1952.6418,7.129444
2,2015-05-23 00:00:00,KM1508,21.346175,-158.27415,4.65436,0.654208,0.635654,0.008443,34.609317,25.646243,-0.002256,-0.132022,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,1952.6418,8.129722


In [5]:
covari_pro = covari.drop(columns=['biomass_pico','biomass_croco','biomass_syn'])
covari_syn = covari.drop(columns=['biomass_pico','biomass_croco','biomass_pro'])
covari_pico = covari.drop(columns=['biomass_syn','biomass_croco','biomass_pro'])
covari_croco = covari.drop(columns=['biomass_pico','biomass_syn','biomass_pro'])

In [6]:
def population_dfer(covari_pop, pop_name):
    """
    This function removes population names from the columns of each df
    """
    df = covari_pop
    pop_name = pop_name
    df.rename(columns=lambda x: x.replace('_'+pop_name, ''), inplace=True)

In [7]:
population_dfer(covari_syn, 'syn')
population_dfer(covari_pro, 'pro')
population_dfer(covari_pico, 'pico')
population_dfer(covari_croco, 'croco')

In [8]:
pro_df, labels_pro, features_pro, feature_list_pro = preprocess_single_population(covari_pro)

In [9]:
from sklearn.model_selection import KFold
import numpy as np

# Define the number of splits for k-fold cross-validation
n_splits = 8
kf = KFold(n_splits=n_splits, shuffle=False)

# Initialize lists to hold training and testing data
train_features = []
test_features = []
train_labels = []
test_labels = []

# Split the data into training and testing sets for each fold
for train_index, test_index in kf.split(features_pro):
    train_feat, test_feat = features_pro[train_index], features_pro[test_index]
    train_lab, test_lab = labels_pro[train_index], labels_pro[test_index]
    
    # Append the training and testing data for this fold to the lists
    train_features.append(train_feat)
    test_features.append(test_feat)
    train_labels.append(train_lab)
    test_labels.append(test_lab)


In [10]:
fold = [0,1,2,3,4,5,6,7]

In [11]:
def equalize(data):
    # Find the length of the shortest sublist
    min_len = min(len(sublist) for sublist in data)

    # Remove data points from each sublist until they are all the same length
    data_truncated = [sublist[:min_len] for sublist in data]

    return data_truncated

In [12]:
train_features = equalize(train_features)
test_features = equalize(test_features)
train_labels = equalize(train_labels)
test_labels = equalize(test_labels)

In [13]:
import numpy
lengths = [len(sublist) for sublist in train_features]
print(set(lengths))
print(type(train_features))
print(type(train_labels))
print(features_pro)
train_features = numpy.array(train_features)
train_labels = numpy.array(train_labels)
test_features = numpy.array(test_features)
test_labels = numpy.array(test_labels)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

{4094}
<class 'list'>
<class 'list'>
[[ 3.45717163e+01  2.56531183e+01  5.76444444e-03 ...  9.46470396e+00
   1.95264180e+03  7.12944444e+00]
 [ 3.46093165e+01  2.56462433e+01 -2.25555556e-03 ...  9.46470396e+00
   1.95264180e+03  8.12972222e+00]
 [ 3.46269515e+01  2.52466600e+01 -1.54770833e-01 ...  9.54816445e+00
   1.95380275e+03  1.11466667e+01]
 ...
 [ 3.49623977e+01  2.48945779e+01 -4.86312500e-02 ...  5.54336847e-04
   1.95756288e+03  1.05333333e+01]
 [ 3.49623977e+01  2.48945779e+01 -4.86312500e-02 ...  6.59941725e-04
   1.95833642e+03  1.15255556e+01]
 [ 3.49402743e+01  2.50197858e+01 -5.21711111e-02 ...  6.59941725e-04
   1.95833642e+03  1.25227778e+01]]
Training Features Shape: (8, 4094, 11)
Training Labels Shape: (8, 4094)
Testing Features Shape: (8, 584, 11)
Testing Labels Shape: (8, 584)


In [14]:
# We are using a regressor RF model because we are predicting on continous values
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib

# Initialize a list to hold the models for each fold
models = []



# Loop over the folds
for i in range(train_features.shape[0]):
    
    rf = RandomForestRegressor(n_estimators = 240, max_features='sqrt', max_depth = 10, random_state = 33)
    
   
    rf.fit(train_features[i], train_labels[i])
    
    
    models.append(rf)

# Save the models
for i, model in enumerate(models):
    joblib.dump(model, f"RF_models/pro_random_forest_fold_{i}.joblib")


In [None]:
def k_fold(features_pro, labels_pro, splits):
    # initialize kfold
    n_splits = splits
    kf = KFold(n_splits=n_splits, shuffle=False)

    # Initialize lists to hold training and testing data
    train_features = []
    test_features = []
    train_labels = []
    test_labels = []

    # Split the data into training and testing sets for each fold
    for train_index, test_index in kf.split(features_pro):
        train_feat, test_feat = features_pro[train_index], features_pro[test_index]
        train_lab, test_lab = labels_pro[train_index], labels_pro[test_index]
        
        # Append the training and testing data for this fold to the lists
        train_features.append(train_feat)
        test_features.append(test_feat)
        train_labels.append(train_lab)
        test_labels.append(test_lab)
    return train_features, test_features, train_labels, test_labels

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
def model_training(train_features, train_labels, test_features, test_labels, hyperparameters):
    '''
    this function trains a random forest regressor model for each fold and saves the model
    '''
    # make sure features and labels are the same length
    train_features = equalize(train_features)
    test_features = equalize(test_features)
    train_labels = equalize(train_labels)
    test_labels = equalize(test_labels)


    train_features = np.array(train_features)
    train_labels = np.array(train_labels)
    test_features = np.array(test_features)
    test_labels = np.array(test_labels)

    
    models = []
    n_estimators = hyperparameters['n_estimators']
    max_depth = hyperparameters['max_depth']
    max_features = hyperparameters['max_features']
    # Loop over the folds
    for i in range(train_features.shape[0]):
        
        rf = RandomForestRegressor(n_estimators = n_estimators, max_features=max_features, max_depth = max_depth, random_state = 33)
        
    
        rf.fit(train_features[i], train_labels[i])
        
        
        models.append(rf)

    # Save the models
    for i, model in enumerate(models):
        joblib.dump(model, f"RF_models/pro_random_forest_fold_{i}.joblib")

In [None]:
def new_features(covari_pro):
    covari_pro = covari_pro.drop(columns=['ugos','vgos','O2', 'NO3', 'PO4', 'Si'])
    pro_df, labels_pro, features_pro, feature_list_pro = preprocess_single_population(covari_pro)
    splits = 8
    train_features, test_features, train_labels, test_labels = k_fold(features_pro, labels_pro, splits)
    hyperparameters = {
    'n_estimators': 240,
    'max_features': 'sqrt',
    'max_depth': 10,
    'random_state': 33
    }   
    model_training(train_features, train_labels, test_features, test_labels, hyperparameters)
    