In [None]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [None]:
import numpy as np

def preprocess_single_population(covari_population):
    """
    Takes the covari dataframe, and whichever of the four populations and returns a dataframe that
    only includes the selected population's rows, a list of labels (biomass values associated with the dataframe)
    and a list of all of the features.
    """
    # Selecting the population based on the provided name
    pop_df = covari_population
    
    # pop_df.drop(columns='hours_since_sunrise', inplace=True)

    # Creating the labels and features for the population
    labels = np.array(pop_df.biomass, copy=True)
    labels = np.delete(labels, 0, 0)
    features = pop_df.drop(['time', 'biomass', 'lat', 'lon', 'cruise'], axis=1, inplace=False)
    # Saving feature names for later use
    feature_list = list(features.columns)
    features = features.to_numpy()
    features = np.delete(features, 0, 0)
    return pop_df, labels, features, feature_list


In [None]:
import pandas as pd

covari_path = 'data_ingest/data/modified/RF_ready_covari.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(3)

In [None]:
covari_pro = covari.drop(columns=['biomass_pico','biomass_croco','biomass_syn'])
covari_syn = covari.drop(columns=['biomass_pico','biomass_croco','biomass_pro'])
covari_pico = covari.drop(columns=['biomass_syn','biomass_croco','biomass_pro'])
covari_croco = covari.drop(columns=['biomass_pico','biomass_syn','biomass_pro'])

In [None]:
def population_dfer(covari_pop, pop_name):
    """
    This function removes population names from the columns of each df
    """
    df = covari_pop
    pop_name = pop_name
    df.rename(columns=lambda x: x.replace('_'+pop_name, ''), inplace=True)

In [None]:
population_dfer(covari_syn, 'syn')
population_dfer(covari_pro, 'pro')
population_dfer(covari_pico, 'pico')
population_dfer(covari_croco, 'croco')

In [None]:
pro_df, labels_pro, features_pro, feature_list_pro = preprocess_single_population(covari_pro)

In [None]:
from sklearn.model_selection import KFold
import numpy as np

# Define the number of splits for k-fold cross-validation
n_splits = 8
kf = KFold(n_splits=n_splits, shuffle=False)

# Initialize lists to hold training and testing data
train_features = []
test_features = []
train_labels = []
test_labels = []

# Split the data into training and testing sets for each fold
for train_index, test_index in kf.split(features_pro):
    train_feat, test_feat = features_pro[train_index], features_pro[test_index]
    train_lab, test_lab = labels_pro[train_index], labels_pro[test_index]
    
    # Append the training and testing data for this fold to the lists
    train_features.append(train_feat)
    test_features.append(test_feat)
    train_labels.append(train_lab)
    test_labels.append(test_lab)


In [None]:
fold = [0,1,2,3,4,5,6,7]

In [None]:
def equalize(data):
    # Find the length of the shortest sublist
    min_len = min(len(sublist) for sublist in data)

    # Remove data points from each sublist until they are all the same length
    data_truncated = [sublist[:min_len] for sublist in data]

    return data_truncated

In [None]:
train_features = equalize(train_features)
test_features = equalize(test_features)
train_labels = equalize(train_labels)
test_labels = equalize(test_labels)

In [None]:
import numpy
lengths = [len(sublist) for sublist in train_features]
print(set(lengths))
print(type(train_features))
print(type(train_labels))
print(features_pro)
train_features = numpy.array(train_features)
train_labels = numpy.array(train_labels)
test_features = numpy.array(test_features)
test_labels = numpy.array(test_labels)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# We are using a regressor RF model because we are predicting on continous values
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib

# Initialize a list to hold the models for each fold
models = []

# Loop over the folds
for i in range(train_features.shape[0]):
    
    rf = RandomForestRegressor(n_estimators = 200, max_features='sqrt', max_depth = 10, random_state = 33)
    
   
    rf.fit(train_features[i], train_labels[i])
    
    
    models.append(rf)

# Save the models
for i, model in enumerate(models):
    joblib.dump(model, f"RF_models/pro_random_forest_fold_{i}.joblib")
