# Here we make a Random Forest Model For Prochlorococcus


In [None]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

###  Running the model preparation notebook

In [None]:
# Running model preperation notebook that has a function we need to call
%run 'Python/04_Populations-model-fitting/01_model-preparation.ipynb'
#%run '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/python/04_Populations-model-fitting/01_model-preparation.ipynb'


In [None]:
#loading packages for random forest modeling

from sklearn.model_selection import train_test_split
import joblib

### First we are making a graph to determine the best testing to training ratio 

This is using a function that we defined in notebook 01_model-preparation

In [None]:
#using a function defined in the model preparation notebook
RMSEs = testing_training_ratio(features = features_pro, labels = labels_pro,
                               feature_list=feature_list_pro, title_prefix="Prochlorococcus")


In [None]:
R_RMSEs = testing_training_ratio_random(features = features_pro, labels = labels_pro,
                                        feature_list=feature_list_pro, title_prefix="Prochlorococcus")

## RF Regressor for Prochlorococcus

Here we are first calling a function defined in the model preparation notebook to find the optimal number of decision trees for out model, and then we are assembling a random forest regressor model for the prochlorococcus population.  This is then saved in a joblib file for future use.

In [None]:
from sklearn.model_selection import KFold
import numpy as np

# Define the number of splits for k-fold cross-validation
n_splits = 8
kf = KFold(n_splits=n_splits, shuffle=False)

# Initialize lists to hold training and testing data
train_features = []
test_features = []
train_labels = []
test_labels = []

# Split the data into training and testing sets for each fold
for train_index, test_index in kf.split(features_pro):
    train_feat, test_feat = features_pro[train_index], features_pro[test_index]
    train_lab, test_lab = labels_pro[train_index], labels_pro[test_index]
    
    # Append the training and testing data for this fold to the lists
    train_features.append(train_feat)
    test_features.append(test_feat)
    train_labels.append(train_lab)
    test_labels.append(test_lab)


In [None]:
#plot_oob_error_vs_num_trees(train_features, train_labels, title_prefix="Prochlorococcus")

In [None]:
# Looking at the shape of the features and labels to see if they match up
import numpy
lengths = [len(sublist) for sublist in train_features]
print(set(lengths))
print(type(train_features))
print(type(train_labels))
print(features_pro)
train_features = numpy.array(train_features)
train_labels = numpy.array(train_labels)
test_features = numpy.array(test_features)
test_labels = numpy.array(test_labels)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# We are using a regressor RF model because we are predicting on continous values
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Initialize a list to hold the models for each fold
models = []

# Loop over the folds
for i in range(train_features.shape[0]):
    # Instantiate model with 100 decision trees
    rf = RandomForestRegressor(n_estimators = 100, max_features='sqrt', random_state = 42)
    
    # Use the Training data to build the model
    rf.fit(train_features[i], train_labels[i])
    
    # Append the model to the list
    models.append(rf)

# Save the models
for i, model in enumerate(models):
    joblib.dump(model, f"RF_models/pro_random_forest_fold_{i}.joblib")


In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor

# # Define the hyperparameters grid
# param_grid = {
#     'n_estimators': [4, 6, 8, 10, 50, 100, 200],
#     'max_depth': [None, 2, 4, 6, 8, 10, 20, 30]
# }

# # Initialize the model
# rf = RandomForestRegressor()

# # Initialize the grid search
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# # Fit the grid search to the data
# grid_search.fit(features_pro, labels_pro)

# # Get the best hyperparameters
# best_params = grid_search.best_params_

# print(f"Best hyperparameters: {best_params}")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import pandas as pd

# Initialize lists to hold predictions and errors
predictions = []
maes = []
rmses = []

# Loop over the folds
for i in range(test_features.shape[0]):
    # Load the model for this fold
    rf = joblib.load(f"RF_models/pro_random_forest_fold_{i}.joblib")
    
    # Use the model to predict on the test data for this fold
    preds = rf.predict(test_features[i])
    
    # Calculate the errors
    mae = mean_absolute_error(test_labels[i], preds)
    RMSE = mean_squared_error(test_labels[i], preds, squared=False)
    
    # Append the predictions and errors to the lists
    predictions.append(preds)
    maes.append(mae)
    rmses.append(RMSE)
    # Save the predictions for each fold
    data = {'predictions': preds,
        'reals' : test_labels[i]}
    
    for key, value in data.items():
        print(f"Number of elements in {key}: {np.size(value)}")
    
    actual = pd.DataFrame(data)
    actual.to_csv(f'actual_pro{i}.csv', index=False)

# Convert lists of arrays to 2D arrays

predictions = np.concatenate(predictions)
maes = np.array(maes)
rmses = np.array(rmses)
rmse = np.sqrt(np.mean(rmses**2))

# Print the mean absolute errors and root mean square errors
print('Mean Absolute Errors:', maes)
print('Root Mean Squared Errors:', rmses)
print('Mean RMSE:', rmse)

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd

# Define a list of fold numbers
fold = [0,1,2,3,4,5,6,7]

# Loop over each fold
for f in fold:
    # Read the actual values from a CSV file into a pandas DataFrame
    actual = pd.read_csv(f'actual_pro{f}.csv')
    
    # Create a scatter plot with 'reals' on the x-axis and 'predictions' on the y-axis
    sns.scatterplot(x='reals', y='predictions', data=actual)
    
    # Add a reference line from (0,0) to (1,1)
    plt.plot([0, 1], [0, 1], transform=plt.gca().transAxes, ls='--', c='black')
    
    plt.title(f'Prochlorococcus Fold {f+1}')
    
    plt.savefig(f'pro_fold{f+1}.png')
    
    plt.show()

In [None]:
#choosing one model to use for the rest of the notebook

ftu = 4

## Predicting and Testing for Prochlorococus

Now that we have a model, it's time to test it.  These following functions compare the predictions from out random forest model to actual data, and then use this comparison to give us feature importance.  

In [None]:
# for f in fold:
#     # Convert test_features to a DataFrame
#     test_features_df = pd.DataFrame(test_features[f], columns=feature_list_pro)

#     # Use the forest's predict method on the test data
#     predictions = rf.predict(test_features[f])

#     # Create a new Series with predicted values and index from test_features_df
#     predic_biomass = pd.Series(predictions, index=test_features_df.index)

#     # Assign the new Series to the DataFrame using .loc
#     test_features_df.loc[:, 'Prediction'] = predic_biomass

#     # Calculate the absolute errors
#     errors = abs(predictions - test_labels)

#     # Print out the mean absolute error (mae)
#     from sklearn.metrics import mean_absolute_error
#     mae = mean_absolute_error(test_labels[f], predictions)
#     print('Mean Absolute Error:', round(mae, 2), 'pgC per L.')

#     # Finding the root mean square error (RMSE)
#     from sklearn.metrics import mean_squared_error
#     # RMSE give realtively high weight to large errors 
#     RMSE = mean_squared_error(test_labels[f], predictions, squared=False) #setting squared=False gives us RMSE not MSE

#     # Calculate the percentage of RMSE
#     range_target = test_labels.max() - test_labels.min()
#     percentage_RMSE = (RMSE / range_target) * 100

#     print('Root Mean Squared Error:', round(RMSE, 2), 'pgC per L.')
#     print('Percentage of RMSE:', round(percentage_RMSE, 2), '%')


In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list_pro, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

In [None]:
feature_importance = pd.DataFrame(rf.feature_importances_, index=feature_list_pro).sort_values(by=0, ascending=False)

# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# Make a bar chart
plt.bar(x=feature_importance.index,height=feature_importance[0], orientation = 'vertical')
# Tick labels for x axis
plt.xticks(feature_importance.index, rotation=45, ha='right', rotation_mode='anchor')

# Axis labels and title
plt.ylabel('Importance'); plt.title('Variable Importances for Pro RF');

### Permutation importance as a method of assessing feature importance

Permutation importance tests feature importance by permuting branches on RF trees to asses the impact of changing specific variables on the prediction.  This is an additional way to validate our feature importance.

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    rf, test_features, test_labels, n_repeats=10, random_state=42, n_jobs=2
)


forest_importances = pd.DataFrame(result.importances_mean, index=feature_list_pro).sort_values(by=0, ascending=False)
forest_importances.to_csv('data/modified/pro_permutation_importance.csv', index=False)


In [None]:
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

### Comparing predicted biomass vs actual

In [None]:
fig, axs = plot_model_predictions()

axs[1].set_title('Prochlorococcus')

In [None]:
# Call the function and store the figure and axes objects
# Assuming plot_model_predictions_density() generates a plot and returns the figure and axes objects
fig, ax = plot_model_predictions_density()

# Set the title for the single subplot
ax.set_xlim(0, 15)  # Example x-axis limits
ax.set_ylim(0, 15)  # Example y-axis limits
    
ax.set_title('Prochlorococcus', fontsize=20)



# Display Legend
ax.legend(loc="lower right", )

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust the fontsize




# Display the plot with the updated title
plt.show()


In [None]:
from sklearn.metrics import r2_score
r2_score(test_labels[ftu], predictions)