In [1]:
######################################################################
# Code produced by Ayden McCarthy
# Manuscript Title: "Machine Learning Models Predict Assessment Outcomes 
#                    From Military Physical Employment Standards Via a 
#                    Physical Test Battery"
# Program of Study: PhD Candidacy
# Institution: Macquarie University
# Year: 2024
######################################################################

######################################################################
# Note for Users:
# This code is intended for use within Python JupyterLab.
# It requires data to be set up according to the instructions 
# outlined in the manuscript. Users can follow the code comments to 
# understand each step of the analysis.
# Please ensure that you replace the placeholder CSV file names in 
# the code with the names of your specific data files to run the code 
# successfully.
######################################################################


In [2]:
# Import necessary libraries and modules for all machine learning models produced in the manuscript
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import xgboost as xgb
import warnings
import time
from itertools import combinations
from matplotlib.ticker import MaxNLocator
from sklearn.exceptions import ConvergenceWarning
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from scipy import stats
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV, ElasticNetCV, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, LeaveOneOut, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from IPython.display import display, HTML
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures


KeyboardInterrupt


KeyboardInterrupt



In [None]:
# Create HTML for text with black color
html_text = """
<div style='font-size:70px; font-weight:bold; text-align:center; color: black;'>
    Random Forest Model
</div>
"""

# Display the HTML in the output cell
HTML(html_text)

In [None]:
###### Note: The below can take some time and compute resources.

In [None]:
# Load dataset
df = pd.read_csv('Training_Set_Reduced_with_Important_Features.csv') ###Please change to your own dataset.

# Separate features (predictors) and target variable
X = df.drop(columns=['Weight Lifted (Kg)']) ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.
y = df['Weight Lifted (Kg)'] ######## Outcome variable

# Define parameter grid for Random Forest model
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Perform Grid Search CV for hyperparameter tuning for Random Forest
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=LeaveOneOut(), scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X, y)
best_params_rf = grid_search_rf.best_params_

# Initialize the Random Forest model with the best hyperparameters
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)

# Initialize RFECV for feature selection with the optimized Random Forest model
rfecv = RFECV(estimator=best_rf, step=1, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
rfecv.fit(X, y)

# Number of features selected by RFECV
n_features = rfecv.n_features_
selected_features = X.columns[rfecv.support_]
X_selected = X[selected_features]

print(f"Number of features selected by Random Forest: {n_features}")

# Fit the Random Forest model with the selected features
best_rf.fit(X_selected, y)

# Calculate RMSE using the optimal number of features
mse_scores = rfecv.cv_results_['mean_test_score']
rmse_optimal = np.sqrt(-mse_scores.max())
print(f"RMSE using optimal {n_features} features in Random Forest: {rmse_optimal}")

# Initialize SHAP Explainer to interpret model predictions
explainer = shap.Explainer(best_rf.predict, shap.sample(X_selected, 100))

# Calculate SHAP values for the selected features
shap_values = explainer(X_selected)
shap.summary_plot(shap_values, X_selected, feature_names=selected_features)

# Calculate the mean absolute SHAP values for feature importance
shap_sum = np.abs(shap_values.values).mean(axis=0)

# Create a DataFrame with feature names and their corresponding SHAP values
feature_importance = pd.DataFrame(list(zip(selected_features, shap_sum)), columns=['Feature', 'SHAP Value'])
feature_importance = feature_importance.sort_values(by='SHAP Value', ascending=False)

print("Feature Importance based on SHAP values:")
print(feature_importance)

# Store results in a dictionary
LOO_RF_model_results = {
    'Random Forest': {
        'Best Parameters': best_rf.get_params(),
        'RMSE Optimal': rmse_optimal,
        'Optimal Features': selected_features,
        'RFECV Support': rfecv.support_,
        'Model': best_rf,
        'Feature Importance': feature_importance
    }
}

print("Analysis Complete.")

In [None]:
# Save model results and optimal features
for model_name, model_info in LOO_RF_model_results.items():
    # Extract the selected features - Check if 'Optimal Features' exists in the dictionary to avoid KeyError
    if 'Optimal Features' in model_info:
        selected_features = model_info['Optimal Features']
    else:
        print(f"No 'Optimal Features' found for {model_name}.")
        continue

    # Add 'Weight Lifted (Kg)' to the list of features
    # Ensure that 'Weight Lifted (Kg)' is not already in the list to avoid duplication
    if 'Weight Lifted (Kg)' not in selected_features: ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.
        selected_features_with_target = list(selected_features) + ['Weight Lifted (Kg)'] ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.
    else:
        selected_features_with_target = list(selected_features)

    # Filter the original DataFrame based on these features
    # Ensure the features exist in the DataFrame to avoid KeyError
    missing_features = [feature for feature in selected_features_with_target if feature not in df.columns]
    if missing_features:
        print(f"The following features are missing in the DataFrame for {model_name}: {missing_features}")
        continue
    df_filtered = df[selected_features_with_target]

    # Save the filtered data to a CSV file
    # Using a safe string for the filename (replacing spaces and slashes)
    safe_model_name = model_name.replace(' ', '_').replace('/', '_')
    filename_filtered = f'{safe_model_name}_optimal_features_data_LOO.csv'
    df_filtered.to_csv(filename_filtered, index=False)

    # Save the model results (including feature importance and RMSE) to a CSV file
    filename_results = f'{safe_model_name}_model_results_LOO.csv'
    model_info['Feature Importance'].to_csv(filename_results, index=False)

    print(f"Saved data for {model_name} with optimal features to {filename_filtered}")
    print(f"Saved model results for {model_name} to {filename_results}")

print("Data saving process complete.")


In [None]:
# Create HTML for text with black color
html_text = """
<div style='font-size:70px; font-weight:bold; text-align:center; color: black;'>
    Optimised Models Paramters
</div>
"""

# Display the HTML in the output cell
HTML(html_text)

In [None]:
#### Note: This script takes singificant time and compute resources. Adjust the parameters as needed.

In [None]:
# Load the data
df_rf = pd.read_csv('Random_Forest_optimal_features_data_LOO.csv')

# Separate features and target variable
X = df_rf.drop(['Weight Lifted (Kg)'], axis=1) ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.
y = df_rf['Weight Lifted (Kg)'] ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.

# Define a more extensive parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400, 500],  # Adding more options for the number of trees
    'max_depth': [None, 10, 20, 30, 40, 50],  # Expanding the range for the maximum depth of trees
    'min_samples_split': [2, 4, 6, 8, 10],  # Expanding the range for the minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Expanding the range for the minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Adding options for the number of features to consider when looking for the best split
    'bootstrap': [True, False]  # Adding options for bootstrap samples
}

# Initialise models
rf = RandomForestRegressor(random_state=42)

# Initialise Leave-One-Out cross-validator
loo = LeaveOneOut()

# Perform Grid Search CV for hyperparameter tuning
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=loo, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
print("*** Tuning hyperparameters for Random Forest... ***")
grid_search_rf.fit(X, y)

# Best hyperparameters
best_params_rf = grid_search_rf.best_params_
print("Best parameters found:", best_params_rf)

# Train the Random Forest model with the best hyperparameters
rf_best = RandomForestRegressor(**best_params_rf, random_state=42)
rf_best.fit(X, y)

# Make predictions
y_pred_loo_rf = rf_best.predict(X)

# Calculate RMSE
rmse_loo_rf = np.sqrt(mean_squared_error(y, y_pred_loo_rf))
print("Root Mean Squared Error (LOO):", rmse_loo_rf)

# Calculate residuals
residuals_loo_rf = y - y_pred_loo_rf

# Plot a histogram of the residuals
plt.hist(residuals_loo_rf, bins=30, edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals Random Forest LOO')
plt.show()


In [None]:
# Create HTML for text with black color
html_text = """
<div style='font-size:70px; font-weight:bold; text-align:center; color: black;'>
    Testing Phase On Unseen Data
</div>
"""

# Display the HTML in the output cell
HTML(html_text)

In [None]:
# Load the testing dataset
df_testing = pd.read_csv('Testing_Set.csv') ###Please change to your own dataset.

# Encoding 'Sex' column (if applicable)
#df_testing['Sex'] = df_testing['Sex'].map({'M': 0, 'F': 1})

# Select the same columns as in the training dataset
X_testing = df_testing[X.columns]  # Use the features selected from LOO RF model

# Use the trained Random Forest model to make predictions on the testing dataset
y_pred_testing_rf = rf_best.predict(X_testing)

# Add the predicted values to the testing dataset
df_testing['Predicted_Weight_Lifted_RF'] = y_pred_testing_rf

# Calculate RMSE for testing data (if true target values are available)
true_y_testing = df_testing['Weight Lifted (Kg)'] ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.
rmse_testing_rf = np.sqrt(mean_squared_error(true_y_testing, y_pred_testing_rf))
print("Root Mean Squared Error on Testing Data (Random Forest LOO):", rmse_testing_rf)

# Plot the true vs. predicted values with improved styling
plt.figure(figsize=(10, 8))

# Scatter plot for the predicted values
plt.scatter(true_y_testing, y_pred_testing_rf, alpha=0.7, label='Predicted RF', color='blue', edgecolors='w')

# Scatter plot for the true values
plt.scatter(true_y_testing, true_y_testing, alpha=0.7, label='True', color='red', edgecolors='w')

# Diagonal line indicating perfect predictions
plt.plot([true_y_testing.min(), true_y_testing.max()], [true_y_testing.min(), true_y_testing.max()], 'k--', lw=2, label='Perfect Prediction')

# Styling the plot
plt.style.use('ggplot')
plt.xlabel('True Weight Lifted (Kg)', fontsize=14)
plt.ylabel('Predicted Weight Lifted (Kg)', fontsize=14)
plt.title('Random Forest LOO True vs. Predicted Weight Lifted (Testing Data)', fontsize=16, fontweight='bold')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)

# Display the plot
plt.show()

# Save the predictions and the plot to files
df_testing.to_csv('Testing_Set_Predictions_RF_LOO.csv', index=False)
plt.savefig('True_vs_Predicted_Plot_RF_LOO.png', format='png')
