In [2]:
######################################################################
# Code produced by Ayden McCarthy
# Manuscript Title: "Machine Learning Models Predict Assessment Outcomes 
#                    From Military Physical Employment Standards Via a 
#                    Physical Test Battery"
# Program of Study: PhD Candidacy
# Institution: Macquarie University
# Year: 2024
######################################################################

######################################################################
# Note for Users:
# This code is intended for use within Python JupyterLab.
# It requires data to be set up according to the instructions 
# outlined in the manuscript. Users can follow the code comments to 
# understand each step of the analysis.
# Please ensure that you replace the placeholder CSV file names in 
# the code with the names of your specific data files to run the code 
# successfully.
######################################################################


In [None]:
# Create HTML for text with black color
html_text = """
<div style='font-size:70px; font-weight:bold; text-align:center; color: black;'>
    MLP Model
</div>
"""

# Display the HTML in the output cell
HTML(html_text)

In [None]:
#####This can take a fair amount of time + Compute Power. Please change number of iterations in thje randomised Grid Search.

In [None]:
import warnings
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, LeaveOneOut
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
import os
import sys

# Redirect warnings to null file
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

devnull = open(os.devnull, 'w')
old_stderr = os.dup(2)
sys.stderr.flush()
os.dup2(devnull.fileno(), 2)

warnings.filterwarnings("ignore", category=ConvergenceWarning)

class MLPRegressorWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.model = MLPRegressor(**params)
    
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    @property
    def coef_(self):
        # Heuristic: sum of absolute weights from the input layer
        importances = np.abs(self.model.coefs_[0]).sum(axis=1)
        return importances

# Load dataset
df = pd.read_csv('Training_Set_Reduced_with_Important_Features.csv') ### Please change this file to your own training dataset

# Separate features (predictors) and target variable
X = df.drop(columns=['Weight Lifted (Kg)']) ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.
y = df['Weight Lifted (Kg)'] ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.

# Standardise features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Focused Randomized Grid Search Parameter Grid
param_grid_focused = {
    'hidden_layer_sizes': [(100, 100)], # Specific structure
    'activation': ['relu'], # Specific activation function
    'solver': ['adam'], # Specific solver
    'alpha': [0.005, 0.01, 0.015], # Exploring values around 0.01
    'learning_rate': ['adaptive'], # Specific learning rate
    'max_iter': [400, 800, 1200, 10000] # Varied iterations
}

# Initialise Leave-One-Out cross-validator
loo = LeaveOneOut()

# Randomised Search with Focused Grid
model = MLPRegressor(max_iter=40000)
random_search_focused = RandomizedSearchCV(model, param_grid_focused, n_iter=1000, cv=loo, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1, random_state=42)
random_search_focused.fit(X_scaled, y)

# Best hyperparameters from focused search
best_params_focused = random_search_focused.best_params_
print(f"Best Focused Parameters: {best_params_focused}")

# Initialize the wrapped model with the best parameters
wrapped_nn = MLPRegressorWrapper(**best_params_focused)

# Initialize RFECV with Leave-One-Out cross-validation
selector = RFECV(estimator=wrapped_nn, step=1, cv=loo, scoring='neg_mean_squared_error')

# Fit RFECV
selector.fit(X_scaled, y)

# Print the optimal number of features
print(f"Optimal number of features: {selector.n_features_}")

# Selected features based on RFECV
selected_features = X.columns[selector.support_]

# Train final model with selected features
X_selected = selector.transform(X_scaled)
final_model = MLPRegressorWrapper(**best_params_focused)
final_model.fit(X_selected, y)

# Print RMSE for the final model
y_pred_final = final_model.predict(X_selected)
final_rmse = np.sqrt(mean_squared_error(y, y_pred_final))
print(f"RMSE for the final model: {final_rmse}")

# Print final selected features
print(f"Final selected features: {selected_features}")

In [None]:
# Assuming the rest of the code is unchanged and final_model has been trained

# Create a new DataFrame with selected features and the target column
df_selected = df[selected_features.tolist() + ['Weight Lifted (Kg)']] ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.

# Save the DataFrame to a CSV file
df_selected.to_csv('NN_RFE_Selected_Features_With_Weight_Lifted_RFE.csv', index=False)

print("CSV file saved with selected features and 'Weight Lifted (Kg)")

      

In [None]:
# Create HTML for text with black color
html_text = """
<div style='font-size:70px; font-weight:bold; text-align:center; color: black;'>
    MLP Model Hyper-Paramter Optimisation
</div>
"""

# Display the HTML in the output cell
HTML(html_text)

In [None]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import os
import sys

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
devnull = open(os.devnull, 'w')
old_stderr = os.dup(2)
sys.stderr.flush()
os.dup2(devnull.fileno(), 2)

# Load dataset
df = pd.read_csv('NN_RFE_Selected_Features_With_Weight_Lifted_RFE.csv')

# Separate features (predictors) and target variable
X = df.drop(columns=['Weight Lifted (Kg)'])
y = df['Weight Lifted (Kg)']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define parameter grid for Neural Network
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50, 100)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

# Initialize the model
model = MLPRegressor(max_iter=1000)

# Set up leave-one-out cross-validation
loo = LeaveOneOut()

# Use GridSearchCV with LOOCV
grid_search = GridSearchCV(
    model, param_grid, cv=loo, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1
)
grid_search.fit(X_scaled, y)

# Extract the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

# Output the best parameters and RMSE
print("Best Parameters:", best_params)
print("Best RMSE:", best_rmse)

# Plot residuals for the best model
y_pred = best_model.predict(X_scaled)
residuals = y - y_pred
plt.hist(residuals, bins=30, edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals - Best Model')
plt.show()

print("Optimization Complete.")


In [None]:
# Create HTML for text with black color
html_text = """
<div style='font-size:70px; font-weight:bold; text-align:center; color: black;'>
    MLP Model vs Test Data Set
</div>
"""

# Display the HTML in the output cell
HTML(html_text)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# Assuming 'scaler' and 'best_model' (the single best model) are already defined

# Load the testing dataset
df_testing = pd.read_csv('Testing_Set.csv') ### Please change to your own datset file

# Assuming the same preprocessing steps as applied to the training set
# Example: Encoding 'Sex' if it was part of your original training features
# Uncomment if applicable
df_testing['Sex'] = df_testing['Sex'].map({'M': 0, 'F': 1}) 

# Prepare the testing feature set, ensuring it matches the training feature set
X_testing = df_testing[X.columns]

# Scale the testing set features using the same scaler from the training phase
X_testing_scaled = scaler.transform(X_testing)

# True target values in the testing dataset for evaluation
true_y_testing = df_testing['Weight Lifted (Kg)'] ######## Your outcome variable is to be placed where 'Weight Lifted (Kg)' is. This was the lift-to-place results.

# Use the best model to make predictions on the testing dataset
y_pred_testing = best_model.predict(X_testing_scaled)

# Calculate RMSE for testing data
rmse_testing = np.sqrt(mean_squared_error(true_y_testing, y_pred_testing))

print("Best Model Parameters:", best_model.get_params())
print(f"RMSE on Testing Data: {rmse_testing:.2f}\n")

# Plotting the predictions vs. true values
plt.figure(figsize=(10, 6))
plt.scatter(true_y_testing, y_pred_testing, alpha=0.7, label='Predicted', color='blue', edgecolors='w')
plt.plot([true_y_testing.min(), true_y_testing.max()], [true_y_testing.min(), true_y_testing.max()], 'k--', lw=2)
plt.xlabel('True Weight Lifted (Kg)')
plt.ylabel('Predicted Weight Lifted (Kg)')
plt.title(f'Best Model on Testing Set\nRMSE: {rmse_testing:.2f}')
plt.legend()
plt.show()
