In [1]:
import pandas as pd
import numpy as np

In [10]:
vald_data = pd.read_csv('data/vald_data_for_modelling.csv')

In [11]:
vald_data

Unnamed: 0,sbuid,testDateUtc,leftAvgForce,leftImpulse,leftMaxForce,leftTorque,rightAvgForce,rightImpulse,rightMaxForce,rightTorque,ForceSymmetry,ImpulseSymmetry,MaxForceSymmetry,TorqueSymmetry,ForceSymmetryRisk,ImpulseSymmetryRisk,MaxForceSymmetryRisk,TorqueSymmetryRisk,RiskCategory
0,14568521.0,2022-01-28,364.812500,6375.605,450.75,184.582125,347.937500,6104.060,451.50,184.889250,1.048500,1.044486,0.998339,0.998339,Low Risk,Low Risk,Low Risk,Low Risk,Low Risk
1,14568521.0,2022-03-24,460.750000,8661.625,473.00,193.693500,511.833333,8758.945,529.25,216.727875,0.900195,0.988889,0.893718,0.893718,Low Risk,Low Risk,Low Risk,Low Risk,Low Risk
2,14568521.0,2022-05-27,465.750000,8750.730,481.25,197.071875,491.583333,8295.600,521.00,213.349500,0.947449,1.054864,0.923704,0.923704,Low Risk,Low Risk,Low Risk,Low Risk,Low Risk
3,14568521.0,2022-06-17,448.000000,9147.955,462.50,189.393750,418.916667,8099.180,443.25,181.510875,1.069425,1.129492,1.043429,1.043429,Low Risk,Low Risk,Low Risk,Low Risk,Low Risk
4,14568521.0,2022-07-01,496.583333,9311.730,508.25,208.128375,478.000000,7821.835,504.00,206.388000,1.038877,1.190479,1.008433,1.008433,Low Risk,Low Risk,Low Risk,Low Risk,Low Risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,116196314.0,2024-03-19,426.000000,4867.330,426.00,193.617000,515.500000,5686.100,515.50,234.294750,0.826382,0.856005,0.826382,0.826382,Low Risk,Low Risk,Low Risk,Low Risk,Low Risk
968,116196314.0,2024-06-06,414.750000,4018.515,414.75,188.503875,515.750000,4267.670,515.75,234.408375,0.804169,0.941618,0.804169,0.804169,Medium Risk,Low Risk,Medium Risk,Medium Risk,Medium Risk
969,116196314.0,2024-07-03,396.250000,4591.660,396.25,183.661875,512.750000,5257.880,512.75,237.659625,0.772794,0.873291,0.772794,0.772794,Medium Risk,Low Risk,Medium Risk,Medium Risk,Medium Risk
970,116196314.0,2024-07-12,387.250000,5937.870,387.25,179.490375,509.500000,6422.515,509.50,236.153250,0.760059,0.924540,0.760059,0.760059,Medium Risk,Low Risk,Medium Risk,Medium Risk,Medium Risk


In [12]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare results storage
models_results = {}

# Step 1: Prepare the data (X and y from vald_data)
risk_mapping = {'Low Risk': 0, 'Medium Risk': 1, 'High Risk': 2}
X = vald_data[['ForceSymmetry', 'MaxForceSymmetry', 'TorqueSymmetry']]
y = vald_data['RiskCategory'].map(risk_mapping)

# MODEL 1: SMOTE Oversampling
smote = SMOTE(random_state=42)
X_resampled_1, y_resampled_1 = smote.fit_resample(X, y)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_resampled_1, y_resampled_1, test_size=0.2, stratify=y_resampled_1, random_state=42)
rf_model_1 = RandomForestClassifier(random_state=42)
rf_model_1.fit(X_train_1, y_train_1)
y_pred_1 = rf_model_1.predict(X_test_1)
models_results['Model 1 (SMOTE)'] = {
    'accuracy': accuracy_score(y_test_1, y_pred_1),
    'classification_report': classification_report(y_test_1, y_pred_1, target_names=risk_mapping.keys()),
    'confusion_matrix': confusion_matrix(y_test_1, y_pred_1),
}

# MODEL 2: No Balancing
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
rf_model_2 = RandomForestClassifier(random_state=42)
rf_model_2.fit(X_train_2, y_train_2)
y_pred_2 = rf_model_2.predict(X_test_2)
models_results['Model 2 (No Balancing)'] = {
    'accuracy': accuracy_score(y_test_2, y_pred_2),
    'classification_report': classification_report(y_test_2, y_pred_2, target_names=risk_mapping.keys()),
    'confusion_matrix': confusion_matrix(y_test_2, y_pred_2),
}

# MODEL 3: SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled_3, y_resampled_3 = smoteenn.fit_resample(X, y)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_resampled_3, y_resampled_3, test_size=0.2, stratify=y_resampled_3, random_state=42)
rf_model_3 = RandomForestClassifier(random_state=42)
rf_model_3.fit(X_train_3, y_train_3)
y_pred_3 = rf_model_3.predict(X_test_3)
models_results['Model 3 (SMOTEENN)'] = {
    'accuracy': accuracy_score(y_test_3, y_pred_3),
    'classification_report': classification_report(y_test_3, y_pred_3, target_names=risk_mapping.keys()),
    'confusion_matrix': confusion_matrix(y_test_3, y_pred_3),
}

# MODEL 4: Class Weights
rf_model_4 = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 2, 2: 3})
rf_model_4.fit(X_train_2, y_train_2)  # Use unbalanced data
y_pred_4 = rf_model_4.predict(X_test_2)
models_results['Model 4 (Class Weights)'] = {
    'accuracy': accuracy_score(y_test_2, y_pred_4),
    'classification_report': classification_report(y_test_2, y_pred_4, target_names=risk_mapping.keys()),
    'confusion_matrix': confusion_matrix(y_test_2, y_pred_4),
}


In [15]:
# Print model comparisons
for model_name, results in models_results.items():
    print(f"\n{model_name}")
    print(f"Accuracy: {results['accuracy']}")
    print("Classification Report:")
    print(results['classification_report'])
    print("Confusion Matrix:")
    print(results['confusion_matrix'])



Model 1 (SMOTE)
Accuracy: 0.9838709677419355
Classification Report:
              precision    recall  f1-score   support

    Low Risk       1.00      0.97      0.99       144
 Medium Risk       0.95      1.00      0.98       145
   High Risk       1.00      0.98      0.99       145

    accuracy                           0.98       434
   macro avg       0.98      0.98      0.98       434
weighted avg       0.98      0.98      0.98       434

Confusion Matrix:
[[140   4   0]
 [  0 145   0]
 [  0   3 142]]

Model 2 (No Balancing)
Accuracy: 0.9897435897435898
Classification Report:
              precision    recall  f1-score   support

    Low Risk       1.00      0.99      1.00       145
 Medium Risk       0.95      1.00      0.97        37
   High Risk       1.00      0.92      0.96        13

    accuracy                           0.99       195
   macro avg       0.98      0.97      0.98       195
weighted avg       0.99      0.99      0.99       195

Confusion Matrix:
[[144   1  

In [21]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import mlflow

In [7]:
!mlflow --version


mlflow, version 2.20.1


In [None]:
!mlflow ui

In [8]:
import mlflow
import mlflow.sklearn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Load dataset
vald_data = pd.read_csv("data/vald_data_for_modelling.csv")

# Define features and target
risk_mapping = {'Low Risk': 0, 'Medium Risk': 1, 'High Risk': 2}
X = vald_data[['ForceSymmetry', 'MaxForceSymmetry', 'TorqueSymmetry']]
y = vald_data['RiskCategory'].map(risk_mapping)

# Set MLflow experiment
mlflow.set_experiment("Athlete_Injury_Risk")

# Function to train and log model with MLflow
def train_and_log_model(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log parameters, metrics, and model
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_estimators", 100)
        mlflow.log_param("random_state", 42)
        mlflow.log_metric("accuracy", accuracy)
        
        # Log classification report as an artifact
        report = classification_report(y_test, y_pred, target_names=risk_mapping.keys(), output_dict=True)
        mlflow.log_metric("precision_low", report["Low Risk"]["precision"])
        mlflow.log_metric("recall_low", report["Low Risk"]["recall"])
        mlflow.log_metric("f1_low", report["Low Risk"]["f1-score"])
        
        mlflow.log_metric("precision_medium", report["Medium Risk"]["precision"])
        mlflow.log_metric("recall_medium", report["Medium Risk"]["recall"])
        mlflow.log_metric("f1_medium", report["Medium Risk"]["f1-score"])
        
        mlflow.log_metric("precision_high", report["High Risk"]["precision"])
        mlflow.log_metric("recall_high", report["High Risk"]["recall"])
        mlflow.log_metric("f1_high", report["High Risk"]["f1-score"])

        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} logged with Accuracy: {accuracy:.4f}")

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Model 1: SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)
rf_model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_log_model("RF_SMOTE", rf_model_1, X_train_s, X_test_s, y_train_s, y_test_s)

# Model 2: No Balancing
rf_model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_log_model("RF_No_Balancing", rf_model_2, X_train, X_test, y_train, y_test)

# Model 3: SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled_se, y_resampled_se = smoteenn.fit_resample(X, y)
X_train_se, X_test_se, y_train_se, y_test_se = train_test_split(X_resampled_se, y_resampled_se, test_size=0.2, stratify=y_resampled_se, random_state=42)
rf_model_3 = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_log_model("RF_SMOTEENN", rf_model_3, X_train_se, X_test_se, y_train_se, y_test_se)

# Model 4: Class Weights
rf_model_4 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 2, 2: 3})
train_and_log_model("RF_Class_Weights", rf_model_4, X_train, X_test, y_train, y_test)


2025/01/30 09:13:02 INFO mlflow.tracking.fluent: Experiment with name 'Athlete_Injury_Risk' does not exist. Creating a new experiment.


RF_SMOTE logged with Accuracy: 0.9839




RF_No_Balancing logged with Accuracy: 0.9897




RF_SMOTEENN logged with Accuracy: 1.0000




RF_Class_Weights logged with Accuracy: 0.9795


In [12]:
# # Define the model URI from MLflow
# model_uri = 'runs://RF_SMOTEENN'

In [30]:
import os

from dotenv import load_dotenv
load_dotenv()

True

In [32]:
model_uri=os.getenv("MODEL_URI")

In [33]:
# Load the trained model
best_model = mlflow.pyfunc.load_model(model_uri)

In [14]:
unseen_data=pd.read_csv("data/unseen_data.csv")

In [16]:
X_unseen = unseen_data[['ForceSymmetry', 'MaxForceSymmetry', 'TorqueSymmetry']]

In [34]:
# Make predictions
predictions = best_model.predict(X_unseen)

In [35]:
# Mapping predictions back to Risk Categories
risk_mapping_inverse = {0: 'Low Risk', 1: 'Medium Risk', 2: 'High Risk'}

unseen_data['Predicted Risk']=pd.Series(predictions).map(risk_mapping_inverse)

In [36]:
unseen_data

Unnamed: 0,sbuid,testDateUtc,leftAvgForce,leftImpulse,leftMaxForce,leftTorque,rightAvgForce,rightImpulse,rightMaxForce,rightTorque,ForceSymmetry,ImpulseSymmetry,MaxForceSymmetry,TorqueSymmetry,Predicted Risk
0,14568521.0,2025-01-08,483.375,5730.71,494.75,211.505625,541.25,5814.26,560.5,239.61375,0.893072,0.98563,0.882694,0.882694,Low Risk
1,112900560.0,2025-01-22,588.625,2786.57,607.0,275.8815,481.875,2289.36,494.5,224.75025,1.22153,1.217183,1.227503,1.227503,Medium Risk
2,113328523.0,2025-01-08,442.625,3319.4,451.5,209.27025,379.5,2921.005,387.5,179.60625,1.166337,1.13639,1.165161,1.165161,Medium Risk
3,113328523.0,2025-01-22,592.875,5623.18,614.25,284.704875,504.5,5573.355,519.5,240.78825,1.175173,1.00894,1.182387,1.182387,Medium Risk
4,114215402.0,2025-01-22,540.375,4927.61,563.5,261.18225,599.5,5116.78,602.25,279.142875,0.901376,0.963029,0.935658,0.935658,Low Risk
5,114737056.0,2025-01-22,468.75,2890.84,485.75,203.286375,484.125,3314.205,492.25,206.006625,0.968242,0.872257,0.986795,0.986795,Low Risk
6,114755618.0,2025-01-22,494.125,4411.01,510.5,236.61675,458.0,3922.86,468.25,217.033875,1.078876,1.124437,1.09023,1.09023,Low Risk
7,115197404.0,2025-01-22,594.75,6660.04,606.0,280.881,608.5,6313.285,620.25,287.485875,0.977403,1.054925,0.977025,0.977025,Low Risk
8,115624874.0,2025-01-22,503.5,6560.345,503.5,224.30925,555.0,7262.795,555.0,247.2525,0.907207,0.903281,0.907207,0.907207,Low Risk
9,115667134.0,2025-01-22,499.5,4261.785,504.75,211.237875,456.25,3628.1,462.5,193.55625,1.094795,1.17466,1.091351,1.091351,Low Risk
