In [2]:
import mlflow
import pandas as pd
import numpy as np


In [3]:
vald_data = pd.read_csv('data/vald_data_for_modelling.csv')

In [4]:
import mlflow
import mlflow.sklearn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Load dataset
vald_data = pd.read_csv("data/vald_data_for_modelling.csv")

# Define features and target
risk_mapping = {'Low Risk': 0, 'Medium Risk': 1, 'High Risk': 2}
X = vald_data[['ForceSymmetry', 'MaxForceSymmetry', 'TorqueSymmetry']]
y = vald_data['RiskCategory'].map(risk_mapping)

# Set MLflow experiment
mlflow.set_experiment("Athlete_Injury_Risk")

# Function to train and log model with MLflow
def train_and_log_model(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log parameters, metrics, and model
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_estimators", 100)
        mlflow.log_param("random_state", 42)
        mlflow.log_metric("accuracy", accuracy)
        
        # Log classification report as an artifact
        report = classification_report(y_test, y_pred, target_names=risk_mapping.keys(), output_dict=True)
        mlflow.log_metric("precision_low", report["Low Risk"]["precision"])
        mlflow.log_metric("recall_low", report["Low Risk"]["recall"])
        mlflow.log_metric("f1_low", report["Low Risk"]["f1-score"])
        
        mlflow.log_metric("precision_medium", report["Medium Risk"]["precision"])
        mlflow.log_metric("recall_medium", report["Medium Risk"]["recall"])
        mlflow.log_metric("f1_medium", report["Medium Risk"]["f1-score"])
        
        mlflow.log_metric("precision_high", report["High Risk"]["precision"])
        mlflow.log_metric("recall_high", report["High Risk"]["recall"])
        mlflow.log_metric("f1_high", report["High Risk"]["f1-score"])

        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} logged with Accuracy: {accuracy:.4f}")

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Model 1: SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)
rf_model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_log_model("RF_SMOTE", rf_model_1, X_train_s, X_test_s, y_train_s, y_test_s)

# Model 2: No Balancing
rf_model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_log_model("RF_No_Balancing", rf_model_2, X_train, X_test, y_train, y_test)

# Model 3: SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled_se, y_resampled_se = smoteenn.fit_resample(X, y)
X_train_se, X_test_se, y_train_se, y_test_se = train_test_split(X_resampled_se, y_resampled_se, test_size=0.2, stratify=y_resampled_se, random_state=42)
rf_model_3 = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_log_model("RF_SMOTEENN", rf_model_3, X_train_se, X_test_se, y_train_se, y_test_se)

# Model 4: Class Weights
rf_model_4 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 2, 2: 3})
train_and_log_model("RF_Class_Weights", rf_model_4, X_train, X_test, y_train, y_test)




RF_SMOTE logged with Accuracy: 0.9839




RF_No_Balancing logged with Accuracy: 0.9897




RF_SMOTEENN logged with Accuracy: 1.0000




RF_Class_Weights logged with Accuracy: 0.9795


In [5]:
import os

from dotenv import load_dotenv
load_dotenv()

True

In [6]:
model_uri=os.getenv("MODEL_URI")
# Load the trained model
best_model = mlflow.pyfunc.load_model(model_uri)
unseen_data=pd.read_csv("data/unseen_data.csv")


In [7]:
X_unseen = unseen_data[['ForceSymmetry', 'MaxForceSymmetry', 'TorqueSymmetry']]
# Make predictions
predictions = best_model.predict(X_unseen)
# Mapping predictions back to Risk Categories
risk_mapping_inverse = {0: 'Low Risk', 1: 'Medium Risk', 2: 'High Risk'}

unseen_data['Predicted Risk']=pd.Series(predictions).map(risk_mapping_inverse)

In [8]:
unseen_data

Unnamed: 0,sbuid,testDateUtc,leftAvgForce,leftImpulse,leftMaxForce,leftTorque,rightAvgForce,rightImpulse,rightMaxForce,rightTorque,ForceSymmetry,ImpulseSymmetry,MaxForceSymmetry,TorqueSymmetry,Predicted Risk
0,14568521.0,2025-01-08,483.375,5730.71,494.75,211.505625,541.25,5814.26,560.5,239.61375,0.893072,0.98563,0.882694,0.882694,Low Risk
1,112900560.0,2025-01-22,588.625,2786.57,607.0,275.8815,481.875,2289.36,494.5,224.75025,1.22153,1.217183,1.227503,1.227503,Medium Risk
2,113328523.0,2025-01-08,442.625,3319.4,451.5,209.27025,379.5,2921.005,387.5,179.60625,1.166337,1.13639,1.165161,1.165161,Medium Risk
3,113328523.0,2025-01-22,592.875,5623.18,614.25,284.704875,504.5,5573.355,519.5,240.78825,1.175173,1.00894,1.182387,1.182387,Medium Risk
4,114215402.0,2025-01-22,540.375,4927.61,563.5,261.18225,599.5,5116.78,602.25,279.142875,0.901376,0.963029,0.935658,0.935658,Low Risk
5,114737056.0,2025-01-22,468.75,2890.84,485.75,203.286375,484.125,3314.205,492.25,206.006625,0.968242,0.872257,0.986795,0.986795,Low Risk
6,114755618.0,2025-01-22,494.125,4411.01,510.5,236.61675,458.0,3922.86,468.25,217.033875,1.078876,1.124437,1.09023,1.09023,Low Risk
7,115197404.0,2025-01-22,594.75,6660.04,606.0,280.881,608.5,6313.285,620.25,287.485875,0.977403,1.054925,0.977025,0.977025,Low Risk
8,115624874.0,2025-01-22,503.5,6560.345,503.5,224.30925,555.0,7262.795,555.0,247.2525,0.907207,0.903281,0.907207,0.907207,Low Risk
9,115667134.0,2025-01-22,499.5,4261.785,504.75,211.237875,456.25,3628.1,462.5,193.55625,1.094795,1.17466,1.091351,1.091351,Low Risk
