In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
from openpyxl.styles import PatternFill
from openpyxl.drawing.image import Image
from openpyxl import Workbook
import os
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 1: Model Initializing

# load training and testing data
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx")
real_df = pd.read_excel("cleaned_realuse_data_v3.xlsx")

# features for prediction
features = [
    'sleep_score', 
    'weekly_training_load', 'weekly_training_duration', 'RTT', 'Mood state', 'Muscle readiness', 'Energy levels', 'Academic Pressure'
]

# targets
targets = ['illed', 'injured']

# keep User ID, Name, and Date for reference
user_info_columns = ['User ID', 'Date']

# drop missing values in features or targets
train_df = train_df.dropna(subset=features + targets)
test_df = test_df.dropna(subset=features + targets)

# define X (features) and Y (targets)
X_train = train_df[features]
y_train_ill = train_df['illed']
y_train_injured = train_df['injured']

X_test = test_df[features]
y_test_ill = test_df['illed']
y_test_injured = test_df['injured']

# standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2: Handle Class Imbalance

# check class distribution
print("Class distribution for 'illed':")
print(y_train_ill.value_counts())

print("Class distribution for 'injured':")
print(y_train_injured.value_counts())

# method 1: resampling (SMOTE for oversampling)
smote = SMOTE(random_state=42)
X_train_resampled_injured, y_train_resampled_injured = smote.fit_resample(X_train_scaled, y_train_injured)
X_train_resampled_ill, y_train_resampled_ill = smote.fit_resample(X_train_scaled, y_train_ill)

# method 2: Adjust class weights
class_weights_injured = {
    0: len(y_train_injured) / (2 * len(y_train_injured[y_train_injured == 0])),
    1: len(y_train_injured) / (2 * len(y_train_injured[y_train_injured == 1]))
}

class_weights_ill = {
    0: len(y_train_ill) / (2 * len(y_train_ill[y_train_ill == 0])),
    1: len(y_train_ill) / (2 * len(y_train_ill[y_train_ill == 1]))
}


print("Training Random Forest - Injured...")
rf_model_injured = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    class_weight=class_weights_injured,  # Adjust class weights
    random_state=42
)
rf_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training Logistic Regression - Injured...")
lr_model_injured = LogisticRegression(
    C=10,  
    max_iter=100,  
    solver='liblinear',  
    class_weight='balanced',
    random_state=42
)
lr_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training XGBoost - Injured...")
xgb_model_injured = xgb.XGBClassifier(
    learning_rate=0.01,  
    n_estimators=3000, 
    max_depth=6,  
    subsample=0.9,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train_injured[y_train_injured == 0]) / len(y_train_injured[y_train_injured == 1]),
    gamma=0.1,  # regularization parameter to avoid overfitting
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training Random Forest - Ill...")
rf_model_ill = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    class_weight=class_weights_ill,  # Adjust class weights
    random_state=42
)
rf_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

print("Training Logistic Regression - Ill...")
lr_model_ill = LogisticRegression(
    C=10, 
    max_iter=100, 
    solver='liblinear', 
    class_weight='balanced', 
    random_state=42
)
lr_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

print("Training XGBoost - Ill...")
xgb_model_ill = xgb.XGBClassifier(
    learning_rate=0.01,  
    n_estimators=3000,  
    max_depth=6,  
    subsample=0.9, 
    colsample_bytree=0.8, 
    scale_pos_weight=len(y_train_ill[y_train_ill == 0]) / len(y_train_ill[y_train_ill == 1]),
    gamma=0.1,  # regularization parameter to avoid overfitting
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 4: Model Evaluating

# predictions
print("Making Predictions for Injured...")
y_pred_rf_injured = rf_model_injured.predict(X_test_scaled)
y_pred_lr_injured = lr_model_injured.predict(X_test_scaled)
y_pred_xgb_injured = xgb_model_injured.predict(X_test_scaled)

print("Making Predictions for Ill...")
y_pred_rf_ill = rf_model_ill.predict(X_test_scaled)
y_pred_lr_ill = lr_model_ill.predict(X_test_scaled)
y_pred_xgb_ill = xgb_model_ill.predict(X_test_scaled)

# calculate accuracy for each model
print("Calculating Accuracy for Models...")
accuracy_rf_injured = accuracy_score(y_test_injured, y_pred_rf_injured)
accuracy_lr_injured = accuracy_score(y_test_injured, y_pred_lr_injured)
accuracy_xgb_injured = accuracy_score(y_test_injured, y_pred_xgb_injured)

accuracy_rf_ill = accuracy_score(y_test_ill, y_pred_rf_ill)
accuracy_lr_ill = accuracy_score(y_test_ill, y_pred_lr_ill)
accuracy_xgb_ill = accuracy_score(y_test_ill, y_pred_xgb_ill)

# save test set evaluation results to Excel
test_evaluation_results = {
    'Model': ['Random Forest', 'Logistic Regression', 'XGBoost'],
    'Accuracy_Injured': [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured],
    'Accuracy_Ill': [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]
}
test_evaluation_df = pd.DataFrame(test_evaluation_results)


# drop rows with missing values for the specific features
real_df = real_df.dropna(subset=features)
X_real = real_df[features]
X_real_scaled = scaler.transform(X_real)

# predict using the best model (we can choose the model we prefer from Random Forest, Logistic Regression and XGBoost)
print("Making Predictions for Real-Use Data...")
real_df['Prediction_Injury_RandomForest'] = rf_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_LogisticRegression'] = lr_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_XGBoost'] = xgb_model_injured.predict(X_real_scaled)

real_df['Prediction_Illness_RandomForest'] = rf_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_LogisticRegression'] = lr_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_XGBoost'] = xgb_model_ill.predict(X_real_scaled)

# calculate correctness for each row in real-use data
real_df['Correct_check_Injury_RandomForest'] = (real_df['Prediction_Injury_RandomForest'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_LogisticRegression'] = (real_df['Prediction_Injury_LogisticRegression'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_XGBoost'] = (real_df['Prediction_Injury_XGBoost'] == real_df['injured']).astype(int)

real_df['Correct_check_Illness_RandomForest'] = (real_df['Prediction_Illness_RandomForest'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_LogisticRegression'] = (real_df['Prediction_Illness_LogisticRegression'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_XGBoost'] = (real_df['Prediction_Illness_XGBoost'] == real_df['illed']).astype(int)

# calculate global accuracy for real-use data
real_rf_injured_acc = real_df['Correct_check_Injury_RandomForest'].mean()
real_lr_injured_acc = real_df['Correct_check_Injury_LogisticRegression'].mean()
real_xgb_injured_acc = real_df['Correct_check_Injury_XGBoost'].mean()

real_rf_ill_acc = real_df['Correct_check_Illness_RandomForest'].mean()
real_lr_ill_acc = real_df['Correct_check_Illness_LogisticRegression'].mean()
real_xgb_ill_acc = real_df['Correct_check_Illness_XGBoost'].mean()

# add global accuracy columns to real-use data
real_df['Total_accuracy_Injury_RandomForest'] = real_rf_injured_acc
real_df['Total_accuracy_Injury_LogisticRegression'] = real_lr_injured_acc
real_df['Total_accuracy_Injury_XGBoost'] = real_xgb_injured_acc

real_df['Total_accuracy_Illness_RandomForest'] = real_rf_ill_acc
real_df['Total_accuracy_Illness_LogisticRegression'] = real_lr_ill_acc
real_df['Total_accuracy_Illness_XGBoost'] = real_xgb_ill_acc

# highlight incorrect predictions in Excel
def highlight_incorrect_predictions(df, writer, sheet_name):
    wb = writer.book
    ws = wb[sheet_name]
    red_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')
    
    # iterate over each row in the DataFrame
    for row_idx in range(len(df)):  # use range(len(df)) to iterate over valid row indices
        for col in ['Correct_check_Injury_RandomForest', 'Correct_check_Injury_LogisticRegression', 'Correct_check_Injury_XGBoost',
                    'Correct_check_Illness_RandomForest', 'Correct_check_Illness_LogisticRegression', 'Correct_check_Illness_XGBoost']:
            if df.iloc[row_idx][col] == 0:  # use iloc to access rows by position
                # excel rows start from 1, and we need to skip the header row
                ws.cell(row=row_idx + 2, column=df.columns.get_loc(col) + 1).fill = red_fill


# generate bar chart for accuracy comparison
models = ['Random Forest', 'Logistic Regression', 'XGBoost']

# test data accuracy
train_injury_accuracies = [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured]
train_illness_accuracies = [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]

# realuse data accuracy
test_injury_accuracies = [real_rf_injured_acc, real_lr_injured_acc, real_xgb_injured_acc]
test_illness_accuracies = [real_rf_ill_acc, real_lr_ill_acc, real_xgb_ill_acc]

# save results to Excel
with pd.ExcelWriter('prediction_v5_no_diet_yesterday.xlsx', engine='openpyxl') as writer:
    # Test set evaluation results
    test_evaluation_df.to_excel(writer, sheet_name='Test_Evaluation_Accuracy', index=False)
    
    # real-use data
    real_predictions = real_df[user_info_columns + [
        'Prediction_Injury_RandomForest', 'Correct_check_Injury_RandomForest','Total_accuracy_Injury_RandomForest',
        'Prediction_Injury_LogisticRegression', 'Correct_check_Injury_LogisticRegression','Total_accuracy_Injury_LogisticRegression',
        'Prediction_Injury_XGBoost', 'Correct_check_Injury_XGBoost','Total_accuracy_Injury_XGBoost',
        'Prediction_Illness_RandomForest', 'Correct_check_Illness_RandomForest','Total_accuracy_Illness_RandomForest',
        'Prediction_Illness_LogisticRegression', 'Correct_check_Illness_LogisticRegression','Total_accuracy_Illness_LogisticRegression',
        'Prediction_Illness_XGBoost', 'Correct_check_Illness_XGBoost','Total_accuracy_Illness_XGBoost'
    ]]
    real_predictions.to_excel(writer, sheet_name='Real-Use_Predictions', index=False)
    highlight_incorrect_predictions(real_predictions, writer, 'Real-Use_Predictions')

print("Predictions, evaluation results, and confusion matrices saved successfully!")

Class distribution for 'illed':
illed
0    57684
1       22
Name: count, dtype: int64
Class distribution for 'injured':
injured
0    57529
1      177
Name: count, dtype: int64
Training Random Forest - Injured...
Training Logistic Regression - Injured...
Training XGBoost - Injured...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Random Forest - Ill...
Training Logistic Regression - Ill...
Training XGBoost - Ill...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Making Predictions for Injured...
Making Predictions for Ill...
Calculating Accuracy for Models...
Making Predictions for Real-Use Data...
Predictions, evaluation results, and confusion matrices saved successfully!
