In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim


# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 1-1: Data Collection and Preprocessing

# list of all years from 2014 to 2025
years = list(range(2014, 2026))
file_paths = [f"{year}.xlsx" for year in years]  # files are named in format of "2014.xlsx"

# read and merge all files
dfs = [pd.read_excel(file) for file in file_paths]  # read each file into a DataFrame
df = pd.concat(dfs, ignore_index=True)  # merge all DataFrames

# filter to select both "User ID" and "Name" columns and remove duplicate combinations
df_unique = df[['User ID', 'Name']].drop_duplicates()

# save the filtered DataFrame with unique "User ID" and "Name" pairs to an Excel file
df_unique.to_excel('unique_user_id_name.xlsx', index=False)  # Saves the unique User ID and Name pairs to an Excel file
print('generated successfully')
# df.to_excel('merged_data.xlsx', index=False)  # Saves the DataFrame to an Excel file without the index column


generated successfully


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim


# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 1-2: Data Collection and Preprocessing(training data)

# list of all years from 2014 to 2017
years = list(range(2014, 2018))
file_paths = [f"{year}.xlsx" for year in years]  # files are named in format of "2014.xlsx"

# read and merge all files
dfs = [pd.read_excel(file) for file in file_paths]  # read each file into a DataFrame
df = pd.concat(dfs, ignore_index=True)  # merge all DataFrames
# save the merged DataFrame as an Excel file
df.to_excel("merged_training_data.xlsx", index=False)  # set index=False to avoid saving the index column
print("saved successfully!")


saved successfully!


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim


# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 1-3: Data Collection and Preprocessing(test data)

# list of all years from 2018 to 2021
years = list(range(2018, 2022))
file_paths = [f"{year}.xlsx" for year in years]  # files are named in format of "2014.xlsx"

# read and merge all files
dfs = [pd.read_excel(file) for file in file_paths]  # read each file into a DataFrame
df = pd.concat(dfs, ignore_index=True)  # merge all DataFrames
# save the merged DataFrame as an Excel file
df.to_excel("merged_test_data.xlsx", index=False)  # set index=False to avoid saving the index column
print("saved successfully!")


saved successfully!


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim


# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 1-4: Data Collection and Preprocessing(real-use data)

# list of all years from 2022 to 2025
years = list(range(2022, 2026))
file_paths = [f"{year}.xlsx" for year in years]  # files are named in format of "2014.xlsx"

# read and merge all files
dfs = [pd.read_excel(file) for file in file_paths]  # read each file into a DataFrame
df = pd.concat(dfs, ignore_index=True)  # merge all DataFrames
# save the merged DataFrame as an Excel file
df.to_excel("merged_realuse_data.xlsx", index=False)  # set index=False to avoid saving the index column
print("saved successfully!")


saved successfully!


In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-1: Data Cleaning and Transformation(training data)

# record the start time of executing this code block
start_time = time.time()

# define a function to print time information
def log_with_timestamp(message):
    current_time = time.time() - start_time  # calculating executing time so far
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] [Elapsed: {current_time:.2f}s] {message}")

log_with_timestamp("Loading data...")
# load the already merged file
df = pd.read_excel("merged_training_data.xlsx")

log_with_timestamp("Converting 'Date' column to datetime format...")
# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

log_with_timestamp("Extracting year and week number...")
# extract year for filtering
df['Year'] = df['Date'].dt.year
# extract week number of the year
df['Week'] = df['Date'].dt.isocalendar().week  # ISO week number (1-53)

log_with_timestamp("Transforming 'Gender' column...")
# convert 'Gender' column: 'm' → 2, 'f' → 1
df['Gender_m2f1'] = df['Gender'].map({'m': 2, 'f': 1})
# drop the original 'Gender' column
df.drop(columns=['Gender'], inplace=True)

log_with_timestamp("Transforming 'Menstruation' column...")
# convert 'Menstruation' column: 'Y' → 1, blank → 0
df['Menstruation_y1n0'] = df['Menstruation'].map({'Y': 1, '': 0}).fillna(0).astype(int)
# drop the original 'Menstruation' column
df.drop(columns=['Menstruation'], inplace=True)

log_with_timestamp("Identifying training-related columns...")
# identify all columns related to training load and training duration
training_load_columns = df.filter(like="Training Load").columns  # find all "Training Load" columns
training_duration_columns = df.filter(like="Training Duration").columns  # find all "Training Duration" columns
training_type_columns = df.filter(like="Training Type").columns  # find all "Training Type" columns
training_RPE_columns = df.filter(like="Training RPE").columns  # find all "Training RPE" columns
training_HowDidIDo_columns = df.filter(like="Training HowDidIDo").columns  # find all "Training HowDidIDo" columns

log_with_timestamp("Cleaning 'Training Load' columns...")
# ensure all values in 'Training Load' columns are greater than 0
for column in training_load_columns:
    df[column] = np.where(df[column] > 0, df[column], 0)  # set to 0 if <= 0

log_with_timestamp("Cleaning 'Training RPE' columns...")
# ensure that all 'Training RPE' columns are valid (within the range of 1-10)
for column in training_RPE_columns:
    # apply the transformation to each column
    df[column] = np.where((df[column] >= 1) & (df[column] <= 10), df[column], 0)

log_with_timestamp("Cleaning 'Training HowDidIDo' columns...")
# ensure that all 'Training HowDidIDo' columns are valid (within the range of 1-5)
for column in training_HowDidIDo_columns:
    # apply the transformation to each column
    df[column] = np.where((df[column] >= 1) & (df[column] <= 5), df[column], 0)

log_with_timestamp("Calculating total training load and duration...")
# sum training load and duration, default to 0 if no columns exist
df["total_training_load"] = df[training_load_columns].sum(axis=1, skipna=True).fillna(0).astype(int)
df["total_training_duration"] = df[training_duration_columns].sum(axis=1, skipna=True).fillna(0).astype(int)

# ensure columns exist, then replace NaN with 0
df["total_training_load"] = df["total_training_load"].fillna(0).astype(int)
df["total_training_duration"] = df["total_training_duration"].fillna(0).astype(int)

log_with_timestamp("Grouping by Year, Week, and User ID...")
# group by Year and Week and user id and sum the daily values for each metric
weekly_training_load = df.groupby(['Year', 'Week', 'User ID'])['total_training_load'].sum().reset_index(name='weekly_training_load')
weekly_training_duration = df.groupby(['Year', 'Week', 'User ID'])['total_training_duration'].sum().reset_index(name='weekly_training_duration')

log_with_timestamp("Merging weekly totals per athlete...")
# merge the weekly totals back into the original dataframe
df = df.merge(weekly_training_load, on=['Year', 'Week', 'User ID'], how='left')
df = df.merge(weekly_training_duration, on=['Year', 'Week', 'User ID'], how='left')

# remove the original training load and training duration columns
df.drop(columns=list(training_load_columns) + list(training_duration_columns) + list(training_type_columns) + 
        list(training_RPE_columns) + list(training_HowDidIDo_columns), inplace=True)

# initialize columns for Acute and Chronic EWMA
acute_weight = 2 / (7 + 1)
chronic_weight = 2 / (28 + 1)

# initialize the ACWR column (this will store the final result)
df['ACWR'] = None  # initially set as None

# define a helper function to calculate ACWR for each group
def calculate_acwr(group):
    # sort by date
    group = group.sort_values(by='Date')
    
    # find the first day with training load > 0
    nonzero_load = group[group['total_training_load'] > 0]
    if nonzero_load.empty:
        group['ACWR'] = 0 # no valid training data in the whole year, set ACWR = 0
        return group  # no valid training data, return group as is
    
    # ensure year column in proper format
    if 'Year' not in group.columns:
        group['Year'] = group['Date'].dt.year 
    group['Year'] = group['Year'].astype('Int64')
    
    first_day = nonzero_load.iloc[0]['Date']
    end_date = pd.Timestamp(year=group['Year'].iloc[0], month=12, day=31)

    # ensure first_day and end_date are valid
    if pd.isna(first_day) or pd.isna(end_date):
        group['ACWR'] = 0
        return group
        
    # generate a complete date range for the year
    full_date_range = pd.date_range(start=first_day, end=end_date, freq='D')

    # reindex the group to the full date range, filling missing dates
    group = group.set_index('Date').reindex(full_date_range).reset_index().copy()
    group = group.rename(columns={'index': 'Date'})

    # fill missing values for 'total_training_load' with 0
    group['total_training_load'] = group['total_training_load'].fillna(0)

    # forward fill only 'User ID', 'Year', 'Week', 'Gender_m2f1'
    group[['User ID', 'Year', 'Week', 'Gender_m2f1']] = group[['User ID', 'Year', 'Week', 'Gender_m2f1']].ffill()
    
    # set the specified columns to 0
    columns_to_zero = ['total_training_load', 'total_training_duration', 'Menstruation_y1n0', 
                   'Mood state', 'Sleep quality', 'Energy levels', 'Muscle readiness', 
                   'Diet Yesterday', 'Academic Pressure']
    group[columns_to_zero] = group[columns_to_zero].fillna(0).astype(int)

    # initialize acute and chronic workload
    acute = chronic = 0

    # loop through each row and calculate ACWR
    for i in tqdm(range(len(group)), desc="Calculating ACWR", mininterval=1):
        current_load = group.iloc[i]['total_training_load']

        if i == 0:
            acute = chronic = current_load  # initialize with first value in the date range
        else:
            # apply Exponentially Weighted Moving Average (EWMA)
            acute = acute_weight * current_load + (1 - acute_weight) * acute
            chronic = chronic_weight * current_load + (1 - chronic_weight) * chronic

        # compute ACWR
        group.loc[i, 'ACWR'] = acute / chronic if chronic != 0 else 0
    
    # fill missing weight values
    group['Weight'] = group['Weight'].astype(float)
    group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')

    return group

log_with_timestamp("Calculating ACWR for each user and year...")
# apply ACWR calculation function for each user per year
df = df.groupby(['User ID', 'Year'], group_keys=False).apply(calculate_acwr).reset_index(drop=True)

# define the value mappings
value_mapping = {
    1: 0,
    2: 1,
    3: 10,
    4: 20,
    5: 25
}

# define the weightings for each item
weightings = {
    'Mood state': 0.15,
    'Sleep quality': 0.20,
    'Energy levels': 0.30,
    'Muscle readiness': 0.15,
    'Diet Yesterday': 0.10,
    'Academic Pressure': 0.10
}

# check and clean Likert-scale questions (1-5 depending on the column)
likert_columns = ['Mood state', 'Sleep quality', 'Energy levels', 'Muscle readiness', 
                  'Diet Yesterday', 'Academic Pressure']

log_with_timestamp("Cleaning Likert-scale columns...")
# clean Likert-style columns: ensure values are within the valid range
for column in likert_columns:
    df[column] = np.where(df[column].isin([1, 2, 3, 4, 5]), df[column], np.nan)

log_with_timestamp("Cleaning 'Sleep duration' column...")
# assume values > 24 hours are outliers and should be removed or fixed
df['Sleep duration'] = np.where(df['Sleep duration'] > 24, df['Sleep duration'] / 60, df['Sleep duration']) # convert minutes to hours if over 24
# set values greater than 24 to NaN
df['Sleep duration'] = np.where(df['Sleep duration'] > 24, np.nan, df['Sleep duration'])

# function to calculate the weighted RTT score for each row
def calculate_rtt(row):
    total_score = 0
    total_weight = 0

    # for each item and its corresponding weighting
    for column, weight in weightings.items():
        if column in row:
            score = row[column]
            value = value_mapping.get(score, 0) / 25  # use the value mapping, default to 0 if not found
            total_score += value * weight  # add the weighted score
            total_weight += weight  # track the total weight
    
    return total_score

log_with_timestamp("Calculating RTT scores...")
tqdm.pandas(desc="Calculating RTT") 
# apply the function to each row to create the 'RTT' column
df['RTT'] = df.progress_apply(calculate_rtt, axis=1)

log_with_timestamp("Filling missing weight values...")
df['Weight'] = df['Weight'].astype(float)
# fill missing weights by carrying forward the last valid weight for the same user
df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
# (method='ffill'): fills the missing weight values by carrying forward the last known weight within each user group. 
# if a new weight is found for a user, it will be used for the subsequent missing values until another weight is encountered.

log_with_timestamp("Creating 'injured' column...")
# create 'injured' column: 1 if 'Injury location' is not empty, 0 otherwise
df['injured'] = df['Injury location'].notna().astype(int)

# create a mapping dictionary
severity_mapping = {
    'mild': 1,
    'moderate': 2,
    'severe': 3
}

log_with_timestamp("Mapping 'Injury severity' to numeric values...")
# map the 'Injury severity' column to numeric values
df['Injury_severity_numeric_mi1mo2se3'] = df['Injury severity'].map(severity_mapping)

log_with_timestamp("Performing one-hot encoding...")
# perform one-hot encoding for categorical columns using get_dummies
df_encoded = pd.get_dummies(df, columns=['Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type'])

# final check for weekly_training_load and weekly_training_duration
log_with_timestamp("check weekly_training_load and weekly_training_duration value(if none)...")

def fill_weekly_values(group):
    if group.notna().any():
        return group.fillna(method='ffill').fillna(method='bfill')
    else:
        return group.fillna(0)

df['weekly_training_load'] = df.groupby(['Year', 'Week', 'User ID'])['weekly_training_load'].transform(fill_weekly_values)
df['weekly_training_duration'] = df.groupby(['Year', 'Week', 'User ID'])['weekly_training_duration'].transform(fill_weekly_values)

log_with_timestamp("Saving cleaned data...")
# save the final cleaned DataFrame
df.to_excel("cleaned_training_data.xlsx", index=True)
log_with_timestamp("Cleaned training data saved successfully!")


[2025-03-12 14:34:25] [Elapsed: 0.00s] Loading data...
[2025-03-12 14:35:57] [Elapsed: 91.25s] Converting 'Date' column to datetime format...
[2025-03-12 14:35:57] [Elapsed: 91.31s] Extracting year and week number...
[2025-03-12 14:35:57] [Elapsed: 91.31s] Transforming 'Gender' column...
[2025-03-12 14:35:57] [Elapsed: 91.35s] Transforming 'Menstruation' column...
[2025-03-12 14:35:57] [Elapsed: 91.39s] Identifying training-related columns...
[2025-03-12 14:35:57] [Elapsed: 91.41s] Cleaning 'Training Load' columns...
[2025-03-12 14:35:57] [Elapsed: 91.42s] Cleaning 'Training RPE' columns...
[2025-03-12 14:35:57] [Elapsed: 91.44s] Cleaning 'Training HowDidIDo' columns...
[2025-03-12 14:35:57] [Elapsed: 91.45s] Calculating total training load and duration...
[2025-03-12 14:35:57] [Elapsed: 91.50s] Grouping by Year, Week, and User ID...
[2025-03-12 14:35:57] [Elapsed: 91.51s] Merging weekly totals per athlete...


  df["total_training_load"] = df[training_load_columns].sum(axis=1, skipna=True).fillna(0).astype(int)
  df["total_training_duration"] = df[training_duration_columns].sum(axis=1, skipna=True).fillna(0).astype(int)


[2025-03-12 14:35:57] [Elapsed: 91.67s] Calculating ACWR for each user and year...


Calculating ACWR: 100%|████████████████████| 296/296 [00:00<00:00, 10008.17it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 294/294 [00:00<00:00, 10480.15it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 300/300 [00:00<00:00, 10360.91it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 303/303 [00:00<00:00, 10174.89it/s]
  group['W

[2025-03-12 14:36:14] [Elapsed: 108.94s] Cleaning Likert-scale columns...
[2025-03-12 14:36:14] [Elapsed: 108.96s] Cleaning 'Sleep duration' column...
[2025-03-12 14:36:14] [Elapsed: 108.97s] Calculating RTT scores...


Calculating RTT: 100%|██████████████| 138640/138640 [00:01<00:00, 108220.68it/s]
  df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  return group.fillna(method='ffill').fillna(method='bfill')


[2025-03-12 14:36:16] [Elapsed: 110.25s] Filling missing weight values...
[2025-03-12 14:36:16] [Elapsed: 110.27s] Creating 'injured' column...
[2025-03-12 14:36:16] [Elapsed: 110.27s] Mapping 'Injury severity' to numeric values...
[2025-03-12 14:36:16] [Elapsed: 110.28s] Performing one-hot encoding...
[2025-03-12 14:36:16] [Elapsed: 110.37s] check weekly_training_load and weekly_training_duration value(if none)...
[2025-03-12 14:36:18] [Elapsed: 112.43s] Saving cleaned data...
[2025-03-12 14:37:13] [Elapsed: 167.53s] Cleaned training data saved successfully!


In [19]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-2: Data Cleaning and Transformation(test data)

# record the start time of executing this code block
start_time = time.time()

# define a function to print time information
def log_with_timestamp(message):
    current_time = time.time() - start_time  # calculating executing time so far
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] [Elapsed: {current_time:.2f}s] {message}")

log_with_timestamp("Loading data...")
# load the already merged file
df = pd.read_excel("merged_test_data.xlsx")

log_with_timestamp("Converting 'Date' column to datetime format...")
# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

log_with_timestamp("Extracting year and week number...")
# extract year for filtering
df['Year'] = df['Date'].dt.year
# extract week number of the year
df['Week'] = df['Date'].dt.isocalendar().week  # ISO week number (1-53)

log_with_timestamp("Transforming 'Gender' column...")
# convert 'Gender' column: 'm' → 2, 'f' → 1
df['Gender_m2f1'] = df['Gender'].map({'m': 2, 'f': 1})
# drop the original 'Gender' column
df.drop(columns=['Gender'], inplace=True)

log_with_timestamp("Transforming 'Menstruation' column...")
# convert 'Menstruation' column: 'Y' → 1, blank → 0
df['Menstruation_y1n0'] = df['Menstruation'].map({'Y': 1, '': 0}).fillna(0).astype(int)
# drop the original 'Menstruation' column
df.drop(columns=['Menstruation'], inplace=True)

log_with_timestamp("Identifying training-related columns...")
# identify all columns related to training load and training duration
training_load_columns = df.filter(like="Training Load").columns  # find all "Training Load" columns
training_duration_columns = df.filter(like="Training Duration").columns  # find all "Training Duration" columns
training_type_columns = df.filter(like="Training Type").columns  # find all "Training Type" columns
training_RPE_columns = df.filter(like="Training RPE").columns  # find all "Training RPE" columns
training_HowDidIDo_columns = df.filter(like="Training HowDidIDo").columns  # find all "Training HowDidIDo" columns

log_with_timestamp("Cleaning 'Training Load' columns...")
# ensure all values in 'Training Load' columns are greater than 0
for column in training_load_columns:
    df[column] = np.where(df[column] > 0, df[column], 0)  # set to 0 if <= 0

log_with_timestamp("Cleaning 'Training RPE' columns...")
# ensure that all 'Training RPE' columns are valid (within the range of 1-10)
for column in training_RPE_columns:
    # apply the transformation to each column
    df[column] = np.where((df[column] >= 1) & (df[column] <= 10), df[column], 0)

log_with_timestamp("Cleaning 'Training HowDidIDo' columns...")
# ensure that all 'Training HowDidIDo' columns are valid (within the range of 1-5)
for column in training_HowDidIDo_columns:
    # apply the transformation to each column
    df[column] = np.where((df[column] >= 1) & (df[column] <= 5), df[column], 0)

log_with_timestamp("Calculating total training load and duration...")
# sum training load and duration, default to 0 if no columns exist
df["total_training_load"] = df[training_load_columns].sum(axis=1, skipna=True).fillna(0).astype(int)
df["total_training_duration"] = df[training_duration_columns].sum(axis=1, skipna=True).fillna(0).astype(int)

# ensure columns exist, then replace NaN with 0
df["total_training_load"] = df["total_training_load"].fillna(0).astype(int)
df["total_training_duration"] = df["total_training_duration"].fillna(0).astype(int)

log_with_timestamp("Grouping by Year, Week, and User ID...")
# group by Year and Week and user id and sum the daily values for each metric
weekly_training_load = df.groupby(['Year', 'Week', 'User ID'])['total_training_load'].sum().reset_index(name='weekly_training_load')
weekly_training_duration = df.groupby(['Year', 'Week', 'User ID'])['total_training_duration'].sum().reset_index(name='weekly_training_duration')

log_with_timestamp("Merging weekly totals per athlete...")
# merge the weekly totals back into the original dataframe
df = df.merge(weekly_training_load, on=['Year', 'Week', 'User ID'], how='left')
df = df.merge(weekly_training_duration, on=['Year', 'Week', 'User ID'], how='left')

# remove the original training load and training duration columns
df.drop(columns=list(training_load_columns) + list(training_duration_columns) + list(training_type_columns) + 
        list(training_RPE_columns) + list(training_HowDidIDo_columns), inplace=True)

# initialize columns for Acute and Chronic EWMA
acute_weight = 2 / (7 + 1)
chronic_weight = 2 / (28 + 1)

# initialize the ACWR column (this will store the final result)
df['ACWR'] = None  # initially set as None

# define a helper function to calculate ACWR for each group
def calculate_acwr(group):
    # sort by date
    group = group.sort_values(by='Date')
    
    # find the first day with training load > 0
    nonzero_load = group[group['total_training_load'] > 0]
    if nonzero_load.empty:
        group['ACWR'] = 0 # no valid training data in the whole year, set ACWR = 0
        return group  # no valid training data, return group as is
    
    # ensure year column in proper format
    if 'Year' not in group.columns:
        group['Year'] = group['Date'].dt.year 
    group['Year'] = group['Year'].astype('Int64')
    
    first_day = nonzero_load.iloc[0]['Date']
    end_date = pd.Timestamp(year=group['Year'].iloc[0], month=12, day=31)

    # ensure first_day and end_date are valid
    if pd.isna(first_day) or pd.isna(end_date):
        group['ACWR'] = 0
        return group
        
    # generate a complete date range for the year
    full_date_range = pd.date_range(start=first_day, end=end_date, freq='D')

    # reindex the group to the full date range, filling missing dates
    group = group.set_index('Date').reindex(full_date_range).reset_index().copy()
    group = group.rename(columns={'index': 'Date'})

    # fill missing values for 'total_training_load' with 0
    group['total_training_load'] = group['total_training_load'].fillna(0)

    # forward fill only 'User ID', 'Year', 'Week', 'Gender_m2f1'
    group[['User ID', 'Year', 'Week', 'Gender_m2f1']] = group[['User ID', 'Year', 'Week', 'Gender_m2f1']].ffill()
    
    # set the specified columns to 0
    columns_to_zero = ['total_training_load', 'total_training_duration', 'Menstruation_y1n0', 
                   'Mood state', 'Sleep quality', 'Energy levels', 'Muscle readiness', 
                   'Diet Yesterday', 'Academic Pressure']
    group[columns_to_zero] = group[columns_to_zero].fillna(0).astype(int)

    # initialize acute and chronic workload
    acute = chronic = 0

    # loop through each row and calculate ACWR
    for i in tqdm(range(len(group)), desc="Calculating ACWR", mininterval=1):
        current_load = group.iloc[i]['total_training_load']

        if i == 0:
            acute = chronic = current_load  # initialize with first value in the date range
        else:
            # apply Exponentially Weighted Moving Average (EWMA)
            acute = acute_weight * current_load + (1 - acute_weight) * acute
            chronic = chronic_weight * current_load + (1 - chronic_weight) * chronic

        # compute ACWR
        group.loc[i, 'ACWR'] = acute / chronic if chronic != 0 else 0
    
    # fill missing weight values
    group['Weight'] = group['Weight'].astype(float)
    group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')

    return group

log_with_timestamp("Calculating ACWR for each user and year...")
# apply ACWR calculation function for each user per year
df = df.groupby(['User ID', 'Year'], group_keys=False).apply(calculate_acwr).reset_index(drop=True)

# define the value mappings
value_mapping = {
    1: 0,
    2: 1,
    3: 10,
    4: 20,
    5: 25
}

# define the weightings for each item
weightings = {
    'Mood state': 0.15,
    'Sleep quality': 0.20,
    'Energy levels': 0.30,
    'Muscle readiness': 0.15,
    'Diet Yesterday': 0.10,
    'Academic Pressure': 0.10
}

# check and clean Likert-scale questions (1-5 depending on the column)
likert_columns = ['Mood state', 'Sleep quality', 'Energy levels', 'Muscle readiness', 
                  'Diet Yesterday', 'Academic Pressure']

log_with_timestamp("Cleaning Likert-scale columns...")
# clean Likert-style columns: ensure values are within the valid range
for column in likert_columns:
    df[column] = np.where(df[column].isin([1, 2, 3, 4, 5]), df[column], np.nan)

log_with_timestamp("Cleaning 'Sleep duration' column...")
# assume values > 24 hours are outliers and should be removed or fixed
df['Sleep duration'] = np.where(df['Sleep duration'] > 24, df['Sleep duration'] / 60, df['Sleep duration']) # convert minutes to hours if over 24
# set values greater than 24 to NaN
df['Sleep duration'] = np.where(df['Sleep duration'] > 24, np.nan, df['Sleep duration'])

# function to calculate the weighted RTT score for each row
def calculate_rtt(row):
    total_score = 0
    total_weight = 0

    # for each item and its corresponding weighting
    for column, weight in weightings.items():
        if column in row:
            score = row[column]
            value = value_mapping.get(score, 0) / 25  # use the value mapping, default to 0 if not found
            total_score += value * weight  # add the weighted score
            total_weight += weight  # track the total weight
    
    return total_score

log_with_timestamp("Calculating RTT scores...")
tqdm.pandas(desc="Calculating RTT") 
# apply the function to each row to create the 'RTT' column
df['RTT'] = df.progress_apply(calculate_rtt, axis=1)

log_with_timestamp("Filling missing weight values...")
df['Weight'] = df['Weight'].astype(float)
# fill missing weights by carrying forward the last valid weight for the same user
df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
# (method='ffill'): fills the missing weight values by carrying forward the last known weight within each user group. 
# if a new weight is found for a user, it will be used for the subsequent missing values until another weight is encountered.

log_with_timestamp("Creating 'injured' column...")
# create 'injured' column: 1 if 'Injury location' is not empty, 0 otherwise
df['injured'] = df['Injury location'].notna().astype(int)

# create a mapping dictionary
severity_mapping = {
    'mild': 1,
    'moderate': 2,
    'severe': 3
}

log_with_timestamp("Mapping 'Injury severity' to numeric values...")
# map the 'Injury severity' column to numeric values
df['Injury_severity_numeric_mi1mo2se3'] = df['Injury severity'].map(severity_mapping)

log_with_timestamp("Performing one-hot encoding...")
# perform one-hot encoding for categorical columns using get_dummies
df_encoded = pd.get_dummies(df, columns=['Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type'])

# final check for weekly_training_load and weekly_training_duration
log_with_timestamp("check weekly_training_load and weekly_training_duration value(if none)...")

def fill_weekly_values(group):
    if group.notna().any():
        return group.fillna(method='ffill').fillna(method='bfill')
    else:
        return group.fillna(0)

df['weekly_training_load'] = df.groupby(['Year', 'Week', 'User ID'])['weekly_training_load'].transform(fill_weekly_values)
df['weekly_training_duration'] = df.groupby(['Year', 'Week', 'User ID'])['weekly_training_duration'].transform(fill_weekly_values)

log_with_timestamp("Saving cleaned data...")
# save the final cleaned DataFrame
df.to_excel("cleaned_test_data.xlsx", index=True)
log_with_timestamp("Cleaned test data saved successfully!")


[2025-03-12 14:41:19] [Elapsed: 0.00s] Loading data...
[2025-03-12 14:42:17] [Elapsed: 57.20s] Converting 'Date' column to datetime format...
[2025-03-12 14:42:17] [Elapsed: 57.24s] Extracting year and week number...
[2025-03-12 14:42:17] [Elapsed: 57.24s] Transforming 'Gender' column...
[2025-03-12 14:42:17] [Elapsed: 57.27s] Transforming 'Menstruation' column...
[2025-03-12 14:42:17] [Elapsed: 57.29s] Identifying training-related columns...
[2025-03-12 14:42:17] [Elapsed: 57.31s] Cleaning 'Training Load' columns...
[2025-03-12 14:42:17] [Elapsed: 57.31s] Cleaning 'Training RPE' columns...
[2025-03-12 14:42:17] [Elapsed: 57.32s] Cleaning 'Training HowDidIDo' columns...
[2025-03-12 14:42:17] [Elapsed: 57.33s] Calculating total training load and duration...
[2025-03-12 14:42:17] [Elapsed: 57.36s] Grouping by Year, Week, and User ID...
[2025-03-12 14:42:17] [Elapsed: 57.36s] Merging weekly totals per athlete...


  df["total_training_load"] = df[training_load_columns].sum(axis=1, skipna=True).fillna(0).astype(int)
  df["total_training_duration"] = df[training_duration_columns].sum(axis=1, skipna=True).fillna(0).astype(int)


[2025-03-12 14:42:17] [Elapsed: 57.46s] Calculating ACWR for each user and year...


Calculating ACWR: 100%|█████████████████████| 309/309 [00:00<00:00, 9996.45it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 291/291 [00:00<00:00, 10335.70it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 290/290 [00:00<00:00, 10364.60it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 316/316 [00:00<00:00, 10569.29it/s]
  group['W

[2025-03-12 14:42:29] [Elapsed: 69.98s] Cleaning Likert-scale columns...
[2025-03-12 14:42:29] [Elapsed: 70.00s] Cleaning 'Sleep duration' column...
[2025-03-12 14:42:29] [Elapsed: 70.00s] Calculating RTT scores...


Calculating RTT: 100%|██████████████| 100107/100107 [00:00<00:00, 123581.88it/s]
  df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  return group.fillna(method='ffill').fillna(method='bfill')


[2025-03-12 14:42:30] [Elapsed: 70.81s] Filling missing weight values...
[2025-03-12 14:42:30] [Elapsed: 70.83s] Creating 'injured' column...
[2025-03-12 14:42:30] [Elapsed: 70.83s] Mapping 'Injury severity' to numeric values...
[2025-03-12 14:42:30] [Elapsed: 70.83s] Performing one-hot encoding...
[2025-03-12 14:42:30] [Elapsed: 70.91s] check weekly_training_load and weekly_training_duration value(if none)...
[2025-03-12 14:42:32] [Elapsed: 72.40s] Saving cleaned data...
[2025-03-12 14:43:11] [Elapsed: 112.02s] Cleaned test data saved successfully!


In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-3: Data Cleaning and Transformation(real-use data)

# record the start time of executing this code block
start_time = time.time()

# define a function to print time information
def log_with_timestamp(message):
    current_time = time.time() - start_time  # calculating executing time so far
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] [Elapsed: {current_time:.2f}s] {message}")

log_with_timestamp("Loading data...")
# load the already merged file
df = pd.read_excel("merged_realuse_data.xlsx")

log_with_timestamp("Converting 'Date' column to datetime format...")
# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

log_with_timestamp("Extracting year and week number...")
# extract year for filtering
df['Year'] = df['Date'].dt.year
# extract week number of the year
df['Week'] = df['Date'].dt.isocalendar().week  # ISO week number (1-53)

log_with_timestamp("Transforming 'Gender' column...")
# convert 'Gender' column: 'm' → 2, 'f' → 1
df['Gender_m2f1'] = df['Gender'].map({'m': 2, 'f': 1})
# drop the original 'Gender' column
df.drop(columns=['Gender'], inplace=True)

log_with_timestamp("Transforming 'Menstruation' column...")
# convert 'Menstruation' column: 'Y' → 1, blank → 0
df['Menstruation_y1n0'] = df['Menstruation'].map({'Y': 1, '': 0}).fillna(0).astype(int)
# drop the original 'Menstruation' column
df.drop(columns=['Menstruation'], inplace=True)

log_with_timestamp("Identifying training-related columns...")
# identify all columns related to training load and training duration
training_load_columns = df.filter(like="Training Load").columns  # find all "Training Load" columns
training_duration_columns = df.filter(like="Training Duration").columns  # find all "Training Duration" columns
training_type_columns = df.filter(like="Training Type").columns  # find all "Training Type" columns
training_RPE_columns = df.filter(like="Training RPE").columns  # find all "Training RPE" columns
training_HowDidIDo_columns = df.filter(like="Training HowDidIDo").columns  # find all "Training HowDidIDo" columns

log_with_timestamp("Cleaning 'Training Load' columns...")
# ensure all values in 'Training Load' columns are greater than 0
for column in training_load_columns:
    df[column] = np.where(df[column] > 0, df[column], 0)  # set to 0 if <= 0

log_with_timestamp("Cleaning 'Training RPE' columns...")
# ensure that all 'Training RPE' columns are valid (within the range of 1-10)
for column in training_RPE_columns:
    # apply the transformation to each column
    df[column] = np.where((df[column] >= 1) & (df[column] <= 10), df[column], 0)

log_with_timestamp("Cleaning 'Training HowDidIDo' columns...")
# ensure that all 'Training HowDidIDo' columns are valid (within the range of 1-5)
for column in training_HowDidIDo_columns:
    # apply the transformation to each column
    df[column] = np.where((df[column] >= 1) & (df[column] <= 5), df[column], 0)

log_with_timestamp("Calculating total training load and duration...")
# sum training load and duration, default to 0 if no columns exist
df["total_training_load"] = df[training_load_columns].sum(axis=1, skipna=True).fillna(0).astype(int)
df["total_training_duration"] = df[training_duration_columns].sum(axis=1, skipna=True).fillna(0).astype(int)

# ensure columns exist, then replace NaN with 0
df["total_training_load"] = df["total_training_load"].fillna(0).astype(int)
df["total_training_duration"] = df["total_training_duration"].fillna(0).astype(int)

log_with_timestamp("Grouping by Year, Week, and User ID...")
# group by Year and Week and user id and sum the daily values for each metric
weekly_training_load = df.groupby(['Year', 'Week', 'User ID'])['total_training_load'].sum().reset_index(name='weekly_training_load')
weekly_training_duration = df.groupby(['Year', 'Week', 'User ID'])['total_training_duration'].sum().reset_index(name='weekly_training_duration')

log_with_timestamp("Merging weekly totals per athlete...")
# merge the weekly totals back into the original dataframe
df = df.merge(weekly_training_load, on=['Year', 'Week', 'User ID'], how='left')
df = df.merge(weekly_training_duration, on=['Year', 'Week', 'User ID'], how='left')

# remove the original training load and training duration columns
df.drop(columns=list(training_load_columns) + list(training_duration_columns) + list(training_type_columns) + 
        list(training_RPE_columns) + list(training_HowDidIDo_columns), inplace=True)

# initialize columns for Acute and Chronic EWMA
acute_weight = 2 / (7 + 1)
chronic_weight = 2 / (28 + 1)

# initialize the ACWR column (this will store the final result)
df['ACWR'] = None  # initially set as None

# define a helper function to calculate ACWR for each group
def calculate_acwr(group):
    # sort by date
    group = group.sort_values(by='Date')
    
    # find the first day with training load > 0
    nonzero_load = group[group['total_training_load'] > 0]
    if nonzero_load.empty:
        group['ACWR'] = 0 # no valid training data in the whole year, set ACWR = 0
        return group  # no valid training data, return group as is
    
    # ensure year column in proper format
    if 'Year' not in group.columns:
        group['Year'] = group['Date'].dt.year 
    group['Year'] = group['Year'].astype('Int64')
    
    first_day = nonzero_load.iloc[0]['Date']
    end_date = pd.Timestamp(year=group['Year'].iloc[0], month=12, day=31)

    # ensure first_day and end_date are valid
    if pd.isna(first_day) or pd.isna(end_date):
        group['ACWR'] = 0
        return group
        
    # generate a complete date range for the year
    full_date_range = pd.date_range(start=first_day, end=end_date, freq='D')

    # reindex the group to the full date range, filling missing dates
    group = group.set_index('Date').reindex(full_date_range).reset_index().copy()
    group = group.rename(columns={'index': 'Date'})

    # fill missing values for 'total_training_load' with 0
    group['total_training_load'] = group['total_training_load'].fillna(0)

    # forward fill only 'User ID', 'Year', 'Week', 'Gender_m2f1'
    group[['User ID', 'Year', 'Week', 'Gender_m2f1']] = group[['User ID', 'Year', 'Week', 'Gender_m2f1']].ffill()
    
    # set the specified columns to 0
    columns_to_zero = ['total_training_load', 'total_training_duration', 'Menstruation_y1n0', 
                   'Mood state', 'Sleep quality', 'Energy levels', 'Muscle readiness', 
                   'Diet Yesterday', 'Academic Pressure']
    group[columns_to_zero] = group[columns_to_zero].fillna(0).astype(int)

    # initialize acute and chronic workload
    acute = chronic = 0

    # loop through each row and calculate ACWR
    for i in tqdm(range(len(group)), desc="Calculating ACWR", mininterval=1):
        current_load = group.iloc[i]['total_training_load']

        if i == 0:
            acute = chronic = current_load  # initialize with first value in the date range
        else:
            # apply Exponentially Weighted Moving Average (EWMA)
            acute = acute_weight * current_load + (1 - acute_weight) * acute
            chronic = chronic_weight * current_load + (1 - chronic_weight) * chronic

        # compute ACWR
        group.loc[i, 'ACWR'] = acute / chronic if chronic != 0 else 0
    
    # fill missing weight values
    group['Weight'] = group['Weight'].astype(float)
    group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')

    return group

log_with_timestamp("Calculating ACWR for each user and year...")
# apply ACWR calculation function for each user per year
df = df.groupby(['User ID', 'Year'], group_keys=False).apply(calculate_acwr).reset_index(drop=True)

# define the value mappings
value_mapping = {
    1: 0,
    2: 1,
    3: 10,
    4: 20,
    5: 25
}

# define the weightings for each item
weightings = {
    'Mood state': 0.15,
    'Sleep quality': 0.20,
    'Energy levels': 0.30,
    'Muscle readiness': 0.15,
    'Diet Yesterday': 0.10,
    'Academic Pressure': 0.10
}

# check and clean Likert-scale questions (1-5 depending on the column)
likert_columns = ['Mood state', 'Sleep quality', 'Energy levels', 'Muscle readiness', 
                  'Diet Yesterday', 'Academic Pressure']

log_with_timestamp("Cleaning Likert-scale columns...")
# clean Likert-style columns: ensure values are within the valid range
for column in likert_columns:
    df[column] = np.where(df[column].isin([1, 2, 3, 4, 5]), df[column], np.nan)

log_with_timestamp("Cleaning 'Sleep duration' column...")
# assume values > 24 hours are outliers and should be removed or fixed
df['Sleep duration'] = np.where(df['Sleep duration'] > 24, df['Sleep duration'] / 60, df['Sleep duration']) # convert minutes to hours if over 24
# set values greater than 24 to NaN
df['Sleep duration'] = np.where(df['Sleep duration'] > 24, np.nan, df['Sleep duration'])

# function to calculate the weighted RTT score for each row
def calculate_rtt(row):
    total_score = 0
    total_weight = 0

    # for each item and its corresponding weighting
    for column, weight in weightings.items():
        if column in row:
            score = row[column]
            value = value_mapping.get(score, 0) / 25  # use the value mapping, default to 0 if not found
            total_score += value * weight  # add the weighted score
            total_weight += weight  # track the total weight
    
    return total_score

log_with_timestamp("Calculating RTT scores...")
tqdm.pandas(desc="Calculating RTT") 
# apply the function to each row to create the 'RTT' column
df['RTT'] = df.progress_apply(calculate_rtt, axis=1)

log_with_timestamp("Filling missing weight values...")
df['Weight'] = df['Weight'].astype(float)
# fill missing weights by carrying forward the last valid weight for the same user
df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
# (method='ffill'): fills the missing weight values by carrying forward the last known weight within each user group. 
# if a new weight is found for a user, it will be used for the subsequent missing values until another weight is encountered.

log_with_timestamp("Creating 'injured' column...")
# create 'injured' column: 1 if 'Injury location' is not empty, 0 otherwise
df['injured'] = df['Injury location'].notna().astype(int)

# create a mapping dictionary
severity_mapping = {
    'mild': 1,
    'moderate': 2,
    'severe': 3
}

log_with_timestamp("Mapping 'Injury severity' to numeric values...")
# map the 'Injury severity' column to numeric values
df['Injury_severity_numeric_mi1mo2se3'] = df['Injury severity'].map(severity_mapping)

log_with_timestamp("Performing one-hot encoding...")
# perform one-hot encoding for categorical columns using get_dummies
df_encoded = pd.get_dummies(df, columns=['Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type'])

# final check for weekly_training_load and weekly_training_duration
log_with_timestamp("check weekly_training_load and weekly_training_duration value(if none)...")

def fill_weekly_values(group):
    if group.notna().any():
        return group.fillna(method='ffill').fillna(method='bfill')
    else:
        return group.fillna(0)

df['weekly_training_load'] = df.groupby(['Year', 'Week', 'User ID'])['weekly_training_load'].transform(fill_weekly_values)
df['weekly_training_duration'] = df.groupby(['Year', 'Week', 'User ID'])['weekly_training_duration'].transform(fill_weekly_values)

log_with_timestamp("Saving cleaned data...")
# save the final cleaned DataFrame
df.to_excel("cleaned_realuse_data.xlsx", index=True)
log_with_timestamp("Cleaned realuse data saved successfully!")


[2025-03-12 14:47:26] [Elapsed: 0.00s] Loading data...


  df["total_training_load"] = df[training_load_columns].sum(axis=1, skipna=True).fillna(0).astype(int)
  df["total_training_duration"] = df[training_duration_columns].sum(axis=1, skipna=True).fillna(0).astype(int)


[2025-03-12 14:47:56] [Elapsed: 30.07s] Converting 'Date' column to datetime format...
[2025-03-12 14:47:56] [Elapsed: 30.10s] Extracting year and week number...
[2025-03-12 14:47:56] [Elapsed: 30.10s] Transforming 'Gender' column...
[2025-03-12 14:47:56] [Elapsed: 30.11s] Transforming 'Menstruation' column...
[2025-03-12 14:47:56] [Elapsed: 30.13s] Identifying training-related columns...
[2025-03-12 14:47:56] [Elapsed: 30.13s] Cleaning 'Training Load' columns...
[2025-03-12 14:47:56] [Elapsed: 30.14s] Cleaning 'Training RPE' columns...
[2025-03-12 14:47:56] [Elapsed: 30.15s] Cleaning 'Training HowDidIDo' columns...
[2025-03-12 14:47:56] [Elapsed: 30.15s] Calculating total training load and duration...
[2025-03-12 14:47:56] [Elapsed: 30.17s] Grouping by Year, Week, and User ID...
[2025-03-12 14:47:56] [Elapsed: 30.17s] Merging weekly totals per athlete...
[2025-03-12 14:47:56] [Elapsed: 30.23s] Calculating ACWR for each user and year...


Calculating ACWR: 100%|████████████████████| 299/299 [00:00<00:00, 10130.43it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 292/292 [00:00<00:00, 10704.99it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 307/307 [00:00<00:00, 10561.01it/s]
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  group['Weight'] = group.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
Calculating ACWR: 100%|████████████████████| 300/300 [00:00<00:00, 10602.21it/s]
  group['W

[2025-03-12 14:48:09] [Elapsed: 43.05s] Cleaning Likert-scale columns...
[2025-03-12 14:48:09] [Elapsed: 43.07s] Cleaning 'Sleep duration' column...
[2025-03-12 14:48:09] [Elapsed: 43.07s] Calculating RTT scores...


Calculating RTT: 100%|██████████████| 104340/104340 [00:00<00:00, 116049.10it/s]
  df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  df['weight'] = df.groupby(['User ID', 'Name'])['Weight'].fillna(method='bfill').fillna(method='ffill')
  return group.fillna(method='ffill').fillna(method='bfill')


[2025-03-12 14:48:10] [Elapsed: 43.97s] Filling missing weight values...
[2025-03-12 14:48:10] [Elapsed: 43.98s] Creating 'injured' column...
[2025-03-12 14:48:10] [Elapsed: 43.98s] Mapping 'Injury severity' to numeric values...
[2025-03-12 14:48:10] [Elapsed: 43.99s] Performing one-hot encoding...
[2025-03-12 14:48:10] [Elapsed: 44.05s] check weekly_training_load and weekly_training_duration value(if none)...
[2025-03-12 14:48:11] [Elapsed: 45.02s] Saving cleaned data...
[2025-03-12 14:48:52] [Elapsed: 86.20s] Cleaned realuse data saved successfully!


In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-4: Data Cleaning and Transformation(illness and injury part for training data)

# load the already cleaned file
df = pd.read_excel("cleaned_training_data.xlsx")

# set a column illed
df['illed'] = df['Type of illness'].notna().astype(int)

# create the 'injury_information' column by concatenating relevant injury-related columns
df['injury_information'] = df[['Injury location', 'Injury type', 'Injury surface', 
                               'Surface condition', 'Injury tissue type', 'Injury severity']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# create the 'illness_information' column by concatenating relevant illness-related columns
df['illness_information'] = df[['Type of illness', 'Illness severity']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# replace empty strings with NaN to keep the dataset clean
df['injury_information'].replace('', np.nan, inplace=True)
df['illness_information'].replace('', np.nan, inplace=True)

# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Recovery date (expected)'] = pd.to_datetime(df['Recovery date (expected)'], errors='coerce')
df['Injury/Illness duration (days)'] = df.apply(
    lambda row: (row['Recovery date (expected)'] - row['Date']).days + 1 if pd.notnull(row['Date']) and pd.notnull(row['Recovery date (expected)']) else 0,
    axis=1
)
# replace negative or NaN values with 0 (optional)
df['Injury/Illness duration (days)'] = df['Injury/Illness duration (days)'].apply(lambda x: max(0, x) if pd.notnull(x) else 0)

# rearrange the columns sequence in the table
new_column_order = (
    ['Year'] +  # move 'Year' to the first column
    ['Date', 'Week'] +  # move 'Week' right after 'Date'
    ['User ID','Name','Sport', 'Height','Weight', 'Current Age'] +
    ['Gender_m2f1', 'Menstruation_y1n0','Mood state','Energy levels','Muscle readiness',
     'Muscle Readiness - Body Locations','Muscle Readiness Comments','Academic Pressure','Diet Yesterday',
     'Sleep quality','Sleep duration','Health','Health - Ailments','Ailment Comments','Resting HR','Comments',
     'Rest day','Holiday','Tournament','total_training_load', 'total_training_duration', 
     'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'] +
    ['injured', 'injury_information','Injury location',
     'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type','Injury severity'] +   
    ['illed','illness_information','Type of illness', 'Illness severity', 
     'Injury/Illness duration (days)', 'Reported to', 'Recovery date (expected)', 
     'Start date of rehab', 'Response to Injury', 'How did injury occur', 'Injury Mechanism', 
     'Doctor visit required', 'Stage in activity', 'Hospital visit required', 'Mode of onset', 
     'Medication required', 'Illness Classification Code', 'Additional information', 'Injury Grade', 
     'Diagnosis', 'Diagnosed by', 'Training status', 'Treatment recommendations', 
     'Return to training guideline/timeline', 'Rehab programs', 'Files description']
)

# reorder the DataFrame
df = df[new_column_order]

# save the cleaned and transformed DataFrame to a new Excel 
print("saving...")
df.to_excel("cleaned_training_data_v1.xlsx", index=False)
print("saved!")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['injury_information'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['illness_information'].replace('', np.nan, inplace=True)


saving...
saved!


In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-5: Data Cleaning and Transformation(illness and injury part for test data)

# load the already cleaned file
df = pd.read_excel("cleaned_test_data.xlsx")

# set a column illed
df['illed'] = df['Type of illness'].notna().astype(int)

# create the 'injury_information' column by concatenating relevant injury-related columns
df['injury_information'] = df[['Injury location', 'Injury type', 'Injury surface', 
                               'Surface condition', 'Injury tissue type', 'Injury severity']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# create the 'illness_information' column by concatenating relevant illness-related columns
df['illness_information'] = df[['Type of illness', 'Illness severity']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# replace empty strings with NaN to keep the dataset clean
df['injury_information'].replace('', np.nan, inplace=True)
df['illness_information'].replace('', np.nan, inplace=True)

# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Recovery date (expected)'] = pd.to_datetime(df['Recovery date (expected)'], errors='coerce')
df['Injury duration (days)'] = df.apply(
    lambda row: (row['Recovery date (expected)'] - row['Date']).days + 1 if pd.notnull(row['Date']) and pd.notnull(row['Recovery date (expected)']) else 0,
    axis=1
)
# replace negative or NaN values with 0 (optional)
df['Injury duration (days)'] = df['Injury duration (days)'].apply(lambda x: max(0, x) if pd.notnull(x) else 0)

# rearrange the columns sequence in the table
new_column_order = (
    ['Year'] +  # move 'Year' to the first column
    ['Date', 'Week'] +  # move 'Week' right after 'Date'
    ['User ID','Name','Sport', 'Height','Weight', 'Current Age'] +
    ['Gender_m2f1', 'Menstruation_y1n0','Mood state','Energy levels','Muscle readiness',
     'Muscle Readiness - Body Locations','Muscle Readiness Comments','Academic Pressure','Diet Yesterday',
     'Sleep quality','Sleep duration','Health','Health - Ailments','Ailment Comments','Resting HR','Comments',
     'Rest day','Holiday','Tournament','total_training_load', 'total_training_duration', 
     'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT', 'injured', 'injury_information','illed','illness_information',
     'Injury_severity_numeric_mi1mo2se3', 'Injury duration (days)'] +   
    ['Injury location',  
     'Type of illness', 'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type', 
     'Illness severity', 'Injury severity', 'Reported to', 'Recovery date (expected)', 
     'Start date of rehab', 'Response to Injury', 'How did injury occur', 'Injury Mechanism', 
     'Doctor visit required', 'Stage in activity', 'Hospital visit required', 'Mode of onset', 
     'Medication required', 'Illness Classification Code', 'Additional information', 'Injury Grade', 
     'Diagnosis', 'Diagnosed by', 'Training status', 'Treatment recommendations', 
     'Return to training guideline/timeline', 'Rehab programs', 'Files description']
)

# reorder the DataFrame
df = df[new_column_order]

# save the cleaned and transformed DataFrame to a new Excel 
print("saving...")
df.to_excel("cleaned_test_data_v1.xlsx", index=False)
print("saved!")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['injury_information'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['illness_information'].replace('', np.nan, inplace=True)


saving...
saved!


In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-6: Data Cleaning and Transformation(illness and injury part for realuse data)

# load the already cleaned file
df = pd.read_excel("cleaned_realuse_data.xlsx")

# set a column illed
df['illed'] = df['Type of illness'].notna().astype(int)

# create the 'injury_information' column by concatenating relevant injury-related columns
df['injury_information'] = df[['Injury location', 'Injury type', 'Injury surface', 
                               'Surface condition', 'Injury tissue type', 'Injury severity']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# create the 'illness_information' column by concatenating relevant illness-related columns
df['illness_information'] = df[['Type of illness', 'Illness severity']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# replace empty strings with NaN to keep the dataset clean
df['injury_information'].replace('', np.nan, inplace=True)
df['illness_information'].replace('', np.nan, inplace=True)

# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Recovery date (expected)'] = pd.to_datetime(df['Recovery date (expected)'], errors='coerce')
df['Injury duration (days)'] = df.apply(
    lambda row: (row['Recovery date (expected)'] - row['Date']).days + 1 if pd.notnull(row['Date']) and pd.notnull(row['Recovery date (expected)']) else 0,
    axis=1
)
# replace negative or NaN values with 0 (optional)
df['Injury duration (days)'] = df['Injury duration (days)'].apply(lambda x: max(0, x) if pd.notnull(x) else 0)

# rearrange the columns sequence in the table
new_column_order = (
    ['Year'] +  # move 'Year' to the first column
    ['Date', 'Week'] +  # move 'Week' right after 'Date'
    ['User ID','Name','Sport', 'Height','Weight', 'Current Age'] +
    ['Gender_m2f1', 'Menstruation_y1n0','Mood state','Energy levels','Muscle readiness',
     'Muscle Readiness - Body Locations','Muscle Readiness Comments','Academic Pressure','Diet Yesterday',
     'Sleep quality','Sleep duration','Health','Health - Ailments','Ailment Comments','Resting HR','Comments',
     'Rest day','Holiday','Tournament','total_training_load', 'total_training_duration', 
     'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT', 'injured', 'injury_information','illed','illness_information',
     'Injury_severity_numeric_mi1mo2se3', 'Injury duration (days)'] +   
    ['Injury location',  
     'Type of illness', 'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type', 
     'Illness severity', 'Injury severity', 'Reported to', 'Recovery date (expected)', 
     'Start date of rehab', 'Response to Injury', 'How did injury occur', 'Injury Mechanism', 
     'Doctor visit required', 'Stage in activity', 'Hospital visit required', 'Mode of onset', 
     'Medication required', 'Illness Classification Code', 'Additional information', 'Injury Grade', 
     'Diagnosis', 'Diagnosed by', 'Training status', 'Treatment recommendations', 
     'Return to training guideline/timeline', 'Rehab programs', 'Files description']
)

# reorder the DataFrame
df = df[new_column_order]

# save the cleaned and transformed DataFrame to a new Excel 
print("saving...")
df.to_excel("cleaned_realuse_data_v1.xlsx", index=False)
print("saved!")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['injury_information'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['illness_information'].replace('', np.nan, inplace=True)


saving...
saved!


In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-7: Data Cleaning and Transformation(column sequence chaging and sleep score calculating for training data)

# load the already cleaned file
df = pd.read_excel("cleaned_training_data_v1.xlsx")

# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Recovery date (expected)'] = pd.to_datetime(df['Recovery date (expected)'], errors='coerce')
df['Injury/Illness duration (days)'] = df.apply(
    lambda row: (row['Recovery date (expected)'] - row['Date']).days + 1 if pd.notnull(row['Date']) and pd.notnull(row['Recovery date (expected)']) else 0,
    axis=1
)
# rename the column of injury/illness duration days
df.rename(columns={'Injury duration (days)': 'injury/illness_duration_days'}, inplace=True)

# create sleep score column, if sleep duration or sleep quality value is missing, set it as 0
df['Sleep duration'] = pd.to_numeric(df['Sleep duration'], errors='coerce')
df['Sleep quality'] = pd.to_numeric(df['Sleep quality'], errors='coerce')
df['sleep_score'] = df['Sleep duration'].fillna(0) * df['Sleep quality'].fillna(0)

# rearrange the columns sequence in the table
new_column_order = (
    ['Year'] +  # move 'Year' to the first column
    ['Date', 'Week'] +  # move 'Week' right after 'Date'
    ['User ID','Name','Sport', 'Height','Weight', 'Current Age'] +
    ['Gender_m2f1', 'Menstruation_y1n0','Mood state','Energy levels','Muscle readiness',
     'Muscle Readiness - Body Locations','Muscle Readiness Comments','Academic Pressure','Diet Yesterday',
     'Sleep quality','Sleep duration','sleep_score','Health','Health - Ailments','Ailment Comments','Resting HR','Comments',
     'Rest day','Holiday','Tournament','total_training_load', 'total_training_duration', 
     'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'] +
    ['illed','illness_information','Type of illness', 'Illness severity'] +
    ['injured', 'injury_information','Injury location',
     'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type','Injury severity'] +   
    ['injury/illness_duration_days', 'Reported to', 'Recovery date (expected)', 
     'Start date of rehab', 'Response to Injury', 'How did injury occur', 'Injury Mechanism', 
     'Doctor visit required', 'Stage in activity', 'Hospital visit required', 'Mode of onset', 
     'Medication required', 'Illness Classification Code', 'Additional information', 'Injury Grade', 
     'Diagnosis', 'Diagnosed by', 'Training status', 'Treatment recommendations', 
     'Return to training guideline/timeline', 'Rehab programs', 'Files description']
)

# reorder the DataFrame
df = df[new_column_order]

# save the cleaned and transformed DataFrame to a new Excel 
print("saving...")
df.to_excel("cleaned_training_data_v2.xlsx", index=False)
print("saved!")

saving...
saved!


In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-8: Data Cleaning and Transformation(column sequence chaging and sleep score calculating for test data)

# load the already cleaned file
df = pd.read_excel("cleaned_test_data_v1.xlsx")

# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Recovery date (expected)'] = pd.to_datetime(df['Recovery date (expected)'], errors='coerce')
df['Injury/Illness duration (days)'] = df.apply(
    lambda row: (row['Recovery date (expected)'] - row['Date']).days + 1 if pd.notnull(row['Date']) and pd.notnull(row['Recovery date (expected)']) else 0,
    axis=1
)
# rename the column of injury/illness duration days
df.rename(columns={'Injury duration (days)': 'injury/illness_duration_days'}, inplace=True)

# create sleep score column, if sleep duration or sleep quality value is missing, set it as 0
df['Sleep duration'] = pd.to_numeric(df['Sleep duration'], errors='coerce')
df['Sleep quality'] = pd.to_numeric(df['Sleep quality'], errors='coerce')
df['sleep_score'] = df['Sleep duration'].fillna(0) * df['Sleep quality'].fillna(0)

# rearrange the columns sequence in the table
new_column_order = (
    ['Year'] +  # move 'Year' to the first column
    ['Date', 'Week'] +  # move 'Week' right after 'Date'
    ['User ID','Name','Sport', 'Height','Weight', 'Current Age'] +
    ['Gender_m2f1', 'Menstruation_y1n0','Mood state','Energy levels','Muscle readiness',
     'Muscle Readiness - Body Locations','Muscle Readiness Comments','Academic Pressure','Diet Yesterday',
     'Sleep quality','Sleep duration','sleep_score','Health','Health - Ailments','Ailment Comments','Resting HR','Comments',
     'Rest day','Holiday','Tournament','total_training_load', 'total_training_duration', 
     'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'] +
    ['illed','illness_information','Type of illness', 'Illness severity'] +
    ['injured', 'injury_information','Injury location',
     'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type','Injury severity'] +   
    ['injury/illness_duration_days', 'Reported to', 'Recovery date (expected)', 
     'Start date of rehab', 'Response to Injury', 'How did injury occur', 'Injury Mechanism', 
     'Doctor visit required', 'Stage in activity', 'Hospital visit required', 'Mode of onset', 
     'Medication required', 'Illness Classification Code', 'Additional information', 'Injury Grade', 
     'Diagnosis', 'Diagnosed by', 'Training status', 'Treatment recommendations', 
     'Return to training guideline/timeline', 'Rehab programs', 'Files description']
)

# reorder the DataFrame
df = df[new_column_order]

# save the cleaned and transformed DataFrame to a new Excel 
print("saving...")
df.to_excel("cleaned_test_data_v2.xlsx", index=False)
print("saved!")

saving...
saved!


In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-9: Data Cleaning and Transformation(column sequence chaging and sleep score calculating for realuse data)

# load the already cleaned file
df = pd.read_excel("cleaned_realuse_data_v1.xlsx")

# convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Recovery date (expected)'] = pd.to_datetime(df['Recovery date (expected)'], errors='coerce')
df['Injury/Illness duration (days)'] = df.apply(
    lambda row: (row['Recovery date (expected)'] - row['Date']).days + 1 if pd.notnull(row['Date']) and pd.notnull(row['Recovery date (expected)']) else 0,
    axis=1
)
# rename the column of injury/illness duration days
df.rename(columns={'Injury duration (days)': 'injury/illness_duration_days'}, inplace=True)

# create sleep score column, if sleep duration or sleep quality value is missing, set it as 0
df['Sleep duration'] = pd.to_numeric(df['Sleep duration'], errors='coerce')
df['Sleep quality'] = pd.to_numeric(df['Sleep quality'], errors='coerce')
df['sleep_score'] = df['Sleep duration'].fillna(0) * df['Sleep quality'].fillna(0)

# rearrange the columns sequence in the table
new_column_order = (
    ['Year'] +  # move 'Year' to the first column
    ['Date', 'Week'] +  # move 'Week' right after 'Date'
    ['User ID','Name','Sport', 'Height','Weight', 'Current Age'] +
    ['Gender_m2f1', 'Menstruation_y1n0','Mood state','Energy levels','Muscle readiness',
     'Muscle Readiness - Body Locations','Muscle Readiness Comments','Academic Pressure','Diet Yesterday',
     'Sleep quality','Sleep duration','sleep_score','Health','Health - Ailments','Ailment Comments','Resting HR','Comments',
     'Rest day','Holiday','Tournament','total_training_load', 'total_training_duration', 
     'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'] +
    ['illed','illness_information','Type of illness', 'Illness severity'] +
    ['injured', 'injury_information','Injury location',
     'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type','Injury severity'] +   
    ['injury/illness_duration_days', 'Reported to', 'Recovery date (expected)', 
     'Start date of rehab', 'Response to Injury', 'How did injury occur', 'Injury Mechanism', 
     'Doctor visit required', 'Stage in activity', 'Hospital visit required', 'Mode of onset', 
     'Medication required', 'Illness Classification Code', 'Additional information', 'Injury Grade', 
     'Diagnosis', 'Diagnosed by', 'Training status', 'Treatment recommendations', 
     'Return to training guideline/timeline', 'Rehab programs', 'Files description']
)

# reorder the DataFrame
df = df[new_column_order]

# save the cleaned and transformed DataFrame to a new Excel 
print("saving...")
df.to_excel("cleaned_realuse_data_v2.xlsx", index=False)
print("saved!")

saving...
saved!


In [4]:
import pandas as pd
import numpy as np
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-10: Data Cleaning and Transformation(birthday and age updating for training data)

print("loading v2 data...")
# load the already cleaned file
df = pd.read_excel("cleaned_training_data_v2.xlsx")

print("loading birthday data...")
# load the birthday data
birthday_df = pd.read_excel("birthday_data.xlsx")

print("merging...")
# ensure 'User ID' is the common key and merge the data
merged_df = pd.merge(df, birthday_df[['User ID', 'Date of Birth']], on='User ID', how='left')

print("converting date of birth...")
# convert 'Date of Birth' to datetime format
name_index = merged_df.columns.get_loc('Name')
sport_index = merged_df.columns.get_loc('Sport')
merged_df.insert(name_index + 1, 'Date of Birth', merged_df.pop('Date of Birth'))
merged_df['Date of Birth'] = pd.to_datetime(merged_df['Date of Birth'], errors='coerce')

print("calculating...")
# calculate age only for non-null birth years
merged_df['Current Age'] = merged_df.apply(
    lambda row: row['Year'] - row['Date of Birth'].year if pd.notnull(row['Date of Birth']) else None, axis=1
)

print("saving...")
# save the updated DataFrame to a new Excel file
merged_df.to_excel("cleaned_training_data_v3.xlsx", index=False)
print("Data merged and saved successfully!")

loading v2 data...
loading birthday data...
merging...
converting date of birth...
calculating...
saving...
Data merged and saved successfully!


In [5]:
import pandas as pd
import numpy as np
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-11: Data Cleaning and Transformation(birthday and age updating for test data)

print("loading v2 data...")
# load the already cleaned file
df = pd.read_excel("cleaned_test_data_v2.xlsx")

print("loading birthday data...")
# load the birthday data
birthday_df = pd.read_excel("birthday_data.xlsx")

print("merging...")
# ensure 'User ID' is the common key and merge the data
merged_df = pd.merge(df, birthday_df[['User ID', 'Date of Birth']], on='User ID', how='left')

print("converting date of birth...")
# convert 'Date of Birth' to datetime format
name_index = merged_df.columns.get_loc('Name')
sport_index = merged_df.columns.get_loc('Sport')
merged_df.insert(name_index + 1, 'Date of Birth', merged_df.pop('Date of Birth'))
merged_df['Date of Birth'] = pd.to_datetime(merged_df['Date of Birth'], errors='coerce')

print("calculating...")
# calculate age only for non-null birth years
merged_df['Current Age'] = merged_df.apply(
    lambda row: row['Year'] - row['Date of Birth'].year if pd.notnull(row['Date of Birth']) else None, axis=1
)

print("saving...")
# save the updated DataFrame to a new Excel file
merged_df.to_excel("cleaned_test_data_v3.xlsx", index=False)
print("Data merged and saved successfully!")

loading v2 data...
loading birthday data...
merging...
converting date of birth...
calculating...
saving...
Data merged and saved successfully!


In [6]:
import pandas as pd
import numpy as np
from datetime import datetime  
import time  

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 2-12: Data Cleaning and Transformation(birthday and age updating for realuse data)

print("loading v2 data...")
# load the already cleaned file
df = pd.read_excel("cleaned_realuse_data_v2.xlsx")

print("loading birthday data...")
# load the birthday data
birthday_df = pd.read_excel("birthday_data.xlsx")

print("merging...")
# ensure 'User ID' is the common key and merge the data
merged_df = pd.merge(df, birthday_df[['User ID', 'Date of Birth']], on='User ID', how='left')

print("converting date of birth...")
# convert 'Date of Birth' to datetime format
name_index = merged_df.columns.get_loc('Name')
sport_index = merged_df.columns.get_loc('Sport')
merged_df.insert(name_index + 1, 'Date of Birth', merged_df.pop('Date of Birth'))
merged_df['Date of Birth'] = pd.to_datetime(merged_df['Date of Birth'], errors='coerce')

print("calculating...")
# calculate age only for non-null birth years
merged_df['Current Age'] = merged_df.apply(
    lambda row: row['Year'] - row['Date of Birth'].year if pd.notnull(row['Date of Birth']) else None, axis=1
)

print("saving...")
# save the updated DataFrame to a new Excel file
merged_df.to_excel("cleaned_realuse_data_v3.xlsx", index=False)
print("Data merged and saved successfully!")

loading v2 data...
loading birthday data...
merging...
converting date of birth...
calculating...
saving...
Data merged and saved successfully!


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
from openpyxl.styles import PatternFill
from openpyxl.drawing.image import Image
from openpyxl import Workbook
import os

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-1: Model Initializing

# load training and testing data
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx")
real_df = pd.read_excel("cleaned_realuse_data_v3.xlsx")

# features for prediction
features = [
    'Gender_m2f1', 'Menstruation_y1n0', 'Mood state', 'Energy levels',
    'Muscle readiness', 'Academic Pressure', 'Diet Yesterday', 'Sleep quality',
    'Sleep duration', 'sleep_score', 'total_training_load', 'total_training_duration',
    'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'
]

# targets
targets = ['illed', 'injured']

# keep User ID and Name for reference
user_info_columns = ['User ID', 'Name','Date']

# drop missing values in features or targets
train_df = train_df.dropna(subset=features + targets)
test_df = test_df.dropna(subset=features + targets)

# define X (features) and Y (targets)
X_train = train_df[features]
y_train_ill = train_df['illed']
y_train_injured = train_df['injured']

X_test = test_df[features]
y_test_ill = test_df['illed']
y_test_injured = test_df['injured']

# standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-2: Model Training

# train models (try Random Forest, Logistic Regression, and XGBoost)

print("Training Random Forest - Injured...")
# train models on "injured" target
rf_model_injured = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_injured.fit(X_train_scaled, y_train_injured)

print("Training Logistic Regression - Injured...")
lr_model_injured = LogisticRegression(random_state=42)
lr_model_injured.fit(X_train_scaled, y_train_injured)

print("Training XGBoost - Injured...")
xgb_model_injured = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model_injured.fit(X_train_scaled, y_train_injured)

print("Training Random Forest - Ill...")
# train models on "illed" target
rf_model_ill = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_ill.fit(X_train_scaled, y_train_ill)

print("Training Logistic Regression - Ill...")
lr_model_ill = LogisticRegression(random_state=42)
lr_model_ill.fit(X_train_scaled, y_train_ill)

print("Training XGBoost - Ill...")
xgb_model_ill = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss',random_state=42)
xgb_model_ill.fit(X_train_scaled, y_train_ill)

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-3: Model Evaluating

# predictions
print("Making Predictions for Injured...")
y_pred_rf_injured = rf_model_injured.predict(X_test_scaled)
y_pred_lr_injured = lr_model_injured.predict(X_test_scaled)
y_pred_xgb_injured = xgb_model_injured.predict(X_test_scaled)

print("Making Predictions for Ill...")
y_pred_rf_ill = rf_model_ill.predict(X_test_scaled)
y_pred_lr_ill = lr_model_ill.predict(X_test_scaled)
y_pred_xgb_ill = xgb_model_ill.predict(X_test_scaled)

# calculate accuracy for each model
print("Calculating Accuracy for Models...")
accuracy_rf_injured = accuracy_score(y_test_injured, y_pred_rf_injured)
accuracy_lr_injured = accuracy_score(y_test_injured, y_pred_lr_injured)
accuracy_xgb_injured = accuracy_score(y_test_injured, y_pred_xgb_injured)

accuracy_rf_ill = accuracy_score(y_test_ill, y_pred_rf_ill)
accuracy_lr_ill = accuracy_score(y_test_ill, y_pred_lr_ill)
accuracy_xgb_ill = accuracy_score(y_test_ill, y_pred_xgb_ill)

# save test set evaluation results to Excel
test_evaluation_results = {
    'Model': ['Random Forest', 'Logistic Regression', 'XGBoost'],
    'Accuracy_Injured': [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured],
    'Accuracy_Ill': [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]
}
test_evaluation_df = pd.DataFrame(test_evaluation_results)

# plot confusion matrix for test set and save as image
def plot_and_save_confusion_matrix(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(filename)
    plt.close()

# create a directory to save confusion matrix images
os.makedirs('confusion_matrices', exist_ok=True)

# plot and save confusion matrices for all models
confusion_matrix_files = []

print("Plotting and Saving Confusion Matrices for Injury Predictions...")
plot_and_save_confusion_matrix(y_test_injured, y_pred_rf_injured, 'Confusion Matrix for Injury (Random Forest)', 'confusion_matrices/cm_rf_injury.png')
plot_and_save_confusion_matrix(y_test_injured, y_pred_lr_injured, 'Confusion Matrix for Injury (Logistic Regression)', 'confusion_matrices/cm_lr_injury.png')
plot_and_save_confusion_matrix(y_test_injured, y_pred_xgb_injured, 'Confusion Matrix for Injury (XGBoost)', 'confusion_matrices/cm_xgb_injury.png')

print("Plotting and Saving Confusion Matrices for Illness Predictions...")
plot_and_save_confusion_matrix(y_test_ill, y_pred_rf_ill, 'Confusion Matrix for Illness (Random Forest)', 'confusion_matrices/cm_rf_illness.png')
plot_and_save_confusion_matrix(y_test_ill, y_pred_lr_ill, 'Confusion Matrix for Illness (Logistic Regression)', 'confusion_matrices/cm_lr_illness.png')
plot_and_save_confusion_matrix(y_test_ill, y_pred_xgb_ill, 'Confusion Matrix for Illness (XGBoost)', 'confusion_matrices/cm_xgb_illness.png')

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-4: Implmenting model on real-use data

# drop rows with missing values for the specific features
real_df = real_df.dropna(subset=features)
X_real = real_df[features]
X_real_scaled = scaler.transform(X_real)

# predict using the best model (we can choose the model we prefer from Random Forest, Logistic Regression and XGBoost)
print("Making Predictions for Real-Use Data...")
real_df['Prediction_Injury_RandomForest'] = rf_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_LogisticRegression'] = lr_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_XGBoost'] = xgb_model_injured.predict(X_real_scaled)

real_df['Prediction_Illness_RandomForest'] = rf_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_LogisticRegression'] = lr_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_XGBoost'] = xgb_model_ill.predict(X_real_scaled)

# Calculate correctness for each row in real-use data
real_df['Correct_check_Injury_RandomForest'] = (real_df['Prediction_Injury_RandomForest'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_LogisticRegression'] = (real_df['Prediction_Injury_LogisticRegression'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_XGBoost'] = (real_df['Prediction_Injury_XGBoost'] == real_df['injured']).astype(int)

real_df['Correct_check_Illness_RandomForest'] = (real_df['Prediction_Illness_RandomForest'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_LogisticRegression'] = (real_df['Prediction_Illness_LogisticRegression'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_XGBoost'] = (real_df['Prediction_Illness_XGBoost'] == real_df['illed']).astype(int)

# calculate global accuracy for real-use data
real_rf_injured_acc = real_df['Correct_check_Injury_RandomForest'].mean()
real_lr_injured_acc = real_df['Correct_check_Injury_LogisticRegression'].mean()
real_xgb_injured_acc = real_df['Correct_check_Injury_XGBoost'].mean()

real_rf_ill_acc = real_df['Correct_check_Illness_RandomForest'].mean()
real_lr_ill_acc = real_df['Correct_check_Illness_LogisticRegression'].mean()
real_xgb_ill_acc = real_df['Correct_check_Illness_XGBoost'].mean()

# add global accuracy columns to real-use data
real_df['Total_accuracy_Injury_RandomForest'] = real_rf_injured_acc
real_df['Total_accuracy_Injury_LogisticRegression'] = real_lr_injured_acc
real_df['Total_accuracy_Injury_XGBoost'] = real_xgb_injured_acc

real_df['Total_accuracy_Illness_RandomForest'] = real_rf_ill_acc
real_df['Total_accuracy_Illness_LogisticRegression'] = real_lr_ill_acc
real_df['Total_accuracy_Illness_XGBoost'] = real_xgb_ill_acc

# highlight incorrect predictions in Excel
def highlight_incorrect_predictions(df, writer, sheet_name):
    wb = writer.book
    ws = wb[sheet_name]
    red_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')
    
    # iterate over each row in the DataFrame
    for row_idx in range(len(df)):  # use range(len(df)) to iterate over valid row indices
        for col in ['Correct_check_Injury_RandomForest', 'Correct_check_Injury_LogisticRegression', 'Correct_check_Injury_XGBoost',
                    'Correct_check_Illness_RandomForest', 'Correct_check_Illness_LogisticRegression', 'Correct_check_Illness_XGBoost']:
            if df.iloc[row_idx][col] == 0:  # use iloc to access rows by position
                # excel rows start from 1, and we need to skip the header row
                ws.cell(row=row_idx + 2, column=df.columns.get_loc(col) + 1).fill = red_fill

# save results to Excel
with pd.ExcelWriter('prediction_v1.xlsx', engine='openpyxl') as writer:
    # test set evaluation results
    test_evaluation_df.to_excel(writer, sheet_name='Test_Evaluation', index=False)
    
    # real-use data
    real_predictions = real_df[user_info_columns + [
        'Prediction_Injury_RandomForest', 'Correct_check_Injury_RandomForest','Total_accuracy_Injury_RandomForest',
        'Prediction_Injury_LogisticRegression', 'Correct_check_Injury_LogisticRegression','Total_accuracy_Injury_LogisticRegression',
        'Prediction_Injury_XGBoost', 'Correct_check_Injury_XGBoost','Total_accuracy_Injury_XGBoost',
        'Prediction_Illness_RandomForest', 'Correct_check_Illness_RandomForest','Total_accuracy_Illness_RandomForest',
        'Prediction_Illness_LogisticRegression', 'Correct_check_Illness_LogisticRegression','Total_accuracy_Illness_LogisticRegression',
        'Prediction_Illness_XGBoost', 'Correct_check_Illness_XGBoost','Total_accuracy_Illness_XGBoost'
    ]]
    real_predictions.to_excel(writer, sheet_name='Real-Use_Predictions', index=False)
    highlight_incorrect_predictions(real_predictions, writer, 'Real-Use_Predictions')
    # Add confusion matrices to Excel
    wb = writer.book
    ws = wb.create_sheet('Confusion_Matrices')

    # insert confusion matrix images into Excel
    row_offset = 1
    for filename in [
        'confusion_matrices/cm_rf_injury.png',
        'confusion_matrices/cm_lr_injury.png',
        'confusion_matrices/cm_xgb_injury.png',
        'confusion_matrices/cm_rf_illness.png',
        'confusion_matrices/cm_lr_illness.png',
        'confusion_matrices/cm_xgb_illness.png'
    ]:
        img = Image(filename)
        ws.add_image(img, f'A{row_offset}')
        row_offset += 20  # adjust row offset to avoid overlapping images

print("Predictions and evaluation results saved successfully!")


Training Random Forest - Injured...
Training Logistic Regression - Injured...
Training XGBoost - Injured...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Random Forest - Ill...
Training Logistic Regression - Ill...
Training XGBoost - Ill...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Making Predictions for Injured...
Making Predictions for Ill...
Calculating Accuracy for Models...
Plotting and Saving Confusion Matrices for Injury Predictions...
Plotting and Saving Confusion Matrices for Illness Predictions...
Making Predictions for Real-Use Data...
Predictions and evaluation results saved successfully!


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
from openpyxl.styles import PatternFill
from openpyxl.drawing.image import Image
from openpyxl import Workbook
import os
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-1: Model Initializing

# Load training and testing data
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx")
real_df = pd.read_excel("cleaned_realuse_data_v3.xlsx")

# Features for prediction
features = [
    'Gender_m2f1', 'Menstruation_y1n0', 'Mood state', 'Energy levels',
    'Muscle readiness', 'Academic Pressure', 'Diet Yesterday', 'Sleep quality',
    'Sleep duration', 'sleep_score', 'total_training_load', 'total_training_duration',
    'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'
]

# Targets
targets = ['illed', 'injured']

# Keep User ID, Name, and Date for reference
user_info_columns = ['User ID', 'Name', 'Date']

# Drop missing values in features or targets
train_df = train_df.dropna(subset=features + targets)
test_df = test_df.dropna(subset=features + targets)

# Define X (features) and Y (targets)
X_train = train_df[features]
y_train_ill = train_df['illed']
y_train_injured = train_df['injured']

X_test = test_df[features]
y_test_ill = test_df['illed']
y_test_injured = test_df['injured']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-2: Handle Class Imbalance

# Check class distribution
print("Class distribution for 'illed':")
print(y_train_ill.value_counts())

print("Class distribution for 'injured':")
print(y_train_injured.value_counts())

# Method 1: Resampling (SMOTE for oversampling)
smote = SMOTE(random_state=42)
X_train_resampled_injured, y_train_resampled_injured = smote.fit_resample(X_train_scaled, y_train_injured)
X_train_resampled_ill, y_train_resampled_ill = smote.fit_resample(X_train_scaled, y_train_ill)

# Method 2: Adjust class weights
class_weights_injured = {
    0: len(y_train_injured) / (2 * len(y_train_injured[y_train_injured == 0])),
    1: len(y_train_injured) / (2 * len(y_train_injured[y_train_injured == 1]))
}

class_weights_ill = {
    0: len(y_train_ill) / (2 * len(y_train_ill[y_train_ill == 0])),
    1: len(y_train_ill) / (2 * len(y_train_ill[y_train_ill == 1]))
}

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-3: Model Training

# Train models (try Random Forest, Logistic Regression, and XGBoost)

# # fine tuning hyper-parameters---------------------------------------------------
# # For Random Forest
# param_grid_rf = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 15, 20],
#     'min_samples_split': [2, 5, 10],
# }

# rf_grid_search = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42), param_grid_rf, cv=3)
# rf_grid_search.fit(X_train_resampled_injured, y_train_resampled_injured)

# # For Logistic Regression
# param_grid_lr = {
#     'C': [0.01, 0.1, 1, 10],
#     'solver': ['liblinear', 'saga'],
#     'max_iter': [100, 200, 300]
# }

# lr_grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42), param_grid_lr, cv=3)
# lr_grid_search.fit(X_train_resampled_injured, y_train_resampled_injured)

# # For XGBoost
# param_grid_xgb = {
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'max_depth': [3, 6, 10],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
# }

# xgb_grid_search = GridSearchCV(xgb.XGBClassifier(scale_pos_weight=len(y_train_injured[y_train_injured == 0]) / len(y_train_injured[y_train_injured == 1]), use_label_encoder=False, eval_metric='logloss', random_state=42), param_grid_xgb, cv=3)
# xgb_grid_search.fit(X_train_resampled_injured, y_train_resampled_injured)

# # Print best parameters and scores
# print("Best Random Forest Params:", rf_grid_search.best_params_)
# print("Best Logistic Regression Params:", lr_grid_search.best_params_)
# print("Best XGBoost Params:", xgb_grid_search.best_params_)
# # ---------------------------------------------------------------------------------

print("Training Random Forest - Injured...")
rf_model_injured = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    class_weight=class_weights_injured,  # Adjust class weights
    random_state=42
)
rf_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training Logistic Regression - Injured...")
lr_model_injured = LogisticRegression(
    C=10,  
    max_iter=100,  
    solver='liblinear',  
    class_weight='balanced',
    random_state=42
)
lr_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training XGBoost - Injured...")
xgb_model_injured = xgb.XGBClassifier(
    learning_rate=0.1,  
    n_estimators=500, 
    max_depth=10,  
    subsample=0.9,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train_injured[y_train_injured == 0]) / len(y_train_injured[y_train_injured == 1]),
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training Random Forest - Ill...")
rf_model_ill = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    class_weight=class_weights_ill,  # Adjust class weights
    random_state=42
)
rf_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

print("Training Logistic Regression - Ill...")
lr_model_ill = LogisticRegression(
    C=10, 
    max_iter=100, 
    solver='liblinear', 
    class_weight='balanced', 
    random_state=42
)
lr_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

print("Training XGBoost - Ill...")
xgb_model_ill = xgb.XGBClassifier(
    learning_rate=0.1,  
    n_estimators=500,  
    max_depth=10,  
    subsample=0.9, 
    colsample_bytree=0.8, 
    scale_pos_weight=len(y_train_ill[y_train_ill == 0]) / len(y_train_ill[y_train_ill == 1]), 
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-4: Model Evaluating

# Predictions
print("Making Predictions for Injured...")
y_pred_rf_injured = rf_model_injured.predict(X_test_scaled)
y_pred_lr_injured = lr_model_injured.predict(X_test_scaled)
y_pred_xgb_injured = xgb_model_injured.predict(X_test_scaled)

print("Making Predictions for Ill...")
y_pred_rf_ill = rf_model_ill.predict(X_test_scaled)
y_pred_lr_ill = lr_model_ill.predict(X_test_scaled)
y_pred_xgb_ill = xgb_model_ill.predict(X_test_scaled)

# Calculate accuracy for each model
print("Calculating Accuracy for Models...")
accuracy_rf_injured = accuracy_score(y_test_injured, y_pred_rf_injured)
accuracy_lr_injured = accuracy_score(y_test_injured, y_pred_lr_injured)
accuracy_xgb_injured = accuracy_score(y_test_injured, y_pred_xgb_injured)

accuracy_rf_ill = accuracy_score(y_test_ill, y_pred_rf_ill)
accuracy_lr_ill = accuracy_score(y_test_ill, y_pred_lr_ill)
accuracy_xgb_ill = accuracy_score(y_test_ill, y_pred_xgb_ill)

# Save test set evaluation results to Excel
test_evaluation_results = {
    'Model': ['Random Forest', 'Logistic Regression', 'XGBoost'],
    'Accuracy_Injured': [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured],
    'Accuracy_Ill': [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]
}
test_evaluation_df = pd.DataFrame(test_evaluation_results)

# Plot confusion matrix for test set and save as image
def plot_and_save_confusion_matrix(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(filename)
    plt.close()

def plot_and_save_precision_recall_curve(y_true, y_pred_proba, title, filename):
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    plt.figure(figsize=(6, 6))
    plt.plot(recall, precision, color='green', label='Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend(loc='lower left')
    plt.savefig(filename)
    plt.close()

def plot_and_save_accuracy_bar_chart(models, accuracies, title, filename):
    plt.figure(figsize=(8, 5))
    sns.barplot(x=models, y=accuracies, palette="viridis")
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.ylim(0, 1)  # Accuracy ranges from 0 to 1
    plt.savefig(filename)
    plt.close()

def plot_and_save_confusion_matrix_heatmap(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.savefig(filename)
    plt.close()

# Create a directory to save confusion matrix images
os.makedirs('confusion_matrices', exist_ok=True)

# Plot and save confusion matrices for all models
print("Plotting and Saving Confusion Matrices for Injury Predictions...")
plot_and_save_confusion_matrix(y_test_injured, y_pred_rf_injured, 'Confusion Matrix for Injury (Random Forest)', 'confusion_matrices/cm_rf_injury.png')
plot_and_save_confusion_matrix(y_test_injured, y_pred_lr_injured, 'Confusion Matrix for Injury (Logistic Regression)', 'confusion_matrices/cm_lr_injury.png')
plot_and_save_confusion_matrix(y_test_injured, y_pred_xgb_injured, 'Confusion Matrix for Injury (XGBoost)', 'confusion_matrices/cm_xgb_injury.png')

print("Plotting and Saving Confusion Matrices for Illness Predictions...")
plot_and_save_confusion_matrix(y_test_ill, y_pred_rf_ill, 'Confusion Matrix for Illness (Random Forest)', 'confusion_matrices/cm_rf_illness.png')
plot_and_save_confusion_matrix(y_test_ill, y_pred_lr_ill, 'Confusion Matrix for Illness (Logistic Regression)', 'confusion_matrices/cm_lr_illness.png')
plot_and_save_confusion_matrix(y_test_ill, y_pred_xgb_ill, 'Confusion Matrix for Illness (XGBoost)', 'confusion_matrices/cm_xgb_illness.png')

# Generate and Save Precision-Recall Curves
print("Saving Precision-Recall Curves...")
plot_and_save_precision_recall_curve(y_test_injured, rf_model_injured.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Injury (RF)', 'confusion_matrices/pr_rf_injury.png')
plot_and_save_precision_recall_curve(y_test_injured, lr_model_injured.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Injury (LR)', 'confusion_matrices/pr_lr_injury.png')
plot_and_save_precision_recall_curve(y_test_injured, xgb_model_injured.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Injury (XGB)', 'confusion_matrices/pr_xgb_injury.png')

plot_and_save_precision_recall_curve(y_test_ill, rf_model_ill.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Illness (RF)', 'confusion_matrices/pr_rf_illness.png')
plot_and_save_precision_recall_curve(y_test_ill, lr_model_ill.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Illness (LR)', 'confusion_matrices/pr_lr_illness.png')
plot_and_save_precision_recall_curve(y_test_ill, xgb_model_ill.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Illness (XGB)', 'confusion_matrices/pr_xgb_illness.png')

# Generate and Save Accuracy Bar Chart
models = ['Random Forest', 'Logistic Regression', 'XGBoost']
injury_accuracies = [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured]
illness_accuracies = [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]

print("Saving Accuracy Bar Charts...")
plot_and_save_accuracy_bar_chart(models, injury_accuracies, 'Model Accuracy for Injury Prediction', 'confusion_matrices/accuracy_injury.png')
plot_and_save_accuracy_bar_chart(models, illness_accuracies, 'Model Accuracy for Illness Prediction', 'confusion_matrices/accuracy_illness.png')

# drop rows with missing values for the specific features
real_df = real_df.dropna(subset=features)
X_real = real_df[features]
X_real_scaled = scaler.transform(X_real)

# predict using the best model (we can choose the model we prefer from Random Forest, Logistic Regression and XGBoost)
print("Making Predictions for Real-Use Data...")
real_df['Prediction_Injury_RandomForest'] = rf_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_LogisticRegression'] = lr_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_XGBoost'] = xgb_model_injured.predict(X_real_scaled)

real_df['Prediction_Illness_RandomForest'] = rf_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_LogisticRegression'] = lr_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_XGBoost'] = xgb_model_ill.predict(X_real_scaled)

# Calculate correctness for each row in real-use data
real_df['Correct_check_Injury_RandomForest'] = (real_df['Prediction_Injury_RandomForest'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_LogisticRegression'] = (real_df['Prediction_Injury_LogisticRegression'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_XGBoost'] = (real_df['Prediction_Injury_XGBoost'] == real_df['injured']).astype(int)

real_df['Correct_check_Illness_RandomForest'] = (real_df['Prediction_Illness_RandomForest'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_LogisticRegression'] = (real_df['Prediction_Illness_LogisticRegression'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_XGBoost'] = (real_df['Prediction_Illness_XGBoost'] == real_df['illed']).astype(int)

# calculate global accuracy for real-use data
real_rf_injured_acc = real_df['Correct_check_Injury_RandomForest'].mean()
real_lr_injured_acc = real_df['Correct_check_Injury_LogisticRegression'].mean()
real_xgb_injured_acc = real_df['Correct_check_Injury_XGBoost'].mean()

real_rf_ill_acc = real_df['Correct_check_Illness_RandomForest'].mean()
real_lr_ill_acc = real_df['Correct_check_Illness_LogisticRegression'].mean()
real_xgb_ill_acc = real_df['Correct_check_Illness_XGBoost'].mean()

# add global accuracy columns to real-use data
real_df['Total_accuracy_Injury_RandomForest'] = real_rf_injured_acc
real_df['Total_accuracy_Injury_LogisticRegression'] = real_lr_injured_acc
real_df['Total_accuracy_Injury_XGBoost'] = real_xgb_injured_acc

real_df['Total_accuracy_Illness_RandomForest'] = real_rf_ill_acc
real_df['Total_accuracy_Illness_LogisticRegression'] = real_lr_ill_acc
real_df['Total_accuracy_Illness_XGBoost'] = real_xgb_ill_acc

# highlight incorrect predictions in Excel
def highlight_incorrect_predictions(df, writer, sheet_name):
    wb = writer.book
    ws = wb[sheet_name]
    red_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')
    
    # iterate over each row in the DataFrame
    for row_idx in range(len(df)):  # use range(len(df)) to iterate over valid row indices
        for col in ['Correct_check_Injury_RandomForest', 'Correct_check_Injury_LogisticRegression', 'Correct_check_Injury_XGBoost',
                    'Correct_check_Illness_RandomForest', 'Correct_check_Illness_LogisticRegression', 'Correct_check_Illness_XGBoost']:
            if df.iloc[row_idx][col] == 0:  # use iloc to access rows by position
                # excel rows start from 1, and we need to skip the header row
                ws.cell(row=row_idx + 2, column=df.columns.get_loc(col) + 1).fill = red_fill

# Save results to Excel
with pd.ExcelWriter('prediction_v1.xlsx', engine='openpyxl') as writer:
    # Test set evaluation results
    test_evaluation_df.to_excel(writer, sheet_name='Test_Evaluation', index=False)
    
    # Real-use data
    real_predictions = real_df[user_info_columns + [
        'Prediction_Injury_RandomForest', 'Correct_check_Injury_RandomForest',
        'Prediction_Injury_LogisticRegression', 'Correct_check_Injury_LogisticRegression',
        'Prediction_Injury_XGBoost', 'Correct_check_Injury_XGBoost',
        'Prediction_Illness_RandomForest', 'Correct_check_Illness_RandomForest',
        'Prediction_Illness_LogisticRegression', 'Correct_check_Illness_LogisticRegression',
        'Prediction_Illness_XGBoost', 'Correct_check_Illness_XGBoost'
    ]]
    real_predictions.to_excel(writer, sheet_name='Real-Use_Predictions', index=False)
    highlight_incorrect_predictions(real_predictions, writer, 'Real-Use_Predictions')

    # Add confusion matrices to Excel
    wb = writer.book
    ws = wb.create_sheet('Model_Visuals')

    # Insert confusion matrix images into Excel
    row_offset = 1
    images = [
        ('confusion_matrices/cm_rf_injury.png', 'Confusion Matrix RF Injury'),
        ('confusion_matrices/cm_lr_injury.png', 'Confusion Matrix LR Injury'),
        ('confusion_matrices/cm_xgb_injury.png', 'Confusion Matrix XGB Injury'),
        ('confusion_matrices/cm_rf_illness.png', 'Confusion Matrix RF Illness'),
        ('confusion_matrices/cm_lr_illness.png', 'Confusion Matrix LR Illness'),
        ('confusion_matrices/cm_xgb_illness.png', 'Confusion Matrix XGB Illness'),
        ('confusion_matrices/pr_rf_injury.png', 'Precision-Recall RF Injury'),
        ('confusion_matrices/pr_lr_injury.png', 'Precision-Recall LR Injury'),
        ('confusion_matrices/pr_xgb_injury.png', 'Precision-Recall XGB Injury'),
        ('confusion_matrices/pr_rf_illness.png', 'Precision-Recall RF Illness'),
        ('confusion_matrices/pr_lr_illness.png', 'Precision-Recall LR Illness'),
        ('confusion_matrices/pr_xgb_illness.png', 'Precision-Recall XGB Illness'),
        ('confusion_matrices/accuracy_injury.png', 'Accuracy Bar Chart Injury'),
        ('confusion_matrices/accuracy_illness.png', 'Accuracy Bar Chart Illness'),
    ]

    ws.column_dimensions['A'].width = 50  # Adjust column width for labels

    for img_path, label in images:
        try:
            img = Image(img_path)
            ws.append([label])  # Insert label as a row before the image
            ws.add_image(img, f'A{row_offset}')
            row_offset += 30  # Adjust row offset to avoid overlapping images
        except Exception as e:
            print(f"Error inserting {img_path}: {e}")

print("Predictions, evaluation results, and confusion matrices saved successfully!")

Class distribution for 'illed':
illed
0    57684
1       22
Name: count, dtype: int64
Class distribution for 'injured':
injured
0    57529
1      177
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Random Forest Params: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}
Best Logistic Regression Params: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Best XGBoost Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.9}
Training Random Forest - Injured...
Training Logistic Regression - Injured...
Training XGBoost - Injured...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Random Forest - Ill...
Training Logistic Regression - Ill...
Training XGBoost - Ill...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Making Predictions for Injured...
Making Predictions for Ill...
Calculating Accuracy for Models...
Plotting and Saving Confusion Matrices for Injury Predictions...
Plotting and Saving Confusion Matrices for Illness Predictions...
Saving Precision-Recall Curves...
Saving Accuracy Bar Charts...
Making Predictions for Real-Use Data...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=models, y=accuracies, palette="viridis")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=models, y=accuracies, palette="viridis")


Predictions, evaluation results, and confusion matrices saved successfully!


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
from openpyxl.styles import PatternFill
from openpyxl.drawing.image import Image
from openpyxl import Workbook
import os
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-1: Model Initializing

# load training and testing data
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx")
real_df = pd.read_excel("cleaned_realuse_data_v3.xlsx")

# features for prediction
features = [
    'Gender_m2f1', 'Menstruation_y1n0', 'Mood state', 'Energy levels',
    'Muscle readiness', 'Academic Pressure', 'Diet Yesterday', 'Sleep quality',
    'Sleep duration', 'sleep_score', 'total_training_load', 'total_training_duration',
    'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT'
]

# targets
targets = ['illed', 'injured']

# keep User ID, Name, and Date for reference
user_info_columns = ['User ID', 'Name', 'Date']

# drop missing values in features or targets
train_df = train_df.dropna(subset=features + targets)
test_df = test_df.dropna(subset=features + targets)

# define X (features) and Y (targets)
X_train = train_df[features]
y_train_ill = train_df['illed']
y_train_injured = train_df['injured']

X_test = test_df[features]
y_test_ill = test_df['illed']
y_test_injured = test_df['injured']

# standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-2: Handle Class Imbalance

# check class distribution
print("Class distribution for 'illed':")
print(y_train_ill.value_counts())

print("Class distribution for 'injured':")
print(y_train_injured.value_counts())

# method 1: resampling (SMOTE for oversampling)
smote = SMOTE(random_state=42)
X_train_resampled_injured, y_train_resampled_injured = smote.fit_resample(X_train_scaled, y_train_injured)
X_train_resampled_ill, y_train_resampled_ill = smote.fit_resample(X_train_scaled, y_train_ill)

# method 2: Adjust class weights
class_weights_injured = {
    0: len(y_train_injured) / (2 * len(y_train_injured[y_train_injured == 0])),
    1: len(y_train_injured) / (2 * len(y_train_injured[y_train_injured == 1]))
}

class_weights_ill = {
    0: len(y_train_ill) / (2 * len(y_train_ill[y_train_ill == 0])),
    1: len(y_train_ill) / (2 * len(y_train_ill[y_train_ill == 1]))
}

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-3: Model Training

# train models (try Random Forest, Logistic Regression, and XGBoost)

# fine tuning hyper-parameters---------------------------------------------------
# For Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
}

rf_grid_search = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42), param_grid_rf, cv=3)
rf_grid_search.fit(X_train_resampled_injured, y_train_resampled_injured)

# for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

lr_grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42), param_grid_lr, cv=3)
lr_grid_search.fit(X_train_resampled_injured, y_train_resampled_injured)

# for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 10],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
}

xgb_grid_search = GridSearchCV(xgb.XGBClassifier(scale_pos_weight=len(y_train_injured[y_train_injured == 0]) / len(y_train_injured[y_train_injured == 1]), use_label_encoder=False, eval_metric='logloss', random_state=42), param_grid_xgb, cv=3)
xgb_grid_search.fit(X_train_resampled_injured, y_train_resampled_injured)

# print best parameters and scores
print("Best Random Forest Params:", rf_grid_search.best_params_)
print("Best Logistic Regression Params:", lr_grid_search.best_params_)
print("Best XGBoost Params:", xgb_grid_search.best_params_)
# # ---------------------------------------------------------------------------------

print("Training Random Forest - Injured...")
rf_model_injured = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    class_weight=class_weights_injured,  # Adjust class weights
    random_state=42
)
rf_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training Logistic Regression - Injured...")
lr_model_injured = LogisticRegression(
    C=10,  
    max_iter=100,  
    solver='liblinear',  
    class_weight='balanced',
    random_state=42
)
lr_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training XGBoost - Injured...")
xgb_model_injured = xgb.XGBClassifier(
    learning_rate=0.01,  
    n_estimators=3000, 
    max_depth=6,  
    subsample=0.9,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train_injured[y_train_injured == 0]) / len(y_train_injured[y_train_injured == 1]),
    gamma=0.1,  # regularization parameter to avoid overfitting
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model_injured.fit(X_train_resampled_injured, y_train_resampled_injured)  # Use resampled data

print("Training Random Forest - Ill...")
rf_model_ill = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    class_weight=class_weights_ill,  # Adjust class weights
    random_state=42
)
rf_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

print("Training Logistic Regression - Ill...")
lr_model_ill = LogisticRegression(
    C=10, 
    max_iter=100, 
    solver='liblinear', 
    class_weight='balanced', 
    random_state=42
)
lr_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

print("Training XGBoost - Ill...")
xgb_model_ill = xgb.XGBClassifier(
    learning_rate=0.01,  
    n_estimators=3000,  
    max_depth=6,  
    subsample=0.9, 
    colsample_bytree=0.8, 
    scale_pos_weight=len(y_train_ill[y_train_ill == 0]) / len(y_train_ill[y_train_ill == 1]),
    gamma=0.1,  # regularization parameter to avoid overfitting
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model_ill.fit(X_train_resampled_ill, y_train_resampled_ill)  # Use resampled data

# -----------------------------------------------------------------------------------------------------------------------------------------
## Step 3-4: Model Evaluating

# predictions
print("Making Predictions for Injured...")
y_pred_rf_injured = rf_model_injured.predict(X_test_scaled)
y_pred_lr_injured = lr_model_injured.predict(X_test_scaled)
y_pred_xgb_injured = xgb_model_injured.predict(X_test_scaled)

print("Making Predictions for Ill...")
y_pred_rf_ill = rf_model_ill.predict(X_test_scaled)
y_pred_lr_ill = lr_model_ill.predict(X_test_scaled)
y_pred_xgb_ill = xgb_model_ill.predict(X_test_scaled)

# calculate accuracy for each model
print("Calculating Accuracy for Models...")
accuracy_rf_injured = accuracy_score(y_test_injured, y_pred_rf_injured)
accuracy_lr_injured = accuracy_score(y_test_injured, y_pred_lr_injured)
accuracy_xgb_injured = accuracy_score(y_test_injured, y_pred_xgb_injured)

accuracy_rf_ill = accuracy_score(y_test_ill, y_pred_rf_ill)
accuracy_lr_ill = accuracy_score(y_test_ill, y_pred_lr_ill)
accuracy_xgb_ill = accuracy_score(y_test_ill, y_pred_xgb_ill)

# save test set evaluation results to Excel
test_evaluation_results = {
    'Model': ['Random Forest', 'Logistic Regression', 'XGBoost'],
    'Accuracy_Injured': [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured],
    'Accuracy_Ill': [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]
}
test_evaluation_df = pd.DataFrame(test_evaluation_results)

# plot confusion matrix for test set and save as image
def plot_and_save_confusion_matrix(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(filename)
    plt.close()

def plot_and_save_precision_recall_curve(y_true, y_pred_proba, title, filename):
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    plt.figure(figsize=(6, 6))
    plt.plot(recall, precision, color='green', label='Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend(loc='lower left')
    plt.savefig(filename)
    plt.close()

def plot_and_save_accuracy_bar_chart(models, accuracies, title, filename):
    plt.figure(figsize=(8, 5))
    sns.barplot(x=models, y=accuracies, palette="viridis")
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.ylim(0, 1)  # Accuracy ranges from 0 to 1
    plt.savefig(filename)
    plt.close()

def plot_and_save_confusion_matrix_heatmap(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.savefig(filename)
    plt.close()

# create a directory to save confusion matrix images
os.makedirs('confusion_matrices', exist_ok=True)

# plot and save confusion matrices for all models
print("Plotting and Saving Confusion Matrices for Injury Predictions...")
plot_and_save_confusion_matrix(y_test_injured, y_pred_rf_injured, 'Confusion Matrix for Injury (Random Forest)', 'confusion_matrices/cm_rf_injury.png')
plot_and_save_confusion_matrix(y_test_injured, y_pred_lr_injured, 'Confusion Matrix for Injury (Logistic Regression)', 'confusion_matrices/cm_lr_injury.png')
plot_and_save_confusion_matrix(y_test_injured, y_pred_xgb_injured, 'Confusion Matrix for Injury (XGBoost)', 'confusion_matrices/cm_xgb_injury.png')

print("Plotting and Saving Confusion Matrices for Illness Predictions...")
plot_and_save_confusion_matrix(y_test_ill, y_pred_rf_ill, 'Confusion Matrix for Illness (Random Forest)', 'confusion_matrices/cm_rf_illness.png')
plot_and_save_confusion_matrix(y_test_ill, y_pred_lr_ill, 'Confusion Matrix for Illness (Logistic Regression)', 'confusion_matrices/cm_lr_illness.png')
plot_and_save_confusion_matrix(y_test_ill, y_pred_xgb_ill, 'Confusion Matrix for Illness (XGBoost)', 'confusion_matrices/cm_xgb_illness.png')

# generate and save precision-recall curves
print("Saving Precision-Recall Curves...")
plot_and_save_precision_recall_curve(y_test_injured, rf_model_injured.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Injury (RF)', 'confusion_matrices/pr_rf_injury.png')
plot_and_save_precision_recall_curve(y_test_injured, lr_model_injured.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Injury (LR)', 'confusion_matrices/pr_lr_injury.png')
plot_and_save_precision_recall_curve(y_test_injured, xgb_model_injured.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Injury (XGB)', 'confusion_matrices/pr_xgb_injury.png')

plot_and_save_precision_recall_curve(y_test_ill, rf_model_ill.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Illness (RF)', 'confusion_matrices/pr_rf_illness.png')
plot_and_save_precision_recall_curve(y_test_ill, lr_model_ill.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Illness (LR)', 'confusion_matrices/pr_lr_illness.png')
plot_and_save_precision_recall_curve(y_test_ill, xgb_model_ill.predict_proba(X_test_scaled)[:, 1], 'Precision-Recall for Illness (XGB)', 'confusion_matrices/pr_xgb_illness.png')

# # generate and save accuracy bar chart
# models = ['Random Forest', 'Logistic Regression', 'XGBoost']
# injury_accuracies = [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured]
# illness_accuracies = [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]

# print("Saving Accuracy Bar Charts...")
# plot_and_save_accuracy_bar_chart(models, injury_accuracies, 'Model Accuracy for Injury Prediction', 'confusion_matrices/accuracy_injury.png')
# plot_and_save_accuracy_bar_chart(models, illness_accuracies, 'Model Accuracy for Illness Prediction', 'confusion_matrices/accuracy_illness.png')

# drop rows with missing values for the specific features
real_df = real_df.dropna(subset=features)
X_real = real_df[features]
X_real_scaled = scaler.transform(X_real)

# predict using the best model (we can choose the model we prefer from Random Forest, Logistic Regression and XGBoost)
print("Making Predictions for Real-Use Data...")
real_df['Prediction_Injury_RandomForest'] = rf_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_LogisticRegression'] = lr_model_injured.predict(X_real_scaled)
real_df['Prediction_Injury_XGBoost'] = xgb_model_injured.predict(X_real_scaled)

real_df['Prediction_Illness_RandomForest'] = rf_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_LogisticRegression'] = lr_model_ill.predict(X_real_scaled)
real_df['Prediction_Illness_XGBoost'] = xgb_model_ill.predict(X_real_scaled)

# calculate correctness for each row in real-use data
real_df['Correct_check_Injury_RandomForest'] = (real_df['Prediction_Injury_RandomForest'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_LogisticRegression'] = (real_df['Prediction_Injury_LogisticRegression'] == real_df['injured']).astype(int)
real_df['Correct_check_Injury_XGBoost'] = (real_df['Prediction_Injury_XGBoost'] == real_df['injured']).astype(int)

real_df['Correct_check_Illness_RandomForest'] = (real_df['Prediction_Illness_RandomForest'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_LogisticRegression'] = (real_df['Prediction_Illness_LogisticRegression'] == real_df['illed']).astype(int)
real_df['Correct_check_Illness_XGBoost'] = (real_df['Prediction_Illness_XGBoost'] == real_df['illed']).astype(int)

# calculate global accuracy for real-use data
real_rf_injured_acc = real_df['Correct_check_Injury_RandomForest'].mean()
real_lr_injured_acc = real_df['Correct_check_Injury_LogisticRegression'].mean()
real_xgb_injured_acc = real_df['Correct_check_Injury_XGBoost'].mean()

real_rf_ill_acc = real_df['Correct_check_Illness_RandomForest'].mean()
real_lr_ill_acc = real_df['Correct_check_Illness_LogisticRegression'].mean()
real_xgb_ill_acc = real_df['Correct_check_Illness_XGBoost'].mean()

# add global accuracy columns to real-use data
real_df['Total_accuracy_Injury_RandomForest'] = real_rf_injured_acc
real_df['Total_accuracy_Injury_LogisticRegression'] = real_lr_injured_acc
real_df['Total_accuracy_Injury_XGBoost'] = real_xgb_injured_acc

real_df['Total_accuracy_Illness_RandomForest'] = real_rf_ill_acc
real_df['Total_accuracy_Illness_LogisticRegression'] = real_lr_ill_acc
real_df['Total_accuracy_Illness_XGBoost'] = real_xgb_ill_acc

# highlight incorrect predictions in Excel
def highlight_incorrect_predictions(df, writer, sheet_name):
    wb = writer.book
    ws = wb[sheet_name]
    red_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')
    
    # iterate over each row in the DataFrame
    for row_idx in range(len(df)):  # use range(len(df)) to iterate over valid row indices
        for col in ['Correct_check_Injury_RandomForest', 'Correct_check_Injury_LogisticRegression', 'Correct_check_Injury_XGBoost',
                    'Correct_check_Illness_RandomForest', 'Correct_check_Illness_LogisticRegression', 'Correct_check_Illness_XGBoost']:
            if df.iloc[row_idx][col] == 0:  # use iloc to access rows by position
                # excel rows start from 1, and we need to skip the header row
                ws.cell(row=row_idx + 2, column=df.columns.get_loc(col) + 1).fill = red_fill

# # plot bar chart for accuracy comparison
# models = ['Random Forest', 'Logistic Regression', 'XGBoost']
# injury_accuracies = [real_rf_injured_acc, real_lr_injured_acc, real_xgb_injured_acc]
# illness_accuracies = [real_rf_ill_acc, real_lr_ill_acc, real_xgb_ill_acc]
# plot_and_save_accuracy_bar_chart(
#     models, injury_accuracies, 
#     'Model Accuracy for Injury Prediction', 
#     'confusion_matrices/realuse_accuracy_injury.png'
# )
# plot_and_save_accuracy_bar_chart(
#     models, illness_accuracies, 
#     'Model Accuracy for Illness Prediction', 
#     'confusion_matrices/realuse_accuracy_illness.png'
# )

# define bar chart genarating function
def plot_and_save_accuracy_bar_chart(models, train_accuracies, test_accuracies, title, filename):
    x = np.arange(len(models)) 
    width = 0.35 
    plt.figure(figsize=(8, 5))
    plt.bar(x - width/2, train_accuracies, width, label='Test Accuracy', color='royalblue')
    plt.bar(x + width/2, test_accuracies, width, label='Real-use Accuracy', color='orange')
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.xticks(x, models) 
    plt.ylim(0, 1) 
    plt.legend()
    plt.savefig(filename)
    plt.close()

# generate bar chart for accuracy comparison
models = ['Random Forest', 'Logistic Regression', 'XGBoost']

# test data accuracy
train_injury_accuracies = [accuracy_rf_injured, accuracy_lr_injured, accuracy_xgb_injured]
train_illness_accuracies = [accuracy_rf_ill, accuracy_lr_ill, accuracy_xgb_ill]

# realuse data accuracy
test_injury_accuracies = [real_rf_injured_acc, real_lr_injured_acc, real_xgb_injured_acc]
test_illness_accuracies = [real_rf_ill_acc, real_lr_ill_acc, real_xgb_ill_acc]

print("Saving Accuracy Comparison Bar Charts...")

plot_and_save_accuracy_bar_chart(
    models, train_injury_accuracies, test_injury_accuracies,
    'Test vs Real-use Accuracy for Injury Prediction',
    'confusion_matrices/comparison_accuracy_injury.png'
)

plot_and_save_accuracy_bar_chart(
    models, train_illness_accuracies, test_illness_accuracies,
    'Test vs Real-use Accuracy for Illness Prediction',
    'confusion_matrices/comparison_accuracy_illness.png'
)

# save results to Excel
with pd.ExcelWriter('prediction_v2.xlsx', engine='openpyxl') as writer:
    # Test set evaluation results
    test_evaluation_df.to_excel(writer, sheet_name='Test_Evaluation_Accuracy', index=False)
    
    # real-use data
    real_predictions = real_df[user_info_columns + [
        'Prediction_Injury_RandomForest', 'Correct_check_Injury_RandomForest','Total_accuracy_Injury_RandomForest',
        'Prediction_Injury_LogisticRegression', 'Correct_check_Injury_LogisticRegression','Total_accuracy_Injury_LogisticRegression',
        'Prediction_Injury_XGBoost', 'Correct_check_Injury_XGBoost','Total_accuracy_Injury_XGBoost',
        'Prediction_Illness_RandomForest', 'Correct_check_Illness_RandomForest','Total_accuracy_Illness_RandomForest',
        'Prediction_Illness_LogisticRegression', 'Correct_check_Illness_LogisticRegression','Total_accuracy_Illness_LogisticRegression',
        'Prediction_Illness_XGBoost', 'Correct_check_Illness_XGBoost','Total_accuracy_Illness_XGBoost'
    ]]
    real_predictions.to_excel(writer, sheet_name='Real-Use_Predictions', index=False)
    highlight_incorrect_predictions(real_predictions, writer, 'Real-Use_Predictions')

    # add confusion matrices to Excel
    wb = writer.book
    ws = wb.create_sheet('Model_Visuals')

    # insert confusion matrix images into Excel
    row_offset = 1
    images = [
        ('confusion_matrices/comparison_accuracy_injury.png', 'Accuracy Bar Chart Injury'),
        ('confusion_matrices/comparison_accuracy_illness.png', 'Accuracy Bar Chart Illness'),
    ]

    ws.column_dimensions['A'].width = 50  # adjust column width for labels

    for img_path, label in images:
        try:
            img = Image(img_path)
            ws.append([label])  # insert label as a row before the image
            ws.add_image(img, f'A{row_offset}')
            row_offset += 30  # adjust row offset to avoid overlapping images
        except Exception as e:
            print(f"Error inserting {img_path}: {e}")

print("Predictions, evaluation results, and confusion matrices saved successfully!")

Class distribution for 'illed':
illed
0    57684
1       22
Name: count, dtype: int64
Class distribution for 'injured':
injured
0    57529
1      177
Name: count, dtype: int64
Training Random Forest - Injured...
Training Logistic Regression - Injured...
Training XGBoost - Injured...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Random Forest - Ill...
Training Logistic Regression - Ill...
Training XGBoost - Ill...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Making Predictions for Injured...
Making Predictions for Ill...
Calculating Accuracy for Models...
Plotting and Saving Confusion Matrices for Injury Predictions...
Plotting and Saving Confusion Matrices for Illness Predictions...
Saving Precision-Recall Curves...
Making Predictions for Real-Use Data...
Saving Accuracy Comparison Bar Charts...
Predictions, evaluation results, and confusion matrices saved successfully!
