In [None]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

model_dir = 'saved_models'
new_xgb_injury_model_path = os.path.join(model_dir, 'enhanced_xgb_model_injury.pkl')
xgb_illness_label_encoder_path = os.path.join(model_dir, 'xgb_illness_label_encoder.pkl')
xgb_injury_label_encoder_path = os.path.join(model_dir, 'xgb_injury_label_encoder.pkl')


# Function to load the existing model if it exists
def load_existing_model(model_path):
    if os.path.exists(model_path):
        print(f"Loading saved model from: {model_path}")
        model = joblib.load(model_path)
        return model
    else:
        print(f"No existing model found at: {model_path}. Starting fresh.")
        return None

def save_checkpoint(model, checkpoint_path, iteration):
    joblib.dump(model, checkpoint_path)
    print(f"Checkpoint saved at iteration {iteration} to {checkpoint_path}")

# Function to continue training an existing model
def train_model_from_existing_model(model, X_train, y_train, model_path, checkpoint_interval=5, max_rounds=55):
    checkpoint_path = os.path.join(model_dir, 'xgb_injury_model_checkpoint.pkl')
    checkpoint_file = os.path.join(model_dir, 'xgb_injury_model_checkpoint_rounds.txt')
    start_iteration = 0
    if os.path.exists(model_path):
        print(f"Loading saved model from: {model_path}")
        model = joblib.load(model_path)
        if hasattr(model, 'best_iteration'):
            start_iteration = model.best_iteration
    elif os.path.exists(checkpoint_path):
        print(f"Resuming training from checkpoint: {checkpoint_path}")
        model = joblib.load(checkpoint_path)
        with open(checkpoint_file, 'r') as f:
            checkpoint_rounds = int(f.read().strip())
            print(f"Resuming from saved round: {checkpoint_rounds}")
            start_iteration = checkpoint_rounds

    eval_set = [(X_train, y_train)]

    # Train the model, starting from the last best_iteration
    for i in range(start_iteration, max_rounds):
        model.fit(X_train, y_train, eval_set=eval_set, verbose=True, xgb_model=model.get_booster() if i > 0 else None)

        # Save checkpoint after each 5 rounds
        if (i + 1) % checkpoint_interval == 0:
            save_checkpoint(model, checkpoint_path, i + 1)
            with open(checkpoint_file, 'w') as f:
                f.write(str(i + 1))

        # Save the round checkpoint number
        with open(checkpoint_file, 'w') as f:
            f.write(str(i + 1))

    # Save the final model after training
    joblib.dump(model, new_xgb_injury_model_path)

    # Clean up the checkpoint file after final save
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)

    return model

# Load training and testing data
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx") 

# Define numeric features
numeric_features = ['Gender_m2f1', 'Menstruation_y1n0', 'Mood state', 'Energy levels',
                    'Muscle readiness', 'Academic Pressure', 'Diet Yesterday', 'Sleep quality',
                    'Sleep duration', 'sleep_score', 'total_training_load', 'total_training_duration',
                    'weekly_training_load', 'weekly_training_duration', 'ACWR', 'RTT', 'illed', 'injured']

# Separate numeric features from the rest
X_train = train_df[numeric_features]
X_test = test_df[numeric_features]

# Combine text columns for illness and injury separately
illness_columns = ['Type of illness', 'Illness severity']
injury_columns = ['Injury location', 'Injury type', 'Injury surface', 'Surface condition', 'Injury tissue type', 'Injury severity']

# Fill missing values and concatenate the text columns
train_df['illness_text'] = train_df[illness_columns].fillna('').agg(' '.join, axis=1)
train_df['injury_text'] = train_df[injury_columns].fillna('').agg(' '.join, axis=1)
test_df['illness_text'] = test_df[illness_columns].fillna('').agg(' '.join, axis=1)
test_df['injury_text'] = test_df[injury_columns].fillna('').agg(' '.join, axis=1)
train_df['injury_information'] = train_df['injury_information'].fillna("No Injury")
train_df['illness_information'] = train_df['illness_information'].fillna("No Illness")
test_df['injury_information'] = test_df['injury_information'].fillna("No Injury")
test_df['illness_information'] = test_df['illness_information'].fillna("No Illness")

# Vectorize the text data using TfidfVectorizer
tfidf_vectorizer_illness = TfidfVectorizer(max_features=500)
tfidf_vectorizer_injury = TfidfVectorizer(max_features=500)

# Fit the vectorizers on training data and transform the test data
X_illness_train = tfidf_vectorizer_illness.fit_transform(train_df['illness_information'])
X_injury_train = tfidf_vectorizer_injury.fit_transform(train_df['injury_information'])
X_illness_test = tfidf_vectorizer_illness.transform(test_df['illness_information'])
X_injury_test = tfidf_vectorizer_injury.transform(test_df['injury_information'])

# Save the vectorizers for later use
joblib.dump(tfidf_vectorizer_illness, os.path.join(model_dir, 'tfidf_vectorizer_illness.pkl'))
joblib.dump(tfidf_vectorizer_injury, os.path.join(model_dir, 'tfidf_vectorizer_injury.pkl'))

# function to encode the target variable
def handle_label_encoder(y_train, encoder_path):
    if os.path.exists(encoder_path):
        print(f"loading label encoder for xgb: {encoder_path}")
        encoder = joblib.load(encoder_path)
        y_encoded = encoder.transform(y_train) 
    else:
        print(f"creating new label encoder for xgb: {encoder_path}")
        encoder = LabelEncoder()
        y_encoded = encoder.fit_transform(y_train)
        joblib.dump(encoder, encoder_path)
    return y_encoded, encoder

print("\n" + "="*50 + " handling xgb illness label encoder " + "="*50)
y_illness_encoded, illness_encoder = handle_label_encoder(train_df['illness_information'], xgb_illness_label_encoder_path)

print("\n" + "="*50 + " handling xgb injury label encoder " + "="*50)
y_injury_encoded, injury_encoder = handle_label_encoder(train_df['injury_information'], xgb_injury_label_encoder_path)

# Resample the training data using RandomOverSampler
ros = RandomOverSampler(random_state=42)

X_resampled_illness, y_resampled_illness = ros.fit_resample(X_illness_train, y_illness_encoded)
X_resampled_injury, y_resampled_injury = ros.fit_resample(X_injury_train, y_injury_encoded)

# Split the resampled data into train and test sets
X_train_illness, X_test_illness, y_train_illness, y_test_illness = train_test_split(X_resampled_illness, y_resampled_illness, test_size=0.2, random_state=42)
X_train_injury, X_test_injury, y_train_injury, y_test_injury = train_test_split(X_resampled_injury, y_resampled_injury, test_size=0.2, random_state=42)

# Prepare model paths
xgb_injury_model_path = os.path.join(model_dir, 'xgb_model_injury.pkl')

# Check if the model already exists, if not, create a new one
best_xgb_model_injury = load_existing_model(xgb_injury_model_path)

# Set up XGBoost parameters
positive_class_count_injury = sum(y_train_injury == 1)
negative_class_count_injury = sum(y_train_injury == 0)
scale_pos_weight_injury = negative_class_count_injury / positive_class_count_injury if positive_class_count_injury > 0 else 1

xgb_param_injury = {
    'n_estimators': 55,
    'learning_rate': 0.05,  # Reduced learning rate
    'max_depth': 5,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'enable_categorical': True,
    'tree_method': 'hist',
    'device': 'cpu',
    'eval_metric': 'mlogloss',
    'scale_pos_weight': scale_pos_weight_injury
}

# Create the XGBoost model if it doesn't exist
if best_xgb_model_injury is None:
    best_xgb_model_injury = xgb.XGBClassifier(random_state=42, **xgb_param_injury)
    best_xgb_model_injury.classes_ = injury_encoder.classes_


# Continue training the model for additional rounds
best_xgb_model_injury = train_model_from_existing_model(
    model=best_xgb_model_injury,
    X_train=X_train_injury,
    y_train=y_train_injury,  # Ensure the target labels are properly encoded
    model_path=new_xgb_injury_model_path,
    checkpoint_interval=5, 
    max_rounds=55  
)

# Optionally, save final model after training
joblib.dump(best_xgb_model_injury, new_xgb_injury_model_path)



loading label encoder for xgb: saved_models/xgb_illness_label_encoder.pkl

loading label encoder for xgb: saved_models/xgb_injury_label_encoder.pkl
Loading saved model from: saved_models/xgb_model_injury.pkl
Loading saved model from: saved_models/enhanced_xgb_model_injury.pkl
