# Imports

In [1]:
import time
start_time = time.time()

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
sns.set_palette('muted')

from datetime import datetime

from tqdm import tqdm

from sklearn.base import clone

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_absolute_error

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_PATH = '../data/equity-post-HCT-survival-predictions/'
RANDOM_STATE = 54321

# Data

In [4]:
sample_df = pd.read_csv(DATA_PATH + 'sample_submission.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')

In [5]:
X = pd.read_pickle(DATA_PATH + 'train_test_split/X_25-12-2024.pkl')
y = pd.read_pickle(DATA_PATH + 'train_test_split/y_25-12-2024.pkl')
efs_time = pd.read_pickle(DATA_PATH + 'train_test_split/efs_time_25-12-2024.pkl')
race_group = pd.read_pickle(DATA_PATH + 'train_test_split/race_group_25-12-2024.pkl')

In [6]:
print(X.shape, y.shape, efs_time.shape, race_group.shape)

(28800, 81) (28800,) (28800,) (28800,)


# Modeling

In [7]:
def cross_validate_efs_time(model, X, efs_time, cv=10, scale=False):
    cv_scores = []

    for i in range(cv):
        test_idxs = list(range(int((len(X)*(i)/cv)), int((len(X)*(i+1)/cv))))
        
        X_train = X.drop(index=test_idxs)
        y_train = efs_time.drop(index=test_idxs)
        
        X_test = X.iloc[test_idxs]
        y_test = efs_time.iloc[test_idxs]
        
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
        model_copy = clone(model)
        
        model_copy.fit(X_train, y_train)
        y_pred = model_copy.predict(X_test)
        
        cv_scores.append(mean_absolute_error(y_test, y_pred))
    
    return np.mean(cv_scores)

In [8]:
model = LGBMRegressor(random_state=RANDOM_STATE, verbose=-1)
cv_score = cross_validate_efs_time(model, X, efs_time, cv=5)

print(f'CV Score: {cv_score:.4f}')

CV Score: 16.8865


## Hiper parameters

In [9]:
def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 20, 150)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 0.5, log=True)
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True)

    model = LGBMRegressor(
        num_leaves=num_leaves,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        random_state=RANDOM_STATE,
        verbose=-1,
    )

    cv_score = cross_validate_efs_time(model, X, efs_time, cv=5)

    return cv_score

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2025-01-12 10:11:51,394] A new study created in memory with name: no-name-e5dc54fd-3f1d-4445-ac0e-3ab93ca4af17
[I 2025-01-12 10:12:07,122] Trial 0 finished with value: 16.7084047098786 and parameters: {'num_leaves': 50, 'max_depth': 16, 'learning_rate': 0.05129167200639864, 'n_estimators': 442, 'min_child_samples': 44, 'subsample': 0.7335527845064131, 'colsample_bytree': 0.5630653716876615, 'reg_alpha': 2.187340888115115, 'reg_lambda': 0.0015006112611769082}. Best is trial 0 with value: 16.7084047098786.
[I 2025-01-12 10:12:36,704] Trial 1 finished with value: 17.236936453389088 and parameters: {'num_leaves': 143, 'max_depth': 13, 'learning_rate': 0.11289912370842253, 'n_estimators': 551, 'min_child_samples': 82, 'subsample': 0.7091357291807747, 'colsample_bytree': 0.9352471946269185, 'reg_alpha': 0.003978384044123794, 'reg_lambda': 0.37481816249401156}. Best is trial 0 with value: 16.7084047098786.
[I 2025-01-12 10:12:59,754] Trial 2 finished with value: 17.342467847511706 and para

KeyboardInterrupt: 

In [None]:
study.best_params