# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
sns.set_palette('muted')

from datetime import datetime

from tqdm import tqdm

from sklearn.base import clone

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import optuna

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = '../data/equity-post-HCT-survival-predictions/'
RANDOM_STATE = 54321

# Data

In [3]:
sample_df = pd.read_csv(DATA_PATH + 'sample_submission.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')

In [4]:
X = pd.read_pickle(DATA_PATH + 'train_test_split/X_25-12-2024.pkl')
y = pd.read_pickle(DATA_PATH + 'train_test_split/y_25-12-2024.pkl')
efs_time = pd.read_pickle(DATA_PATH + 'train_test_split/efs_time_25-12-2024.pkl')
race_group = pd.read_pickle(DATA_PATH + 'train_test_split/race_group_25-12-2024.pkl')

In [5]:
print(X.shape, y.shape, efs_time.shape, race_group.shape)

(28800, 81) (28800,) (28800,) (28800,)


## Métric

In [6]:
import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [7]:
def score_wrapper(y_true, y_pred, efs_time, race_group):
    y_pred_dict = {
        'prediction': {i: pred for i, pred in enumerate(y_pred)}
    }        
    y_pred_dict = pd.DataFrame(y_pred_dict)
    y_pred_dict.insert(0, 'id', range(len(y_pred_dict)))
    
    y_true_dict = {
        'efs': {i: y for i, y in enumerate(y_true.values)},
        'efs_time': {i: t for i, t in enumerate(efs_time.values)},
        'race_group': {i: r for i, r in enumerate(race_group.values)},
    }
    y_true_dict = pd.DataFrame(y_true_dict)
    y_true_dict.insert(0, 'id', range(len(y_true_dict)))
    
    return score(y_true_dict.copy(), y_pred_dict.copy(), 'id')

def cross_validate(model, X, y, cv=10, scale=False):
    cv_scores = []
    
    for i in range(cv):
        test_idxs = list(range(int((len(X)*(i)/cv)), int((len(X)*(i+1)/cv))))
        
        X_train = X.drop(index=test_idxs)
        y_train = y.drop(index=test_idxs)
        
        X_test = X.iloc[test_idxs]
        y_test = y.iloc[test_idxs]
        
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
        model_copy = clone(model)
        
        model_copy.fit(X_train, y_train)
        y_pred = model_copy.predict(X_test)
        
        cv_scores.append(score_wrapper(
            y_test, 
            y_pred, 
            efs_time.iloc[test_idxs], 
            race_group.iloc[test_idxs]
        ))
    
    return np.mean(cv_scores)

In [8]:
X = X.dropna()
y = y[X.index].copy()

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [9]:
print(X.shape, y.shape)

(19280, 81) (19280,)


# Linear Regression

In [10]:
def objective(trial):
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    use_scaler = trial.suggest_categorical('use_scaler', [True, False])
    
    steps = []
    
    if use_scaler:
        steps.append(('scaler', StandardScaler()))
    
    steps.append(('regressor', LinearRegression(fit_intercept=fit_intercept)))
    
    model = Pipeline(steps)
    
    cv_score = cross_validate(model, X, y, cv=5, scale=True)
    
    return cv_score

In [11]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-28 17:28:28,942] A new study created in memory with name: no-name-6cd7bdcd-f542-4911-9cba-073eef87e3a6
[I 2024-12-28 17:28:29,143] Trial 0 finished with value: 0.5776564314806606 and parameters: {'fit_intercept': True, 'use_scaler': False}. Best is trial 0 with value: 0.5776564314806606.
[I 2024-12-28 17:28:29,338] Trial 1 finished with value: 0.5776564314806606 and parameters: {'fit_intercept': False, 'use_scaler': False}. Best is trial 0 with value: 0.5776564314806606.


In [12]:
study.best_params

{'fit_intercept': True, 'use_scaler': False}

# Random Forest

In [13]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 2_000)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 200)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=RANDOM_STATE,
    )
    
    cv_score = cross_validate(model, X, y, cv=5, scale=True)
    
    return cv_score

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-12-28 17:28:29,351] A new study created in memory with name: no-name-bed6fbb8-5f7a-4124-bd84-e0622ee34254
[I 2024-12-28 17:29:09,728] Trial 0 finished with value: 0.5826598732021501 and parameters: {'n_estimators': 974, 'max_depth': 13, 'min_samples_split': 22, 'min_samples_leaf': 51, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.5826598732021501.
[I 2024-12-28 17:29:50,750] Trial 1 finished with value: 0.5821276392491364 and parameters: {'n_estimators': 1427, 'max_depth': 30, 'min_samples_split': 131, 'min_samples_leaf': 40, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.5826598732021501.


In [15]:
study.best_params

{'n_estimators': 974,
 'max_depth': 13,
 'min_samples_split': 22,
 'min_samples_leaf': 51,
 'max_features': 'sqrt',
 'bootstrap': False}

# XGBoost

In [16]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 2_000)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    gamma = trial.suggest_float('gamma', 0, 10)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-5, 1e1, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-5, 1e1, log=True)

    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        random_state=RANDOM_STATE,
    )
    
    cv_score = cross_validate(model, X, y, cv=5, scale=True)
    
    return cv_score

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[I 2024-12-28 17:29:50,762] A new study created in memory with name: no-name-c623a406-4a74-4cef-a20a-dd61431c34f6
[I 2024-12-28 17:29:56,029] Trial 0 finished with value: 0.5752432221139643 and parameters: {'n_estimators': 805, 'max_depth': 24, 'learning_rate': 7.753605377641606e-05, 'min_child_weight': 9, 'subsample': 0.5793133301050022, 'colsample_bytree': 0.8181374083038078, 'gamma': 3.4805308536162527, 'reg_alpha': 0.3409014620447248, 'reg_lambda': 0.4771370774554879}. Best is trial 0 with value: 0.5752432221139643.
[I 2024-12-28 17:29:58,804] Trial 1 finished with value: 0.5772937036241247 and parameters: {'n_estimators': 878, 'max_depth': 23, 'learning_rate': 0.00485688458147218, 'min_child_weight': 3, 'subsample': 0.9791245218003174, 'colsample_bytree': 0.6263345573622661, 'gamma': 8.421442000091034, 'reg_alpha': 7.011450556354446, 'reg_lambda': 0.3225828228593451}. Best is trial 1 with value: 0.5772937036241247.


In [18]:
study.best_params

{'n_estimators': 878,
 'max_depth': 23,
 'learning_rate': 0.00485688458147218,
 'min_child_weight': 3,
 'subsample': 0.9791245218003174,
 'colsample_bytree': 0.6263345573622661,
 'gamma': 8.421442000091034,
 'reg_alpha': 7.011450556354446,
 'reg_lambda': 0.3225828228593451}

# LGBMReg

In [19]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 2_000)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    num_leaves = trial.suggest_int('num_leaves', 20, 200)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-5, 1e1, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-5, 1e1, log=True)

    model = LGBMRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        random_state=RANDOM_STATE,
        verbose=-1,
    )
    
    cv_score = cross_validate(model, X, y, cv=5, scale=True)
    
    return cv_score

In [20]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-28 17:29:58,817] A new study created in memory with name: no-name-84d91ac9-542b-499a-9427-2ea4e46b1ae5
[I 2024-12-28 17:30:05,333] Trial 0 finished with value: 0.5733731746261619 and parameters: {'n_estimators': 1035, 'max_depth': 4, 'learning_rate': 0.0003768724256125172, 'num_leaves': 177, 'min_child_samples': 34, 'subsample': 0.8655383886726912, 'colsample_bytree': 0.7202293110869937, 'reg_alpha': 0.00014608611264611463, 'reg_lambda': 0.02469946132207104}. Best is trial 0 with value: 0.5733731746261619.
[I 2024-12-28 17:30:26,824] Trial 1 finished with value: 0.5829969046526566 and parameters: {'n_estimators': 492, 'max_depth': 21, 'learning_rate': 0.000505402544611678, 'num_leaves': 99, 'min_child_samples': 10, 'subsample': 0.6099702180241707, 'colsample_bytree': 0.7044962249004738, 'reg_alpha': 0.029051793632351084, 'reg_lambda': 0.07869082942752609}. Best is trial 1 with value: 0.5829969046526566.


In [21]:
study.best_params

{'n_estimators': 492,
 'max_depth': 21,
 'learning_rate': 0.000505402544611678,
 'num_leaves': 99,
 'min_child_samples': 10,
 'subsample': 0.6099702180241707,
 'colsample_bytree': 0.7044962249004738,
 'reg_alpha': 0.029051793632351084,
 'reg_lambda': 0.07869082942752609}