In [None]:
from src.utilities import split_data
from src.preprocess import process_missing_values, main_preprocess, create_entity
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb
from sksurv.metrics import concordance_index_ipcw
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")

GLOBAL = {
    "cox": {"run": False, "save":False, "shap": False},
    "xgb": {"run": True, "save": False, "shap": True},
    "lgbm": {"run": False, "save":False, "shap": False},
    "rsf": {"run": False, "save":False, "shap": False}
}

PARAMS = {
    "EDA" : False, 
    "size": 0.7,
    "impute": {"strategy": "median", "sex": False},
    #"outliers": {"threshold": 0.01, "multiplier": 1.5},
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"] ["BM_BLAST+WBC", "BM_BLAST/HB", "HB*PLT", "HB/num_trisomies"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "additional": [
        #['cadd', 'phred'],
        # ['cadd', 'rawscore'],
        # # ['cadd', 'consequence'],
        # # ['cadd', 'bstatistic'],
        # # ['cadd', 'gerp', 'n'],
        # ['cadd', 'phast_cons', 'mammalian'],
        # ['cadd', 'phylop', 'mammalian'],
        # ['snpeff', 'putative_impact'],
        # # ['snpeff', 'rank'],
        # # ['snpeff', 'total'],
         #['cadd', 'exon'],
        # # ['cadd', 'cds', 'rel_cds_pos']
        ],
    "xgb": {
        'loss': 'coxph',
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 335,
        'subsample': 0.55,
        'max_features': "sqrt",
        'min_samples_split': 3,
        'min_samples_leaf': 1,
        'min_weight_fraction_leaf': 0,
        'min_impurity_decrease': 0,
        'dropout_rate': 0,
        'warm_start': False,
        'ccp_alpha': 0,
        'random_state': 126
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':200,  # Nombre d'arbres dans la forêt
    'max_depth':None,
    'min_samples_split':50,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':20,  # Nombre minimum d'échantillons par feuille
    'max_features':'sqrt',  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}


data = create_entity(PARAMS)
data = main_preprocess(data, PARAMS)
X, X_eval, y = split_data(data)

print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)
X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, X.columns, **PARAMS["impute"])

# Définition de la grille des hyperparamètres
param_grid = {
    'max_depth': [2],
    'learning_rate': [0.05],
    'n_estimators': [450],
    'subsample': [0.55],
    'max_features': ['sqrt']
}

kfold_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 26
}

# Configuration du KFold (5 folds)
cv = KFold(**kfold_params)

# Initialisation du modèle
model = GradientBoostingSurvivalAnalysis(random_state=26)



# Fonction de scoring basée sur le concordance index censored
def cindex_scorer(y_true, y_pred):
    return concordance_index_ipcw(y_true, y_true, y_pred, tau=7)[0]

# Spécifiez needs_estimator=True pour que make_scorer transmette (estimator, X, y) à votre fonction
scorer = make_scorer(cindex_scorer, greater_is_better=True)


# Configuration du GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring=scorer,
    n_jobs=-1,
    verbose=1
)

# Lancement de la recherche
grid_search.fit(X_train, y_train)

print("Meilleurs paramètres :", grid_search.best_params_)
print("Meilleur C-index :", grid_search.best_score_)

Index(['gene_ASXL1', 'gene_BCOR', 'gene_BCORL1', 'gene_BRCC3', 'gene_CBL',
       'gene_CEBPA', 'gene_CSF3R', 'gene_CTCF', 'gene_CUX1', 'gene_DDX41',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=102)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres : {'learning_rate': 0.05, 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 450, 'subsample': 0.55}
Meilleur C-index : 0.7153507201792274


In [30]:
for i, params in enumerate(grid_search.cv_results_['params']):
    mean_score = grid_search.cv_results_['mean_test_score'][i]
    std_score = grid_search.cv_results_['std_test_score'][i]
    print(f"Paramètres: {params} => Score moyen: {mean_score:.4f} ± {std_score:.4f}")

Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 100, 'subsample': 0.55} => Score moyen: 0.7058 ± 0.0319
Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 100, 'subsample': 0.54} => Score moyen: 0.7063 ± 0.0321
Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 100, 'subsample': 0.56} => Score moyen: 0.7065 ± 0.0328
Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': 'log2', 'n_estimators': 100, 'subsample': 0.55} => Score moyen: 0.7023 ± 0.0313
Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': 'log2', 'n_estimators': 100, 'subsample': 0.54} => Score moyen: 0.7023 ± 0.0316
Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': 'log2', 'n_estimators': 100, 'subsample': 0.56} => Score moyen: 0.7019 ± 0.0309
Paramètres: {'learning_rate': 0.049, 'max_depth': 2, 'max_features': None, 'n_estimators': 100, 'subsample