In [17]:
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
    "save_rsf": True
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "rsf": {
    'n_estimators':150,  # Nombre d'arbres dans la forêt
    'min_samples_split':50,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':20,  # Nombre minimum d'échantillons par feuille
    'max_features':"sqrt",  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)
# Check if there are any columns that are not float or int in X
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)

X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")

(14024, 12)
(14024, 15)
(14024, 17)
Index(['gene_ASXL1', 'gene_BCOR', 'gene_BCORL1', 'gene_BRCC3', 'gene_CBL',
       'gene_CEBPA', 'gene_CSF3R', 'gene_CTCF', 'gene_CUX1', 'gene_DDX41',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=102)


In [None]:
rsf_params_method = "_".join([(str(key) + "=" + str(PARAMS['rsf'][key])) for key in PARAMS['rsf'].keys()])

rsf = RandomSurvivalForest(min_samples_split=PARAMS['rsf']['min_samples_split'], n_estimators=PARAMS['rsf']['n_estimators'], min_samples_leaf=PARAMS['rsf']['min_samples_leaf'], max_features=PARAMS['rsf']['max_features'], n_jobs=PARAMS['rsf']['n_jobs'], random_state=42)
rsf.fit(X_train, y_train)
rsf_cindex_train = concordance_index_ipcw(y_train, y_train, rsf.predict(X_train), tau=7)[0]
rsf_cindex_test = concordance_index_ipcw(y_train, y_test, rsf.predict(X_test), tau=7)[0]
print(f"Random Survival Forest Model Concordance Index IPCW on train: {rsf_cindex_train:.3f}")
print(f"Random Survival Forest Model Concordance Index IPCW on test: {rsf_cindex_test:.3f}")
rsf_score_method = f"score_{rsf_cindex_train:.3f}_{rsf_cindex_test:.3f}"

In [15]:
import numpy as np
import pandas as pd
from itertools import product
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

# 🚀 Étape 1: Transformer les données en format sksurv
#y_train_surv = Surv.from_dataframe(event="OS_STATUS", time="OS_YEARS", data=y_train)
#y_test_surv = Surv.from_dataframe(event="OS_STATUS", time="OS_YEARS", data=y_test)

# 🚀 Étape 2: Définir une grille d'hyperparamètres à tester
param_grid = {
    "n_estimators": [200, 300],  # Nombre d'arbres
    "max_depth": [2, 3],
    "min_samples_split": [2, 5, 10, 30, 50, 70],  # Nombre minimum d'échantillons pour split
    "min_samples_leaf": [2, 5, 10, 30, 50, 70],  # Nombre minimum d'échantillons par feuille
    "max_features": [None],  # Nombre de features utilisées pour chaque split
}

# 🚀 Étape 3: Tester toutes les combinaisons d'hyperparamètres
results = []

for params in product(*param_grid.values()):
    param_dict = dict(zip(param_grid.keys(), params))
    print(f"🔄 Entraînement avec {param_dict}...")

    # Initialisation du modèle avec les paramètres actuels
    rsf = RandomSurvivalForest(
        n_estimators=param_dict["n_estimators"],
        max_depth=param_dict['max_depth'],
        min_samples_split=param_dict["min_samples_split"],
        min_samples_leaf=param_dict["min_samples_leaf"],
        max_features=param_dict["max_features"],
        n_jobs=-1,  # Utiliser tous les cœurs CPU
        random_state=42,
    )

    # Entraînement du modèle
    rsf.fit(X_train, y_train)

    # Prédiction et évaluation
    rsf_cindex_train = concordance_index_ipcw(y_train, y_train, rsf.predict(X_train), tau=7)[0]
    rsf_cindex_test = concordance_index_ipcw(y_train, y_test, rsf.predict(X_test), tau=7)[0]
    
    print(f"✅ C-index (train) = {rsf_cindex_train:.3f}, C-index (test) = {rsf_cindex_test:.3f}")

    # Stocker les résultats
    results.append({
        "n_estimators": param_dict["n_estimators"],
        "min_samples_split": param_dict["min_samples_split"],
        "min_samples_leaf": param_dict["min_samples_leaf"],
        "max_features": param_dict["max_features"],
        "cindex_train": rsf_cindex_train,
        "cindex_test": rsf_cindex_test,
    })

# 🚀 Étape 4: Convertir les résultats en DataFrame pour analyse
df_results = pd.DataFrame(results)

# 📊 Afficher les résultats triés par la meilleure performance
df_results = df_results.sort_values(by="cindex_test", ascending=False)
##mport ace_tools as tools
#tools.display_dataframe_to_user(name="RSF Hyperparameter Tuning Results", dataframe=df_results)

🔄 Entraînement avec {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None}...
✅ C-index (train) = 0.697, C-index (test) = 0.675
🔄 Entraînement avec {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': None}...
✅ C-index (train) = 0.698, C-index (test) = 0.677
🔄 Entraînement avec {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None}...
✅ C-index (train) = 0.698, C-index (test) = 0.678
🔄 Entraînement avec {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 30, 'max_features': None}...
✅ C-index (train) = 0.700, C-index (test) = 0.678
🔄 Entraînement avec {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 50, 'max_features': None}...
✅ C-index (train) = 0.700, C-index (test) = 0.681
🔄 Entraînement avec {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'min_samples_le

In [16]:
df_results.to_csv('rsf_hyperparameter_tuning_results3.csv', index=False)

In [8]:
df_results

Unnamed: 0,n_estimators,min_samples_split,min_samples_leaf,max_features,cindex_train,cindex_test
20,50,20,2,,0.857146,0.711642
11,50,10,2,,0.880996,0.711488
38,100,10,2,,0.883259,0.711273
47,100,20,2,,0.859483,0.711166
2,50,5,2,,0.898407,0.710305
...,...,...,...,...,...,...
3,50,5,5,sqrt,0.856244,0.696440
22,50,20,5,log2,0.819768,0.695889
19,50,20,2,log2,0.832458,0.695834
13,50,10,5,log2,0.848412,0.694599


In [57]:
import pandas as pd
import numpy as np
import shap
import itertools
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv

# 📌 Étape 1 : Générer des interactions de manière optimisée
def generate_interactions(df, max_combinations=2):
    """
    Génère des interactions entre les variables de df en utilisant différentes opérations.
    
    max_combinations : int, nombre maximal de features à combiner
    """
    df_interactions = df.copy()
    feature_pairs = list(itertools.combinations(df.columns, max_combinations))
    
    # 📌 Stocker les nouvelles colonnes dans un dictionnaire pour une concaténation efficace
    new_features = {}

    for f1, f2 in feature_pairs:
        new_features[f"{f1}_mul_{f2}"] = df[f1] * df[f2]  # Multiplication
        new_features[f"{f1}_add_{f2}"] = df[f1] + df[f2]  # Addition
        new_features[f"{f1}_div_{f2}"] = df[f1] / (df[f2] + 1e-6)  # Division sécurisée
        new_features[f"log_{f1}"] = np.log(df[f1] + 1)  # Log transformation
    
    # 📌 Ajouter toutes les nouvelles colonnes en une seule fois
    df_interactions = pd.concat([df_interactions, pd.DataFrame(new_features)], axis=1)
    
    return df_interactions

# 📌 Étape 2 : Sélectionner les interactions avec SHAP
def select_best_interactions(X_train, y_train, model_params, top_k=10):
    """
    Sélectionne les interactions les plus importantes en utilisant SHAP et un modèle de Gradient Boosting.
    """
    y_train_surv = Surv.from_dataframe(event="event", time="time", data=y_train)

    # Réduire le nombre de features avant SHAP
    top_features = X_train.corrwith(y_train["time"]).abs().nlargest(100).index
    X_train = X_train[top_features]

    # Entraînement du modèle GBSA (Gradient Boosting Survival Analysis)
    model = GradientBoostingSurvivalAnalysis(**model_params)
    model.fit(X_train, y_train_surv)

    def predict_function(data):
        if isinstance(data, np.ndarray):
            data = pd.DataFrame(data, columns=X_train.columns)
        return model.predict(data)

    # 📌 Utilisation de TreeExplainer au lieu de PermutationExplainer
    explainer = shap.Explainer(predict_function, X_train)
    shap_values = explainer(X_train)

    # 📌 Importance des features
    shap_importance = np.abs(shap_values.values).mean(axis=0)
    feature_importance = pd.Series(shap_importance, index=X_train.columns).sort_values(ascending=False)

    # 📌 Sélection des meilleures interactions
    best_features = feature_importance.head(top_k).index.tolist()
    return X_train[best_features], feature_importance, shap_values

# 📌 Étape 3 : Pipeline Complet
def process_features_with_interactions(X_train, y_train, model_params, max_combinations=2, top_k=10):
    """
    Pipeline complet qui génère et sélectionne automatiquement les meilleures interactions avec SHAP.
    """
    print("🛠️ Génération des interactions...")
    X_train_interactions = generate_interactions(X_train, max_combinations)

    print(f"🔍 Sélection des {top_k} meilleures interactions avec SHAP...")
    X_train_selected = select_best_interactions(X_train_interactions, y_train, model_params, top_k)

    return X_train_selected



In [62]:
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
    "save_rsf": True
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": [],#["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": [],#["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "xgb": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 1,
        'max_features': None,
        "random_state": 42,
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)
# Check if there are any columns that are not float or int in X
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)

X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")

Index(['BM_BLAST', 'WBC', 'ANC', 'MONOCYTES', 'HB', 'PLT', 'num_subclones',
       'sex', 'avg_chromosomes', 'total_mitoses', 'num_translocations',
       'num_deletions', 'num_inversions', 'num_duplications', 'num_additions',
       'num_monosomies', 'num_trisomies', 'complexity_score'],
      dtype='object')


In [64]:
model_params = PARAMS['xgb']

X_train_new, feature_importance, shap_values = process_features_with_interactions(pd.DataFrame(X_train, columns=X.columns), pd.DataFrame(y_train), model_params, max_combinations=2, top_k=10)

print("✅ Nouvelles variables sélectionnées :")
print(X_train_new.head())

🛠️ Génération des interactions...
🔍 Sélection des 10 meilleures interactions avec SHAP...


PermutationExplainer explainer: 2222it [01:52, 17.90it/s]                          


✅ Nouvelles variables sélectionnées :
   BM_BLAST_add_WBC  BM_BLAST_div_HB  HB_mul_PLT  HB_div_num_trisomies  \
0               6.5         0.329670      1365.0          9.100000e+06   
1              19.0         1.363636       495.0          1.100000e+07   
2               8.7         0.869565       910.8          6.900000e+06   
3               4.0         0.200000      1780.0          1.000000e+07   
4              12.0         1.000000       530.0          4.999998e+00   

   PLT_add_total_mitoses  PLT_div_num_monosomies  HB_div_num_monosomies  \
0                  150.0            1.500000e+08           9.100000e+06   
1                   65.0            4.500000e+07           1.100000e+07   
2                  162.0            1.320000e+08           6.900000e+06   
3                  197.0            1.779998e+02           9.999990e+00   
4                   78.0            2.649999e+01           4.999998e+00   

   BM_BLAST_add_MONOCYTES  PLT_div_complexity_score  sex_add_num_t

In [67]:
feature_importance

BM_BLAST_add_WBC                      0.168483
BM_BLAST_div_HB                       0.117520
HB_mul_PLT                            0.104916
HB_div_num_trisomies                  0.103098
PLT_add_total_mitoses                 0.060907
                                        ...   
num_inversions_add_num_monosomies     0.000000
BM_BLAST_add_num_additions            0.000000
BM_BLAST_add_num_trisomies            0.000000
num_additions_add_num_trisomies       0.000000
num_deletions_add_complexity_score    0.000000
Length: 100, dtype: float64

In [48]:
X_train_new

Unnamed: 0,BM_BLAST_add_WBC,BM_BLAST_div_HB,HB_mul_PLT,HB_div_num_trisomies,PLT_div_num_monosomies,HB_div_num_monosomies,PLT_add_total_mitoses,PLT_div_complexity_score,BM_BLAST_add_MONOCYTES,BM_BLAST_add_ANC
0,6.50,0.329670,1365.0,9.100000e+06,1.500000e+08,9.100000e+06,150.0,1.500000e+08,4.050,3.840
1,19.00,1.363636,495.0,1.100000e+07,4.500000e+07,1.100000e+07,65.0,4.500000e+07,15.000,17.000
2,8.70,0.869565,910.8,6.900000e+06,1.320000e+08,6.900000e+06,162.0,1.319999e+02,6.170,6.945
3,4.00,0.200000,1780.0,1.000000e+07,1.779998e+02,9.999990e+00,197.0,1.780000e+08,2.000,3.000
4,12.00,1.000000,530.0,4.999998e+00,2.649999e+01,4.999998e+00,78.0,1.766666e+01,10.000,11.000
...,...,...,...,...,...,...,...,...,...,...
2216,5.80,0.405405,606.8,7.400000e+06,8.200000e+07,7.400000e+06,102.0,8.199992e+01,3.220,4.400
2217,11.90,0.707071,2316.6,9.900000e+06,2.340000e+08,9.900000e+06,254.0,2.339998e+02,7.340,9.890
2218,3.30,0.000000,1896.6,1.090000e+07,1.740000e+08,1.090000e+07,194.0,1.739998e+02,0.200,1.730
2219,8.00,0.000000,2000.9,1.070000e+07,1.870000e+08,1.070000e+07,207.0,1.870000e+08,2.000,4.080


In [39]:
len(list(itertools.combinations(X.columns, 2)))

X_train = pd.DataFrame(X_train, columns=X.columns)
y_train = pd.DataFrame(y_train)

top_features = X_train.corrwith(y_train["time"]).abs().nlargest(100).index

In [66]:
top_features

Index(['PLT', 'BM_BLAST', 'HB', 'num_monosomies', 'complexity_score',
       'num_trisomies', 'num_additions', 'num_deletions', 'num_translocations',
       'num_subclones', 'MONOCYTES', 'WBC', 'sex', 'num_duplications', 'ANC',
       'avg_chromosomes', 'num_inversions', 'total_mitoses'],
      dtype='object')