In [8]:
import pandas as pd

# Clinical Data
df = pd.read_csv("data/X_train/clinical_train.csv")
df_eval = pd.read_csv("data/X_test/clinical_test.csv")

# Molecular Data
maf_df = pd.read_csv("data/X_train/molecular_train.csv")
maf_eval = pd.read_csv("data/X_test/molecular_test.csv")

target_df = pd.read_csv("data/target_train.csv")
target_df_test = pd.read_csv("data/random_submission_FRacdcw_v9kP4pP.csv")


In [9]:
df = df.merge(target_df, on="ID")

df_maf = maf_df.merge(target_df, on="ID")

In [13]:
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

def plot_kaplan_meier(df, group_columns, time_column="OS_YEARS", event_column="OS_STATUS"):
    """
    Trace des courbes de survie Kaplan-Meier pour une ou plusieurs colonnes catégoriques.

    Parameters:
    -----------
    df : pd.DataFrame
        Le DataFrame contenant les données.
    group_columns : list
        Liste des colonnes catégoriques pour lesquelles tracer les courbes Kaplan-Meier.
    time_column : str
        Colonne représentant le temps de survie.
    event_column : str
        Colonne indiquant si l'événement s'est produit (1) ou est censuré (0).
    """
    # Initialisation de Kaplan-Meier Fitter
    kmf = KaplanMeierFitter()

    if not isinstance(group_columns, list):
        group_columns = [group_columns]

    # Pour chaque colonne catégorique spécifiée
    for group_column in group_columns:
        # Liste des catégories uniques dans la colonne courante
        categories = df[group_column].dropna().unique()

        # Initialisation du graphique
        plt.figure(figsize=(10, 6))
        
        # Calcul et tracé de la courbe pour chaque catégorie dans la colonne courante
        for category in categories:
            # Filtrer les données pour la catégorie actuelle
            category_data = df[df[group_column] == category]
            
            # Supprimer les lignes avec des valeurs NaN dans les colonnes de temps et d'événement
            category_data = category_data.dropna(subset=[time_column, event_column])
            
            # Ajuster Kaplan-Meier pour les données de la catégorie
            kmf.fit(category_data[time_column], event_observed=category_data[event_column], label=f"{group_column}: {category}")
            
            # Tracer la courbe de survie
            kmf.plot_survival_function()

        # Ajouter des détails au graphique
        plt.title(f"Kaplan-Meier Survival Curve - {group_column}")
        plt.xlabel("Time (Tenure)")
        plt.ylabel("Survival Probability")
        plt.legend(title=group_column)
        plt.grid(True)
        plt.show()

In [None]:
plot_kaplan_meier(df_maf, "EFFECT")

In [1]:
from autoviz import AutoViz_Class

# Initialiser AutoViz
AV = AutoViz_Class()

# Définir les variables
filename = ""  # Aucun fichier car les données sont fournies directement via `df`
custom_plot_dir = "report/eda/autoviz"  # Répertoire principal pour sauvegarder les graphiques
target_variables = ['OS_YEARS', 'OS_STATUS']  # Liste des variables cibles
dfs_to_analyze = {'clinical': df, 'molecular': df_maf}  # Dictionnaire des DataFrames à analyser

try:
    # Effectuer l'analyse pour chaque DataFrame et chaque variable cible
    for df_name, df_data in dfs_to_analyze.items():  # Parcourir chaque DataFrame
        for target_variable in target_variables:  # Parcourir chaque variable cible
            print(f"Analyse du DataFrame '{df_name}' pour la variable cible '{target_variable}'")
            
            # Exécuter AutoViz pour la variable cible actuelle
            dft = AV.AutoViz(
                filename=filename,
                sep=",",  # Délimiteur utilisé dans les données
                depVar=target_variable,  # Variable cible
                dfte=df_data,  # DataFrame à analyser
                header=0,  # Indique que la première ligne contient les noms des colonnes
                verbose=1,  # Activer les messages détaillés
                lowess=False,  # Désactiver le lissage avec Lowess
                chart_format="html",  # Format de génération des graphiques
                max_rows_analyzed=min([df_data.shape[0], 10**5]),  # Limiter les lignes analysées
                max_cols_analyzed=min([df_data.shape[1], 50]),  # Limiter les colonnes analysées
                save_plot_dir=f"{custom_plot_dir}/{df_name}"  # Répertoire spécifique
            )

            # Importer les bibliothèques nécessaires pour afficher le contenu HTML
            from IPython.core.display import display, HTML
            from pathlib import Path

            # Liste pour stocker les noms des fichiers HTML générés
            file_names = []

            # Parcourir les fichiers HTML générés dans le répertoire spécifique
            for file in Path(f'{custom_plot_dir}/{df_name}/{target_variable}/').glob('*.html'):
                # Extraire le nom de fichier et l'ajouter à la liste
                filename = str(file).split('/')[-1]
                file_names.append(filename)

            # Afficher chaque fichier HTML
            for file_name in file_names:
                # Construire le chemin complet du fichier HTML
                file_path = f'{custom_plot_dir}/{df_name}/{target_variable}/{file_name}'

                # Lire et afficher le contenu HTML
                with open(file_path, 'r') as file:
                    html_content = file.read()
                    display(HTML(html_content))

except Exception as e:
    print(f"Exception: {e}")

KeyboardInterrupt: 

In [None]:
import sweetviz as sv

# Drop les lignes avec nan dans la colonne OS_STATUS

# Générer le rapport Sweetviz
report = sv.analyze(df.dropna(subset=['OS_STATUS']), target_feat='OS_STATUS')

# Sauvegarder le rapport sous forme de fichier HTML
report.show_html('report/eda/sweetviz_report.html')

print("Le rapport Sweetviz a été généré et sauvegardé sous le nom 'sweetviz_report.html'.")

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Le rapport Sweetviz a été généré et sauvegardé sous le nom 'sweetviz_report.html'.


In [8]:
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
    "save_rsf": True
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "xgb": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 0.5,
        'max_features': None
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':260,  # Nombre d'arbres dans la forêt
    'max_depth':2,
    'min_samples_split':60,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':40,  # Nombre minimum d'échantillons par feuille
    'max_features':None,  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)
# Check if there are any columns that are not float or int in X
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)

X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")
# Ensure all columns in X_train are either float or int

##############################################
# Define the methods used for training
##############################################

size_method = get_method_name("size", PARAMS)
clinical_method = get_method_name("clinical", PARAMS)
molecular_method = get_method_name("molecular", PARAMS)
merge_method = get_method_name("merge", PARAMS)

##############################################
# Fit a CoxPH model
##############################################

# # Initialize and train the Cox Proportional Hazards model
# cox = CoxPHSurvivalAnalysis()
# cox.fit(X_train, y_train)

# # Evaluate the model using Concordance Index IPCW
# cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
# cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
# print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.3f}")
# print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.3f}")
# cox_score_method = f"score_{cox_cindex_train:.3f}_{cox_cindex_test:.3f}"

# # Predict and save the results
# if GLOBAL["save_cox"]:
#     predict_and_save(X_eval, cox, method=f"{size_method}-{cox_score_method}-{clinical_method}-{molecular_method}-{merge_method}")


##############################################
# Fit a Gradient Boosting model
##############################################

xgb_params_method = "_".join([(str(key) + "=" + str(PARAMS['xgb'][key])) for key in PARAMS['xgb'].keys()])

xgb = GradientBoostingSurvivalAnalysis(max_features=PARAMS['xgb']['max_features'], subsample=PARAMS['xgb']['subsample'], n_estimators=PARAMS['xgb']['n_estimators'], learning_rate=PARAMS['xgb']['learning_rate'], max_depth=PARAMS['xgb']['max_depth'], random_state=42)
xgb.fit(X_train, y_train)
xgb_cindex_train = concordance_index_ipcw(y_train, y_train, xgb.predict(X_train), tau=7)[0]
xgb_cindex_test = concordance_index_ipcw(y_train, y_test, xgb.predict(X_test), tau=7)[0]
print(f"Gradient Boosting Survival Model Concordance Index IPCW on train: {xgb_cindex_train:.3f}")
print(f"Gradient Boosting Survival Model Concordance Index IPCW on test: {xgb_cindex_test:.3f}")
xgboost_score_method = f"score_{xgb_cindex_train:.3f}_{xgb_cindex_test:.3f}"


(14024, 12)
(14024, 15)
(14024, 17)
Index(['gene_ASXL1', 'gene_BCOR', 'gene_BCORL1', 'gene_BRCC3', 'gene_CBL',
       'gene_CEBPA', 'gene_CSF3R', 'gene_CTCF', 'gene_CUX1', 'gene_DDX41',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=102)
Gradient Boosting Survival Model Concordance Index IPCW on train: 0.756
Gradient Boosting Survival Model Concordance Index IPCW on test: 0.704


In [17]:
X['sous_END_START'].skew()

20.614134246681406

In [None]:
import numpy as np
import pandas as pd
import shap
import random

def predict_function(data):
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data, columns=X_train.columns)
    return xgb.predict(data)

# 2) Create the SHAP Explainer using that function and a background dataset
explainer = shap.Explainer(predict_function, X_train)

# Assurez-vous que X_train est un DataFrame
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train, columns=X.columns)
shap_values = explainer.shap_values(X_train)


random.random()

PermutationExplainer explainer: 2222it [02:54, 11.99it/s]                          


In [None]:
fig = shap.summary_plot(shap_values, X_train, plot_type="bar", max_display=1000)

In [None]:
fig = shap.summary_plot(shap_values, X_train, plot_type="bar", max_display=1000)

In [11]:
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
import lightgbm as lgb

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "xgb": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 1,
        'max_features': 'sqrt'
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':260,  # Nombre d'arbres dans la forêt
    'max_depth':2,
    'min_samples_split':60,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':40,  # Nombre minimum d'échantillons par feuille
    'max_features':None,  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)
# Check if there are any columns that are not float or int in X
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)

X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")


size_method = get_method_name("size", PARAMS)
clinical_method = get_method_name("clinical", PARAMS)
molecular_method = get_method_name("molecular", PARAMS)
merge_method = get_method_name("merge", PARAMS)

(14024, 12)
(14024, 15)
(14024, 17)
Index(['gene_ASXL1', 'gene_BCOR', 'gene_BCORL1', 'gene_BRCC3', 'gene_CBL',
       'gene_CEBPA', 'gene_CSF3R', 'gene_CTCF', 'gene_CUX1', 'gene_DDX41',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=102)


In [2]:

xgb_params_method = "_".join([(str(key) + "=" + str(PARAMS['xgb'][key])) for key in PARAMS['xgb'].keys()])

xgb = GradientBoostingSurvivalAnalysis(subsample=PARAMS['xgb']['subsample'], n_estimators=PARAMS['xgb']['n_estimators'], learning_rate=PARAMS['xgb']['learning_rate'], max_depth=PARAMS['xgb']['max_depth'], random_state=42)
xgb.fit(X_train, y_train)
xgb_cindex_train = concordance_index_ipcw(y_train, y_train, xgb.predict(X_train), tau=7)[0]
xgb_cindex_test = concordance_index_ipcw(y_train, y_test, xgb.predict(X_test), tau=7)[0]
print(f"Gradient Boosting Survival Model Concordance Index IPCW on train: {xgb_cindex_train:.3f}")
print(f"Gradient Boosting Survival Model Concordance Index IPCW on test: {xgb_cindex_test:.3f}")
xgboost_score_method = f"score_{xgb_cindex_train:.3f}_{xgb_cindex_test:.3f}"

if GLOBAL["save_xgb"]:
    predict_and_save(X_eval, xgb, method=f"{size_method}-{xgboost_score_method}-{clinical_method}-{molecular_method}-{merge_method}-{xgb_params_method}")


Gradient Boosting Survival Model Concordance Index IPCW on train: 0.756
Gradient Boosting Survival Model Concordance Index IPCW on test: 0.704


In [7]:

from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
rsf_params_method = "_".join([(str(key) + "=" + str(PARAMS['rsf'][key])) for key in PARAMS['rsf'].keys()])

rsf = RandomSurvivalForest(min_samples_split=PARAMS['rsf']['min_samples_split'], n_estimators=PARAMS['rsf']['n_estimators'], min_samples_leaf=PARAMS['rsf']['min_samples_leaf'], max_features=PARAMS['rsf']['max_features'], n_jobs=PARAMS['rsf']['n_jobs'], random_state=42)
rsf.fit(X_train, y_train)
rsf_cindex_train = concordance_index_ipcw(y_train, y_train, rsf.predict(X_train), tau=7)[0]
rsf_cindex_test = concordance_index_ipcw(y_train, y_test, rsf.predict(X_test), tau=7)[0]
print(f"Random Survival Forest Model Concordance Index IPCW on train: {rsf_cindex_train:.3f}")
print(f"Random Survival Forest Model Concordance Index IPCW on test: {rsf_cindex_test:.3f}")
rsf_score_method = f"score_{rsf_cindex_train:.3f}_{rsf_cindex_test:.3f}"


Random Survival Forest Model Concordance Index IPCW on train: 0.759
Random Survival Forest Model Concordance Index IPCW on test: 0.703


In [None]:
from sksurv.util import Surv
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
import pandas as pd
import numpy as np

# Cible (colonne de survie)
target = "event"  # Remplacez par le nom de votre cible
tenure_column = "time"  # Remplacez par la colonne représentant le temps de survie


# Vérifiez que X_train et y_train sont convertis en DataFrame/Series
X_train = pd.DataFrame(X_train, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=[target, tenure_column])
X_eval = pd.DataFrame(X_eval, columns=X.columns)

# Nombre de modèles à entraîner
NB_MODELS = 20

# Initialiser la colonne de soumission avec des zéros
submission = pd.DataFrame(index=X_eval.index, columns=["Survival_Prediction"])
submission["Survival_Prediction"] = 0


# Boucle pour entraîner plusieurs modèles avec des échantillons aléatoires
for i in range(NB_MODELS):
    print(f"Entraînement du modèle {i+1}/{NB_MODELS}...")

    # Réduire la taille de l'échantillon à 80 % des données disponibles
    sample_size = int(len(X_train) * 0.70)

    train_sample = pd.concat([X_train, y_train], axis=1).sample(sample_size, random_state=999 + i, replace=False)
    X_sample = train_sample.drop(columns=[target, tenure_column])
    y_sample = train_sample[[target, tenure_column]].copy()

    # Transformer les colonnes de survie en format compatible avec sksurv
    y_sample[target] = y_sample[target].astype(bool)
    y_surv = Surv.from_dataframe(event=target, time=tenure_column, data=y_sample)

    # Initialiser et entraîner le modèle Gradient Boosting Survival Analysis
    est_cph_tree = GradientBoostingSurvivalAnalysis(
        max_features=PARAMS['xgb']['max_features'],
        n_estimators=PARAMS['xgb']['n_estimators'],
        learning_rate=PARAMS['xgb']['learning_rate'],
        max_depth=PARAMS['xgb']['max_depth'],
        subsample=PARAMS['xgb']['subsample'],
        random_state=i
    )
    est_cph_tree.fit(X_sample, y_surv)

    # Prédictions de survie sur les données d'évaluation
    survival = est_cph_tree.predict(X_eval)

    # Ajouter les prédictions au total
    submission["Survival_Prediction"] += survival

# Moyenne des prédictions sur tous les modèles
submission["Survival_Prediction"] /= NB_MODELS

# Résultat final
print("Prédictions terminées.")
print(submission.head())

In [15]:
df_eval = pd.read_csv("data/X_test/clinical_test.csv")
submission['ID'] = df_eval['ID']

# Renommer la colonne
submission.rename(columns={'Survival_Prediction': 'risk_score'}, inplace=True)

# Mettre la colonne ID en première
cols = ['ID'] + [col for col in submission if col != 'ID']
submission = submission[cols]

submission.to_csv("20250128211854_boosted_20_models_size7_sample7_with_subsample_sqrt.csv", index=False)

In [9]:
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.util import Surv
import pandas as pd
import numpy as np
import warnings
import logging

logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "xgb": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 1,
        'max_features': None
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':260,  # Nombre d'arbres dans la forêt
    'max_depth':2,
    'min_samples_split':60,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':40,  # Nombre minimum d'échantillons par feuille
    'max_features':None,  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)

# Optionnel : Afficher les colonnes
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)
X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")

# Conversion en DataFrame/Series si besoin
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
X_eval = pd.DataFrame(X_eval, columns=X.columns)

y_train = pd.DataFrame(y_train, columns=["event", "time"])
y_test = pd.DataFrame(y_test, columns=["event", "time"])

NB_MODELS = 20

# DataFrame de soumission
submission = pd.DataFrame(index=X_eval.index, columns=["Survival_Prediction"])
submission["Survival_Prediction"] = 0.0

for i in range(NB_MODELS):
    # Échantillonner 90% des données
    sample_size = int(len(X_train) * 0.80)
    train_sample = pd.concat([X_train, y_train], axis=1).sample(sample_size, random_state=999 + i, replace=False)
    X_sample = train_sample.drop(columns=["event", "time"])
    y_sample = train_sample[["event", "time"]].copy()
    
    y_sample["event"] = y_sample["event"].astype(bool)
    y_surv = Surv.from_dataframe("event", "time", y_sample)

    # Alternance : 10 GradientBoostingSurvivalAnalysis et 10 RandomSurvivalForest
    if i < NB_MODELS//2:
        model = GradientBoostingSurvivalAnalysis(
            max_features=PARAMS['xgb']['max_features'],
            n_estimators=PARAMS['xgb']['n_estimators'],
            learning_rate=PARAMS['xgb']['learning_rate'],
            max_depth=PARAMS['xgb']['max_depth'],
            subsample=PARAMS['xgb']['subsample'],
            random_state=i
        )
    else:
        model = RandomSurvivalForest(
            n_estimators=PARAMS['rsf']['n_estimators'],
            max_depth=PARAMS['rsf']['max_depth'],
            min_samples_split=PARAMS['rsf']['min_samples_split'],
            min_samples_leaf=PARAMS['rsf']['min_samples_leaf'],
            max_features=PARAMS['rsf']['max_features'],
            n_jobs=PARAMS['rsf']['n_jobs'],
            random_state=i
        )

    print(f"Entraînement du modèle {i+1}/{NB_MODELS} : {model.__class__.__name__}")

    model.fit(X_sample, y_surv)
    
    # Prédiction de survie sur X_eval
    survival = model.predict(X_eval)

    # Normalisation des prédictions dans l'intervalle [0,1] pour éviter qu'un modèle domine l'autre
    survival = (survival - survival.min()) / (survival.max() - survival.min())

    submission["Survival_Prediction"] += survival

# Moyenne des prédictions
submission["Survival_Prediction"] /= NB_MODELS

print("Prédictions terminées.")
print(submission.head())

(14024, 12)
(14024, 15)
(14024, 17)
Index(['gene_ASXL1', 'gene_BCOR', 'gene_BCORL1', 'gene_BRCC3', 'gene_CBL',
       'gene_CEBPA', 'gene_CSF3R', 'gene_CTCF', 'gene_CUX1', 'gene_DDX41',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=102)
Entraînement du modèle 1/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 2/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 3/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 4/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 5/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 6/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 7/20 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 8/20 : GradientBoostingSurvivalAnalysis
Entr

In [7]:
pd.concat([X_train, y_train], axis=1)

Unnamed: 0,gene_ASXL1,gene_BCOR,gene_BCORL1,gene_BRCC3,gene_CBL,gene_CEBPA,gene_CSF3R,gene_CTCF,gene_CUX1,gene_DDX41,...,STD(molecular.DEPTH),STD(molecular.END),STD(molecular.START),STD(molecular.VAF),SUM(molecular.DEPTH),SUM(molecular.END),SUM(molecular.START),SUM(molecular.VAF),event,time
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,406.263092,5.606084e+07,5.606084e+07,0.141664,4904.0,407714249.0,407714246.0,1.9947,False,1.917808
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,256.724950,1.939291e+07,1.939291e+07,0.282550,4281.0,207288172.0,207288165.0,2.2093,True,1.282192
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,291.327994,6.845612e+07,6.845612e+07,0.093338,2344.0,159589910.0,159589910.0,0.8360,True,1.490411
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,226.174711,1.012337e+07,1.012337e+07,0.013178,3054.0,129717448.0,129717448.0,1.6700,True,1.276712
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,282.584860,4.676600e+07,4.676600e+07,0.154015,988.0,7577142.0,7577142.0,0.1080,True,1.238356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,184.659145,1.440462e+07,1.440458e+07,0.034585,4047.0,178342466.0,178342395.0,0.4862,True,3.465753
2217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,282.584860,4.676600e+07,4.676600e+07,0.154015,1755.0,198266834.0,198266834.0,0.0900,False,3.000000
2218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,397.508490,7.355232e+07,7.355232e+07,0.267528,1854.0,270079907.0,270079907.0,0.7770,False,8.635616
2219,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,324.580242,4.338831e+07,4.338831e+07,0.061678,2017.0,243368816.0,243368814.0,1.2944,False,0.476712


In [12]:
train_sample

Unnamed: 0,sous_END_START,VAF_mean,VAF_max,VAF_min,VAF_std,is_truncating_sum,is_non_synonymous_sum,is_splice_site_sum,mut_length_mean,mut_length_max,...,STD(molecular.DEPTH),STD(molecular.END),STD(molecular.START),STD(molecular.VAF),SUM(molecular.DEPTH),SUM(molecular.END),SUM(molecular.START),SUM(molecular.VAF),OS_STATUS,OS_YEARS
1472,0.0,0.170658,-0.289175,0.594002,-1.076510,0.389309,-0.585953,-0.344382,-0.314221,-0.307746,...,588.938876,4.339603e+07,4.339603e+07,0.039627,1347.0,243397747.0,243397747.0,1.0310,,
348,4.0,-0.138156,0.697108,-0.727981,0.534692,1.762745,-0.585953,-0.344382,-0.097451,-0.139626,...,823.717609,1.621032e+07,1.621032e+07,0.207747,6634.0,160606018.0,160606014.0,1.4633,,
869,7.0,1.546414,2.367484,1.513315,0.520881,1.762745,0.223352,-0.344382,0.001901,0.112554,...,414.440828,1.162215e+07,1.162215e+07,0.206306,6660.0,178335214.0,178335207.0,3.4254,,
481,5.0,0.490026,0.123820,-0.852169,0.233204,1.762745,0.223352,-0.344382,-0.088419,-0.139626,...,258.350473,5.483458e+07,5.483459e+07,0.176289,6283.0,533213587.0,533213582.0,2.3785,,
1736,16.0,-0.206126,-0.162605,-0.717766,0.377947,-0.297410,-0.585953,-0.344382,1.130911,0.953154,...,102.402799,2.430319e+07,2.430318e+07,0.191392,1795.0,94773987.0,94773971.0,0.8443,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,6.0,-0.554023,-0.078955,-0.771527,0.297880,0.389309,1.032658,5.144210,-0.081968,0.112554,...,387.319359,2.710421e+07,2.710421e+07,0.183037,11112.0,641789453.0,641789447.0,1.5678,,
1944,3.0,1.203217,2.097701,-0.631748,1.573618,1.762745,0.223352,2.399914,-0.198095,-0.223686,...,665.782998,5.211286e+07,5.211286e+07,0.316153,8785.0,568233576.0,568233573.0,3.5995,,
78,11.0,-0.287094,-0.262898,-0.930122,-0.090472,-0.297410,0.223352,-0.344382,0.281896,0.196614,...,175.940899,4.029565e+07,4.029565e+07,0.142515,4429.0,206302737.0,206302726.0,1.3403,,
1134,1.0,-0.108197,-0.039539,-0.536592,0.019743,-0.297410,0.223352,-0.344382,-0.223901,-0.223686,...,282.584860,4.676600e+07,4.676600e+07,0.154015,0.0,0.0,0.0,0.0000,,


In [10]:
X_train

Unnamed: 0,sous_END_START,VAF_mean,VAF_max,VAF_min,VAF_std,is_truncating_sum,is_non_synonymous_sum,is_splice_site_sum,mut_length_mean,mut_length_max,...,SKEW(molecular.START),SKEW(molecular.VAF),STD(molecular.DEPTH),STD(molecular.END),STD(molecular.START),STD(molecular.VAF),SUM(molecular.DEPTH),SUM(molecular.END),SUM(molecular.START),SUM(molecular.VAF)
0,3.0,0.505303,0.683969,0.067144,-0.098628,1.076027,-0.585953,2.399914,-0.151644,-0.223686,...,-0.194916,0.303001,406.263092,5.606084e+07,5.606084e+07,0.141664,4904.0,407714249.0,407714246.0,1.9947
1,7.0,0.765157,1.842371,-0.572611,1.251572,0.389309,0.223352,-0.344382,0.065126,0.196614,...,1.794115,0.639325,256.724950,1.939291e+07,1.939291e+07,0.282550,4281.0,207288172.0,207288165.0,2.2093
2,0.0,0.620700,0.144404,0.841303,-0.561765,-0.984128,0.223352,-0.344382,-0.314221,-0.307746,...,0.286075,0.199510,291.327994,6.845612e+07,6.845612e+07,0.093338,2344.0,159589910.0,159589910.0,0.8360
3,0.0,0.617672,-0.092094,1.104732,-1.329989,-0.984128,1.841963,-0.344382,-0.314221,-0.307746,...,0.403591,-0.587252,226.174711,1.012337e+07,1.012337e+07,0.013178,3054.0,129717448.0,129717448.0,1.6700
4,0.0,-1.256157,-1.502322,-0.470465,0.019743,-0.984128,-0.585953,-0.344382,-0.314221,-0.307746,...,0.286075,0.199510,282.584860,4.676600e+07,4.676600e+07,0.154015,988.0,7577142.0,7577142.0,0.1080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2216,71.0,-0.928817,-1.151955,-0.390899,-1.124839,-0.297410,-0.585953,-0.344382,6.098554,5.492394,...,0.601867,-1.477019,184.659145,1.440462e+07,1.440458e+07,0.034585,4047.0,178342466.0,178342395.0,0.4862
2217,0.0,-1.365136,-1.581155,-0.567235,0.019743,-0.984128,-0.585953,-0.344382,-0.314221,-0.307746,...,0.286075,0.199510,282.584860,4.676600e+07,4.676600e+07,0.154015,1755.0,198266834.0,198266834.0,0.0900
2218,0.0,-0.341946,0.424698,-0.943562,1.107606,-0.984128,1.032658,-0.344382,-0.314221,-0.307746,...,-1.293961,0.811657,397.508490,7.355232e+07,7.355232e+07,0.267528,1854.0,270079907.0,270079907.0,0.7770
2219,2.0,0.702232,0.158419,0.912267,-0.865182,0.389309,-0.585953,-0.344382,-0.133580,-0.223686,...,-1.732050,-0.746090,324.580242,4.338831e+07,4.338831e+07,0.061678,2017.0,243368816.0,243368814.0,1.2944


array([(False, 1.91780822), ( True, 1.28219178), ( True, 1.49041096), ...,
       (False, 8.63561644), (False, 0.47671233), (False, 1.29041096)],
      dtype=[('event', '?'), ('time', '<f8')])

In [1]:
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
import lightgbm as lgb

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
}

PARAMS = {
    "size": 0.7,
    "clinical": [],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": [],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["gpt"], # Possible: ["featuretools", "gpt"]
    "xgb": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 200,
        'subsample': 0.5,
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)
# Check if there are any columns that are not float or int in X
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)


Index(['is_truncating_sum', 'is_non_synonymous_sum', 'is_splice_site_sum',
       'VAF_mean', 'VAF_min', 'VAF_max', 'VAF_std', 'VAF_skew', 'DEPTH_mean',
       'DEPTH_min', 'DEPTH_max', 'DEPTH_std', 'DEPTH_skew', 'END_mean',
       'END_min', 'END_max', 'mut_length_mean', 'mut_length_max',
       'AA_position_mean', 'AA_position_min', 'AA_position_max',
       'total_mutations', 'unique_genes', 'frac_vaf_gt_0_3', 'BM_BLAST', 'WBC',
       'ANC', 'MONOCYTES', 'HB', 'PLT', 'CYTOGENETICS'],
      dtype='object')


In [2]:
X_train

Unnamed: 0_level_0,is_truncating_sum,is_non_synonymous_sum,is_splice_site_sum,VAF_mean,VAF_min,VAF_max,VAF_std,VAF_skew,DEPTH_mean,DEPTH_min,...,total_mutations,unique_genes,frac_vaf_gt_0_3,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,CYTOGENETICS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P110821,1.076027,-0.585953,2.399914,0.505303,0.067144,0.683969,-0.098628,0.114624,0.105836,-0.006354,...,0.727396,0.593890,0.690886,3.0,3.50,0.840,1.050,9.1,150.0,
P121044,0.389309,0.223352,-0.344382,0.765157,-0.572611,1.842371,1.251572,0.385954,-0.158098,-0.606026,...,0.727396,1.164964,0.690886,15.0,4.00,2.000,0.000,11.0,45.0,"46,xy[20]"
P103014,-0.984128,0.223352,-0.344382,0.620700,0.841303,0.144404,-0.561765,,0.510845,0.644976,...,-0.668985,-0.548260,1.212453,6.0,2.70,0.945,0.170,6.9,132.0,"46,xy,del(5)(q13q31)[27]/46,xy[3]"
P120972,-0.984128,1.841963,-0.344382,0.617672,1.104732,-0.092094,-1.329989,-0.603593,-0.354460,-0.284853,...,0.261935,0.593890,1.212453,2.0,2.00,1.000,0.000,10.0,178.0,"45,xy,-7[18]/46,xy[1]"
P120934,-0.984128,-0.585953,-0.344382,-1.256157,-0.470465,-1.502322,,,0.121087,0.694387,...,-1.134445,-1.119334,-1.395384,10.0,2.00,1.000,0.000,10.0,53.0,"47,xx,+1,add(1))p36)x2,del(5)(q11q15),+11,-17[..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P118264,-0.297410,-0.585953,-0.344382,-0.928817,-0.390899,-1.151955,-1.124839,-1.321417,0.885775,1.033527,...,-0.203525,0.022815,-1.395384,3.0,2.80,1.400,0.220,7.4,82.0,"46,xx,del(5)(q11;q13)[16]/46,xx[4]"
P110902,-0.984128,-0.585953,-0.344382,-1.365136,-0.567235,-1.581155,,,1.745784,2.417040,...,-1.134445,-1.119334,-1.395384,7.0,4.90,2.890,0.340,9.9,234.0,"46,xx,del(5)(q13q33)[12]/46,xx[8]"
P110684,-0.984128,1.032658,-0.344382,-0.341946,-0.943562,0.424698,1.107606,0.524985,-0.662665,-1.026021,...,-0.203525,0.022815,-0.526105,0.0,3.30,1.730,0.200,10.9,174.0,"46,xx,del(5)(?q21q34)[18]/46,xx[2]"
P100054,0.389309,-0.585953,-0.344382,0.702232,0.912267,0.158419,-0.865182,-0.731736,-0.547573,-0.549877,...,-0.203525,-0.548260,1.212453,0.0,8.00,4.080,2.000,10.7,187.0,"46,xy[20]"


In [None]:
def aggregate_by_patient(df: pd.DataFrame) -> pd.DataFrame:
    """
    Agrège les mutations par ID patient pour obtenir un set de features complet.
    Exemple de features :
        - total_mutations
        - mean_vaf, max_vaf
        - nb_truncating
        - nb_non_synonymous
        - nb_unique_genes
        ...
    """

    agg_dict = {
        "is_truncating": "sum",         # nombre de variants truncants
        "is_non_synonymous": "sum",     # nombre de variants non-synonymes
        "is_splice_site": "sum",        # nombre de variants au site de splice
        "REF": "nunique",              # nombre de bases différentes touchées
        "ALT": "nunique",                 # nombre total de mutations
        "GENE": "nunique",              # nombre de gènes uniques
        "CHR": "nunique",               # nombre de chromosomes touchés 
        "VAF" : ["sum", "mean", "min", "max", "std", "skew"],   # VAF moyen, max, etc.
        "DEPTH": ["sum", "mean", "min", "max", "std", "skew"],  # profondeur moyenne, etc.
        "START": ["sum", "mean", "min", "max", "std", "skew"],  # position moyenne de début
        "END": ["sum", "mean", "min", "max", "std", "skew"],  # position moyenne de fin
        "mut_length": ["sum", "mean", "max"],  # taille moyenne d'indel
        "AA_position": ["sum", "mean", "min", "max", "std", "skew"] # position moyenne d'aa impacté
        # etc.
    }

    # On veut aussi compter le nombre total de mutations, 
    # donc on peut ajouter une colonne dummy pour compter
    df["count_mut"] = 1

    # On agrège
    df_agg = df.groupby("ID").agg(agg_dict).fillna(0)

    # Flatten les multi-index de colonnes
    df_agg.columns = ["_".join(col).strip() for col in df_agg.columns.values]

    # Nombre total de mutations
    df_agg["total_mutations"] = df.groupby("ID")["count_mut"].sum()

    # Nombre de gènes distincts
    df_agg["unique_genes"] = df.groupby("ID")["GENE"].nunique()

    # Autres features, ex. fraction de mutations avec VAF > 0.3
    df_agg["frac_vaf_gt_0_3"] = df.groupby("ID").apply(
        lambda group: (group["VAF"] > 0.3).mean()
    )

    df_agg = df_agg.reset_index()

    return df_agg

In [44]:
from sksurv.metrics import concordance_index_ipcw
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.util import Surv


from src.utilities import predict_and_save, split_data, get_method_name, score_method
from src.preprocess import process_missing_values, main_preprocess, create_entity
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb
import pandas as pd

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")


GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
    "save_rsf": False,
    "save_shap": False
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"] ["BM_BLAST+WBC", "BM_BLAST/HB", "HB*PLT", "HB/num_trisomies"]
    "molecular": ["GENE", "EFFECT"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "additional": [
        #v['cadd', 'exon'],
    ],
    "xgb1": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 305,
        'subsample': 0.55,
        'max_features': None
    },
    "xgb2": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 335,
        'subsample': 0.55,
        'max_features': 'sqrt'
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':260,  # Nombre d'arbres dans la forêt
    'max_depth':2,
    'min_samples_split':60,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':40,  # Nombre minimum d'échantillons par feuille
    'max_features':None,  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}


data = create_entity(PARAMS)
data = main_preprocess(data, PARAMS)
X, X_eval, y = split_data(data)

# Optionnel : Afficher les colonnes
print(X.columns)

# Supprime de manière random deux colonnes et print le nom des colonnes, cela doit être un chiffre entr 1 et la longueur de X.columns
X = X.drop(columns=[X.columns[1], X.columns[2]])

Index(['effect_2KB_upstream_variant', 'effect_3_prime_UTR_variant',
       'effect_ITD', 'effect_PTD', 'effect_complex_change_in_transcript',
       'effect_frameshift_variant', 'effect_inframe_codon_gain',
       'effect_inframe_codon_loss', 'effect_inframe_variant',
       'effect_initiator_codon_change',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=118)


In [3]:
from sksurv.metrics import concordance_index_ipcw
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.util import Surv


from src.utilities import predict_and_save, split_data, get_method_name, score_method
from src.preprocess import process_missing_values, main_preprocess, create_entity
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb
import pandas as pd

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")


GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
    "save_rsf": False,
    "save_shap": False
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"] ["BM_BLAST+WBC", "BM_BLAST/HB", "HB*PLT", "HB/num_trisomies"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "additional": [
        #v['cadd', 'exon'],
    ],
    "xgb1": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 305,
        'subsample': 0.55,
        'max_features': None
    },
    "xgb2": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 450,
        'subsample': 0.55,
        'max_features': 'sqrt',
        'random_state': 16
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':260,  # Nombre d'arbres dans la forêt
    'max_depth':2,
    'min_samples_split':60,  # Nombre minimum d'échantillons requis pour splitter un nœud
    'min_samples_leaf':40,  # Nombre minimum d'échantillons par feuille
    'max_features':None,  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}


data = create_entity(PARAMS)
data = main_preprocess(data, PARAMS)
X, X_eval, y = split_data(data)

# Optionnel : Afficher les colonnes
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)
X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")

# Conversion en DataFrame/Series si besoin
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
X_eval = pd.DataFrame(X_eval, columns=X.columns)

y_train = pd.DataFrame(y_train, columns=["event", "time"])
y_test = pd.DataFrame(y_test, columns=["event", "time"])

NB_MODELS = 10

# DataFrame de soumission
submission = pd.DataFrame(index=X_eval.index, columns=["Survival_Prediction"])
submission["Survival_Prediction"] = 0.0

test_predict = pd.DataFrame(index=X_test.index, columns=["Survival_Prediction"])
test_predict["Survival_Prediction"] = 0.0

train_predict = pd.DataFrame(index=X_train.index, columns=["Survival_Prediction"])
train_predict["Survival_Prediction"] = 0.0

for i in range(NB_MODELS):
    
    sample_size = int(len(X_train) * 0.95)
    train_sample = pd.concat([X_train, y_train], axis=1).sample(sample_size, random_state=999 + i, replace=False)
    X_sample = train_sample.drop(columns=["event", "time"])
    y_sample = train_sample[["event", "time"]].copy()
    
    y_sample["event"] = y_sample["event"].astype(bool)
    y_surv = Surv.from_dataframe("event", "time", y_sample)

    # Alternance : 10 GradientBoostingSurvivalAnalysis et 10 RandomSurvivalForest
    # if i < 0:
    #     param = PARAMS['xgb1']
    # else:
    param = PARAMS['xgb2']

    model = GradientBoostingSurvivalAnalysis(
        **param
    )

    print(f"Entraînement du modèle {i+1}/{NB_MODELS} : {model.__class__.__name__}")

    model.fit(X_sample, y_surv)

    # Normalisation des prédictions dans l'intervalle [0,1] pour éviter qu'un modèle domine l'autre
    #survival = (survival - survival.min()) / (survival.max() - survival.min())

    submission["Survival_Prediction"] += model.predict(X_eval)
    test_predict["Survival_Prediction"] += model.predict(X_test)
    train_predict["Survival_Prediction"] += model.predict(X_train)

# Moyenne des prédictions
submission["Survival_Prediction"] /= NB_MODELS
test_predict["Survival_Prediction"] /= NB_MODELS
train_predict["Survival_Prediction"] /= NB_MODELS


# Convert y_train and y_test to structured arrays
y_train_structured = Surv.from_dataframe("event", "time", y_train)
y_test_structured = Surv.from_dataframe("event", "time", y_test)

cindex_train = concordance_index_ipcw(y_train_structured, y_train_structured, train_predict["Survival_Prediction"].to_numpy(), tau=7)[0]
cindex_test = concordance_index_ipcw(y_train_structured, y_test_structured, test_predict["Survival_Prediction"].to_numpy(), tau=7)[0]
print(f"Model Concordance Index IPCW on train: {cindex_train:.3f}")
print(f"Model Concordance Index IPCW on test: {cindex_test:.3f}")
print("Prédictions terminées.")

Index(['gene_ASXL1', 'gene_BCOR', 'gene_BCORL1', 'gene_BRCC3', 'gene_CBL',
       'gene_CEBPA', 'gene_CSF3R', 'gene_CTCF', 'gene_CUX1', 'gene_DDX41',
       ...
       'SKEW(molecular.START)', 'SKEW(molecular.VAF)', 'STD(molecular.DEPTH)',
       'STD(molecular.END)', 'STD(molecular.START)', 'STD(molecular.VAF)',
       'SUM(molecular.DEPTH)', 'SUM(molecular.END)', 'SUM(molecular.START)',
       'SUM(molecular.VAF)'],
      dtype='object', length=102)
Entraînement du modèle 1/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 2/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 3/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 4/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 5/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 6/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 7/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 8/10 : GradientBoostingSurvivalAnalysis
Entraînement du modèle 9/10 : GradientBo

In [4]:
df_eval = pd.read_csv("data/X_test/clinical_test.csv")
submission['ID'] = df_eval['ID']

# Renommer la colonne
submission.rename(columns={'Survival_Prediction': 'risk_score'}, inplace=True)

# Mettre la colonne ID en première
cols = ['ID'] + [col for col in submission if col != 'ID']
submission = submission[cols]

submission.to_csv("10_gradiant_7_95_sqrt_subsample55_450_rs_16.csv", index=False)

In [1]:
import pandas as pd
import numpy as np
import torchtuples as tt
from pycox.models import CoxPH
from sksurv.util import Surv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.util import Surv
import pandas as pd
import numpy as np
import warnings
import logging

logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
}

PARAMS = {
    "size": 0.75,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "xgb1": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 0.6,
        'max_features': None
    },
    "xgb2": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 0.6,
        'max_features': "sqrt"
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)

# Optionnel : Afficher les colonnes
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)
X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")


# 🔹 Charger et préparer les données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

# Convertir y_train et y_test au format `numpy` pour DeepSurv
y_train_np = (y_train["event"].values, y_train["time"].values)
y_test_np = (y_test["event"].values, y_test["time"].values)

# 🔹 Normaliser les features pour le réseau de neurones
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 🔹 Paramètres du modèle DeepSurv
net = tt.practical.MLPVanilla(
    in_features=X_train.shape[1], 
    num_nodes=[128, 64], 
    out_features=1,  # 🔥 Correction ici !
    batch_norm=True, 
    dropout=0.2, 
    output_bias=False
)

model = CoxPH(net, tt.optim.Adam(lr=0.001))

# 🔹 Entraînement du modèle
batch_size = 256
epochs = 100
log = model.fit(X_train_scaled, y_train_np, batch_size, epochs, callbacks=[tt.callbacks.EarlyStopping(patience=10)])

# 🔹 Prédiction sur les données de test
c_index = concordance_index_ipcw(y_test_np[1], y_test_np[0], -model.predict(X_test_scaled), tau=7)[0]

print(f"DeepSurv Concordance Index: {c_index:.3f}")



ImportError: cannot import name 'create_entity' from 'src.utilities' (/Users/julesmourgues/Documents/Programmation/Personal/qrt-challenge-2025/src/utilities.py)

In [1]:
from deepsurv import deep_surv
import pandas as pd
import numpy as np
import torchtuples as tt
from pycox.models import CoxPH
from sksurv.util import Surv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from src.utilities import create_entity, predict_and_save, split_data, get_method_name
from src.preprocess import process_missing_values, main_preprocess
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_ipcw
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.util import Surv
import pandas as pd
import numpy as np
import warnings
import logging

logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message=".*Ill-conditioned matrix.*")

data = create_entity()

GLOBAL = {
    "save_cox": False,
    "save_xgb": True,
    "save_lgbm": False,
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"]
    "molecular": ["GENE"],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": ["featuretools", "gpt"], # Possible: ["featuretools", "gpt"]
    "xgb1": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 0.6,
        'max_features': None
    },
    "xgb2": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 260,
        'subsample': 0.6,
        'max_features': "sqrt"
    }
}

data = main_preprocess(data, PARAMS['clinical'], PARAMS['molecular'], PARAMS['merge'])
X, X_eval, y = split_data(data)

# Optionnel : Afficher les colonnes
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)
X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")


def dataframe_to_deepsurv_ds(df, event_col = 'Event', time_col = 'Time'):
    # Extract the event and time columns as numpy arrays
    e = df[event_col].values.astype(np.int32)
    t = df[time_col].values.astype(np.float32)

    # Extract the patient's covariates as a numpy array
    x_df = df.drop([event_col, time_col], axis = 1)
    x = x_df.values.astype(np.float32)
    
    # Return the deep surv dataframe
    return {
        'x' : x,
        'e' : e,
        't' : t
    }

  self.ctor = getattr(np, o_type.dtype)


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [12]:
# Clinical Data
df = pd.read_csv("data/X_train/clinical_train.csv")
df_eval = pd.read_csv("data/X_test/clinical_test.csv")

# Molecular Data
maf_df = pd.read_csv("data/X_train/molecular_train.csv")
maf_eval = pd.read_csv("data/X_test/molecular_test.csv")

target_df = pd.read_csv("data/target_train.csv")
target_df_test = pd.read_csv("data/random_submission_FRacdcw_v9kP4pP.csv")

In [15]:
df['CENTER'].value_counts()

CENTER
KI       900
DUS      455
PV       316
GESMD    246
RMCN     199
CCH      159
CGM      107
ROM      104
UOB       88
HMS       83
MUV       83
TUD       73
FUCE      73
ICO       71
FLO       68
DUTH      66
UOXF      50
HIAE      47
MSK       37
IHBT      33
VU        33
UMG       26
REL        6
Name: count, dtype: int64

In [1]:
from src.utilities import predict_and_save, split_data, get_method_name, score_method
from src.preprocess import process_missing_values, main_preprocess, create_entity
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
import lightgbm as lgb
import pandas as pd

import warnings
import logging

# Régler le logger de Featuretools au niveau ERROR
logging.getLogger('featuretools.entityset').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",message=".*Ill-conditioned matrix.*")


GLOBAL = {
    "cox": {"run": False, "save":False, "shap": False},
    "xgb": {"run": True, "save":True, "shap": False},
    "lgbm": {"run": False, "save":False, "shap": False},
    "rsf": {"run": False, "save":False, "shap": False}
}

PARAMS = {
    "size": 0.7,
    "clinical": ["CYTOGENETICS"],#["CYTOGENETICS"], # Possible: ["CYTOGENETICS", "HB/PLT", "logMONOCYTES", "logWBC", "logANC"] ["BM_BLAST+WBC", "BM_BLAST/HB", "HB*PLT", "HB/num_trisomies"]
    "molecular": [],#["END-START"], # Possible: ["GENE", "EFFECT", "ALT", "REF", "END-START"]
    "merge": [], # Possible: ["featuretools", "gpt"]
    "additional": [
        #['cadd', 'phred'],
        # ['cadd', 'rawscore'],
        # # ['cadd', 'consequence'],
        # # ['cadd', 'bstatistic'],
        # # ['cadd', 'gerp', 'n'],
        # ['cadd', 'phast_cons', 'mammalian'],
        # ['cadd', 'phylop', 'mammalian'],
        # ['snpeff', 'putative_impact'],
        # # ['snpeff', 'rank'],
        # # ['snpeff', 'total'],
         #['cadd', 'exon'],
        # # ['cadd', 'cds', 'rel_cds_pos']
        ],
    "xgb": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'n_estimators': 450,
        'subsample': 0.55,
        'max_features': 'sqrt',
        'random_state': 26
    },
    "lgbm": {
        'max_depth': 2,
        'learning_rate': 0.05,
        'verbose': 0
    },
    "rsf": {
    'n_estimators':300,  # Nombre d'arbres dans la forêt
    'max_depth':2,
    #'min_samples_split':60,  # Nombre minimum d'échantillons requis pour splitter un nœud
    #'min_samples_leaf':40,  # Nombre minimum d'échantillons par feuille
    'max_features':None,  # Sélection aléatoire des features
    'n_jobs':-1,  # Utilisation de tous les cœurs disponibles
    }
}


data = create_entity(PARAMS)
data = main_preprocess(data, PARAMS)
X, X_eval, y = split_data(data)

# Check if there are any columns that are not float or int in X
print(X.columns)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - PARAMS['size']), random_state=42)

X_train, X_test, X_eval = process_missing_values(X_train, X_test, X_eval, method="impute", strategy="median")

Index(['BM_BLAST', 'WBC', 'ANC', 'MONOCYTES', 'HB', 'PLT', 'num_subclones',
       'sex', 'avg_chromosomes', 'total_mitoses', 'num_translocations',
       'num_deletions', 'num_inversions', 'num_duplications', 'num_additions',
       'num_monosomies', 'num_trisomies', 'complexity_score'],
      dtype='object')
(3173, 18)


In [None]:
import os
from pathlib import Path
from autoviz import AutoViz_Class
from datetime import datetime

def create_EDA_report(
    df, 
    target_variables, 
    custom_plot_dir="report/eda",
    title="EDA_" + str(datetime.now().strftime("%Y%m%d_%H%M%S")),
    output_file=None,
    sep=",",
    header=0,
    verbose=0,
    lowess=False,
    chart_format="html"
):
    """
    Génère des visualisations AutoViz pour un DataFrame donné sur les variables cibles spécifiées,
    puis combine tous les graphiques générés dans un rapport HTML unique en utilisant des iframes.
    Le rapport présente un en-tête principal et, pour chaque variable cible, un titre centré en majuscules,
    suivi des graphiques dont les titres et dimensions sont définis via un dictionnaire.
    
    Parameters:
        df (pd.DataFrame): Le DataFrame à analyser.
        target_variables (list): Liste des variables cibles sur lesquelles réaliser l'EDA.
        custom_plot_dir (str): Répertoire principal pour sauvegarder les graphiques AutoViz.
        title (str): Titre (et nom de sous-dossier) du rapport.
        output_file (str): Chemin complet du rapport HTML combiné. Si None, le rapport sera sauvegardé dans
                           custom_plot_dir/<title>/ sous le nom 'EDA_report.html'.
        sep (str): Délimiteur utilisé dans les données (passé à AutoViz).
        header (int): Numéro de la ligne contenant les noms de colonnes (passé à AutoViz).
        verbose (int): Niveau de verbosité pour AutoViz.
        lowess (bool): Indique si le lissage Lowess doit être appliqué.
        chart_format (str): Format des graphiques générés par AutoViz.
    
    Returns:
        str: Chemin du rapport HTML généré.
    """
    # Dictionnaire des réglages pour chaque type de graphique (nom de fichier sans le préfixe)
    files_settings = {
        "distplots_cats.html": {"title": "CATEGORIES", "width": "100%", "height": "400"},
        "pair_scatters.html": {"title": "PAIR SCATTERS", "width": "100%", "height": "400"},
        "scatterplots.html": {"title": "SCATTERPLOTS", "width": "100%", "height": "400"},
        "cat_var_plots.html": {"title": "CATEGORICAL VARIABLES", "width": "100%", "height": "400"},
        "heatmaps.html": {"title": "HEATMAPS", "width": "100%", "height": "800"},
        "distplots_nums.html": {"title": "NUMERICAL DISTRIBUTIONS", "width": "100%", "height": "400"},
        "violinplots.html": {"title": "VIOLIN PLOTS", "width": "100%", "height": "800"},
        "kde_plots.html": {"title": "KDE PLOTS", "width": "100%", "height": "400"},
    }
    
    # Création d'une instance AutoViz_Class
    AV = AutoViz_Class()
    
    # Création du répertoire de sauvegarde pour le rapport (custom_plot_dir/title)
    custom_plot_dir = Path(custom_plot_dir) / title
    custom_plot_dir.mkdir(parents=True, exist_ok=True)
    
    # Pour chaque variable cible, exécuter AutoViz et sauvegarder les graphiques dans custom_plot_dir/<target_variable>
    for target_variable in target_variables:
        print(f"Analyse pour la variable cible '{target_variable}'")
        # On utilise un sous-dossier dédié à chaque variable cible
        save_dir = custom_plot_dir
        save_dir.mkdir(parents=True, exist_ok=True)
        _ = AV.AutoViz(
            filename="",
            sep=sep,
            depVar=target_variable,
            dfte=df,
            header=header,
            lowess=lowess,
            chart_format=chart_format,
            max_rows_analyzed=min([df.shape[0], 10**5]),
            max_cols_analyzed=min([df.shape[1], 50]),
            save_plot_dir=str(save_dir),
            verbose=verbose
        )
    
    # Construction du contenu HTML final
    final_contents = []
    # En-tête principal du rapport
    final_contents.append(f'<h1 style="text-align: center; margin: 30px 0;">Rapport EDA</h1>')
    
    for target_variable in target_variables:
        report_dir = custom_plot_dir / target_variable
        if report_dir.exists():
            # En-tête unique pour la variable cible (centré, en majuscules)
            final_contents.append(f'<h2 style="text-align: center; margin: 20px 0;">{target_variable.upper()}</h2>')
            # Container pour les graphiques de cette variable cible
            plots_html = '<div class="container">'
            for html_file in sorted(report_dir.glob("*.html")):
                file_name = html_file.name
                # Récupérer les réglages définis dans le dictionnaire ou utiliser des valeurs par défaut
                settings = files_settings.get(file_name, {"title": file_name, "width": "100%", "height": "800"})
                # Calculer le chemin relatif pour l'iframe
                relative_path = os.path.relpath(html_file, custom_plot_dir)
                plots_html += f"""
                <div class="plot">
                    <h3 style="margin: 0; padding: 5px 0;">{settings['title']}</h3>
                    <iframe src="{relative_path}" width="{settings['width']}" height="{settings['height']}" 
                            style="border: none; display: block; margin: 0; padding: 0;"></iframe>
                </div>
                """
            plots_html += "</div>"
            final_contents.append(plots_html)
        else:
            print(f"Le répertoire {report_dir} n'existe pas.")
    
    # Construction du template HTML final
    combined_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>{title} - Rapport EDA AutoViz</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 20px;
                background: #f5f5f5;
            }}
            h1, h2, h3 {{
                color: #333;
                text-align: center;
            }}
            .container {{
                display: flex;
                flex-wrap: wrap;
                gap: 20px;
                justify-content: center;
            }}
            .plot {{
                background: white;
                padding: 10px;
                border-radius: 8px;
                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
                margin-bottom: 20px;
                width: 100%;
                max-width: 1100px;
            }}
            iframe {{
                width: 100%;
                border: none;
                border-radius: 8px;
            }}
        </style>
    </head>
    <body>
        {"".join(final_contents)}
    </body>
    </html>
    """
    
    # Définir le chemin de sortie du rapport
    if output_file is None:
        output_file = custom_plot_dir / "EDA_report.html"
    else:
        output_file = Path(output_file)
    
    # Sauvegarder le rapport HTML combiné
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(combined_html)
    
    print("Rapport combiné généré :", output_file)
    return str(output_file)


In [47]:
df_analyze = pd.concat([pd.DataFrame(X_train, columns=X.columns), pd.DataFrame(y_train, columns=["event", "time"])],axis=1)

bool_cols = df_analyze.select_dtypes(include=['bool']).columns
df_analyze[bool_cols] = df_analyze[bool_cols].astype(int)

report_path = create_EDA_report(df_analyze, target_variables=["event", "time"], verbose=-1)



Analyse pour la variable cible 'event'
Shape of your Data Set loaded: (2221, 20)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    19 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 2221 exceeds maximum, randomly sampling 2221 rows for EDA...

################ Binary_Classification problem #####################




Saving scatterplots in HTML format
  0%|          | 0/19 [00:00<?, ?it/s]



 11%|█         | 2/19 [00:00<00:01, 14.76it/s]



 21%|██        | 4/19 [00:00<00:00, 16.49it/s]



 32%|███▏      | 6/19 [00:00<00:00, 17.51it/s]



 42%|████▏     | 8/19 [00:00<00:00, 17.32it/s]



 58%|█████▊    | 11/19 [00:00<00:00, 19.70it/s]



 74%|███████▎  | 14/19 [00:00<00:00, 20.60it/s]



 89%|████████▉ | 17/19 [00:00<00:00, 19.74it/s]



                                               

Saving pair_scatters in HTML format
                                                 

Saving distplots_nums in HTML format
                                               

KDE plot is erroring due to problems with DynamicMaps. Hence it is skipped


Saving violinplots in HTML format


Saving heatmaps in HTML format
Time to run AutoViz (in seconds) = 14
Analyse pour la variable cible 'time'
Shape of your Data Set loaded: (2221, 20)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    19 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 2221 exceeds maximum, randomly sampling 2221 rows for EDA...

################ Regression problem #####################


Saving scatterplots in HTML format
                                                

Saving pair_scatters in HTML format
                                                 

Saving distplots_cats in HTML format
                                     

Saving distplots_nums in HTML format
                                               

Saving kde_plots in HTML format


Saving violinplots in HTML format


Saving heatmaps in HTML format


Saving cat_var_plots in HTML format
Time to run AutoViz (in seconds) = 10        
Rapport combiné généré : report/eda/EDA_20250306_161249/EDA_report.html


In [None]:
from src.report import EDAReport

df_analyze = pd.concat([pd.DataFrame(X_train, columns=X.columns), pd.DataFrame(y_train, columns=["event", "time"])],axis=1)

bool_cols = df_analyze.select_dtypes(include=['bool']).columns
df_analyze[bool_cols] = df_analyze[bool_cols].astype(int)

eda = EDAReport(df_analyze, target_variables=["event", "time"])
eda.generate_report()
eda.display()

In [48]:
import os
import webbrowser
from http.server import HTTPServer, SimpleHTTPRequestHandler
import threading
from IPython.display import HTML

def display_html_report(html_file):
    def start_server():
        server_address = ('', 8000)
        httpd = HTTPServer(server_address, SimpleHTTPRequestHandler)
        httpd.serve_forever()

    # Démarrer le serveur dans un thread séparé
    thread = threading.Thread(target=start_server)
    thread.daemon = True
    thread.start()

    # Style CSS amélioré avec arrière-plan
    html_content = f'''
    <div style="
        padding: 20px;
        background: #f5f5f5;
        border-radius: 10px;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    ">
        <iframe 
            src="http://localhost:8000/{html_file}" 
            width="100%" 
            height="800px"
            style="
                border: none;
                border-radius: 8px;
                background: white;
            "
        ></iframe>
    </div>
    '''

    display(HTML(html_content))

In [None]:
display_html_report(report_path)