In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import shap

# 1. CHARGEMENT DU FICHIER D√âJ√Ä TRAIT√â
# Attention : Assure-toi que le chemin est bon
df = pd.read_csv('../../data/processed/train_processed.csv')

print("‚úÖ Donn√©es trait√©es charg√©es !")
print(df.info()) # V√©rifie que tout est bien num√©rique (float/int)

# 2. S√âPARATION CIBLE / FEATURES
target_col = 'heart_disease'

# Si la cible est encore du texte (ex: "Malade"/"Sain"), on convertit
if df[target_col].dtype == 'object':
    df[target_col] = df[target_col].apply(lambda x: 1 if x in ['Malade', 'Yes', '1'] else 0)

X = df.drop(columns=[target_col])
y = df[target_col]

# 3. SPLIT TRAIN / TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. ENTRA√éNEMENT (Plus besoin de Pipeline de nettoyage !)
print("\nüöÄ D√©marrage de l'entra√Ænement XGBoost sur donn√©es pr√©-trait√©es...")

model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)
print("‚úÖ Mod√®le entra√Æn√© !")

# 5. √âVALUATION
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\n--- R√âSULTATS ---")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_prob)
print(f"üéØ SCORE AUC : {auc_score:.4f}")

# 6. SHAP (Interpr√©tabilit√©)
# C'est plus simple ici car on n'a pas de pipeline complexe
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

plt.figure()
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.show()

‚úÖ Donn√©es trait√©es charg√©es !
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       630000 non-null  int64  
 1   age                      630000 non-null  int64  
 2   bp                       630000 non-null  int64  
 3   cholesterol              630000 non-null  int64  
 4   max_hr                   630000 non-null  int64  
 5   st_depression            630000 non-null  float64
 6   number_of_vessels_fluro  630000 non-null  int64  
 7   heart_disease            630000 non-null  object 
 8   risk_ratio               630000 non-null  float64
 9   sex_X1                   630000 non-null  int64  
 10  chest_pain_type_X2       630000 non-null  int64  
 11  chest_pain_type_X3       630000 non-null  int64  
 12  chest_pain_type_X4       630000 non-null  int64  
 13  fbs_over_120_X1         



ValueError: could not convert string to float: '[0E0]'