In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import shap

# 1. CHARGEMENT DU FICHIER DÃ‰JÃ€ TRAITÃ‰
# Attention : Assure-toi que le chemin est bon
df = pd.read_csv('../data/processed/train_processed.csv')

print("âœ… DonnÃ©es traitÃ©es chargÃ©es !")
print(df.info()) # VÃ©rifie que tout est bien numÃ©rique (float/int)

# 2. SÃ‰PARATION CIBLE / FEATURES
target_col = 'heart_disease'

# Si la cible est encore du texte (ex: "Malade"/"Sain"), on convertit
if df[target_col].dtype == 'object':
    df[target_col] = df[target_col].apply(lambda x: 1 if x in ['Malade', 'Yes', '1'] else 0)

X = df.drop(columns=[target_col])
y = df[target_col]

# 3. SPLIT TRAIN / TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. ENTRAÃŽNEMENT (Plus besoin de Pipeline de nettoyage !)
print("\nðŸš€ DÃ©marrage de l'entraÃ®nement XGBoost sur donnÃ©es prÃ©-traitÃ©es...")

model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)
print("âœ… ModÃ¨le entraÃ®nÃ© !")

# 5. Ã‰VALUATION
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\n--- RÃ‰SULTATS ---")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_prob)
print(f"ðŸŽ¯ SCORE AUC : {auc_score:.4f}")

# 6. SHAP (InterprÃ©tabilitÃ©)
# C'est plus simple ici car on n'a pas de pipeline complexe
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

plt.figure()
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.show()