In [1]:
# Cellule 1 : imports & config

RANDOM_STATE = 42

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, roc_auc_score, ConfusionMatrixDisplay
)

import lightgbm as lgb
import joblib

pd.set_option("display.max_columns", None)

print("OK imports.")


OK imports.


In [2]:
# Cellule 2 — Chargement du CSV NHANES

df = pd.read_csv("nhanes_ready.csv")
print("Shape df :", df.shape)
df.head(3)


Shape df : (6889, 38)


Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,LBXTC,BMXHT,BMXBMI,BMXWT,BMXWAIST,BMXARMC,PAD680,SLD012,DRKCAL,DRPROT,DRSUGR,DRFIBE,DRTFAT,DRTALCO,DRCARB,DRWATER,smoker,diabetes,PAD680_log,DRTALCO_log,DRWATER_log,DRSUGR_log,DRFIBE_log,DRCARB_log,DRPROT_log,DRTFAT_log,DRKCAL_log,BMXBMI_log,ethnicity_MexicanAmerican,ethnicity_NonHispanicAsian,ethnicity_NonHispanicBlack,ethnicity_NonHispanicWhite,ethnicity_OtherHispanic,ethnicity_OtherRace,chol_high
0,130378.0,43.0,1,264.0,179.5,27.0,86.9,98.3,35.7,360.0,9.5,1390.0,88.525,22.06,5.95,47.42,22.8,109.785,1020.0,1,0,5.888878,3.169686,6.928538,3.1381,1.938742,4.707591,4.494518,3.879913,7.237778,3.332205,False,True,False,False,False,False,1
1,130379.0,66.0,1,214.0,174.2,33.5,101.8,114.7,33.7,480.0,9.0,2484.5,82.62,113.495,24.15,73.19,60.3,280.44,120.0,1,0,6.175867,4.11578,4.795791,4.740531,3.224858,5.639919,4.426283,4.306629,7.818229,3.540959,False,False,False,True,False,False,0
2,130380.0,44.0,2,187.0,152.9,29.7,69.4,93.5,36.3,240.0,8.0,1735.0,67.115,110.92,21.4,55.685,5.397605e-79,245.325,1747.5,0,1,5.484797,5.397605e-79,7.466514,4.717784,3.109061,5.506652,4.221197,4.03751,7.459339,3.424263,False,False,False,False,True,False,0


In [3]:
# Cellule 3 — Définition des features (identique modèle 12)

USE_ETHNICITY = True
target = "chol_high"

core = ["RIDAGEYR", "RIAGENDR", "smoker"]
morpho = ["BMXBMI_log", "BMXWAIST"]
lifestyle = ["PAD680_log", "SLD012"]

ethn = [
    "ethnicity_MexicanAmerican", "ethnicity_OtherHispanic",
    "ethnicity_NonHispanicWhite", "ethnicity_NonHispanicBlack",
    "ethnicity_NonHispanicAsian", "ethnicity_OtherRace"
]

if USE_ETHNICITY:
    candidate_feats = core + morpho + lifestyle + ethn
else:
    candidate_feats = core + morpho + lifestyle

features = [c for c in candidate_feats if c in df.columns]
missing = [c for c in candidate_feats if c not in df.columns]
if missing:
    print("Colonnes absentes (ignorées) :", missing)

X_full = df[features].copy()
y = df[target].astype(int)

print(f"X_full : {X_full.shape} | y pos rate : {y.mean():.3f}")
print("Features utilisées :", features)


X_full : (6889, 13) | y pos rate : 0.090
Features utilisées : ['RIDAGEYR', 'RIAGENDR', 'smoker', 'BMXBMI_log', 'BMXWAIST', 'PAD680_log', 'SLD012', 'ethnicity_MexicanAmerican', 'ethnicity_OtherHispanic', 'ethnicity_NonHispanicWhite', 'ethnicity_NonHispanicBlack', 'ethnicity_NonHispanicAsian', 'ethnicity_OtherRace']


In [4]:
# Cellule 4 — Split stratifié train / val / test (même paramètres que modèle 12)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_full, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, stratify=y_train_full, random_state=RANDOM_STATE
)

print("=== Répartition ===")
print(f"Train : {len(X_train)} | Val : {len(X_val)} | Test : {len(X_test)}")
print("Taux pos — Train/Val/Test :",
      round(y_train.mean(), 3),
      round(y_val.mean(), 3),
      round(y_test.mean(), 3))


=== Répartition ===
Train : 4133 | Val : 1378 | Test : 1378
Taux pos — Train/Val/Test : 0.09 0.09 0.089


In [5]:
# Cellule 5 — Colonnes num/binaire + chargement scaler & modèle

binary_cols = ["RIAGENDR", "smoker"]
if USE_ETHNICITY:
    binary_cols += ethn

binary_cols = [c for c in binary_cols if c in X_train.columns]
num_cols = [c for c in X_train.columns if c not in binary_cols]

print(f"Colonnes binaires ({len(binary_cols)}) :", binary_cols)
print(f"Colonnes standardisées ({len(num_cols)}) :", num_cols)

# Chargement du scaler et du modèle sauvegardés dans le notebook 12
scaler = joblib.load("artifacts_chol_no_bias/scaler_chol_no_bias.pkl")
model = joblib.load("artifacts_chol_no_bias/model_lightgbm_chol_no_bias.pkl")

print("Scaler et modèle chargés.")


Colonnes binaires (8) : ['RIAGENDR', 'smoker', 'ethnicity_MexicanAmerican', 'ethnicity_OtherHispanic', 'ethnicity_NonHispanicWhite', 'ethnicity_NonHispanicBlack', 'ethnicity_NonHispanicAsian', 'ethnicity_OtherRace']
Colonnes standardisées (5) : ['RIDAGEYR', 'BMXBMI_log', 'BMXWAIST', 'PAD680_log', 'SLD012']
Scaler et modèle chargés.


In [6]:
# Cellule 6 — Standardisation de X_test (avec le scaler du modèle 12)

X_test_scaled = X_test.copy()
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

print("X_test_scaled OK. Shape :", X_test_scaled.shape)


X_test_scaled OK. Shape : (1378, 13)


In [7]:
# Cellule 7 — Probabilités du modèle sur le test

from sklearn.metrics import confusion_matrix

# 1) Probabilités du modèle sur le test
y_test_proba = model.predict_proba(X_test_scaled)[:, 1]

# 2) Prédictions au seuil 0.50
y_test_pred_05 = (y_test_proba >= 0.5).astype(int)

print("\n=== RAPPORT (seuil 0.50) ===")
print(classification_report(y_test, y_test_pred_05, digits=3))

print("Matrice de confusion (0.5) :")
print(confusion_matrix(y_test, y_test_pred_05))



=== RAPPORT (seuil 0.50) ===
              precision    recall  f1-score   support

           0      0.957     0.585     0.726      1255
           1      0.147     0.732     0.245       123

    accuracy                          0.598      1378
   macro avg      0.552     0.658     0.486      1378
weighted avg      0.885     0.598     0.683      1378

Matrice de confusion (0.5) :
[[734 521]
 [ 33  90]]


In [8]:
# Cellule 8 — Fonction d'évaluation d'un seuil

from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

def evaluate_threshold(threshold, y_true, probas):
    """
    Retourne les métriques principales pour un seuil donné.
    """
    y_pred = (probas >= threshold).astype(int)

    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)

    report = classification_report(y_true, y_pred, digits=3)
    cm     = confusion_matrix(y_true, y_pred)

    return {
        "threshold": threshold,
            "precision": prec,
            "recall": rec,
            "f1": f1,
            "report": report,
            "cm": cm
    }


In [9]:
# Cellule 9 — Tests sur une grille de seuils

thresholds = np.arange(0.05, 0.95, 0.05)

results = []
for th in thresholds:
    res = evaluate_threshold(th, y_test, y_test_proba)
    results.append(res)
    print("="*60)
    print(f"Seuil = {th:.2f}")
    print(res["report"])

# DataFrame récapitulatif des métriques
df_thr = pd.DataFrame([{
    "threshold": r["threshold"],
    "precision": r["precision"],
    "recall": r["recall"],
    "f1": r["f1"]
} for r in results])

df_thr


Seuil = 0.05
              precision    recall  f1-score   support

           0      0.000     0.000     0.000      1255
           1      0.089     1.000     0.164       123

    accuracy                          0.089      1378
   macro avg      0.045     0.500     0.082      1378
weighted avg      0.008     0.089     0.015      1378

Seuil = 0.10
              precision    recall  f1-score   support

           0      0.985     0.107     0.193      1255
           1      0.097     0.984     0.177       123

    accuracy                          0.185      1378
   macro avg      0.541     0.545     0.185      1378
weighted avg      0.906     0.185     0.191      1378

Seuil = 0.15
              precision    recall  f1-score   support

           0      0.992     0.193     0.323      1255
           1      0.107     0.984     0.193       123

    accuracy                          0.263      1378
   macro avg      0.549     0.588     0.258      1378
weighted avg      0.913     0.263  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Seuil = 0.20
              precision    recall  f1-score   support

           0      0.986     0.284     0.442      1255
           1      0.116     0.959     0.207       123

    accuracy                          0.345      1378
   macro avg      0.551     0.622     0.324      1378
weighted avg      0.909     0.345     0.421      1378

Seuil = 0.25
              precision    recall  f1-score   support

           0      0.980     0.354     0.520      1255
           1      0.123     0.927     0.218       123

    accuracy                          0.405      1378
   macro avg      0.552     0.640     0.369      1378
weighted avg      0.904     0.405     0.493      1378

Seuil = 0.30
              precision    recall  f1-score   support

           0      0.975     0.396     0.563      1255
           1      0.127     0.894     0.222       123

    accuracy                          0.440      1378
   macro avg      0.551     0.645     0.393      1378
weighted avg      0.899     0.440 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,threshold,precision,recall,f1
0,0.05,0.08926,1.0,0.163891
1,0.1,0.097424,0.98374,0.177289
2,0.15,0.106702,0.98374,0.192522
3,0.2,0.116142,0.95935,0.207199
4,0.25,0.123243,0.926829,0.217557
5,0.3,0.126728,0.894309,0.221998
6,0.35,0.130381,0.861789,0.226496
7,0.4,0.136364,0.829268,0.234214
8,0.45,0.141813,0.788618,0.240397
9,0.5,0.1473,0.731707,0.245232


In [10]:
# Cellule 10 : matrice de confusion pour seuil = 0.35

threshold = 0.35

y_pred_035 = (y_test_proba >= threshold).astype(int)

cm_035 = confusion_matrix(y_test, y_pred_035)
print(cm_035)


[[548 707]
 [ 17 106]]
