In [1]:
import src.data.make_dataset as md
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import src.models.models as model
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = md.data_cleaning("cardio_train.csv")

In [3]:
df = data.copy()
df["BMI"] = df["weight"]/((df["height"]/100)**2)
df["Pulse Pressure"] = df["ap_hi"]- df["ap_lo"]
df = df.drop(["weight", "height", "ap_hi", "ap_lo"], axis=1)

In [4]:
# split training & test

# 1. Séparer les Features (X) de la Cible (y)
X = df.drop("cardio", axis=1)  # Tout sauf la réponse
y = df["cardio"]               # La réponse (0 ou 1)

# 2. Le Split (stratifié)
# stratify=y est CRUCIAL en médical : il assure d'avoir le même % de malades 
# dans le Train et le Test (ex: 50/50 partout).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,    # 20% pour le test
    random_state=42,  # Pour que le split soit toujours le même (reproductible)
    stratify=y        # Garde l'équilibre des classes
)

print(f"Taille Train : {X_train.shape}")
print(f"Taille Test  : {X_test.shape}")

Taille Train : (55869, 9)
Taille Test  : (13968, 9)


In [5]:
# Normalization
X_train, X_test = md.data_normalization(X_train, X_test)
df.head()

Unnamed: 0_level_0,age,gender,cholesterol,gluc,smoke,alco,active,cardio,BMI,Pulse Pressure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,50,2,1,1,0,0,1,0,21.96712,30
1,55,1,3,1,0,0,1,1,34.927679,50
2,51,1,3,1,0,0,0,1,23.507805,60
3,48,2,1,1,0,0,1,1,28.710479,50
4,47,1,1,1,0,0,0,0,23.011177,40


In [6]:
# parameter grid per model
models_config = {
        'LogisticRegression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {
                'C': [0.1, 1, 10],
                'solver': ['liblinear', 'lbfgs']
            }
        },
        'KNeighbors': {
            'model': KNeighborsClassifier(),
            'params': {
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance']
            }
        },
        'SVM_Stochastic': {
            'model': SGDClassifier(loss='hinge', random_state=42, n_jobs=-1), # loss='hinge' = SVM linéaire
            'params': {
                'alpha': [1e-4, 1e-3, 1e-2], # Remplace le paramètre 'C' (inversement proportionnel)
                'penalty': ['l2', 'l1']
            }
        },
        'XGBoost': {
            'model': GradientBoostingClassifier(),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1]
            }
        }
    }

In [7]:
results = model.run_models(models_config,X_train= X_train,y_train=y_train,X_pred=X_test,y_pred=y_test)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best Parameters: {'solver': 'liblinear', 'C': 0.1}
Best CV Score: 0.6920
Held-out Test Accuracy: 0.6987
Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best Parameters: {'weights': 'uniform', 'n_neighbors': 9}
Best CV Score: 0.6726
Held-out Test Accuracy: 0.6723
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best Parameters: {'penalty': 'l1', 'alpha': 0.0001}
Best CV Score: 0.6891
Held-out Test Accuracy: 0.6896
Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best Parameters: {'n_estimators': 100, 'learning_rate': 0.1}
Best CV Score: 0.7012
Held-out Test Accuracy: 0.7054


In [8]:
meta = model.train_stacking_model(results,X_train= X_train,y_train=y_train,X_pred=X_test,y_pred=y_test)


 CONSTRUCTION DU STACKING MODEL
Modèles de base utilisés : ['lr', 'knn', 'svm']
Entraînement du méta-modèle en cours...
Stacking Train Score: 0.7071
Stacking Test Accuracy: 0.7017
