In [50]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler, OneHotEncoder

Importation des modules et rechargement du module helpers :

In [51]:
import importlib
import helpers

importlib.reload(helpers)
from helpers import categorize_imc

Chargement des données depuis un fichier CSV :

In [52]:
data_path = "Dataset_Brief.csv"
df = pd.read_csv(data_path)

Calcul du taux de valeurs manquantes et suppression des lignes avec des valeurs manquantes :

In [53]:
missing_rate = df.isna().sum()/df.shape[0]
missing_rate
df = df.dropna()

Identification et suppression des doublons :

In [54]:
duplicates = df[df.duplicated(keep=False)]
print(duplicates)
index_to_remove = 581
df = df.drop(index_to_remove)
df = df.reset_index(drop=True)
print(df.shape)

     age   sex    bmi  children smoker     region    charges
195   19  male  30.59         0     no  northwest  1639.5631
581   19  male  30.59         0     no  northwest  1639.5631
(1337, 7)


Définition des catégories pour la catégorisation de l'IMC (Indice de Masse Corporelle) :

In [55]:
categories = {
    'Underweight': (0, 18.5),
    'Normal Weight': (18.5, 24.9),
    'Overweight': (25, 29.9),
    'Obesity Class I': (30, 34.9),
    'Obesity Class II': (35, 39.9),
    'Obesity Class III': (40, float('inf'))
}

Catégorisation de l'IMC et suppression de la colonne 'bmi' :

In [56]:
df['imc_category'] = df['bmi'].apply(categorize_imc, categories=categories)
df = df.drop('bmi', axis=1)
df

Unnamed: 0,age,sex,children,smoker,region,charges,imc_category
0,19,female,0,yes,southwest,16884.92400,Overweight
1,18,male,1,no,southeast,1725.55230,Obesity Class I
2,28,male,3,no,southeast,4449.46200,Obesity Class I
3,33,male,0,no,northwest,21984.47061,Normal Weight
4,32,male,0,no,northwest,3866.85520,Overweight
...,...,...,...,...,...,...,...
1332,50,male,3,no,northwest,10600.54830,Obesity Class I
1333,18,female,0,no,northeast,2205.98080,Obesity Class I
1334,18,female,0,no,southeast,1629.83350,Obesity Class II
1335,21,female,0,no,southwest,2007.94500,Overweight


Sélection des caractéristiques (features) :

Séparation des données en ensembles d'entraînement et de test :

In [57]:
X = df.drop('charges', axis=1)
y = df.charges
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])

Identification des colonnes numériques et catégorielles :

In [58]:
num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

Création des pipelines de prétraitement :

In [59]:
onehotscale_pipeline = make_pipeline(OneHotEncoder(), RobustScaler(with_centering=False))
scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

Prétraitement des données avec des pipelines :

In [60]:
preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', onehotscale_pipeline, cat_col),
        ('numerical', scale_pipeline, num_col)]
)

Définition de la grille de recherche d'hyperparamètres :

In [61]:
param_grid = {'lasso__alpha': list(range(36, 40))}

Construction du pipeline final :

In [62]:
polyscale_pipeline = make_pipeline(PolynomialFeatures(2))
my_final_pipeline = make_pipeline(preprocessing, polyscale_pipeline)

Entraînement du pipeline final sur les données d'entraînement :

In [63]:
my_final_pipeline.fit(X_train)

In [64]:
feature_names = my_final_pipeline.get_feature_names_out(X.columns)
feature_names

array(['1', 'categorical__sex_female', 'categorical__sex_male',
       'categorical__smoker_no', 'categorical__smoker_yes',
       'categorical__region_northeast', 'categorical__region_northwest',
       'categorical__region_southeast', 'categorical__region_southwest',
       'categorical__imc_category_Normal Weight',
       'categorical__imc_category_Obesity Class I',
       'categorical__imc_category_Obesity Class II',
       'categorical__imc_category_Obesity Class III',
       'categorical__imc_category_Overweight',
       'categorical__imc_category_Underweight',
       'categorical__imc_category_None', 'numerical__age',
       'numerical__children', 'categorical__sex_female^2',
       'categorical__sex_female categorical__sex_male',
       'categorical__sex_female categorical__smoker_no',
       'categorical__sex_female categorical__smoker_yes',
       'categorical__sex_female categorical__region_northeast',
       'categorical__sex_female categorical__region_northwest',
       'c

Création du modèle de régression Lasso avec GridSearchCV :

In [65]:
model = make_pipeline(
    my_final_pipeline,
    Lasso(random_state=42, max_iter=100000)
)

Entraînement du modèle Lasso avec recherche de grille :

In [66]:
model = GridSearchCV(model, param_grid,cv =5)
model.fit(X_train, y_train)

Évaluation du modèle sur les données d'entraînement :

In [67]:
grid_score = model.score(X_train, y_train)
best_alpha = model.best_params_['lasso__alpha']

Prédiction sur les données de test :

In [68]:
y_pred = model.predict(X_test)

Évaluation du modèle :

In [69]:
r2_cleaned = r2_score(y_test, y_pred)
mse_cleaned = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse_cleaned = np.sqrt(mse_cleaned)

print("Performance du modèle après suppression des valeurs extrêmes:")
print(grid_score)
print(model.best_params_)
print(f"R2 Score: {r2_cleaned:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse_cleaned:.2f}")

Performance du modèle après suppression des valeurs extrêmes:
0.8559809626895564
{'lasso__alpha': 36}
R2 Score: 0.9231
MAE: 2051.44
RMSE: 3338.52


Sauvegarde du modèle :

In [70]:
with open('modele.pkl', 'wb') as file:
    pickle.dump(model, file)