In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from helpers import select_bmi_column, transform_bmi_features
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import f_classif, chi2, SelectKBest



In [2]:
# os.chdir('/home/utilisateur/projet/Assurance/Assurance_FE/')

Chargement des données depuis un fichier CSV :

In [3]:
data_path = "Dataset_Brief.csv"
df = pd.read_csv(data_path)

Calcul du taux de valeurs manquantes :

In [4]:
missing_rate = df.isna().sum()/df.shape[0]
missing_rate

age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
charges     0.0
dtype: float64

Suppression des lignes contenant des valeurs manquantes :

In [5]:
df = df.dropna()

Identification et suppression des doublons :

In [6]:
duplicates = df.duplicated()
duplicates_data = df[duplicates]
df = df.drop_duplicates(subset=["age", "sex", "bmi", "children", "smoker", "region"], keep='first')

In [7]:
bins = [0, 18.5, 24.9, 29.9, 34.9, float('inf')]

labels = ['Insuffisance pondérale', 'Poids normal', 'Surpoids', 'Obésité modérée', 'Obésité sévère']

df['bmi_category'] = pd.cut(df['bmi'], bins=bins, labels=labels, right=False)

print(df)


      age     sex     bmi  children smoker     region      charges  \
0      19  female  27.900         0    yes  southwest  16884.92400   
1      18    male  33.770         1     no  southeast   1725.55230   
2      28    male  33.000         3     no  southeast   4449.46200   
3      33    male  22.705         0     no  northwest  21984.47061   
4      32    male  28.880         0     no  northwest   3866.85520   
...   ...     ...     ...       ...    ...        ...          ...   
1333   50    male  30.970         3     no  northwest  10600.54830   
1334   18  female  31.920         0     no  northeast   2205.98080   
1335   18  female  36.850         0     no  southeast   1629.83350   
1336   21  female  25.800         0     no  southwest   2007.94500   
1337   61  female  29.070         0    yes  northwest  29141.36030   

         bmi_category  
0            Surpoids  
1     Obésité modérée  
2     Obésité modérée  
3        Poids normal  
4            Surpoids  
...            

Sélection des caractéristiques (features) :

In [8]:
selected_features = ['age','sex' ,'bmi', 'children', 'smoker', 'region']
X = df[selected_features]
y = df['charges']

Division des données en ensembles d'entraînement et de test :

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])

Identification des colonnes numériques et catégorielles :

In [10]:
num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

Création des pipelines de prétraitement :

In [11]:
onehotscale_pipeline = make_pipeline(OneHotEncoder(), RobustScaler(with_centering=False))
scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

ColumnTransformer (preprocessing) :

In [12]:
preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', onehotscale_pipeline, cat_col),
        ('numerical', scale_pipeline, num_col),
        # ('bmi_only', FunctionTransformer(select_bmi_column), ['bmi_category']),
        # ('transform_bmi_features', FunctionTransformer(transform_bmi_features), ['bmi_category'])
    ]
)

Définition de la grille de recherche d'hyperparamètres :

In [13]:
param_grid = {'lasso__alpha': [1,33,50,100]}

Création d'un pipeline pour les caractéristiques polynomiales :

In [14]:
polyscale_pipeline = make_pipeline(PolynomialFeatures(2))

Création du pipeline final :

In [15]:
my_final_pipeline = make_pipeline(preprocessing, polyscale_pipeline)

Entraînement du pipeline final sur les données d'entraînement :

In [16]:
my_final_pipeline.fit(X_train)

Création du modèle de régression Lasso avec GridSearchCV :

In [17]:
model = make_pipeline(
    my_final_pipeline,
    Lasso(random_state=42, max_iter=100000)
)

Recherche de grille pour trouver le meilleur hyperparamètre :

In [18]:
grid_search = GridSearchCV(model, param_grid,cv =5)
grid_search.fit(X_train, y_train)

Évaluation du modèle sur les données d'entraînement :

In [19]:
grid_score = grid_search.score(X_train, y_train)
best_alpha = grid_search.best_params_['lasso__alpha']

Prédiction sur les données de test :

In [20]:
y_pred = grid_search.predict(X_test)

Évaluation de la performance du modèle sur les données de test :

In [21]:
r2_cleaned = r2_score(y_test, y_pred)
mse_cleaned = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse_cleaned = np.sqrt(mse_cleaned)

print("Performance du modèle après suppression des valeurs extrêmes:")
print(grid_search.best_params_)
print(f"R2 Score: {r2_cleaned:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse_cleaned:.2f}")

Performance du modèle après suppression des valeurs extrêmes:
{'lasso__alpha': 33}
R2 Score: 0.8629
MAE: 2660.55
RMSE: 4466.03


Sauvegarde du modèle :

In [22]:
with open('modele.pkl', 'wb') as file:
    pickle.dump(grid_search, file)