In [1]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler, OneHotEncoder

Chargement des données depuis un fichier CSV :

In [2]:
dataset_path = "Dataset_Brief.csv"
df = pd.read_csv(dataset_path)

Calcul du taux de valeurs manquantes et suppression des lignes avec des valeurs manquantes :

In [3]:
missing_rate = df.isna().sum()/df.shape[0]
missing_rate

df = df.dropna()

Suppression des doublons :

In [4]:
duplicates = df[df.duplicated(keep=False)]
print(duplicates)
index_to_remove = 581
df = df.drop(index_to_remove)
df = df.reset_index(drop=True)
print(df.shape)

     age   sex    bmi  children smoker     region    charges
195   19  male  30.59         0     no  northwest  1639.5631
581   19  male  30.59         0     no  northwest  1639.5631
(1337, 7)


Conversion des valeurs "yes" et "no" en 1 et 0 :

In [5]:
df['smoker'] = df['smoker'].replace({'yes': 1, 'no': 0})
df['sex'] = df['sex'].replace({'female': 1, 'male': 0})

Catégorisation de l'IMC (Indice de Masse Corporelle) :

In [6]:
categories = {
    'Underweight': (0, 18.5),
    'Normal Weight': (18.5, 24.9),
    'Overweight': (25, 29.9),
    'Obesity Class I': (30, 34.9),
    'Obesity Class II': (35, 39.9),
    'Obesity Class III': (40, float('inf'))
}

def categorize_imc(bmi):
    for category, (lower, upper) in categories.items():
        if lower <= bmi <= upper:
            return category

df['imc_category'] = df['bmi'].apply(categorize_imc)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,imc_category
0,19,1,27.900,0,1,southwest,16884.92400,Overweight
1,18,0,33.770,1,0,southeast,1725.55230,Obesity Class I
2,28,0,33.000,3,0,southeast,4449.46200,Obesity Class I
3,33,0,22.705,0,0,northwest,21984.47061,Normal Weight
4,32,0,28.880,0,0,northwest,3866.85520,Overweight
...,...,...,...,...,...,...,...,...
1332,50,0,30.970,3,0,northwest,10600.54830,Obesity Class I
1333,18,1,31.920,0,0,northeast,2205.98080,Obesity Class I
1334,18,1,36.850,0,0,southeast,1629.83350,Obesity Class II
1335,21,1,25.800,0,0,southwest,2007.94500,Overweight


Encodage des variables catégorielles :

In [7]:
object_encodage = ['region', 'imc_category']

for col in object_encodage:
    unique_values = df[col].unique()
    for value in unique_values:
        df[f"{col}_{value}"] = (df[col] == value).astype(int)

df = df.drop(object_encodage, axis=1)
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_southwest,region_southeast,region_northwest,region_northeast,imc_category_Overweight,imc_category_Obesity Class I,imc_category_Normal Weight,imc_category_Obesity Class II,imc_category_Obesity Class III,imc_category_Underweight,imc_category_None
0,19,1,27.900,0,1,16884.92400,1,0,0,0,1,0,0,0,0,0,0
1,18,0,33.770,1,0,1725.55230,0,1,0,0,0,1,0,0,0,0,0
2,28,0,33.000,3,0,4449.46200,0,1,0,0,0,1,0,0,0,0,0
3,33,0,22.705,0,0,21984.47061,0,0,1,0,0,0,1,0,0,0,0
4,32,0,28.880,0,0,3866.85520,0,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332,50,0,30.970,3,0,10600.54830,0,0,1,0,0,1,0,0,0,0,0
1333,18,1,31.920,0,0,2205.98080,0,0,0,1,0,1,0,0,0,0,0
1334,18,1,36.850,0,0,1629.83350,0,1,0,0,0,0,0,1,0,0,0
1335,21,1,25.800,0,0,2007.94500,1,0,0,0,1,0,0,0,0,0,0


Réorganisation des colonnes :

Description statistique du DataFrame :

In [8]:
df["charges"] = df.pop("charges")
df.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region_southwest,region_southeast,region_northwest,region_northeast,imc_category_Overweight,imc_category_Obesity Class I,imc_category_Normal Weight,imc_category_Obesity Class II,imc_category_Obesity Class III,imc_category_Underweight,imc_category_None,charges
count,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0
mean,39.222139,0.495138,30.663452,1.095737,0.204936,0.243082,0.272251,0.242334,0.242334,0.281975,0.289454,0.165295,0.166791,0.068063,0.015707,0.0,13279.121487
std,14.044333,0.500163,6.100468,1.205571,0.403806,0.429104,0.445285,0.428655,0.428655,0.450129,0.453679,0.371586,0.372929,0.251948,0.124385,0.0,12110.359656
min,18.0,0.0,15.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1121.8739
25%,27.0,0.0,26.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4746.344
50%,39.0,0.0,30.4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9386.1613
75%,51.0,1.0,34.7,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,16657.71745
max,64.0,1.0,53.13,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,63770.42801


Sauvegarde du DataFrame nettoyé dans un nouveau fichier CSV :

In [9]:
df.to_csv('Clean_Dataset_Brief.csv', index=False)