# Imports

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Charger, observer et comprendre le dataset

In [13]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Transformer les données catégorielles en données numériques

In [14]:
from sklearn.preprocessing import OrdinalEncoder

ord_encoded = OrdinalEncoder()
df_categorielles = df.select_dtypes(include=['object', 'category']).columns
df[df_categorielles] = ord_encoded.fit_transform(df[df_categorielles])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,1.0,3.0,16884.924
1,18,1.0,33.77,1,0.0,2.0,1725.5523
2,28,1.0,33.0,3,0.0,2.0,4449.462
3,33,1.0,22.705,0,0.0,1.0,21984.47061
4,32,1.0,28.88,0,0.0,1.0,3866.8552


# Afficher et traiter les données manquantes
Pour traiter les données manquantes, regarder `SimpleImputer` ou `KNNImputer`

In [15]:
missing_values = df.isnull().sum()
print(missing_values)

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


# Standardiser les données

In [16]:
features = df.columns[:-1]
X = df[features]
y = df['charges']

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_visu = pd.DataFrame(X, columns=features)
X_visu.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-1.438764,-1.010519,-0.45332,-0.908614,1.970587,1.343905
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,0.438495
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,0.438495
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,-0.466915
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,-0.466915


In [18]:
y.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


# Séparer le dataset en train et test

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

results = []

# Régression Linéaire
Entrainer une régression linéaire avec les hyper-paramètres par défaut

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

lg = LinearRegression()
lg.fit(X_train, y_train)

y_pred_train = lg.predict(X_train)
y_pred_test = lg.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)

results.append({
        'Algorithm': "LinearRegression",
        'Best Hyperparameters': "default",
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 37007457.72622415
MSE test : 35345256.80316576
R² train : 0.7551612463624837
R² test : 0.7336872711428812


# KNN
Entrainer un KNN avec les hyper-paramètres par défaut
Avec `GridSearchCV` ou `RandomizedSearchCV`, optimiser les hyper-paramètres. Utiliser une validation croisée de 5 splits. Vous explorerez les paramètres suivants:
- k : 3, 5, 7, 9, 11,
- poids : uniform, distance,
- distances : euclidean, manhattan, minkowski

In [21]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)

results.append({
        'Algorithm': "KNN",
        'Best Hyperparameters': "default",
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 18976714.11020517
MSE test : 25922849.527965117
R² train : 0.8744513858463263
R² test : 0.804681436154496


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_

y_pred_train = best_knn.predict(X_train)
y_pred_test = best_knn.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)
print('Best parameters :', grid_search.best_params_)

results.append({
        'Algorithm': "KNN",
        'Best Hyperparameters': grid_search.best_params_,
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})


MSE train : 21416341.402113523
MSE test : 25538317.219579086
R² train : 0.85831098220364
R² test : 0.8075787371686123
Best parameters : {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}


# SVM
Entrainer un SVM avec les hyper-paramètres par défaut
Avec `GridSearchCV` ou `RandomizedSearchCV`, optimiser les hyper-paramètres. Utiliser une validation croisée de 5 splits. Vous explorerez les paramètres suivants:
- C : 0.01, 0.1, 1, 10, 100,
- noyau : linear, poly, rbf, sigmoid,
- gamma : scale, auto, 0.001, 0.01, 0.1, 1,
- degrée du polynome : 2, 3, 4, 5

In [23]:
from sklearn.svm import SVR

svm = SVR()
svm.fit(X_train, y_train)

y_pred_train = svm.predict(X_train)
y_pred_test = svm.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)

results.append({
        'Algorithm': "SVM",
        'Best Hyperparameters': "default",
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 167226384.8526425
MSE test : 147351107.07911313
R² train : -0.10635807370294814
R² test : -0.11023313948143088


In [24]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'degree': [2, 3, 4, 5]
}

rand_search = RandomizedSearchCV(SVR(), param_grid, cv=5, n_iter=50)

rand_search.fit(X_train, y_train)

best_svm = rand_search.best_estimator_

y_pred_train = best_svm.predict(X_train)
y_pred_test = best_svm.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)
print('Best parameters :', rand_search.best_params_)

results.append({
        'Algorithm': "SVM",
        'Best Hyperparameters': rand_search.best_params_,
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 25976858.14096175
MSE test : 25303188.971938312
R² train : 0.8281389222220273
R² test : 0.8093503368378216
Best parameters : {'kernel': 'poly', 'gamma': 1, 'degree': 3, 'C': 100}


# DecisionTree
Entrainer un arbre de décision avec les hyper-paramètres par défaut
Avec `GridSearchCV` ou `RandomizedSearchCV`, optimiser les hyper-paramètres. Utiliser une validation croisée de 5 splits. Vous explorerez les hyper-paramètres suivants:
- critère : gini, entropy,
- profondeur maximale : None, 10, 20, 30, 40, 50,
- nombre minimum d'exemples par split : 2, 5, 10, 20,
- nombre minimum d'exemples par feuille : 1, 2, 4, 6,
- nombre de features maximum : None, sqrt, log2

In [25]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)

results.append({
        'Algorithm': "DecisionTree",
        'Best Hyperparameters': "default",
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 182648.17106092346
MSE test : 36711488.36188354
R² train : 0.9987916124666667
R² test : 0.7233932490431392


In [27]:
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_

y_pred_train = best_dt.predict(X_train)
y_pred_test = best_dt.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)
print('Best parameters :', grid_search.best_params_)

results.append({
        'Algorithm': "DecisionTree",
        'Best Hyperparameters': grid_search.best_params_,
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 20423549.286453396
MSE test : 18278659.546725214
R² train : 0.8648792254484909
R² test : 0.8622774271850072
Best parameters : {'criterion': 'absolute_error', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 2}


# RandomForest
Entrainer une forêt aléatoire avec les paramètres par défaut
Avec `GridSearchCV` ou `RandomizedSearchCV`, optimiser les hyper-paramètres. Utiliser une validation croisée de 5 splits. Vous explorerez les hyper-paramètres suivants:
- nombre d'arbres : 50, 100, 200, 300,
- critère : gini, entropy,
- profondeur maximale : None, 10, 20, 30, 40, 50,
- nombre minimum d'exemples par split : 2, 5, 10, 20,
- nombre minimum d'exemples par feuille : 1, 2, 4, 6,
- nombre de features maximum : None, sqrt, log2,
- remise des exemples : vrai, faux

In [28]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)

results.append({
        'Algorithm': "RandomForest",
        'Best Hyperparameters': "default",
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 3555896.8893040125
MSE test : 20150800.28619537
R² train : 0.9764744347239022
R² test : 0.8481715766628453


In [29]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

rand_search_rd_forest = RandomizedSearchCV(RandomForestRegressor(), param_grid, cv=5, n_iter=50)

rand_search_rd_forest.fit(X_train, y_train)

best_rf = rand_search_rd_forest.best_estimator_

y_pred_train = best_rf.predict(X_train)
y_pred_test = best_rf.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print('MSE train :', mse_train)
print('MSE test :', mse_test)
print('R² train :', r2_train)
print('R² test :', r2_test)
print('Best parameters :', rand_search_rd_forest.best_params_)

results.append({
        'Algorithm': "RandomForest",
        'Best Hyperparameters': rand_search_rd_forest.best_params_,
        'MSE train': mse_train,
        'MSE test':  mse_test,
        'R² train': r2_train,
        'R² test': r2_test
})

MSE train : 19535863.858449303
MSE test : 18078413.623319197
R² train : 0.8707520901943634
R² test : 0.8637862021417644
Best parameters : {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': None, 'max_depth': None, 'criterion': 'absolute_error', 'bootstrap': True}


# Rapporter et analyser les résultats obtenir. Choisir le meilleur modèle

In [30]:
df_results = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
df_results

Unnamed: 0,Algorithm,Best Hyperparameters,MSE train,MSE test,R² train,R² test
0,LinearRegression,default,37007460.0,35345260.0,0.755161,0.733687
1,KNN,default,18976710.0,25922850.0,0.874451,0.804681
2,KNN,"{'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}",21416340.0,25538320.0,0.858311,0.807579
3,SVM,default,167226400.0,147351100.0,-0.106358,-0.110233
4,SVM,"{'kernel': 'poly', 'gamma': 1, 'degree': 3, 'C': 100}",25976860.0,25303190.0,0.828139,0.80935
5,DecisionTree,default,182648.2,36711490.0,0.998792,0.723393
6,DecisionTree,"{'criterion': 'absolute_error', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 2}",20423550.0,18278660.0,0.864879,0.862277
7,RandomForest,default,3555897.0,20150800.0,0.976474,0.848172
8,RandomForest,"{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': None, 'max_depth': None, 'criterion': 'absolute_error', 'bootstrap': True}",19535860.0,18078410.0,0.870752,0.863786
