# Importation des modules

In [60]:
# modules pour manipuler et visualiser les données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# modules pour séparer et évaluer les données
from sklearn.model_selection import train_test_split, cross_validate, KFold, learning_curve, GridSearchCV, RandomizedSearchCV

# modules pour préparer les données
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# modules pour créer et entraîner un modèle
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, lasso_path
from sklearn.dummy import DummyRegressor

# modules pour créer un pipeline 
from sklearn import tree
from sklearn.tree import plot_tree, DecisionTreeRegressor
from sklearn.pipeline import make_pipeline

# pour utiliser des modules de maths et des métriques
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt
from statsmodels.api import OLS
import statsmodels.api as sm

# pour exporter notre modèle
import pickle



In [61]:
dataset = pd.read_csv("dataset.csv")

# On supprime le doublon
dataset = dataset.drop_duplicates()

# Affichage
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# Préparation des données

In [62]:
var_num = ['age', 'children']
var_cat = ['sex' , 'smoker', 'region', 'bmi']

col_transform = ColumnTransformer([
    ("sclal", StandardScaler(), var_num),
    ("one_hot_encoder",OneHotEncoder(drop="if_binary"), var_cat),
])

# # On crée un pipeline
# preprocessor = Pipeline([
#     # on applique un transformateur de colonne 
#     ('normalizer', col_transform)
# ])

In [63]:
def convert_bmi_to_cat(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi < 25:
        return "healthy"
    elif bmi <30:
        return "overweight"
    elif bmi < 40:
        return "obesity"
    else:
        return "morbid_obesity"

dataset['bmi'] = dataset['bmi'].astype(float).apply(lambda x : convert_bmi_to_cat(x))

In [64]:
# La target "charges"
Y = dataset["charges"]

# Les features
X = dataset.drop("charges",axis=1)

# Affichage
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,overweight,0,yes,southwest
1,18,male,obesity,1,no,southeast
2,28,male,obesity,3,no,southeast
3,33,male,healthy,0,no,northwest
4,32,male,overweight,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,obesity,3,no,northwest
1334,18,female,obesity,0,no,northeast
1335,18,female,obesity,0,no,southeast
1336,21,female,overweight,0,no,southwest


In [65]:
# Séparation du dataset en train set (80%) et test set (20%) (stratify sert à bien répartir les fumeurs)
X_train, X_test, y_train, y_test = train_test_split(X, Y,shuffle=True, random_state=42, train_size=0.8,stratify=X[["smoker"]])

# Arbre de décision

In [66]:
arbre = make_pipeline(col_transform, DecisionTreeRegressor())
arbre.fit(X_train,y_train)
arbre.score(X_test,y_test)

0.751213166313419

In [67]:
# Définir les paramètres de recherche
param_grid = {
    'decisiontreeregressor__max_depth': [5, 10, 20],
    'decisiontreeregressor__min_samples_split': [2, 5],
    'decisiontreeregressor__min_samples_leaf': [1, 2],
    'decisiontreeregressor__min_weight_fraction_leaf': [0, 0.1],
    'decisiontreeregressor__max_features': [2, 4],
    'decisiontreeregressor__random_state': [42]
}

# Initialiser GridSearchCV
grid_search = GridSearchCV(arbre, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Lancer la recherche sur la grille
grid_search.fit(X_train, y_train)

In [68]:
best = grid_search.best_params_
best

{'decisiontreeregressor__max_depth': 5,
 'decisiontreeregressor__max_features': 2,
 'decisiontreeregressor__min_samples_leaf': 2,
 'decisiontreeregressor__min_samples_split': 5,
 'decisiontreeregressor__min_weight_fraction_leaf': 0,
 'decisiontreeregressor__random_state': 42}

In [69]:
# Prédire sur les données de test
y_pred = grid_search.predict(X_test)

# Calculer R2
r2 = r2_score(y_test, y_pred)
print("R2 score :", r2)

# Calculer MAE
mae = mean_absolute_error(y_test, y_pred)
print("MAE :", mae)

# Calculer MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE :", mse)

R2 score : 0.8845579517350599
MAE : 2676.4555176651847
MSE : 16641609.065198183


In [77]:
plt.figure(figsize=(20,10))
plot_tree(arbre.named_steps['decisiontreeregressor'], filled=True)
plt.show()

NotFittedError: This DecisionTreeRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

<Figure size 2000x1000 with 0 Axes>