# Importation des modules

In [40]:
# modules pour manipuler et visualiser les données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# modules pour séparer et évaluer les données
from sklearn.model_selection import train_test_split, cross_val_score, KFold

# modules pour préparer les données
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# modules pour créer et entraîner un modèle
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor

# modules pour créer un pipeline de traitement et de modèle
from sklearn import pipeline
from sklearn.pipeline import make_pipeline


On prépare notre dataset nettoyé :

In [25]:
dataset = pd.read_csv("dataset.csv")

# On supprime le doublon
dataset = dataset.drop_duplicates()

# Affichage
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# 1. Dummy modèle

## 1.1. Préparation des données

In [26]:
# La target "charges"
Y = dataset["charges"]

# Les features
X = dataset.drop("charges",axis=1)

# Affichage
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


Dans notre étude, on considèrera la variable "bmi" comme une variable catégorielle :

In [27]:
def convert_bmi_to_cat(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi < 25:
        return "healthy"
    elif bmi <30:
        return "overweight"
    elif bmi < 40:
        return "obesity"
    else:
        return "morbid_obesity"

In [33]:
dataset['bmi'] = dataset['bmi'].astype(float).apply(lambda x : convert_bmi_to_cat(x))
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,overweight,0,1,2,16884.92400
1,18,0,obesity,1,0,1,1725.55230
2,28,0,obesity,3,0,1,4449.46200
3,33,0,healthy,0,0,3,21984.47061
4,32,0,overweight,0,0,3,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,obesity,3,0,3,10600.54830
1334,18,1,obesity,0,0,0,2205.98080
1335,18,1,obesity,0,0,1,1629.83350
1336,21,1,overweight,0,0,2,2007.94500


On sépare les variables numériques et catégorielle :

In [34]:
var_num = ['age','children']

var_cat = ['sex' , 'smoker', 'region', 'bmi']

On applique ensuite deux transformations:<br>
- "sclal" est une transformation de type RobustScaler qui va être appliquée aux colonnex de variables numériques pour normaliser les données en utilisant une échelle robuste aux outliers.<br>
- Le "one_hot_encoder" est une transformation qui va être appliquée aux colonnes de variables catégorielles pour les transformer en variables numériques.

In [45]:
col_transform = ColumnTransformer([
    ("sclal", RobustScaler(), var_num),
    
    ("one_hot_encoder",OneHotEncoder(handle_unknown='ignore'), var_cat),
])

## 1.2. Entraînement du modèle

In [46]:
# Séparation du dataset en train set (80%) et test set (20%) (stratify sert à bien répartir les fumeurs)
X_train, X_test, y_train, y_test = train_test_split(X, Y,shuffle=True, random_state=42, train_size=0.8,stratify=X[["smoker"]])

# On crée un modèle de régression 
dummy = DummyRegressor()

# On entraîne le modèle avec notre train set
dummy.fit(X_train, y_train)

# Coefficient de détermination
dummy.score(X_test, y_test)


-0.0010185684988295307

Un score négatif suggère que le modèle "dummy" a une performance plus que médiocre.

In [32]:
model = LinearRegression()
model.fit(X,Y) #entrainement sur le modèle
print("le modèle est fiable à ",model.score(X,Y)*100,"%")


ValueError: could not convert string to float: 'female'

In [None]:
def moncredit(model,age=23,sex=0,bmi=19.4,children=0,smoker=0,region=0):
  x=np.array([age,sex,bmi,children,smoker,region]).reshape(1,6)
  print(model.predict(x))
 

In [None]:
print("la predictions des charges correspondent à :")
Y_predict=moncredit(model)


la predictions des charges correspondent à :
[155.37618221]




Le resultat est pas conforme, la prediction est fausse

In [1]:
X_train, X_test , Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

print('Train set', X_train.shape)
print('Test set', X_test.shape)

NameError: name 'train_test_split' is not defined

In [None]:
model.fit(X_train, Y_train)
print('Train score:', model.score(X_train,Y_train))
print('Test score:', model.score(X_test,Y_test))

Train score: 0.7493341762465824
Test score: 0.7490999167418126


In [None]:
cross_val_score(LinearRegression(), X_train,Y_train, cv=5, scoring ='accuracy').mean()

Traceback (most recent call last):
  File "/home/apprenant/miniconda3/envs/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/apprenant/miniconda3/envs/env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 107, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/apprenant/miniconda3/envs/env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 268, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/apprenant/miniconda3/envs/env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "/home/apprenant/miniconda3/envs/env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/home/apprenant/miniconda3/e

nan

### Dummy modèle

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True, train_size=0.8, random_state=42)
dummy = DummyRegressor()
X_train.head()
dummy.fit(X_train, Y_train)
dummy.score(X_test, Y_test)

-0.0009192486886582252

Le score est négatif, cela suggère que le modèle "dummy" a une performance clairement médiocre.

In [None]:
cv= KFold(5, random_state=0, shuffle=True)
cross_val_score(LinearRegression(),X,Y, cv =cv)

array([0.79806191, 0.77752613, 0.65595805, 0.75180281, 0.71463349])

In [None]:
cross_val_score(LinearRegression(), X,Y,cv=3)

array([0.7501241 , 0.75436919, 0.72742911])

In [None]:
# err_hist= np.abs(Y-Y_predict )
# plt.hist(err_hist, bins=50)
# plt.show()

Preprossecing (transformation des données)


In [None]:
scaler= MinMaxScaler()
scaler.fit_transform(X)

array([[0.02173913, 1.        , 0.3212268 , 0.        , 1.        ,
        0.66666667],
       [0.        , 0.        , 0.47914985, 0.2       , 0.        ,
        0.33333333],
       [0.2173913 , 0.        , 0.45843422, 0.6       , 0.        ,
        0.33333333],
       ...,
       [0.        , 1.        , 0.56201238, 0.        , 0.        ,
        0.33333333],
       [0.06521739, 1.        , 0.26472962, 0.        , 0.        ,
        0.66666667],
       [0.93478261, 1.        , 0.35270379, 0.        , 1.        ,
        1.        ]])

In [None]:
model.fit(X_train,Y_train)

Version avec Pipeline


In [None]:
y= data['charges'] # Je prend la colonne des charges de monn dataset
x= data.drop('charges', axis = 1) # je prend toute les colonnes de mon dataset excepté la colonne charges
#Cela me permet d'avoir deux dataset et entrainer les paramètres (age,smoker,children,sex,region) sur la valeur charges

In [None]:
numerical_features = ['age','bmi']
categorical_features =  ['children','sex','region']

In [None]:
numerical_pipeline = make_pipeline(SimpleImputer(),StandardScaler())

categorical_pipeline = make_pipeline(SimpleImputer(strategy= 'most_frequent'),OneHotEncoder())

numerical_pipeline
categorical_pipeline

In [None]:
mae_column_transformer((numerical_pipeline, numerical_features),
                       (categorical_pipeline, categorical_features))

NameError: name 'mae_column_transformer' is not defined