In [2]:
# onehotencoding pipeline standartscaler minmaxscaler greadsearchcv halvinggridsearchcv train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [3]:
data = pd.read_csv('./scripts/insurance.csv')
print (data.head())
print (data.info())

   age    sexe     bmi  children smoker     region       charge
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sexe      1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charge    1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [4]:
print(data.isnull().sum())

age         0
sexe        0
bmi         0
children    0
smoker      0
region      0
charge      0
dtype: int64


In [5]:
print(data.describe())

               age          bmi     children        charge
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [28]:
# définir les variables explicatives X et la variable cible Y

X = data.drop(columns=["charge"])
y = data['charge']

print(X.head())
print(y.head())

   age    sexe     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest
0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charge, dtype: float64


In [29]:
#transformer les variables catégorielles en variables numériques
X = pd.get_dummies(X, columns=["sexe", "smoker", "region"], drop_first=True)

# vérif après transformation
print(X.head())

   age     bmi  children  sexe_male  smoker_yes  region_northwest  \
0   19  27.900         0      False        True             False   
1   18  33.770         1       True       False             False   
2   28  33.000         3       True       False             False   
3   33  22.705         0       True       False              True   
4   32  28.880         0       True       False              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


In [30]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [31]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mse: {mse:.2f}")
print(f"mae: {mae:.2f}")
print(f"r2: {r2:.4f}")

mse: 33596915.85
mae: 4181.19
r2: 0.7836


In [32]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"mse_rf: {mse_rf:.2f}")
print(f"mae_rf: {mae_rf:.2f}")
print(f"r2_rf: {r2_rf:.4f}")

mse_rf: 20864569.51
mae_rf: 2543.98
r2_rf: 0.8656
