In [59]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

In [60]:
os.chdir('/home/utilisateur/projet/Assurance/Assurance_FE/')
print(os.listdir())
dataset_path = "Clean_Dataset_Brief.csv"
df_clean = pd.read_csv(dataset_path)

['NoteBook_Modelisation.ipynb', 'Clean_Dataset_Brief.csv', 'testmodel.ipynb', 'NoteBook_Netoyage.ipynb', '.git', 'Guide.txt', 'note.txt', 'Dataset_Brief.csv', 'README.md', 'requirements.txt', 'NoteBook_Analyse.ipynb', '.gitignore']


In [61]:
df_clean.describe()

Unnamed: 0,age,Jeune,Adulte,Adulte_moyen,Senior,Très_senior,sex,is_female,is_male,bmi,...,children_4,children_5,smoker,is_smoker,is_not_smoker,is_southwest,is_southeast,is_northwest,is_northeast,charges
count,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,...,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0,1335.0
mean,39.253933,0.0,0.408989,0.302622,0.28839,0.0,0.494382,0.494382,0.505618,30.658157,...,0.018727,0.013483,0.205243,0.205243,0.794757,0.243446,0.27191,0.242697,0.241948,13286.778216
std,14.030779,0.0,0.491831,0.459565,0.453183,0.0,0.500156,0.500156,0.500156,6.101456,...,0.135609,0.115375,0.404031,0.404031,0.404031,0.429323,0.445111,0.428874,0.428423,12115.61515
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1121.8739
25%,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.255,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4746.69845
50%,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,30.4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9386.1613
75%,51.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,34.6875,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,16717.01075
max,64.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,53.13,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,63770.42801


Selection des features pour prédiction.

Définir target y.

In [62]:
selected_features = ['age', 'Jeune', 'Adulte', 'Adulte_moyen', 'Senior', 'Très_senior', 'sex', 'is_female', 'is_male', 'bmi', 'Insuffisance pondérale', 'Poids_normal', 'Surpoids', 'Obésité_de_classe_I_(modérée)', 'Obésité_de_classe_II_(sévère)', 'children', 'children_0', 'children_1', 'children_2', 'children_3', 'children_4', 'children_5', 'smoker','is_smoker', 'is_not_smoker', 'is_southwest', 'is_southeast', 'is_northwest', 'is_northeast']

x = df_clean[selected_features]
y = df_clean['charges']

Faire le split train/test avec pour ratio commun is_smoker.

In [63]:
X_train, X_test, y_train, y_test = train_test_split(x, y, shuffle=True, train_size=0.85, random_state=42, stratify=x['smoker'])

In [64]:
param_grid = {'linearregression__fit_intercept': [True, False]}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    LinearRegression()
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
fit_intercept_option = grid_search.best_params_['linearregression__fit_intercept']
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

{'linearregression__fit_intercept': False}
Performance du modèle final:
R2 Score: -13234109422682859634688.0000
RMSE: 1387481514005408.75


In [65]:
param_grid = {'lasso__alpha': list(range(230, 244))}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    Lasso(random_state=42, max_iter=100000)
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['lasso__alpha']
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")


{'lasso__alpha': 230}
Performance du modèle final:
R2 Score: 0.8807
RMSE: 4166.67


In [66]:
param_grid = {'ridge__alpha': list(range(425, 475))}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    Ridge(random_state=42, max_iter=100000)  
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['ridge__alpha']  
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")


{'ridge__alpha': 439}
Performance du modèle final:
R2 Score: 0.8667
RMSE: 4403.40


In [67]:
param_grid = {'elasticnet__alpha': [0.1, 1, 10], 'elasticnet__l1_ratio': [0.1, 0.50, 1]}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    ElasticNet(alpha=0.01, l1_ratio=0.01, random_state=42, max_iter=10000, tol=0.001)
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['elasticnet__alpha']
best_l1_ratio = grid_search.best_params_['elasticnet__l1_ratio']

y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0.5}
Performance du modèle final:
R2 Score: 0.8675
RMSE: 4389.89


In [68]:
# Ajuster le pipeline sur les données d'entraînement
model.fit(X_train, y_train)

# Calculer les résidus sur l'ensemble d'entraînement
residuals_train = y_train - model.predict(X_train)

# Récupérer les indices des valeurs absolues des résidus supérieures à 10000
outlier_indices_train = np.where(np.abs(residuals_train) > 3000)[0]

# Afficher les indices des valeurs aberrantes sur l'ensemble d'entraînement
print("Indices des valeurs aberrantes dans l'ensemble d'entraînement :", outlier_indices_train)
# Afficher le nombre d'indices trouvés
print("Nombre d'indices trouvés :", len(outlier_indices_train))

Indices des valeurs aberrantes dans l'ensemble d'entraînement : [  13   23   25   30   38   49   57   66   82   93   98   99  108  113
  131  132  133  134  143  153  157  160  165  175  181  182  183  184
  185  187  188  193  194  206  224  232  237  240  250  253  261  262
  264  272  279  286  300  302  310  315  337  343  348  365  366  370
  387  389  390  391  392  400  405  414  415  418  426  429  432  455
  459  464  466  481  482  487  498  516  524  533  534  539  541  543
  545  552  563  570  580  582  591  594  595  598  602  613  623  632
  635  641  644  648  651  652  655  659  660  675  676  677  678  692
  695  699  700  704  705  714  729  735  742  760  767  791  792  796
  803  809  810  824  828  830  834  838  844  847  851  853  856  864
  865  867  870  908  921  922  924  930  933  934  935  950  954  965
  981  982  983  985 1011 1013 1023 1028 1037 1042 1044 1048 1049 1052
 1070 1071 1073 1075 1076 1086 1095 1098 1099 1101 1102 1104 1112 1122
 1123 1124 11

In [69]:
# Assuming X_train_cleaned and y_train_cleaned are your cleaned datasets
# mask = (y_train <= 10000)
# X_train_cleaned = X_train[mask]
# y_train_cleaned = y_train[mask]

# Split the cleaned data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X_train_cleaned, y_train_cleaned, shuffle=True, train_size=0.85, random_state=42, stratify=X_train_cleaned['is_smoker'])

# Filtrer les valeurs aberrantes de X_train et y_train_cleaned
X_train_cleaned = X_train.iloc[~X_train.index.isin(outlier_indices_train)]
y_train_cleaned = y_train.iloc[~y_train.index.isin(outlier_indices_train)]

In [70]:
param_grid = {'linearregression__fit_intercept': [True, False]}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    LinearRegression()
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_cleaned, y_train_cleaned)
fit_intercept_option = grid_search.best_params_['linearregression__fit_intercept']
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

{'linearregression__fit_intercept': False}
Performance du modèle final:
R2 Score: -3603144397542806519808.0000
RMSE: 723970180484636.00


In [71]:
param_grid = {'lasso__alpha': list(range(230, 244))}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    Lasso(random_state=42, max_iter=100000)
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_cleaned, y_train_cleaned)
best_alpha = grid_search.best_params_['lasso__alpha']
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

{'lasso__alpha': 243}
Performance du modèle final:
R2 Score: 0.8813
RMSE: 4154.98


In [72]:

param_grid = {'ridge__alpha': list(range(425, 475))}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    Ridge(random_state=42, max_iter=100000)
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_cleaned, y_train_cleaned)
best_alpha = grid_search.best_params_['ridge__alpha']
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

{'ridge__alpha': 474}
Performance du modèle final:
R2 Score: 0.8667
RMSE: 4403.42


In [73]:

param_grid = {'elasticnet__alpha': [0.1, 1, 10], 'elasticnet__l1_ratio': [0.1, 0.50, 1]}

model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
    ElasticNet(alpha=0.01, l1_ratio=0.01, random_state=42, max_iter=10000, tol=0.001)
)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_cleaned, y_train_cleaned)

best_alpha = grid_search.best_params_['elasticnet__alpha']
best_l1_ratio = grid_search.best_params_['elasticnet__l1_ratio']

y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(grid_search.best_params_)
print("Performance du modèle final:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")


{'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0.5}
Performance du modèle final:
R2 Score: 0.8668
RMSE: 4401.20
