In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


file_path = 'data.csv'

data = pd.read_csv(file_path)


# Vérification des informations manquantes et des doublons
missing_data = data.isnull().sum()
duplicates = data.duplicated().sum()
data = data.drop_duplicates()

# Afficher le DataFrame avec les nouvelles colonnes binaires
print(data.head())

data.dropna(axis=0, inplace=True)


X = data.drop('charges', axis=1)
y = data['charges']

#Crée une colone de smoker en fonction du BMI
X['smoker_binary'] = (X['smoker'] == 'yes').astype(int)

#Création des intervalles pour les catégories BMI
bins = [0, 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')]  # Les limites des catégories

#Étiquettes pour les catégories BMI
labels = [
    'underweight', 'normal weight', 'overweight',
    'obesity class I', 'obesity class II', 'obesity class III'
]

#Utilisation de pd.cut pour créer de nouvelles colonnes basées sur les catégories BMI
X['BMI_category'] = pd.cut(X['bmi'], bins=bins, labels=labels, right=False)

#Utilisation de pd.get_dummies pour obtenir des colonnes binaires pour chaque catégorie
BMI_dummies = pd.get_dummies(X['BMI_category'])

#Ajout des colonnes binaires au DataFrame X
X = pd.concat([X, BMI_dummies], axis=1)

X['bmi_smoker'] = X['bmi'] * X['smoker_binary']
X = X.drop('smoker_binary', axis=1)

#Suppression de la colonne 'BMI_category' car elle n'est plus nécessaire
X = X.drop('BMI_category', axis=1)

#Affichage du DataFrame avec les nouvelles colonnes binaires pour les catégories BMI
print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
# 80% pour train et 20% de test

print("Train set X", X_train.shape)
print("Train set Y", y_train.shape)
print("Test set X", X_test.shape)
print("Test set Y", y_test.shape)


# Identifier les colonnes catégories et numériques
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns


# Créer le pipeline pour les features numériques
numerical_pipeline = Pipeline([
    ('poly', PolynomialFeatures(2)),
    ('scaler', StandardScaler()) # Ajout de PolynomialFeatures
])


# Créer le pipeline pour les features catégorielles
categorial_pipeline = Pipeline([
    ('encoder', OneHotEncoder()),
    ('poly', PolynomialFeatures(2))
])


# Combine les pipelines en utilisant ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorial', categorial_pipeline, categorical_cols)
    ])




# Créer le pipeline final en ajoutant le model

LR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', LinearRegression())
])

# Lasso_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('Lasso', Lasso())
# ])

# ElasticNet_pipeline = Pipeline([
#     ('prepocessor', preprocessor),
#     ('ElasticNet', ElasticNet())
# ])

print (len(X_train))
print (len(y_train))

# On entraine les donnnées
LR_pipeline.fit(X_train, y_train)
# Lasso_pipeline.fit(X_train, y_train)
# ElasticNet_pipeline.fit(X_train, y_train)

# On predicte Linear Regression
y_pred_LR = LR_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred_LR)
r2 = r2_score(y_test, y_pred_LR)
rmse = np.sqrt(mse)
print(f" score du LR modèle : {LR_pipeline.score(X_test, y_test)}")
print(f"mse : {mse}")
print(f"r2 : {r2}")
print(f"rmse : {rmse}")




   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
      age     sex     bmi  children smoker     region  underweight  \
0      19  female  27.900         0    yes  southwest            0   
1      18    male  33.770         1     no  southeast            0   
2      28    male  33.000         3     no  southeast            0   
3      33    male  22.705         0     no  northwest            0   
4      32    male  28.880         0     no  northwest            0   
...   ...     ...     ...       ...    ...        ...          ...   
1333   50    male  30.970         3     no  northwest            0   
1334   18  female  31.920         0     no  northeast   