# Régression linéaire sans utiliser de package python Assurance

## Import des packages

In [338]:
#%pip install pandas
#%pip install numpy
#%pip install matplotlib
#%pip install sklearn
%pip install scikit-learn




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [339]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import seaborn as sns
import array
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from  sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import csv
import datetime
import itertools

- Chargement des données

In [340]:
def load_data(name):
    df = pd.read_csv(name)
    return df

- préparation (filtrage) des colonnes

In [341]:
def prepare_data(df):
    columns=[]
    columns_category=[]
    for col in df.columns:
        if is_numeric_dtype(df[col]):
            #print(f"La colonne '{col}' est numérique.")
            columns.append(col)
        else:
            #print(f"La colonne '{col}' n'est pas numérique.")
            columns_category.append(col)
    
    df_numeric = df[columns].astype(float)
    df_category = df[columns_category]
    return df_category, df_numeric

- transformation des catégories

In [342]:
def transform_data(data):
    encoder = OneHotEncoder()
    for col in data.columns:
        one_hot_array = encoder.fit_transform(data[[col]]).toarray()
        one_hot_df = pd.DataFrame(one_hot_array, columns = encoder.get_feature_names_out(), index = data.index)
        if col not in ("region","age_group","bmi_category"):
            data = pd.concat([data, one_hot_df.iloc[:,0]], axis=1).drop([col], axis=1)
        else:
            data = pd.concat([data, one_hot_df], axis=1).drop([col], axis=1)    
    return data           

- la normalisation (centrage et réduction)

In [343]:
def normalize_data(data, normalisation_code):
    """
    Normalise les données selon le code fourni.
    
    Args:
        data (pd.DataFrame): Données à normaliser.
        normalisation_code (int): Code de normalisation.
            - 0 : Standardisation manuelle (z-score).
            - 1 : StandardScaler.
            - 2 : MinMaxScaler.
    
    Returns:
        pd.DataFrame: Données normalisées.
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("Les données doivent être un DataFrame Pandas.")
    
    if normalisation_code not in [0, 1, 2]:
        raise ValueError("Code de normalisation invalide. Utiliser 0, 1 ou 2.")
    
    if normalisation_code == 0:
        # Normalisation manuelle (z-score)
        for i in range(data.shape[1]):
            mu = data.iloc[:, i].mean()
            std = data.iloc[:, i].std()
            if std != 0:  # Éviter la division par zéro
                data.iloc[:, i] = (data.iloc[:, i] - mu) / std
            else:
                data.iloc[:, i] = 0  # Si la colonne est constante, tous les z-scores sont 0
    else:
        # Normalisation avec Scikit-learn
        if normalisation_code in [1, 2]:
            scaler = StandardScaler() if normalisation_code == 1 else MinMaxScaler()
            data_scaled = scaler.fit_transform(data)
            data = pd.DataFrame(data_scaled, columns=data.columns)
    
    return data


. calcul performance

In [344]:
def calcul_Performane(y_test, y_pred):
        # Évaluation du modèle
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return r2

- Affichage

In [345]:
def plot_performance(J_all, num_epochs):
	plt.xlabel('Modeles')
	plt.ylabel('R2')
	#plt.scatter(num_epochs, J_all,c ='red', marker = 'o' )
	plt.plot(num_epochs, J_all, 'm', linewidth = "1")
	plt.plot
	plt.show()

- modélisation des données

In [346]:
def model_data(data_X, data_cible_y, num_model):
    
    # Division des données en ensembles d'apprentissage et de test
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_cible_y, test_size=0.2, random_state=42)

    # Modèle basé sur la moyenne (DummyRegressor)
    if num_model == 0:
        model = DummyRegressor(strategy='mean')
        model.fit(X_train, y_train)   
    elif num_model == 1:
        model = LinearRegression().fit(X_train, y_train)
    elif num_model == 2:
        model = linear_model.Lasso(alpha=0.1).fit(X_train, y_train)
    elif num_model == 3:
        model = Ridge(alpha=0.1).fit(X_train, y_train)
    elif num_model == 4:
        model = ElasticNet(random_state=0).fit(X_train, y_train)
    else:
        model = RandomForestRegressor(random_state=42).fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
    r2 = calcul_Performane(y_test, y_pred)
    return r2


In [347]:
def RRF(data_X, data_cible_y): 

    X_train, X_test, y_train, y_test = train_test_split(data_X, data_cible_y, test_size=0.2, random_state=42)
    # Régression linéaire avec PolynomialFeatures
    poly = PolynomialFeatures(degree=2, include_bias=True)
    #scaler = StandardScaler()
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # Modèle régression linéaire
    linear_model = LinearRegression()
    linear_model.fit(X_train_poly, y_train)
    y_pred_linear = linear_model.predict(X_test_poly)
    return(r2_score(y_test, y_pred_linear))

## Main Function

In [348]:
df = load_data("df_assurance_clean_with_log.csv")
#df_assurance_clean_with_log
#print(df.info())
df_category, df_numeric = prepare_data(df)
df_cible = df_numeric.iloc[:,-1]
df_numeric = df_numeric[df_numeric.columns[:-1]]
#print(df_numeric)
df_category_transforme = transform_data(df_category)
#print(df_category_transforme.info())

#print(df_cible)
model_dict ={
      0:'DummyRegr',
      1:'LinearReg',
      2:'Lasso',
      3:'Ridge',
      4:'ElasticNet',
             }

combined_df = pd.concat([df_numeric, df_category_transforme], axis=1)
#print(combined_df.info())
# Afficher les colonnes combinées
#print(combined_df.columns)

columns = combined_df.columns
#print(columns)

performances=[]
n_epochs = []
count = 0

#df_subset_normalizes = normalize_data(combined_df, 0)
#performances.append(model_data(df_subset_normalizes, df_cible, 3))
#print(performances)
date_now = datetime.datetime.today()
# Reformater la date pour éviter les caractères spéciaux
formatted_date = date_now.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "results_RL_Norm1_" + formatted_date + ".csv"

Selection = False

with open(file_name, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["i", "j", "k", "l", "result"])  # En-têtes
                    
    if Selection : 
        for r in range(1, len(columns) + 1):  # Longueur des combinaisons, de 1 à nombre de colonnes
            combinaisons = list(itertools.combinations(columns, r))
            #print(combinaisons)
            for combinaison in combinaisons:
                df_subset = combined_df[list(combinaison)]  # Sous-ensemble du DataFrame
                
                #for normalisation in range(3):# on utilise qu'une méthode de normalisation pour le moment
                #    df_subset_normalizes = normalize_data(df_subset, normalisation)
                df_subset_normalizes = normalize_data(df_subset, 1)


                    #for methode_modelisation in range(6):#:# on utilise que deux méthode de modélisation linéaire pour le moment
                resultat = RRF(df_subset_normalizes, df_cible)
                performances.append(resultat)
                n_epochs.append(count)
                count +=1
                writer.writerow([r, combinaison, 1, 6, resultat])
        
        plot_performance(performances, n_epochs )
        #performances.append(r2)
        #print(performances)
    else:
        combined_df_normalizes = normalize_data(combined_df, 1)
        resultat = RRF(combined_df_normalizes, df_cible)
        writer.writerow(['All','ALL', 1, 'RFF', resultat])
        print("performances R2: ", resultat)

performances R2:  0.7414517766020292


. Utilisation de PolynomialFeatures pour améliorer les performances

In [349]:
# Chargement des données
#('age', 'bmi', 'children', 'smoker_no', 'region_northwest', 'region_southeast', 'region_southwest')
#combined_df_selection = combined_df[['age', 'bmi', 'children', 'smoker_no', 'region_northwest', 'region_southeast', 'region_southwest']]
#print(combined_df_selection.info)
combined_df = normalize_data(combined_df, 1)
X_train, X_test, y_train, y_test = train_test_split(combined_df, df_cible, test_size=0.2, random_state=42)

# Régression linéaire avec PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=True)
#scaler = StandardScaler()
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Modèle régression linéaire
linear_model = LinearRegression()
linear_model.fit(X_train_poly, y_train)
y_pred_linear = linear_model.predict(X_test_poly)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)
print(f"Régression linéaire R2: {r2_linear:.3f}")

# Forêt aléatoire avec PolynomialFeatures
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_poly, y_train)
y_pred_rf = rf_model.predict(X_test_poly)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf= r2_score(y_test, y_pred_rf)
print(f"Forêt aléatoire R2: {r2_rf:.3f}")


Régression linéaire R2: 0.741
Forêt aléatoire R2: 0.879


. Utilisation des pipelines

In [350]:
numeric_features = ['age', 'bmi', 'children','log_age','log_bmi','log_children']
categorical_features = ['sex','smoker']
ordinal_features = ['region','age_group','bmi_category']

df_cible = df.iloc[:,-1]
df_numeric = df[df.columns[:-1]]
# Chargement des données
X_train, X_test, y_train, y_test = train_test_split(df_numeric, df_cible, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),  # Normalisation des données numériques
        ('cat', OneHotEncoder(drop='first'), categorical_features),  # Encodage des variables catégoriques
        ('ord', OrdinalEncoder(), ordinal_features)
    ], remainder='passthrough',
    )
#print(preprocessor)
# Création d'un pipeline avec prétraitement et modèle
# Régression linéaire
lr_model = make_pipeline(preprocessor, PolynomialFeatures(2,), LinearRegression())
lr_model.fit(X_train, y_train)

# Forêt aléatoire #n_estimators=100, max_depth=4,random_state=42
rf_model = make_pipeline(preprocessor, PolynomialFeatures(2,), RandomForestRegressor(n_estimators=30, max_depth=4, min_samples_split=5,random_state=42))
rf_model.fit(X_train, y_train)

#DecisionTreeRegressor
tr_model = make_pipeline(preprocessor, PolynomialFeatures(2,), DecisionTreeRegressor(max_depth=4, min_samples_split=4,random_state=42))
tr_model.fit(X_train, y_train)

#GradientBoostingRegressor
gb_model = make_pipeline(preprocessor, GradientBoostingRegressor())
gb_model.fit(X_train, y_train)

#GradientBoostingRegressor_reg
gbreg_model = make_pipeline(preprocessor, GradientBoostingRegressor(n_estimators=41, max_depth=3, min_samples_split=4, random_state=42))
gbreg_model.fit(X_train, y_train)

In [351]:
# Prédictions
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_tr = tr_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)
y_pred_gbreg = gbreg_model.predict(X_test)

# Evaluation
mse_lr = root_mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("Score :", rf_model.score(X_test, y_test))

mse_rf = root_mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

r2_tr = r2_score(y_test, y_pred_tr)
r2_gb = r2_score(y_test, y_pred_gb)
r2_gbreg = r2_score(y_test, y_pred_gbreg)

# Affichage des résultats
print("\nRésultats Régression Linéaire après Normalisation")
print("MSE :", mse_lr)
print("R² :", r2_lr)

print("\nRésultats Forêt Aléatoire après Normalisation")
print("MSE :", mse_rf)
print("R² :", r2_rf)

print("\nRésultats DecisionTreeRegressor après Normalisation")
print("R² :", r2_tr)

print("\nRésultats DecisionTreeRegressor après Normalisation")
print("R² :", r2_gb)

print("\nRésultats DecisionTreeRegressor après Normalisation")
print("R² :", r2_gbreg)

Score : 0.8961983308010555

Résultats Régression Linéaire après Normalisation
MSE : 4535.709415963455
R² : 0.8880436408266874

Résultats Forêt Aléatoire après Normalisation
MSE : 4367.400390723785
R² : 0.8961983308010555

Résultats DecisionTreeRegressor après Normalisation
R² : 0.8729465334648522

Résultats DecisionTreeRegressor après Normalisation
R² : 0.9032852405274839

Résultats DecisionTreeRegressor après Normalisation
R² : 0.9013967105494541
