In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from joblib import dump
import numpy as np
from sklearn.model_selection import KFold

import sys 
import os
import json
sys.path.append("../src")
from data_preparation import DataPreparation

# Import des données

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

  train = pd.read_csv("../data/train.csv")
  test = pd.read_csv("../data/test.csv")


# Préparation des données

In [3]:
preprocess = DataPreparation(train, test)
train, test = preprocess.prepare_data()

Valeurs manquantes du train supprimées ✅
Valeurs manquantes du test supprimées ✅
Variables renommées ✅
Valeurs manquantes numériques imputées ✅
Valeurs manquantes catégorielles imputées ✅


# Standardisation des données

In [20]:
from sklearn.preprocessing import StandardScaler

to_std = train.select_dtypes(exclude = 'object').columns.to_list()
to_std.remove("Ewltp_(g/km)")

not_std = train.select_dtypes(include = 'object').columns.to_list()
not_std_w_y = train.select_dtypes(include = 'object').columns.to_list()

not_std_w_y.append("Ewltp_(g/km)")

scaler = StandardScaler()

# Apply the StandardScaler to the data
data_scaled = scaler.fit_transform(train[to_std])
test_scaled = scaler.transform(test[to_std])

# Save the scaled data
data_scaled = pd.DataFrame(data_scaled, columns=to_std)
train = pd.concat([data_scaled,train[not_std_w_y]], axis = 1)

# Same with test
test_scaled = pd.DataFrame(test_scaled, columns=to_std)
test = pd.concat([test_scaled,test[not_std]], axis = 1)

# Encoding des variables catégorielles

In [28]:
encoded = pd.get_dummies(train[["Ft"]], dtype=int)
train = pd.concat([train, encoded], axis=1)
train = train.drop(columns=['Ft'])

encoded = pd.get_dummies(test[["Ft"]], dtype=int)
test = pd.concat([test, encoded], axis=1)
test = test.drop(columns=['Ft'])

In [29]:
cat = ["Cn", "VFN", "Mk", "Man",'Tan', 'T', 'Va', 'Ve', 'Ct', 'Cr', "Country"]
impact_dicts = {}

for categorical_feature in cat:
    category_means = train.groupby(categorical_feature)['Ewltp_(g/km)'].mean()

    category_impacts = category_means - category_means.mean()

    impact_dicts[categorical_feature] = category_impacts.to_dict()

    train['encoded_' + categorical_feature] = train[categorical_feature].map(impact_dicts[categorical_feature])
    train.drop(columns=categorical_feature, inplace=True)

for categorical_feature in cat:

    default_impact = impact_dicts[categorical_feature].get("Default", 0)

    test['encoded_' + categorical_feature] = test[categorical_feature].apply(lambda x: impact_dicts[categorical_feature].get(x, default_impact))

    test.drop(columns=categorical_feature, inplace=True)

# Séléction de variables

In [36]:
numericals = train.select_dtypes(exclude = 'object').columns.to_list()
correlations = train[numericals].corr()["Ewltp_(g/km)"]
selected_features = correlations[abs(correlations) > 0.1].index.tolist()
selected_features.remove("Ewltp_(g/km)")
selected_features

['m_(kg)',
 'Mt',
 'ec_(cm3)',
 'Fuel_consumption_',
 'Electric_range_(km)',
 'Ft_DIESEL',
 'Ft_ELECTRIC/HYDROGEN',
 'Ft_HYBRID',
 'Ft_PETROL',
 'encoded_Cn',
 'encoded_VFN',
 'encoded_Mk',
 'encoded_Man',
 'encoded_Tan',
 'encoded_T',
 'encoded_Va',
 'encoded_Ve',
 'encoded_Ct',
 'encoded_Cr',
 'encoded_Country']

In [37]:
X = train[selected_features]
y = train["Ewltp_(g/km)"]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Modélisation Régréssion Linéaire

In [45]:
model = LinearRegression()

print("lancement !")

n_splits = 4
kfold = KFold(n_splits=n_splits)

mae_scores = []

# Effectuez la validation croisée
for train_index, test_index in kfold.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit en cours
    print("fit en cours... ⏳")
    model.fit(X_train_fold, y_train_fold)
    print("fit terminé 🎉")

    # Predict en cours
    print("predict en cours... ⏳")
    y_pred_fold = model.predict(X_test_fold)
    print("predict effectué 🎉")
    
    # Calcul de la MAE pour ce pli
    mae_fold = mean_absolute_error(y_test_fold, y_pred_fold)
    mae_scores.append(mae_fold)
    print(f"MAE {mae_fold}")

# Calcul de la moyenne des scores MAE de tous les plis
mae_mean = np.mean(mae_scores)
print(f"MAE moyenne de la régression linéaire avec {n_splits} plis : {round(mae_mean, 3)}")

lancement !
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE 13.126180461560034
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE 13.096683695429427
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE 13.109883368609301
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE 13.110994692041235
MAE moyenne de la régression linéaire avec 4 plis : 13.111


# Sauvegarde des Artefacts

In [42]:
if not os.path.exists("../artefacts/"):
    os.makedirs("../artefacts/")

if not os.path.exists(f"../artefacts/RL_{round(mae,2)}"):
    os.makedirs(f"../artefacts/RL_{round(mae,2)}")

dump(model, f"../artefacts/RL_{round(mae,2)}/model_RL.joblib")

['../artefacts/RL_13.13/model_RL.joblib']