In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from joblib import dump
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold
import sys 
sys.path.append("../src")
from data_preparation import DataPreparation

import os
import json

# Import des données

In [9]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

  train = pd.read_csv("../data/train.csv")
  test = pd.read_csv("../data/test.csv")


# Data Preparation

In [10]:
preprocess = DataPreparation(train, test)
train, test = preprocess.prepare_data()

Valeurs manquantes du train supprimées ✅
Valeurs manquantes du test supprimées ✅
Variables renommées ✅
Valeurs manquantes numériques imputées ✅
Valeurs manquantes catégorielles imputées ✅


## Encoding
### One Hot Encoding

In [11]:
categorical_columns = train.select_dtypes(include = 'object')

selected_categoricals = [col for col in categorical_columns if train[col].nunique() < 20]

train = pd.get_dummies(train, columns=selected_categoricals, dtype = 'int')
test = pd.get_dummies(test, columns=selected_categoricals, dtype = 'int')

### Impact encoding

In [12]:
impact_dicts = {}
categorical_columns = train.select_dtypes(include = 'object')

for categorical_feature in categorical_columns :

    category_means = train.groupby(categorical_feature)['Ewltp_(g/km)'].mean()
    category_impacts = category_means - category_means.mean()

    impact_dicts[categorical_feature] = category_impacts.to_dict()

    train['encoded_' + categorical_feature] = train[categorical_feature].map(impact_dicts[categorical_feature])
    train.drop(columns= categorical_feature, inplace=True)    

In [13]:
for categorical_feature in categorical_columns.columns:
    
    default_impact = impact_dicts[categorical_feature].get("Default", 0)

    test['encoded_' + categorical_feature] = test[categorical_feature].apply(lambda x: impact_dicts[categorical_feature].get(x, default_impact))

    test.drop(columns=categorical_feature, inplace=True)


## Feature Selection

In [14]:
train["rapport_poids_puissance"] = train["m_(kg)"]/train["ec_(cm3)"]
test["rapport_poids_puissance"] = test["m_(kg)"]/test["ec_(cm3)"]

## Feature selection

In [15]:
correlations = train.corr()["Ewltp_(g/km)"]
positive_correlations = correlations[(abs(correlations) > 0.20)].index.tolist()
clean_train_filtered = train[positive_correlations]
positive_correlations.remove("Ewltp_(g/km)")

# Model

In [16]:
X = train[positive_correlations]
y = train["Ewltp_(g/km)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=889)

In [21]:
N_ESTIMATORS = 500
MAX_DEPTH = 30

xgb_model = xgb.XGBRegressor(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH)

In [22]:
print("lancement !")

n_splits = 2
kfold = KFold(n_splits=n_splits)

mae_scores = []
best_mae = float('inf')

# Effectuez la validation croisée
for train_index, test_index in kfold.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit en cours
    print("fit en cours... ⏳")
    xgb_model.fit(X_train_fold, y_train_fold)
    print("fit terminé 🎉")

    # Predict en cours
    print("predict en cours... ⏳")
    y_pred_fold = xgb_model.predict(X_test_fold)
    print("predict effectué 🎉")
    
    # Calcul de la MAE pour ce pli
    mae_fold = mean_absolute_error(y_test_fold, y_pred_fold)
    mae_scores.append(mae_fold)
    print(f"MAE {mae_fold}")
    
    # Sauvegarde du modèle si le score MAE actuel est meilleur que le meilleur enregistré jusqu'à présent
    if mae_fold < best_mae:
        best_mae = mae_fold
        best_model = xgb_model

mae_mean = np.mean(mae_scores)
print(f"MAE moyenne de XGBoost avec {n_splits} plis : {mae_mean}")

lancement !
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE 2.9082179798084393
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE 2.911776395534477
MAE moyenne de XGBoost avec 2 plis : 2.909997187671458


# Artefacts

In [23]:
parameters_dictionnary = {"Model" : "XGBoost", 
                          'N_ESTIMATORS':N_ESTIMATORS,
                          "MAX_DEPTH":MAX_DEPTH}

In [24]:
if not os.path.exists("../artefacts/"):
    os.makedirs("../artefacts/")

if not os.path.exists(f"../artefacts/XGB_{round(mae_mean,2)}"):
    os.makedirs(f"../artefacts/XGB_{round(mae_mean,2)}")

with open(f"../artefacts/XGB_{round(mae_mean,2)}/parameters", "w") as json_file:
    json.dump(parameters_dictionnary, json_file)

dump(best_model, f"../artefacts/XGB_{round(mae_mean,2)}/XGB.joblib")

['../artefacts/XGB_2.91/XGB.joblib']

# Prediction

In [None]:
test_rf = test[positive_correlations]
prediction = best_model.predict(test_rf)

In [None]:
submission = test[["ID"]].copy()
submission["Ewltp (g/km)"] = prediction
submission.to_csv("../data/submission_XGB.csv", index=False)