In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import TargetEncoder
from sklearn.ensemble import GradientBoostingRegressor


# chargement datasets

In [2]:
clean_train = pd.read_pickle("../data/clean_train.p")
clean_test = pd.read_pickle("../data/clean_test.p")

# features selection et encoding

In [5]:
var_explicatives = ['VFN', 'Tan', 'T', 'Va', 'Ve', 'Cn', 'Ft', 'Fm', 'Fuel_consumption_', 'Electric_range_(km)']

X = clean_train[var_explicatives]
test_rf = clean_test[var_explicatives]
y = clean_train["Ewltp_(g/km)"]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [7]:
encoder = TargetEncoder()
cat_col = clean_train[var_explicatives].select_dtypes(include="object")
for col in cat_col:
    X_train[col] = encoder.fit_transform(X_train[col].values.reshape(-1,1),y_train)
    X_test[col] = encoder.transform(X_test[col].values.reshape(-1,1))
    test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))
A

# Modélisation

In [10]:
N_ESTIMATORS = 100
MAX_DEPTH = 20
N_ITERATION = 100
print("lancement !")
model_xgboost = GradientBoostingRegressor(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH,random_state=10)
print("fit en cours... ⏳")
model_xgboost.fit(X_train,y_train)
print("fit terminé 🎉")
print("predict en cours... ⏳")
y_pred = model_xgboost.predict(X_test)
print("predict effectué 🎉")
mae_xgb = mean_absolute_error(y_test, y_pred)
print(f"MAE de XBG : {mae_xgb}")

lancement !
fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE de XBG : 3.1017684945523114


# sauvegarde des artefacts

In [11]:
parameters_dictionnary = {"Model" : "XGBoost", 
                          "N_ESTIMATORS" : N_ESTIMATORS, 
                          'MAX_DEPTH' : MAX_DEPTH,
                          'N_ITERATIONS' : N_ITERATION,
                          "MAE" : mae_xgb}

In [12]:
import os
import json
from joblib import dump

if not os.path.exists("../artefacts/"):
    os.makedirs("../artefacts/")

if not os.path.exists(f"../artefacts/XGBoost_{round(mae_xgb,2)}"):
    os.makedirs(f"../artefacts/XGBoost_{round(mae_xgb,2)}")

with open(f"../artefacts/XGBoost_{round(mae_xgb,2)}/parameters", "w") as json_file:
    json.dump(parameters_dictionnary, json_file)

dump(model_xgboost, f"../artefacts/XGBoost_{round(mae_xgb,2)}/RF.joblib")

['../artefacts/XGBoost_3.1/RF.joblib']

# Submission

In [13]:
prediction = model_xgboost.predict(test_rf)

submission = clean_test[["ID"]].copy()  
submission["Ewltp (g/km)"] = prediction
submission.to_csv("../data/sample_submission6.csv", index=False)