In [1]:
# =========================
# TRAIN XGBOOST + PREDICTION
# =========================

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib

In [2]:

# -------------------------
# Paramètres
# -------------------------
TRAIN_FILE = "./ressources/npyDS/DataSetLasso/train.csv"
VAL_FILE   = "./ressources/npyDS/DataSetLasso/val.csv"
TEST_FILE  = "./ressources/npyDS/DataSetLasso/test.csv"
SUBMISSION_FILE = "ressources/results/XGB_Lasso_Optuna.csv"
MODEL_FILE = "./ressources/models/xgb_lasso_optuna.joblib"

EARLY_STOPPING_ROUNDS = 50
NUM_BOOST_ROUND = 20000
RANDOM_STATE = 42

In [3]:

# -------------------------
# Chargement des données
# -------------------------
train_df = pd.read_csv(TRAIN_FILE)
val_df   = pd.read_csv(VAL_FILE)
test_df  = pd.read_csv(TEST_FILE)

y_train = train_df["y"].to_numpy()
X_train = train_df.drop(columns=["y"]).to_numpy()

y_val = val_df["y"].to_numpy()
X_val = val_df.drop(columns=["y"]).to_numpy()

X_test = test_df.to_numpy()

In [4]:
# -------------------------
# Dataset XGBoost
# -------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val, label=y_val)
dtest  = xgb.DMatrix(X_test)

# -------------------------
# Paramètres XGBoost
# -------------------------
params = {
 'n_estimators': 2993, 'max_depth': 7, 'learning_rate': 0.08763675484099007, 'subsample': 0.9700702911996296, 'colsample_bytree': 0.852472208909238, 'min_child_weight': 9, 'reg_alpha': 0.018564423579985112, 'reg_lambda': 0.22538057440301235
}

#Meilleurs paramètres : {'n_estimators': 2993, 'max_depth': 7, 'learning_rate': 0.08763675484099007, 'subsample': 0.9700702911996296, 'colsample_bytree': 0.852472208909238, 'min_child_weight': 9, 'reg_alpha': 0.018564423579985112, 'reg_lambda': 0.22538057440301235}

In [5]:

# -------------------------
# Entrainement avec early stopping
# -------------------------
evals = [(dtrain, "train"), (dval, "val")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=NUM_BOOST_ROUND,
    evals=evals,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose_eval=50
)

[0]	train-rmse:0.19256	val-rmse:0.19274


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[50]	train-rmse:0.07175	val-rmse:0.07319
[100]	train-rmse:0.05985	val-rmse:0.06261
[150]	train-rmse:0.05483	val-rmse:0.05900
[200]	train-rmse:0.05136	val-rmse:0.05664
[250]	train-rmse:0.04885	val-rmse:0.05524
[300]	train-rmse:0.04665	val-rmse:0.05412
[350]	train-rmse:0.04491	val-rmse:0.05338
[400]	train-rmse:0.04343	val-rmse:0.05282
[450]	train-rmse:0.04193	val-rmse:0.05223
[500]	train-rmse:0.04077	val-rmse:0.05190
[550]	train-rmse:0.03967	val-rmse:0.05164
[600]	train-rmse:0.03861	val-rmse:0.05126
[650]	train-rmse:0.03766	val-rmse:0.05102
[700]	train-rmse:0.03672	val-rmse:0.05078
[750]	train-rmse:0.03579	val-rmse:0.05050
[800]	train-rmse:0.03509	val-rmse:0.05036
[850]	train-rmse:0.03433	val-rmse:0.05026
[900]	train-rmse:0.03362	val-rmse:0.05010
[950]	train-rmse:0.03296	val-rmse:0.05000
[1000]	train-rmse:0.03234	val-rmse:0.04990
[1050]	train-rmse:0.03179	val-rmse:0.04984
[1100]	train-rmse:0.03125	val-rmse:0.04974
[1150]	train-rmse:0.03073	val-rmse:0.04970
[1200]	train-rmse:0.03021	val-r

In [6]:

# -------------------------
# Sauvegarde du modèle
# -------------------------
joblib.dump(model, MODEL_FILE)

['./ressources/models/xgb_lasso_optuna.joblib']

In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Prédiction sur la validation
y_val_pred = model.predict(dval)

# Métriques
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"RMSE : {rmse:.4f}")
print(f"MAE  : {mae:.4f}")
print(f"R²   : {r2:.4f}")

RMSE : 0.0493
MAE  : 0.0346
R²   : 0.9434


In [8]:

# -------------------------
# Prédiction sur test
# -------------------------
y_test_pred = model.predict(dtest)

# IDs depuis le dataset brut
df_test_raw = pd.read_csv("./ressources/test.csv")
ids = df_test_raw["id"]

# Création du CSV de submission
df_submission = pd.DataFrame({
    "id": ids,
    "wip": 0,
    "investissement": 0,
    "satisfaction": y_test_pred
})

df_submission.to_csv(SUBMISSION_FILE, index=False)

print(f"✅ Fichier {SUBMISSION_FILE} généré avec succès")

✅ Fichier ressources/results/XGB_Lasso_Optuna.csv généré avec succès
