In [22]:
# =========================
# TRAIN CATBOOST + PREDICTION
# =========================
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import optuna
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd

In [23]:

# -------------------------
# Paramètres
# -------------------------
TRAIN_FILE = "../ressources/npyDS/DataSetLasso/train.csv"
VAL_FILE   = "../ressources/npyDS/DataSetLasso/val.csv"
TEST_FILE  = "../ressources/npyDS/DataSetLasso/test.csv"
SUBMISSION_FILE = "./ressources/results/CATBOOST_Lasso_Optuna.csv"
Y_NAME = "y"

EARLY_STOPPING_ROUNDS = 50
NUM_ITERATIONS = 30000
RANDOM_STATE = 42

In [24]:

# -------------------------
# Chargement des données
# -------------------------
train_df = pd.read_csv(TRAIN_FILE)
val_df   = pd.read_csv(VAL_FILE)
test_df  = pd.read_csv(TEST_FILE)

y_train = train_df[Y_NAME].to_numpy()
X_train = train_df.drop(columns=[Y_NAME]).to_numpy()

y_val = val_df[Y_NAME].to_numpy()
X_val = val_df.drop(columns=[Y_NAME]).to_numpy()

X_test = test_df.to_numpy()


In [25]:
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'iterations': 2000,
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
        'random_seed': 42,
        'loss_function': 'Huber:delta=0.05',
        'eval_metric': 'MAE',
        'verbose': 0,
        'task_type': 'CPU'
    }

    model = xgb(**params)

    # Cross-validation
    score = cross_val_score(
        model,
        X_train,
        y_train,
        cv=3,
        scoring='r2'
    ).mean()

    return score

In [26]:
study = optuna.create_study(direction='maximize')  # on maximise R²
study.optimize(objective, n_trials=50)  # 50 essais, tu peux augmenter si tu veux plus de précision
print("Meilleurs paramètres :", study.best_params)
print("Meilleur score R² :", study.best_value)

[I 2025-12-17 22:36:41,988] A new study created in memory with name: no-name-7eb67e6e-2b88-46d9-b1aa-6bc357f590c1
[I 2025-12-17 22:37:18,557] Trial 0 finished with value: 0.8979516057590144 and parameters: {'depth': 6, 'learning_rate': 0.029019628514081634, 'l2_leaf_reg': 10, 'subsample': 0.8666756638108309, 'colsample_bylevel': 0.713204740950334}. Best is trial 0 with value: 0.8979516057590144.
[I 2025-12-17 22:37:43,953] Trial 1 finished with value: 0.8443685434471538 and parameters: {'depth': 5, 'learning_rate': 0.026971406516374394, 'l2_leaf_reg': 9, 'subsample': 0.6219775936734664, 'colsample_bylevel': 0.7915166180645159}. Best is trial 0 with value: 0.8979516057590144.
[I 2025-12-17 22:38:53,073] Trial 2 finished with value: 0.7880562395274565 and parameters: {'depth': 8, 'learning_rate': 0.05924582150684593, 'l2_leaf_reg': 7, 'subsample': 0.8755409787440621, 'colsample_bylevel': 0.8398712109372369}. Best is trial 0 with value: 0.8979516057590144.
[I 2025-12-17 22:39:22,532] Tria

Meilleurs paramètres : {'depth': 8, 'learning_rate': 0.037131908555001, 'l2_leaf_reg': 8, 'subsample': 0.9481957612344127, 'colsample_bylevel': 0.6774015121344121}
Meilleur score R² : 0.9085775993770402


In [27]:
# best_params = study.best_params
# best_params.update({
#     'iterations': 2000,
#     'loss_function': 'RMSE',
#     'eval_metric': 'RMSE',
#     'verbose': 500,
#     'random_seed': 42
# })
#
# final_model = CatBoostRegressor(**best_params)
# final_model.fit(X_train, y_train, eval_set=(X_val, y_val))