In [17]:
#basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#ML libraries
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
import sys
sys.path.append('../src')
import optuna
import joblib
RANDOM_STATE = 42

In [8]:
# Load preprocessed data
df = pd.read_csv("../../data/X_train_preprocessed.csv")

In [9]:
# Define features and target
target = "PrimeCommerciale"
X = df.drop(columns=[target])
y = df[target]

In [11]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.2, random_state=RANDOM_STATE
 )

In [12]:
var_num = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
var_cat = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [13]:
# Define preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
         ('num', StandardScaler(), var_num),           
         ('cat', OneHotEncoder(handle_unknown='ignore'), var_cat)
     ]
 )

In [16]:
# Evaluation function
def evaluate_regression(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

In [18]:
# ===========================
# Optuna objective for XGBoost
# ===========================
def objective_xgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    
    model = Pipeline([
        ('preprocess', preprocessor),
        ('model', XGBRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            objective='reg:squarederror',
            random_state=42
        ))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, _ = evaluate_regression(y_test, y_pred)
    return rmse

# Optuna study for XGBoost
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=30)

print("Best XGB params:", study_xgb.best_params)

# Train best XGBoost model
best_xgb = Pipeline([
    ('preprocess', preprocessor),
    ('model', XGBRegressor(**study_xgb.best_params, objective='reg:squarederror', random_state=42))
])
best_xgb.fit(X_train, y_train)
y_pred_xgb = best_xgb.predict(X_test)
rmse_xgb, r2_xgb = evaluate_regression(y_test, y_pred_xgb)
print(f"\nXGBoost Regressor --> RMSE: {rmse_xgb:.4f}, R2: {r2_xgb:.4f}")

[I 2025-12-23 22:55:40,361] A new study created in memory with name: no-name-743fc8ee-e7ca-4702-8923-d35a31abc8e9
[I 2025-12-23 22:55:40,994] Trial 0 finished with value: 107.65372968301192 and parameters: {'n_estimators': 169, 'max_depth': 8, 'learning_rate': 0.04757170948730508, 'subsample': 0.8018012801794322, 'colsample_bytree': 0.572660232770942}. Best is trial 0 with value: 107.65372968301192.
[I 2025-12-23 22:55:42,163] Trial 1 finished with value: 115.45452326994757 and parameters: {'n_estimators': 302, 'max_depth': 8, 'learning_rate': 0.010168450872262837, 'subsample': 0.7853784768029484, 'colsample_bytree': 0.5553982886966744}. Best is trial 0 with value: 107.65372968301192.
[I 2025-12-23 22:55:43,304] Trial 2 finished with value: 107.09815351365413 and parameters: {'n_estimators': 364, 'max_depth': 8, 'learning_rate': 0.018392988461587708, 'subsample': 0.724758095013798, 'colsample_bytree': 0.9378601791636292}. Best is trial 2 with value: 107.09815351365413.
[I 2025-12-23 22

Best XGB params: {'n_estimators': 488, 'max_depth': 7, 'learning_rate': 0.0465993926653307, 'subsample': 0.6903876261245432, 'colsample_bytree': 0.8873702240299636}

XGBoost Regressor --> RMSE: 106.1646, R2: 0.7594


In [None]:
# Save the best model
joblib.dump(best_xgb, "../../models/best_xgb_regressor_model.pkl")

['../../models/best_xgb_regressor_model.pkl']