In [None]:
### Step 2: Training, testing, and selection of the best model
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

X = pd.read_csv("X_encoded.csv")
y = pd.read_csv("y_log_price.csv").squeeze()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Let's test different models naively and compare their test metrics on a single data split(train=0.8, test=0,2) and 5 in cross-validation
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_log = model.predict(X_test)
    y_pred = np.exp(y_pred_log)
    y_test_eur = np.exp(y_test)

    mae = mean_absolute_error(y_test_eur, y_pred)
    rmse = mean_squared_error(y_test_eur, y_pred) ** 0.5
    r2 = r2_score(y_test_eur, y_pred)

    scores = cross_val_score(model, X, y, cv=5, scoring="r2")

    results.append({
        "Modèle": name,
        "MAE (€)": round(mae, 2),
        "RMSE (€)": round(rmse, 2),
        "R² Score": round(r2, 4),
        "R² CV Mean": round(np.mean(scores), 4),
        "R² CV Std": round(np.std(scores), 4)
    })

results_df = pd.DataFrame(results).sort_values(by="R² Score", ascending=False)
results_df


Unnamed: 0,Modèle,MAE (€),RMSE (€),R² Score,R² CV Mean,R² CV Std
3,Random Forest,1033.59,1711.85,0.9454,0.4076,0.2703
4,XGBoost,1086.27,1721.63,0.9448,0.56,0.2089
2,Decision Tree,1205.98,2065.55,0.9205,0.0593,0.6955
0,Linear Regression,1636.66,2537.28,0.8801,0.7221,0.1708
1,Ridge Regression,1636.48,2537.8,0.8801,0.7231,0.1696


In [12]:
# We targeted xgboost but found that the average cross-validation was still quite low, 
# even though it was above 50, so we decided to do a grid search to test different combinations and obtain the best metrics and hyperparameters.
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=XGBRegressor(random_state=42, verbosity=0),
    param_grid=param_grid_xgb,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X, y)
best_xgb = grid_xgb.best_estimator_

print("\nBest XGBoost hyperparameters (GridSearchCV):")
print(grid_xgb.best_params_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits

Best XGBoost hyperparameters (GridSearchCV):
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0}


In [13]:
# Final evaluation of the best XGBoost model
xgb_cv_scores = cross_val_score(best_xgb, X, y, cv=5, scoring='r2')
best_pred_log = best_xgb.predict(X_test)
best_pred = np.exp(best_pred_log)
best_r2 = r2_score(np.exp(y_test), best_pred)
best_mae = mean_absolute_error(np.exp(y_test), best_pred)
best_rmse = mean_squared_error(np.exp(y_test), best_pred) ** 0.5

print("\nOptimized XGBoost scores:")
print(f"R²: {best_r2:.4f}, MAE: {best_mae:.2f} €, RMSE: {best_rmse:.2f} €, R² CV Mean : {round(np.mean(xgb_cv_scores), 4)}, R² CV Std : {round(np.std(xgb_cv_scores), 4)}")


Optimized XGBoost scores:
R²: 0.9323, MAE: 1237.24 €, RMSE: 1906.86 €, R² CV Mean : 0.7174, R² CV Std : 0.1262


In [14]:
import joblib
from xgboost import XGBRegressor

final_model = XGBRegressor(
    colsample_bytree=0.8,
    learning_rate=0.05,
    max_depth=3,
    n_estimators=300,
    subsample=1.0,
    random_state=42,
    verbosity=0
)
final_model.fit(X, y)

joblib.dump(final_model, "xgboost_final_model.pkl")

print("XGBoost model (optimized) saved as 'xgboost_final_model.pkl'")

XGBoost model (optimized) saved as 'xgboost_final_model.pkl'
