# Notebook 4 : Model Training and Evaluation

## Configuration de l'environnement

In [6]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

# Sklearn models
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Boosting libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

print("All libraries imported successfully!")

All libraries imported successfully!


## Parameters

In [7]:
# Set random seed for reproducibility
R_seed = 42
np.random.seed(R_seed)

# K-Fold Cross-Validation setup
N_FOLDS = 5
kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=R_seed)

print(f"Random seed: {R_seed}")
print(f"Cross-validation: {N_FOLDS}-Fold")

Random seed: 42
Cross-validation: 5-Fold


## Load Preprocessed Datasets

In [8]:
# Load Dataset 1: Linear Models
with open('../data/data_processed/dataset_linear_models.pkl', 'rb') as f:
    data_linear = pickle.load(f)
    X_train_linear = data_linear['X_train']
    X_test_linear = data_linear['X_test']
    y_train_linear = data_linear['y_train']
    y_test_linear = data_linear['y_test']

print("Dataset 1 (Linear Models) loaded:")
print(f"  Train: {X_train_linear.shape}, Test: {X_test_linear.shape}")

# Load Dataset 2: Tree-based Models
with open('../data/data_processed/dataset_tree_models.pkl', 'rb') as f:
    data_tree = pickle.load(f)
    X_train_tree = data_tree['X_train']
    X_test_tree = data_tree['X_test']
    y_train_tree = data_tree['y_train']
    y_test_tree = data_tree['y_test']

print("\nDataset 2 (Tree Models) loaded:")
print(f"  Train: {X_train_tree.shape}, Test: {X_test_tree.shape}")

# Load Dataset 3: CatBoost
with open('../data/data_processed/dataset_catboost.pkl', 'rb') as f:
    data_catboost = pickle.load(f)
    X_train_catboost = data_catboost['X_train']
    X_test_catboost = data_catboost['X_test']
    y_train_catboost = data_catboost['y_train']
    y_test_catboost = data_catboost['y_test']
    cat_features_idx = data_catboost['cat_features_idx']

print("\nDataset 3 (CatBoost) loaded:")
print(f"  Train: {X_train_catboost.shape}, Test: {X_test_catboost.shape}")
print(f"  Categorical features: {len(cat_features_idx)}")

Dataset 1 (Linear Models) loaded:
  Train: (16000, 57), Test: (4000, 57)

Dataset 2 (Tree Models) loaded:
  Train: (16000, 75), Test: (4000, 75)

Dataset 3 (CatBoost) loaded:
  Train: (16000, 67), Test: (4000, 67)
  Categorical features: 11


## Evaluation Function

In [9]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    """
    Evaluate a trained model on train and test sets.
    
    Parameters:
    -----------
    model : trained model
    X_train, y_train : training data
    X_test, y_test : test data
    model_name : str, name of the model
    
    Returns:
    --------
    dict : metrics for train and test
    """
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Train_RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Train_MAE': mean_absolute_error(y_train, y_train_pred),
        'Train_R2': r2_score(y_train, y_train_pred),
        'Test_RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'Test_MAE': mean_absolute_error(y_test, y_test_pred),
        'Test_R2': r2_score(y_test, y_test_pred)
    }
    
    # Print results
    print(f"\n{'='*80}")
    print(f"{model_name} - Performance Metrics")
    print(f"{'='*80}")
    print(f"Training Set:")
    print(f"  RMSE: {metrics['Train_RMSE']:.4f}")
    print(f"  MAE:  {metrics['Train_MAE']:.4f}")
    print(f"  R¬≤:   {metrics['Train_R2']:.4f}")
    print(f"\nTest Set:")
    print(f"  RMSE: {metrics['Test_RMSE']:.4f}")
    print(f"  MAE:  {metrics['Test_MAE']:.4f}")
    print(f"  R¬≤:   {metrics['Test_R2']:.4f}")
    print(f"{'='*80}")
    
    return metrics

# Initialize results storage
all_results = []

print("Evaluation function defined")

Evaluation function defined


---
# Model 1: Ridge Regression

In [5]:
print("Training Ridge Regression...")

# Extensive hyperparameter grid
ridge_params = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge = Ridge(random_state=R_seed, max_iter=10000)
ridge_grid = GridSearchCV(
    ridge, 
    ridge_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

ridge_grid.fit(X_train_linear, y_train_linear)

print(f"\nBest parameters: {ridge_grid.best_params_}")
print(f"Best CV RMSE: {-ridge_grid.best_score_:.4f}")

# Evaluate
ridge_best = ridge_grid.best_estimator_
ridge_metrics = evaluate_model(
    ridge_best, 
    X_train_linear, y_train_linear, 
    X_test_linear, y_test_linear,
    'Ridge Regression'
)
all_results.append(ridge_metrics)

Training Ridge Regression...
Fitting 5 folds for each of 77 candidates, totalling 385 fits

Best parameters: {'alpha': 0.01, 'solver': 'auto'}
Best CV RMSE: 3.4540

Ridge Regression - Performance Metrics
Training Set:
  RMSE: 3.4332
  MAE:  2.6907
  R¬≤:   0.8038

Test Set:
  RMSE: 3.6508
  MAE:  2.8106
  R¬≤:   0.7853


---
# Model 2: Lasso Regression

In [6]:
print("Training Lasso Regression...")

# Extensive hyperparameter grid
lasso_params = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
    'selection': ['cyclic', 'random']
}

lasso = Lasso(random_state=R_seed, max_iter=10000)
lasso_grid = GridSearchCV(
    lasso, 
    lasso_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

lasso_grid.fit(X_train_linear, y_train_linear)

print(f"\nBest parameters: {lasso_grid.best_params_}")
print(f"Best CV RMSE: {-lasso_grid.best_score_:.4f}")

# Evaluate
lasso_best = lasso_grid.best_estimator_
lasso_metrics = evaluate_model(
    lasso_best, 
    X_train_linear, y_train_linear, 
    X_test_linear, y_test_linear,
    'Lasso Regression'
)
all_results.append(lasso_metrics)

Training Lasso Regression...
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best parameters: {'alpha': 0.001, 'selection': 'cyclic'}
Best CV RMSE: 3.4541

Lasso Regression - Performance Metrics
Training Set:
  RMSE: 3.4337
  MAE:  2.6903
  R¬≤:   0.8037

Test Set:
  RMSE: 3.6487
  MAE:  2.8083
  R¬≤:   0.7856


---
# Model 3: ElasticNet

In [7]:
print("Training ElasticNet...")

# Extensive hyperparameter grid
elasticnet_params = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99],
    'selection': ['cyclic', 'random']
}

elasticnet = ElasticNet(random_state=R_seed, max_iter=10000)
elasticnet_grid = GridSearchCV(
    elasticnet, 
    elasticnet_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

elasticnet_grid.fit(X_train_linear, y_train_linear)

print(f"\nBest parameters: {elasticnet_grid.best_params_}")
print(f"Best CV RMSE: {-elasticnet_grid.best_score_:.4f}")

# Evaluate
elasticnet_best = elasticnet_grid.best_estimator_
elasticnet_metrics = evaluate_model(
    elasticnet_best, 
    X_train_linear, y_train_linear, 
    X_test_linear, y_test_linear,
    'ElasticNet'
)
all_results.append(elasticnet_metrics)

Training ElasticNet...
Fitting 5 folds for each of 112 candidates, totalling 560 fits

Best parameters: {'alpha': 0.001, 'l1_ratio': 0.99, 'selection': 'cyclic'}
Best CV RMSE: 3.4542

ElasticNet - Performance Metrics
Training Set:
  RMSE: 3.4338
  MAE:  2.6903
  R¬≤:   0.8037

Test Set:
  RMSE: 3.6486
  MAE:  2.8082
  R¬≤:   0.7856


---
# Model 4: Random Forest

In [None]:
print("Training Random Forest Regressor...")
# Optimized hyperparameter grid (reduced for efficiency)
rf_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=R_seed, n_jobs=-1)
rf_grid = GridSearchCV(
    rf, 
    rf_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_train_tree, y_train_tree)

print(f"\nBest parameters: {rf_grid.best_params_}")
print(f"Best CV RMSE: {-rf_grid.best_score_:.4f}")

# Evaluate
rf_best = rf_grid.best_estimator_
rf_metrics = evaluate_model(
    rf_best, 
    X_train_tree, y_train_tree, 
    X_test_tree, y_test_tree,
    'Random Forest'
)
all_results.append(rf_metrics)

Training Random Forest Regressor...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits

Best parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best CV RMSE: 3.2242

Random Forest - Performance Metrics
Training Set:
  RMSE: 0.0000
  MAE:  0.0000
  R¬≤:   1.0000

Test Set:
  RMSE: 3.2527
  MAE:  2.2860
  R¬≤:   0.8296


---
# Model 5: XGBoost

In [10]:
print("Training XGBoost Regressor...")

# Optimized hyperparameter grid (reduced for efficiency)
xgb_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.3]
}

xgb_model = xgb.XGBRegressor(
    random_state=R_seed,
    n_jobs=-1,
    tree_method='hist'
)

xgb_grid = GridSearchCV(
    xgb_model, 
    xgb_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

xgb_grid.fit(X_train_tree, y_train_tree)

print(f"\nBest parameters: {xgb_grid.best_params_}")
print(f"Best CV RMSE: {-xgb_grid.best_score_:.4f}")

# Evaluate
xgb_best = xgb_grid.best_estimator_
xgb_metrics = evaluate_model(
    xgb_best, 
    X_train_tree, y_train_tree, 
    X_test_tree, y_test_tree,
    'XGBoost'
)
all_results.append(xgb_metrics)

Training XGBoost Regressor...
Fitting 5 folds for each of 2916 candidates, totalling 14580 fits

Best parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 0.7}
Best CV RMSE: 2.2462

XGBoost - Performance Metrics
Training Set:
  RMSE: 0.7214
  MAE:  0.4704
  R¬≤:   0.9913

Test Set:
  RMSE: 2.2524
  MAE:  1.4138
  R¬≤:   0.9183


---
# Model 6: LightGBM

In [None]:
print("Training LightGBM Regressor...")

# Optimized hyperparameter grid (reduced for efficiency)
lgb_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 7, 10, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 63, 127],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_samples': [10, 20],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

lgb_model = lgb.LGBMRegressor(
    random_state=R_seed,
    n_jobs=-1,
    verbose=-1
)

lgb_grid = GridSearchCV(
    lgb_model, 
    lgb_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

lgb_grid.fit(X_train_tree, y_train_tree)

print(f"\nBest parameters: {lgb_grid.best_params_}")
print(f"Best CV RMSE: {-lgb_grid.best_score_:.4f}")

# Evaluate
lgb_best = lgb_grid.best_estimator_
lgb_metrics = evaluate_model(
    lgb_best, 
    X_train_tree, y_train_tree, 
    X_test_tree, y_test_tree,
    'LightGBM'
)
all_results.append(lgb_metrics)

Training LightGBM Regressor...
Fitting 5 folds for each of 17496 candidates, totalling 87480 fits


---
# Model 7: CatBoost

In [None]:
print("Training CatBoost Regressor...")

# Optimized hyperparameter grid (reduced for efficiency)
catboost_params = {
    'iterations': [100, 300, 500],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [64, 128],
    'bagging_temperature': [0, 0.5, 1.0],
    'random_strength': [1, 2]
}

catboost_model = CatBoostRegressor(
    random_state=R_seed,
    verbose=0,
    thread_count=-1,
    cat_features=cat_features_idx
)

catboost_grid = GridSearchCV(
    catboost_model, 
    catboost_params, 
    cv=kfold, 
    scoring='neg_root_mean_squared_error',
    n_jobs=1,  # CatBoost handles parallelization internally
    verbose=2
)

catboost_grid.fit(X_train_catboost, y_train_catboost)

print(f"\nBest parameters: {catboost_grid.best_params_}")
print(f"Best CV RMSE: {-catboost_grid.best_score_:.4f}")

# Evaluate
catboost_best = catboost_grid.best_estimator_
catboost_metrics = evaluate_model(
    catboost_best, 
    X_train_catboost, y_train_catboost, 
    X_test_catboost, y_test_catboost,
    'CatBoost'
)
all_results.append(catboost_metrics)

---
## Model Comparison

In [None]:
# Create comparison dataframe
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('Test RMSE')

print("=" * 80)
print("COMPREHENSIVE MODEL COMPARISON")
print("=" * 80)
print("\nAll Models Ranked by Test RMSE:")
print(results_df.to_string(index=False))

print("\n" + "=" * 80)
print("BEST MODEL:")
print("=" * 80)
best_model = results_df.iloc[0]
print(f"\nüèÜ {best_model['Model']}")
print(f"   Test RMSE: {best_model['Test RMSE']:.4f}")
print(f"   Test MAE:  {best_model['Test MAE']:.4f}")
print(f"   Test R¬≤:   {best_model['Test R2']:.4f}")
print(f"   Train RMSE: {best_model['Train RMSE']:.4f}")
print(f"   Overfitting Gap: {best_model['Train RMSE'] - best_model['Test RMSE']:.4f}")

print("\n" + "=" * 80)

In [None]:
import matplotlib.pyplot as plt

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Test RMSE Comparison
axes[0, 0].barh(results_df['Model'], results_df['Test RMSE'], color='steelblue')
axes[0, 0].set_xlabel('RMSE')
axes[0, 0].set_title('Test RMSE by Model (Lower is Better)')
axes[0, 0].invert_yaxis()

# 2. Test MAE Comparison
axes[0, 1].barh(results_df['Model'], results_df['Test MAE'], color='coral')
axes[0, 1].set_xlabel('MAE')
axes[0, 1].set_title('Test MAE by Model (Lower is Better)')
axes[0, 1].invert_yaxis()

# 3. Test R¬≤ Comparison
axes[1, 0].barh(results_df['Model'], results_df['Test R2'], color='seagreen')
axes[1, 0].set_xlabel('R¬≤')
axes[1, 0].set_title('Test R¬≤ by Model (Higher is Better)')
axes[1, 0].invert_yaxis()

# 4. Train vs Test RMSE (Overfitting Check)
x = range(len(results_df))
width = 0.35
axes[1, 1].bar([i - width/2 for i in x], results_df['Train RMSE'], width, label='Train', color='lightblue')
axes[1, 1].bar([i + width/2 for i in x], results_df['Test RMSE'], width, label='Test', color='darkblue')
axes[1, 1].set_xlabel('Model')
axes[1, 1].set_ylabel('RMSE')
axes[1, 1].set_title('Train vs Test RMSE (Overfitting Check)')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

---
## Save Best Models

In [None]:
import os

# Create models directory if it doesn't exist
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Save all trained models
models_to_save = {
    'ridge': ridge_grid,
    'lasso': lasso_grid,
    'elasticnet': elasticnet_grid,
    'random_forest': rf_grid,
    'xgboost': xgb_grid,
    'lightgbm': lgbm_grid,
    'catboost': catboost_grid
}

for model_name, model in models_to_save.items():
    filepath = os.path.join(models_dir, f'{model_name}_best.pkl')
    with open(filepath, 'wb') as f:
        pickle.dump(model.best_estimator_, f)
    print(f"‚úì Saved {model_name} to {filepath}")

# Save the complete GridSearchCV objects (includes all hyperparameter search history)
for model_name, model in models_to_save.items():
    filepath = os.path.join(models_dir, f'{model_name}_gridsearch.pkl')
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    print(f"‚úì Saved {model_name} GridSearch to {filepath}")

# Save the comparison results
results_filepath = os.path.join(models_dir, 'model_comparison_results.csv')
results_df.to_csv(results_filepath, index=False)
print(f"\n‚úì Saved comparison results to {results_filepath}")

print("\n" + "=" * 80)
print("All models saved successfully!")
print("=" * 80)