In [8]:
import os
import joblib
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

In [None]:
X_train_selected = joblib.load('../data/processed/X_train_selected.pkl')
X_test_selected = joblib.load('../data/processed/X_test_selected.pkl')
y_train = joblib.load('../data/processed/y_train.pkl')

# Create models folder if it does not exist
os.makedirs('../models', exist_ok=True)

In [None]:
models = {
    'RandomForest': RandomForestRegressor(
        n_estimators=100, random_state=42, n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingRegressor(
        n_estimators=100, random_state=42
    ),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'XGBoost': XGBRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3,
        random_state=42, n_jobs=-1
    )
}

baseline_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='r2')
    baseline_results[name] = scores
    print(f"{name} R^2 scores: {scores}")
    print(f"{name} Mean R^2: {scores.mean():.4f}\n")

# Save baseline results
pd.DataFrame({k: v for k, v in baseline_results.items()}).to_csv(
    '../models/baseline_results.csv', index=False
)

In [None]:
gb_params = {
    'n_estimators': [400, 500],
    'learning_rate': [0.03, 0.05],
    'max_depth': [2, 3],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [2, 4],
    'subsample': [0.6, 0.8]
}

gb_grid = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=gb_params,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("--- Running GridSearchCV for GradientBoosting ---")
gb_grid.fit(X_train_selected, y_train)

print("Best R^2 (cv):", gb_grid.best_score_)
print("Best params:", gb_grid.best_params_)

# Save GradientBoosting model and CV results
joblib.dump(gb_grid, '../models/gb_grid_full.pkl')
joblib.dump(gb_grid.best_estimator_, '../models/best_GradientBoosting.pkl')
pd.DataFrame(gb_grid.cv_results_).to_csv('../models/gb_cv_results.csv', index=False)

In [None]:
xgb_params = {
    'n_estimators': [150, 200],
    'learning_rate': [0.03, 0.05],
    'max_depth': [3, 4],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.5, 0.6],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 5],
    'gamma': [0, 0.1]
}

xgb_grid = GridSearchCV(
    estimator=XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    param_grid=xgb_params,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("\n--- Running GridSearchCV for XGBoost ---")
xgb_grid.fit(X_train_selected, y_train)

print("Best R^2 (cv):", xgb_grid.best_score_)
print("Best params:", xgb_grid.best_params_)

# Save XGBoost model and CV results
joblib.dump(xgb_grid, '../models/xgb_grid_full.pkl')
joblib.dump(xgb_grid.best_estimator_, '../models/best_XGBoost.pkl')
pd.DataFrame(xgb_grid.cv_results_).to_csv('../models/xgb_cv_results.csv', index=False)