In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib

In [2]:
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()


In [3]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_r2 = lr_model.score(X_test, y_test)
print(f"Baseline R²: {lr_r2:.4f}")

Baseline R²: 0.6800


In [4]:

# Cell 4: Random Forest
from sklearn.ensemble import RandomForestRegressor

rfr_model = RandomForestRegressor(random_state=35)
rfr_model.fit(X_train, y_train)

y_pred = rfr_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = mse_rf ** 0.5

print("Random Forest R² Score:", rfr_model.score(X_test, y_test))
print(f'Random Forest RMSE: {rmse_rf}')

# Optional: Hyperparameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', None]
}

grid_search = GridSearchCV(
    estimator=rfr_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)

rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best_rf))
r2_best = r2_score(y_test, y_pred_best_rf)

print("Best RF RMSE:", rmse_best)
print("Best RF R² Score:", r2_best)
print("Best Params:", grid_search.best_params_)

Random Forest R² Score: 0.815543881185804
Random Forest RMSE: 49881.915814310385
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best RF RMSE: 49861.39404730444
Best RF R² Score: 0.8156956230252463
Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}


In [5]:
# Cell 5: XGBoost
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=35)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost R² Score:", r2_xgb)
print("XGBoost RMSE:", rmse_xgb)

# Optional: Hyperparameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

grid_search_xgb.fit(X_train, y_train)

best_xgb = grid_search_xgb.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test)

rmse_best_xgb = np.sqrt(mean_squared_error(y_test, y_pred_best_xgb))
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)

print("Best XGBoost RMSE:", rmse_best_xgb)
print("Best XGBoost R² Score:", r2_best_xgb)
print("Best XGBoost Params:", grid_search_xgb.best_params_)

XGBoost R² Score: 0.8204869317648358
XGBoost RMSE: 49209.00988890238
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best XGBoost RMSE: 45371.90450208818
Best XGBoost R² Score: 0.8473907581586055
Best XGBoost Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}


In [6]:
# Cell 6: Save Best Model
import os
os.makedirs('../models', exist_ok=True)
joblib.dump(best_xgb, '../models/xgboost_model.pkl')
print("✅ Model saved!")

✅ Model saved!
