In [1]:
# 🌲 Random Forest – California Housing

# 📁 1. Data Loading
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
import joblib

# Load preprocessed data
df = pd.read_csv("../data/train/housing_train_processed.csv")
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# 🤖 2. Model Fitting
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X, y)

# 🧪 3. Cross-Validation
scores = cross_val_score(rf_reg, X, y, scoring="neg_mean_squared_error", cv=5)
rmse_scores = (-scores) ** 0.5

print("Cross-validation RMSE scores:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

# 🔍 4. Hyperparameter Tuning
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20, None],
    "max_features": ["sqrt", "log2"]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3,
                           scoring="neg_mean_squared_error", n_jobs=-1)

grid_search.fit(X, y)
best_rf = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)

# 💾 5. Model Saving
joblib.dump(best_rf, "../models/random_forest_model.pkl")
print("✅ Model saved to models/random_forest_model.pkl")

Cross-validation RMSE scores: [49464.37245837 48957.41055168 49273.50871464 49645.46563933
 49741.01435692]
Average RMSE: 49416.35434418591
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
✅ Model saved to models/random_forest_model.pkl
