In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Load data
df = pd.read_csv("../data/cleaned_data_with_transaction_year.csv")


X = df.drop("price_per_unit_area", axis=1)
y = df["price_per_unit_area"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Random Forest RMSE:", rmse)
print("Random Forest R² Score:", r2)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Best estimator and score
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate best model
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Tuned Random Forest RMSE:", rmse)
print("Tuned Random Forest R² Score:", r2)



Random Forest RMSE: 5.687590539891411
Random Forest R² Score: 0.8071725356880382
Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Tuned Random Forest RMSE: 5.580686772721471
Tuned Random Forest R² Score: 0.8071725356880382
Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Tuned Random Forest RMSE: 5.580686772721471
Tuned Random Forest R² Score: 0.8071725356880382
