In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle
import os

In [None]:
df = pd.read_csv("../data/processed/feature_engineered_data.csv")
df.shape

In [None]:
X = df.drop(['price', 'price_per_sqft', 'log_price'], axis=1)
y = df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, max_depth=15, min_samples_split=5, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"R2 Score: {r2:.4f}")
print(f"MAE: {mae:.4f} Lakhs")
print(f"RMSE: {rmse:.4f} Lakhs")
print(f"MAPE: {mape:.2f}%")

In [None]:
os.makedirs('../docs/images', exist_ok=True)

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

axes[0, 0].scatter(y_test, y_pred, alpha=0.5, color='#8e44ad')
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Price (Lakhs)')
axes[0, 0].set_ylabel('Predicted Price (Lakhs)')
axes[0, 0].set_title('Actual vs Predicted Prices')

residuals = y_test - y_pred
axes[0, 1].hist(residuals, bins=50, color='#8e44ad', edgecolor='black', alpha=0.7)
axes[0, 1].axvline(x=0, color='red', linestyle='--', lw=2)
axes[0, 1].set_xlabel('Residual (Actual - Predicted)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Residual Distribution')

metrics = ['RÂ² Score', 'MAE (Lakhs)', 'RMSE (Lakhs)']
values = [r2, mae, rmse]
colors = ['#2ecc71', '#3498db', '#e74c3c']
bars = axes[1, 0].bar(metrics, values, color=colors, edgecolor='black')
axes[1, 0].set_ylabel('Value')
axes[1, 0].set_title('Model Performance Metrics')
for bar, val in zip(bars, values):
    axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.2f}', ha='center', fontsize=11, fontweight='bold')

axes[1, 1].scatter(y_pred, residuals, alpha=0.5, color='#8e44ad')
axes[1, 1].axhline(y=0, color='red', linestyle='--', lw=2)
axes[1, 1].set_xlabel('Predicted Price (Lakhs)')
axes[1, 1].set_ylabel('Residual')
axes[1, 1].set_title('Residual vs Predicted')

plt.suptitle('Random Forest Regressor Model Evaluation', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../docs/images/random_forest_results.png', dpi=150, bbox_inches='tight')
plt.show()
print("Graph saved to docs/images/random_forest_results.png")

In [None]:
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False).head(15)

plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance)), feature_importance['importance'].values, color='#8e44ad')
plt.yticks(range(len(feature_importance)), feature_importance['feature'].values)
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../docs/images/random_forest_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
os.makedirs('../models', exist_ok=True)
with open('../models/random_forest.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print("Model saved to models/random_forest.pkl")