In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import pickle

import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv("preprocessed_data.csv")

# Scale numeric columns (e.g., Area)
area_scaler = StandardScaler()
df['Area (sqft)'] = area_scaler.fit_transform(df[['Area (sqft)']])

# Save the scaler
with open('area_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(area_scaler, scaler_file)

# Define features (X) and target (y)
X = df[['Area (sqft)', 'BHK', 'Bathrooms', 'Construction Status', 'City', 'Location_encoded']]
y = df['Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Define the model
model = RandomForestRegressor()

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_depth': [None, 10, 20, 30, 40],
    'bootstrap': [True, False]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, 
                                   param_distributions=param_grid, 
                                   n_iter=100, 
                                   cv=3, 
                                   verbose=2, 
                                   random_state=42, 
                                   n_jobs=-1,
                                   scoring=mae_scorer)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Save the best model
with open('random_forest_best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': True}
Mean Absolute Error: 2337224.558815011
R-squared: 0.8780246407705082
