In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv("preprocessed_data.csv")

# Scaling numeric columns (e.g., Price, Area)
area_scaler = StandardScaler()
area_scaler.fit(df[['Area (sqft)']])
df['Area (sqft)'] = area_scaler.transform(df[['Area (sqft)']])  # Fit the scaler on the 'Area (sqft)' column

# Save the scaler to a .pkl file
with open('area_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(area_scaler, scaler_file)

# Define the features (X) and target (y)
X = df[['Area (sqft)', 'BHK', 'Bathrooms', 'Construction Status', 'City', 'Location_encoded']]
y = df['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [3]:
# Define custom scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

In [4]:
results = {}

# Linear Regression (No hyperparameters to tune)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = lr_model.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

results['Linear Regression'] = {
    'Best Params': None,
    'MAE': mae_lr,
    'R2': r2_lr
}

In [5]:
# RandomSearchCV for other models
# Define hyperparameter search spaces (modify as needed for each model)
dt_param_dist = {'max_depth': [3, 5, 10, None],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

rf_param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

gb_param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [3, 5, 10, 15],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'subsample': [0.8, 1.0]
}

svr_param_dist = {'C': [0.1, 1, 10],
                  'kernel': ['linear', 'rbf'],
                  'gamma': ['scale', 'auto']}

knn_param_dist = {'n_neighbors': [3, 5, 10],
                  'weights': ['uniform', 'distance']}

mlp_param_dist = {'hidden_layer_sizes': [(50, 50), (100, 50)],
                  'activation': ['relu', 'tanh'],
                  'solver': ['adam', 'sgd'],
                  'learning_rate': ['constant', 'adaptive']}

# Decision Tree
dt_model = DecisionTreeRegressor(random_state=42)
dt_random_search = RandomizedSearchCV(estimator=dt_model, param_distributions=dt_param_dist, cv=5, scoring=mae_scorer, n_jobs=-1, random_state=42)
dt_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_dt = dt_random_search.predict(X_test)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

results['Decision Tree'] = {
    'Best Params': dt_random_search.best_params_,
    'MAE': mae_dt,
    'R2': r2_dt
}

In [None]:
# Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=50,  # Increase iterations for finer tuning
    cv=5,
    scoring='neg_mean_absolute_error',  # or 'r2' for R-squared
    random_state=42,
    n_jobs=-1
)
rf_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_random_search.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

results['Random Forest'] = {
    'Best Params': rf_random_search.best_params_,
    'MAE': mae_rf,
    'R2': r2_rf
}

In [None]:
# Gradient Boosting
gb_model = GradientBoostingRegressor(random_state=42)
gb_random_search = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_distributions=gb_param_grid,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_error',  # or 'r2' for R-squared
    random_state=42,
    n_jobs=-1
)
gb_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_gb = gb_random_search.predict(X_test)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

results['Gradient Boosting'] = {
    'Best Params': gb_random_search.best_params_,
    'MAE': mae_gb,
    'R2': r2_gb
}

In [None]:
# SVR
svr_model = SVR()
svr_random_search = RandomizedSearchCV(estimator=svr_model, param_distributions=svr_param_dist, cv=5, scoring=mae_scorer, n_jobs=-1, random_state=42)
svr_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_svr = svr_random_search.predict(X_test)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

results['SVR'] = {
    'Best Params': svr_random_search.best_params_,
    'MAE': mae_svr,
    'R2': r2_svr
}

In [None]:
# KNN
knn_model = KNeighborsRegressor()
knn_random_search = RandomizedSearchCV(estimator=knn_model, param_distributions=knn_param_dist, cv=5, scoring=mae_scorer, n_jobs=-1, random_state=42)
knn_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_random_search.predict(X_test)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

results['KNN'] = {
    'Best Params': knn_random_search.best_params_,
    'MAE': mae_knn,
    'R2': r2_knn
}

In [None]:
# Define hyperparameter search space for XGBoost
xgb_param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [3, 5, 10, 15],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# XGBoost Regressor
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_grid,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1
)
xgb_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_random_search.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

results['XGBoost'] = {
    'Best Params': xgb_random_search.best_params_,
    'MAE': mae_xgb,
    'R2': r2_xgb
}

In [None]:
# Define hyperparameter search space for CatBoost
cat_param_grid = {
    'iterations': [200, 500, 800, 1000],
    'depth': [4, 6, 10, 12],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'l2_leaf_reg': [1, 3, 5, 7]
}

# CatBoost Regressor
cat_model = CatBoostRegressor(random_state=42, verbose=0)
cat_random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=cat_param_grid,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1
)
cat_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_cat = cat_random_search.predict(X_test)
mae_cat = mean_absolute_error(y_test, y_pred_cat)
r2_cat = r2_score(y_test, y_pred_cat)

results['CatBoost'] = {
    'Best Params': cat_random_search.best_params_,
    'MAE': mae_cat,
    'R2': r2_cat
}

In [None]:
# Define hyperparameter search space for LightGBM
lgb_param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'max_depth': [-1, 10, 20],
    'subsample': [0.8, 1.0]
}

# LightGBM Regressor
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=lgb_param_grid,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1
)
lgb_random_search.fit(X_train, y_train)

# Predict and evaluate
y_pred_lgb = lgb_random_search.predict(X_test)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

results['LightGBM'] = {
    'Best Params': lgb_random_search.best_params_,
    'MAE': mae_lgb,
    'R2': r2_lgb
}

In [None]:
results_df = pd.DataFrame(results).T  # Transpose the DataFrame to make it more readable
print(results_df)

# If you want to save this results table to a CSV file for further analysis
# results_df.to_csv('model_results.csv', index=True)

In [None]:
# Plot MAE for all models
plt.figure(figsize=(10, 6))
results_df['MAE'].plot(kind='bar', color='skyblue')
plt.title('Mean Absolute Error (MAE) for Different Models')
plt.ylabel('MAE')
plt.xticks(rotation=45)
plt.show()

# Plot R² for all models
plt.figure(figsize=(10, 6))
results_df['R2'].plot(kind='bar', color='orange')
plt.title('R² Score for Different Models')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.show()