In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import joblib

# Load preprocessed data
data = pd.read_csv('preprocessed_clean_v4.csv')

# Features and target
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Bedroom_Discrepancy']
X = data[features]
y = data['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize scaler (for deployment)
scaler = StandardScaler()
X_train_scaled = X_train  # Already scaled
X_test_scaled = X_test    # Already scaled

# Function to evaluate and print metrics
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} Performance:")
    print(f"MAE: ${mae:,.2f}")
    print(f"RMSE: ${rmse:,.2f}")
    print(f"R²: {r2:.4f}")
    return mae, rmse, r2

# Random Forest: Expanded hyperparameter tuning
rf = RandomForestRegressor(random_state=42)
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt', 0.5]
}
random_search_rf = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
random_search_rf.fit(X_train_scaled, y_train)

# Best Random Forest model
best_rf = random_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_scaled)
rf_metrics = evaluate_model(y_test, y_pred_rf, "Random Forest")
print(f"Best Random Forest Parameters: {random_search_rf.best_params_}")

# Random Forest with log-transformed Price
y_train_log = np.log1p(y_train)
random_search_rf.fit(X_train_scaled, y_train_log)
best_rf_log = random_search_rf.best_estimator_
y_pred_rf_log = np.expm1(best_rf_log.predict(X_test_scaled))
rf_log_metrics = evaluate_model(y_test, y_pred_rf_log, "Random Forest (Log-Transformed)")
print(f"Best Random Forest Log Parameters: {random_search_rf.best_params_}")

# XGBoost
xgb = XGBRegressor(random_state=42)
param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0]
}
random_search_xgb = RandomizedSearchCV(xgb, param_dist_xgb, n_iter=20, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train_scaled, y_train)

# Best XGBoost model
best_xgb = random_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test_scaled)
xgb_metrics = evaluate_model(y_test, y_pred_xgb, "XGBoost")
print(f"Best XGBoost Parameters: {random_search_xgb.best_params_}")

# Linear Regression (baseline)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
lr_metrics = evaluate_model(y_test, y_pred_lr, "Linear Regression")

# Select best model (highest R²)
models = {
    'Random Forest': (best_rf, rf_metrics),
    'Random Forest (Log)': (best_rf_log, rf_log_metrics),
    'XGBoost': (best_xgb, xgb_metrics),
    'Linear Regression': (lr, lr_metrics)
}
best_model_name = max(models, key=lambda k: models[k][1][2])
best_model, best_metrics = models[best_model_name]



35 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\222528192\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\222528192\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\222528192\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    val


Random Forest Performance:
MAE: $238,551.49
RMSE: $316,790.03
R²: 0.5257
Best Random Forest Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'max_features': 0.5, 'max_depth': 30}


35 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\222528192\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\222528192\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\222528192\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    val


Random Forest (Log-Transformed) Performance:
MAE: $234,796.11
RMSE: $319,611.30
R²: 0.5172
Best Random Forest Log Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'max_features': 0.5, 'max_depth': 30}

XGBoost Performance:
MAE: $226,510.18
RMSE: $300,721.59
R²: 0.5726
Best XGBoost Parameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}

Linear Regression Performance:
MAE: $289,297.91
RMSE: $364,257.72
R²: 0.3729


In [4]:
# Save the best model and scaler
joblib.dump(best_model, 'best_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
print(f"\nSaved best model ({best_model_name}) to 'best_model.joblib'")
print("Saved scaler to 'scaler.joblib'")


Saved best model (XGBoost) to 'best_model.joblib'
Saved scaler to 'scaler.joblib'
