In [1]:
# ======================
# 1. Imports & Setup
# ======================
import json
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [3]:
# ======================
# 2. Load Data
# ======================
df = pd.read_csv("../data/melb_data.csv")
print(df.shape)
df.head()


(13580, 21)


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
# ======================
# 3. Basic Cleaning
# ======================
# Drop high-NA or irrelevant columns (adjust as needed)
df = df.drop(columns=["Address", "SellerG"])  

# Remove rows with target missing
df = df.dropna(subset=["Price"])

# Define features and target
X = df.drop(columns=["Price"])
y = df["Price"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [5]:
# ======================
# 4. Preprocessing
# ======================
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [6]:
# ======================
# 5. Candidate Models
# ======================
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0)
}


In [8]:
# ======================
# 6. Cross-Validation Comparison
# ======================
cv_results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])
    
    scores = cross_val_score(pipe, X_train, y_train, 
                             cv=5, scoring="neg_mean_absolute_error")
    cv_results[name] = -scores.mean()

cv_result_df = pd.DataFrame(cv_results, index=["MAE"]).T.sort_values("MAE")


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [9]:
model_mapping = {
    "RandomForest": {
        "pipe": Pipeline([("preprocessor", preprocessor),
                          ("model", RandomForestRegressor(random_state=42))]),
        "param_grid": {
            "model__n_estimators": [100, 200, 300],
            "model__max_depth": [None, 10, 20, 30],
            "model__min_samples_split": [2, 5, 10]
        }
    },
    "XGBoost": {
        "pipe": Pipeline([("preprocessor", preprocessor),
                          ("model", XGBRegressor(
                              random_state=42,
                              objective="reg:squarederror",
                              n_jobs=-1
                          ))]),
        "param_grid": {
            "model__n_estimators": [100, 300, 500],
            "model__max_depth": [3, 5, 7],
            "model__learning_rate": [0.01, 0.05, 0.1],
            "model__subsample": [0.8, 1.0]
        }
    },
    "GradientBoosting": {
        "pipe": Pipeline([("preprocessor", preprocessor),
                          ("model", GradientBoostingRegressor(random_state=42))]),
        "param_grid": {
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.05, 0.1, 0.2],
            "model__max_depth": [3, 5]
        }
    }
}


In [10]:
# Sort by MAE (lower is better) and get top 2
top_models = cv_result_df.head(2).index.tolist()
print("Top 2 models selected:", top_models)


Top 2 models selected: ['XGBoost', 'RandomForest']


In [11]:
best_models = {}

for model_name in top_models:
    print(f"\n--- Hyperparameter tuning for {model_name} ---")
    
    pipe = model_mapping[model_name]["pipe"]
    param_grid = model_mapping[model_name]["param_grid"]
    
    grid_search = GridSearchCV(
        pipe,
        param_grid,
        cv=3,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
        verbose=2
    )
    
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search
    
    print(f"{model_name} best params: {grid_search.best_params_}")
    print(f"{model_name} best CV MAE: {-grid_search.best_score_:.2f}")



--- Hyperparameter tuning for XGBoost ---
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost best params: {'model__learning_rate': 0.05, 'model__max_depth': 7, 'model__n_estimators': 500, 'model__subsample': 0.8}
XGBoost best CV MAE: 159688.32

--- Hyperparameter tuning for RandomForest ---
Fitting 3 folds for each of 36 candidates, totalling 108 fits
RandomForest best params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 300}
RandomForest best CV MAE: 169476.22


In [12]:
# Ensure models folder exists (relative to notebook)
os.makedirs("../models", exist_ok=True)

# Find the best model based on CV MAE
best_model_name = min(best_models, key=lambda m: -best_models[m].best_score_)  # higher score is better (neg MAE)
best_gs = best_models[best_model_name]

# Prepare data to save
results_to_save = {
    "best_model_name": best_model_name,
    "best_params": best_gs.best_params_,
    "best_CV_MAE": -best_gs.best_score_
}

# Save JSON file
json_path = "../models/best_model_results.json"
with open(json_path, "w") as f:
    json.dump(results_to_save, f, indent=4)

print(f"Best model info saved to {json_path}")


Best model info saved to ../models/best_model_results.json
