In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

In [2]:
# Load data
df = pd.read_csv("model_data.csv")
df.drop_duplicates(inplace=True)


In [3]:
# Remove outliers in price
Q1 = df['PRICE'].quantile(0.25)
Q3 = df['PRICE'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['PRICE'] >= lower_bound) & (df['PRICE'] <= upper_bound)]

In [4]:
# Convert date to numerical feature
df['DATE ADDED'] = pd.to_datetime(df['DATE ADDED'])
df['DAYS_SINCE'] = (df['DATE ADDED'] - df['DATE ADDED'].min()).dt.days
df.drop(columns=['DATE ADDED'], inplace=True)

In [5]:
# Encode categorical features
location_dict = {loc: idx for idx, loc in enumerate(df['LOCATION'].unique())}
df['LOCATION'] = df['LOCATION'].map(location_dict)
house_dict = {'FLAT': 1, 'HOUSE': 2, 'DUPLEX': 3, 'BUNGALOW': 4}
df['HOUSE_TYPE'] = df['HOUSE_TYPE'].map(house_dict)

In [6]:

# Define features and target
X = df[['BEDROOMS', 'BATHROOMS', 'TOILETS', 'LOCATION', 'HOUSE_TYPE', 'DAYS_SINCE']]
y = df['PRICE']

In [7]:
# Train-test split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)


In [8]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [9]:
# Hyperparameter tuning
params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [500, 1000],
    'gamma': [0, 0.1, 0.3],
    'reg_lambda': [1, 3, 5]
}

In [None]:
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=0)
grid_search = GridSearchCV(xgb_model, params, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4374 candidates, totalling 13122 fits


In [None]:
# Best model
best_model = grid_search.best_estimator_

In [None]:
# Train the best model
eval_set = [(X_train, y_train), (X_val, y_val)]
best_model.fit(X_train, y_train, eval_set=eval_set, eval_metric='mae', early_stopping_rounds=50, verbose=False)


In [None]:
# Evaluate
y_pred = best_model.predict(X_val)
print("MAE:", mean_absolute_error(y_val, y_pred))
print("MSE:", mean_squared_error(y_val, y_pred))
print("R2:", r2_score(y_val, y_pred))
