In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

In [3]:
# importing data and loading into a dataframe 
path = "https://raw.githubusercontent.com/10Dennisw/house-price-prediction/main/datasets/amsterdam-house-prices-clean.csv"

df = pd.read_csv(path)
df

Unnamed: 0,Address,Zip,Price,Area,Room,Lon,Lat,Price (log),Area Code,Borough,Index
0,"Blasiusstraat 8 2, Amsterdam",1091 CR,685000.0,64,3,4.907736,52.356157,13.437174,1091,Oost,0
1,"Kromme Leimuidenstraat 13 H, Amsterdam",1059 EL,475000.0,60,3,4.850476,52.348586,13.071070,1059,West,1
2,"Zaaiersweg 11 A, Amsterdam",1097 SM,850000.0,109,4,4.944774,52.343782,13.652992,1097,Oost,2
3,"Tenerifestraat 40, Amsterdam",1060 TH,580000.0,128,6,4.789928,52.343712,13.270783,1060,Nieuw West,3
4,"Winterjanpad 21, Amsterdam",1036 KN,720000.0,138,5,4.902503,52.410538,13.487006,1036,Noord,4
...,...,...,...,...,...,...,...,...,...,...,...
913,"Ringdijk, Amsterdam",1097 AE,750000.0,117,1,4.927757,52.354173,13.527828,1097,Oost,919
914,"Kleine Beerstraat 31, Amsterdam",1033 CP,350000.0,72,3,4.890612,52.414587,12.765688,1033,Noord,920
915,"Stuyvesantstraat 33 II, Amsterdam",1058 AK,350000.0,51,3,4.856935,52.363256,12.765688,1058,West,921
916,"John Blankensteinstraat 51, Amsterdam",1095 MB,599000.0,113,4,4.965731,52.375268,13.303017,1095,Oost,922


### Creating the XGBoost Model

In [4]:
# creating the prediction dataframe
prediction_df = df[['Price (log)','Area','Room','Borough']]
prediction_df

Unnamed: 0,Price (log),Area,Room,Borough
0,13.437174,64,3,Oost
1,13.071070,60,3,West
2,13.652992,109,4,Oost
3,13.270783,128,6,Nieuw West
4,13.487006,138,5,Noord
...,...,...,...,...
913,13.527828,117,1,Oost
914,12.765688,72,3,Noord
915,12.765688,51,3,West
916,13.303017,113,4,Oost


In [5]:
# one hot encoding the borough variable so that the model can interpret it
area_code_df = prediction_df['Borough']
area_code_df = pd.get_dummies(area_code_df, columns=['Borough'])
area_code_df = area_code_df.astype(int)

In [6]:
# merging the dataframe
prediction_df = pd.merge(prediction_df, area_code_df, left_index=True, right_index=True, how='left')
prediction_df.drop('Borough', axis=1, inplace=True)
prediction_df

Unnamed: 0,Price (log),Area,Room,Centrum,Nieuw West,Noord,Oost,West,Zuid,Zuidoost
0,13.437174,64,3,0,0,0,1,0,0,0
1,13.071070,60,3,0,0,0,0,1,0,0
2,13.652992,109,4,0,0,0,1,0,0,0
3,13.270783,128,6,0,1,0,0,0,0,0
4,13.487006,138,5,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
913,13.527828,117,1,0,0,0,1,0,0,0
914,12.765688,72,3,0,0,1,0,0,0,0
915,12.765688,51,3,0,0,0,0,1,0,0
916,13.303017,113,4,0,0,0,1,0,0,0


In [7]:
# importing modules to for cretaing, training and evaluating the model
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
# Splitting the data into features and target variables
X = prediction_df.drop('Price (log)', axis=1)
y = prediction_df['Price (log)']

In [9]:
# Split the data into training, validation and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [10]:
# importing for random and grid search for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [11]:
# creating XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror')

# creating parameter distribution with random variables with numpy
param_distributions = {
    'n_estimators': np.arange(100, 1000, 100),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'max_depth': np.arange(1, 10, 1),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'min_child_weight': np.arange(1, 10, 1),
    'gamma': np.linspace(0, 0.5, 5),
    'reg_alpha': np.linspace(0, 1, 5),
    'reg_lambda': np.linspace(0, 1, 5),
}

# configuring RandomizedSearchCV
random_search = RandomizedSearchCV(xgb_regressor, param_distributions=param_distributions, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Performing the random search to find the best hyperparameters
random_search.fit(X_train, y_train)

# printing the best parameters 
print("Best parameters:", random_search.best_params_)

# Evaluating the best model on the val dataset
best_model = random_search.best_estimator_
predictions = best_model.predict(X_val)
r2 = r2_score(y_val, predictions)
print("The R^2 on Validation Set:", r2)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters: {'subsample': 0.6, 'reg_lambda': 0.0, 'reg_alpha': 1.0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.07444444444444444, 'gamma': 0.0, 'colsample_bytree': 0.7}
The R^2 on Validation Set: 0.8684099359251715


In [12]:
# creating parameter distribution with variables based on the random search results
param_distributions = {
    'n_estimators': [400, 500, 600],
    'learning_rate': [0.05, 0.075, 0.1],
    'max_depth': [2, 3, 4],
    'subsample': [0.5, 0.6, 0.7],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'reg_lambda': [0.0],
    'reg_alpha': [1.0],
    'gamma': [0.0],
    'min_child_weight': [5]

}

# configuring and performing grid search 
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_distributions, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# printing best parameters that were found based on parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_val)
r2 = r2_score(y_val, predictions)
print("The R^2 on Validation Set:", r2)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 0.8, 'gamma': 0.0, 'learning_rate': 0.075, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 400, 'reg_alpha': 1.0, 'reg_lambda': 0.0, 'subsample': 0.7}
The R^2 on Validation Set: 0.8672437409165197


In [13]:
# Defining XGBoost model parameters
params = grid_search.best_params_

# creating the final training and test set
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Number of training iterations
num_round = 100

# Training the model
bst = xgb.train(params, dtrain, num_round)

# Creating predictions
preds = bst.predict(dtest)

# Evaluating the model based on r2 and MSE 
r2 = r2_score(y_test, preds)
print(f"R^2: {r2}")
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

R^2: 0.8591113361875852
RMSE: 0.19223124959701504


Parameters: { "n_estimators" } are not used.

