In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

In [3]:
# importing data and loading into a dataframe 
path = "https://raw.githubusercontent.com/10Dennisw/house-price-prediction/main/datasets/amsterdam-house-prices-clean.csv"

df = pd.read_csv(path)
df

Unnamed: 0,Address,Zip,Price,Area,Room,Lon,Lat,Area Code,Borough,Price (log),Area (log),Index
0,"Blasiusstraat 8 2, Amsterdam",1091 CR,685000.0,64,3,4.907736,52.356157,1091,Oost,13.437174,4.158883,0
1,"Kromme Leimuidenstraat 13 H, Amsterdam",1059 EL,475000.0,60,3,4.850476,52.348586,1059,West,13.071070,4.094345,1
2,"Zaaiersweg 11 A, Amsterdam",1097 SM,850000.0,109,4,4.944774,52.343782,1097,Oost,13.652992,4.691348,2
3,"Tenerifestraat 40, Amsterdam",1060 TH,580000.0,128,6,4.789928,52.343712,1060,Nieuw West,13.270783,4.852030,3
4,"Winterjanpad 21, Amsterdam",1036 KN,720000.0,138,5,4.902503,52.410538,1036,Noord,13.487006,4.927254,4
...,...,...,...,...,...,...,...,...,...,...,...,...
912,"Ringdijk, Amsterdam",1097 AE,750000.0,117,1,4.927757,52.354173,1097,Oost,13.527828,4.762174,919
913,"Kleine Beerstraat 31, Amsterdam",1033 CP,350000.0,72,3,4.890612,52.414587,1033,Noord,12.765688,4.276666,920
914,"Stuyvesantstraat 33 II, Amsterdam",1058 AK,350000.0,51,3,4.856935,52.363256,1058,West,12.765688,3.931826,921
915,"John Blankensteinstraat 51, Amsterdam",1095 MB,599000.0,113,4,4.965731,52.375268,1095,Oost,13.303017,4.727388,922


### Creating the XGBoost Model

In [4]:
# creating the prediction dataframe
prediction_df = df[['Price (log)','Area (log)','Room','Borough']]
prediction_df

Unnamed: 0,Price (log),Area (log),Room,Borough
0,13.437174,4.158883,3,Oost
1,13.071070,4.094345,3,West
2,13.652992,4.691348,4,Oost
3,13.270783,4.852030,6,Nieuw West
4,13.487006,4.927254,5,Noord
...,...,...,...,...
912,13.527828,4.762174,1,Oost
913,12.765688,4.276666,3,Noord
914,12.765688,3.931826,3,West
915,13.303017,4.727388,4,Oost


In [5]:
# one hot encoding the borough variable so that the model can interpret it
area_code_df = prediction_df['Borough']
area_code_df = pd.get_dummies(area_code_df, columns=['Borough'])
area_code_df = area_code_df.astype(int)

In [6]:
# merging the dataframe
prediction_df = pd.merge(prediction_df, area_code_df, left_index=True, right_index=True, how='left')
prediction_df.drop('Borough', axis=1, inplace=True)
prediction_df

Unnamed: 0,Price (log),Area (log),Room,Centrum,Nieuw West,Noord,Oost,West,Zuid,Zuidoost
0,13.437174,4.158883,3,0,0,0,1,0,0,0
1,13.071070,4.094345,3,0,0,0,0,1,0,0
2,13.652992,4.691348,4,0,0,0,1,0,0,0
3,13.270783,4.852030,6,0,1,0,0,0,0,0
4,13.487006,4.927254,5,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
912,13.527828,4.762174,1,0,0,0,1,0,0,0
913,12.765688,4.276666,3,0,0,1,0,0,0,0
914,12.765688,3.931826,3,0,0,0,0,1,0,0
915,13.303017,4.727388,4,0,0,0,1,0,0,0


In [7]:
# importing modules to for cretaing, training and evaluating the model
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
# Splitting the data into features and target variables
X = prediction_df.drop('Price (log)', axis=1)
y = prediction_df['Price (log)']

In [9]:
# Split the data into training, validation and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [10]:
# importing for random and grid search for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [11]:
# defining XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror')

# creating search space 
param_distributions = {
    'n_estimators': np.arange(100, 1000, 100),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'max_depth': np.arange(1, 10, 1),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'min_child_weight': np.arange(1, 10, 1),
    'gamma': np.linspace(0, 0.5, 5),
    'reg_alpha': np.linspace(0, 1, 5),
    'reg_lambda': np.linspace(0, 1, 5),
}

# configuring RandomizedSearchCV
random_search = RandomizedSearchCV(xgb_regressor, param_distributions=param_distributions, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Performing the random search to find the best hyperparameters
random_search.fit(X_train, y_train)

# printing the best parameters 
print("Best parameters:", random_search.best_params_)

# Evaluating the best model on the val dataset
best_model = random_search.best_estimator_
predictions = best_model.predict(X_val)
r2 = r2_score(y_val, predictions)
print("The R^2 on Validation Set:", r2)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters: {'subsample': 0.5, 'reg_lambda': 0.25, 'reg_alpha': 0.0, 'n_estimators': 200, 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.07444444444444444, 'gamma': 0.0, 'colsample_bytree': 1.0}
The R^2 on Validation Set: 0.8367800451201158


In [12]:
# Creating parameter distribution with variables based on the random search results
param_distributions = {
    'n_estimators': [random_search.best_params_['n_estimators']-50, random_search.best_params_['n_estimators'], random_search.best_params_['n_estimators']+50],
    'learning_rate': [random_search.best_params_['learning_rate']-0.025, random_search.best_params_['learning_rate'], random_search.best_params_['learning_rate']+0.025],
    'max_depth': [random_search.best_params_['max_depth']-1, random_search.best_params_['max_depth'], random_search.best_params_['max_depth']+1],
    'subsample': [random_search.best_params_['subsample']-0.05, random_search.best_params_['subsample'], random_search.best_params_['subsample']+0.05],
    'colsample_bytree': [random_search.best_params_['colsample_bytree']-0.05, random_search.best_params_['colsample_bytree']],
    'reg_lambda': [random_search.best_params_['reg_lambda']-0.05, random_search.best_params_['reg_lambda'], random_search.best_params_['reg_lambda']+0.05],
    'reg_alpha': [random_search.best_params_['reg_alpha'], [random_search.best_params_['reg_alpha']+0.05]],
    'gamma': [random_search.best_params_['gamma'], random_search.best_params_['gamma']+0.05],
    'min_child_weight': [random_search.best_params_['min_child_weight']-1, random_search.best_params_['min_child_weight'], random_search.best_params_['min_child_weight']+1]
}

# Configuring and performing grid search 
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_distributions, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Printing best parameters that were found based on parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_val)
r2 = r2_score(y_val, predictions)
print("The R^2 on Validation Set:", r2)

Fitting 3 folds for each of 5832 candidates, totalling 17496 fits


8748 fits failed out of a total of 17496.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8748 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\denni\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\denni\anaconda3\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\denni\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1090, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\denni\anaconda3\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
       

Best parameters found:  {'colsample_bytree': 1.0, 'gamma': 0.05, 'learning_rate': 0.04944444444444444, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 150, 'reg_alpha': 0.0, 'reg_lambda': 0.2, 'subsample': 0.5}
The R^2 on Validation Set: 0.8493014267995798


         nan]


In [13]:
# Defining XGBoost model parameters, found with grid search
params = grid_search.best_params_

# creating the final training and test set
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Number of training iterations
num_round = 100

# Training the model
bst = xgb.train(params, dtrain, num_round)

# Creating predictions
preds = bst.predict(dtest)

# Evaluating the model based on r2 and MSE 
r2 = r2_score(y_test, preds)
print(f"R^2: {r2}")
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

Parameters: { "n_estimators" } are not used.



R^2: 0.87410486993967
RMSE: 0.19031879413386416
