In this notebook a boosted tree ensemble model will be trained using Python's library xgboost for trying to predict the sale prices of the houses.

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, LeaveOneOut
import pickle

# 1. Load data

In [2]:
dataset = pd.read_csv('encoded_train_dataset.csv', sep = ',')

In [3]:
dataset

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,HeatingQC,CentralAir,KitchenQual,FireplaceQu,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,SalePrice
0,1,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,...,5.0,1.0,4.0,0.0,2.0,3.0,3.0,2.0,0.0,208500.0
1,2,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,...,5.0,1.0,3.0,3.0,2.0,3.0,3.0,2.0,0.0,181500.0
2,3,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,...,5.0,1.0,4.0,3.0,2.0,3.0,3.0,2.0,0.0,223500.0
3,4,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,...,4.0,1.0,4.0,4.0,1.0,3.0,3.0,2.0,0.0,140000.0
4,5,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,...,5.0,1.0,4.0,3.0,2.0,3.0,3.0,2.0,0.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,62.0,7917.0,6.0,5.0,1999.0,2000.0,0.0,0.0,0.0,...,5.0,1.0,3.0,3.0,2.0,3.0,3.0,2.0,0.0,175000.0
1456,1457,85.0,13175.0,6.0,6.0,1978.0,1988.0,119.0,790.0,163.0,...,3.0,1.0,3.0,3.0,1.0,3.0,3.0,2.0,0.0,210000.0
1457,1458,66.0,9042.0,7.0,9.0,1941.0,2006.0,0.0,275.0,0.0,...,5.0,1.0,4.0,4.0,2.0,3.0,3.0,2.0,0.0,266500.0
1458,1459,68.0,9717.0,5.0,6.0,1950.0,1996.0,0.0,49.0,1029.0,...,4.0,1.0,4.0,0.0,1.0,3.0,3.0,2.0,0.0,142125.0


# 2. Train the model and check accuracy

The XGBRegressor algorithm chosen will be trained with default hyperparameters and the error metric was the negative mean squared error.\
The validation process was done using Leave One Out Cross Validation.

In [4]:
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3)

In [5]:
X = dataset.drop(columns = ['Id', 'SalePrice'])
y = dataset['SalePrice'].values

In [6]:
cv = LeaveOneOut()

In [7]:
scores = cross_val_score(xg_reg, X, y = y, scoring = 'neg_root_mean_squared_error', cv = cv)

In [8]:
print(f'RMSE = {-scores.mean()} \u00B1 {scores.std()}')

RMSE = 15948.46611729452 ± 21995.191201747948


# 3. Fit the model

In [9]:
xg_reg.fit(X,y)

# 4. Save the model

The fitted model is saved to a pickle file.

In [10]:
with open('models/xgboost_reg.pkl','wb') as f:
    pickle.dump(xg_reg,f)