In [1]:
#Loading all basic libraries
import pandas as pd
import numpy as np
import sklearn

In [2]:
#loading an inbuilt dataset from sklearn datasets module
from sklearn.datasets import load_boston
boston=load_boston()
print(boston.keys())
print(boston.data.shape)
print(boston.feature_names)
print(boston.DESCR)

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
(506, 13)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Bos

In [3]:
data=pd.DataFrame(boston.data) #Using pandas calling the data into a dataframe
data.columns=boston.feature_names #Assigning the names to the dataframe using the original names 
data['Price']=boston.target #Adding the target variable to the dataset under the name 'Price'
print(data.head(3)) #exploring the data

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   

   PTRATIO       B  LSTAT  Price  
0     15.3  396.90   4.98   24.0  
1     17.8  396.90   9.14   21.6  
2     17.8  392.83   4.03   34.7  


In [4]:
print(data.info()) #info is used to check str of data as in R

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
Price      506 non-null float64
dtypes: float64(14)
memory usage: 55.4 KB
None


In [5]:
print(data.describe()) #describe is used to check summary function as in R

             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861    2.1057

In [6]:
import xgboost as XGB #Package used to build xtreme gradient boosted models
from sklearn.metrics import mean_squared_error as MSE #A well used performance metric used in evaluating the model

In [7]:
#separating x and y variables
X,y =data.iloc[:,:13],data.iloc[:,-1:] #In python two variabels can be assigened different values in a single line of command

In [8]:
X.head(1) #Checking the separation

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98


In [9]:
y.head() #Checking the separation

Unnamed: 0,Price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [10]:
dmatrix = XGB.DMatrix(data=X,label=y)#converting the data to a dmatrix format will help xgboost give better results

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [12]:
#XGBclassifer() and XGBregresser() are two classes that can be used for various problems
model = XGB.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                         max_depth = 5, alpha = 10, n_estimators = 10)

In [13]:
#Unlike simple model calls here we build a model then call our data into it separately
model.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.3, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [14]:
#Finally we now use the model to predict on our test data
preds=model.predict(X_test)

In [15]:
rmse = np.sqrt(MSE(y_test, preds)) #using the mse function previously called on actual and predicted values
print(rmse)
print("RMSE:  %f" % rmse) 

10.17003469346328
RMSE:  10.170035


In [16]:
#The RMSE Value obtained in 10.17. We can build a more robust model using the cross validation technique
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10} #We define a param variable to hold the hyperparameters that were used for the previosu model

In [17]:
#the cross validation here has a fold value of 3, meaning 3 validation blocks will be created
cv_results = XGB.cv(dtrain=dmatrix, params=params, nfold=3, #here using the dmatrix format will help gain better results
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
#num_boost_rounds just defines number of trees to build like the n_estimators earlier
#Early_stopping_rounds is used to stop building tree if the rmse value stops decreasing after a certain rounds
#finally we choose our evaluation metric to be rmse, and as_pandas means to return the results as a pandas dataframe

In [18]:
#The cv_results from above contains train and test rmse for each boosting round made
print(cv_results.head(50)) #there are as many rows as many trees generated during cv


    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0         21.750760        0.036149       21.765523       0.028850
1         19.778533        0.077649       19.830758       0.031761
2         18.052812        0.118632       18.157338       0.116036
3         16.458958        0.169187       16.623974       0.191413
4         15.074781        0.183547       15.254608       0.213611
5         13.791091        0.216905       14.031675       0.250689
6         12.588550        0.184166       12.855077       0.235383
7         11.595904        0.184369       11.918407       0.259773
8         10.630265        0.156457       11.017993       0.236795
9          9.824976        0.202503       10.252325       0.269036
10         9.081573        0.151426        9.564405       0.248442
11         8.445415        0.201162        8.981438       0.280135
12         7.821793        0.150274        8.405444       0.262750
13         7.351428        0.162784        7.988541       0.26

In [19]:
print(cv_results['test-rmse-mean'].tail(1)) #the last tree generated tree boosted will usually have the lowest error

49    3.99692
Name: test-rmse-mean, dtype: float64


In [20]:
xg_reg = XGB.train(params=params, dtrain=dmatrix, num_boost_round=10)
#building another simple XGB model without any cv

In [21]:
# #Visualizing the XGB trees
# import seaborn as sns
# import matplotlib.pyplot as plt

# XGB.plot_tree(xg_reg,num_trees=2)
# plt.rcParams['figure.figsize'] = [50, 10]
# plt.show()

In [22]:
import matplotlib.pyplot as plt
XGB.plot_importance(model)
plt.rcParams['figure.figsize'] = [5,5]
plt.show()

<Figure size 640x480 with 1 Axes>