# Boosting

### Importing required libraries

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



### Classifying using XGBoosted tress

In [2]:
iris = datasets.load_iris ()
X, Y = iris.data, iris.target
X_train, X_test, Y_train,  Y_test = train_test_split (X, Y, test_size = 0.3, random_state = 15)

xg_cl = xgb.XGBClassifier (objective = 'binary:logistic', n_estimators = 10, seed = 15)

xg_cl.fit (X_train, Y_train)
predictions = xg_cl.predict (X_test)

accuracy = float (np.sum (predictions == Y_test)/Y_test.shape[0])

print ('Accuracy =', accuracy)

Accuracy = 1.0


### XGBoost using SciKit Learn API

In [6]:
file_path = r'..\datasets\boston_housing.csv'
housing = pd.read_csv (file_path)
X, Y = housing.iloc[:, :-1], housing.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.3, random_state = 15)

xg_reg = xgb.XGBRegressor (objective = 'reg:linear', n_estimators = 10, seed = 15)

xg_reg.fit (X_train, Y_train)
predictions = xg_reg.predict (X_test)

rmse = np.sqrt (mean_squared_error (Y_test, predictions))
print ('RMSE =', rmse)

RMSE = 196984.859173


### Boosting using XGBoost API

In [12]:
# using previously split data

DM_train = xgb.DMatrix (data = X_train, label = Y_train)
DM_test = xgb.DMatrix (data = X_test, label = Y_test)

parameters = {'booster': 'gblinear', 'objective': 'reg:linear'}

xg_reg = xgb.train (params = parameters, dtrain = DM_train, num_boost_round = 10)

predictions = xg_reg.predict (DM_test)

rmse = np.sqrt (mean_squared_error (Y_test, predictions))
print ('RMSE =', rmse)

RMSE = 109916.146589


## Tuning the model

#### Untunted model

In [40]:
file_path = r'C:\Users\student\Documents\Python Scripts\datasets\ames_housing_trimmed_processed.csv'
housing = pd.read_csv (file_path)

X, Y = housing[housing.columns.tolist ()[:-1]], housing[housing.columns.tolist ()[-1]]

housing_DMatrix = xgb.DMatrix (data = X, label = Y)

untuned = {'objective': 'reg:linear'}

untuned_cv_results_rmse = xgb.cv (dtrain = housing_DMatrix, params = untuned, nfold = 4, num_boost_round = 10, 
                                  metrics = 'rmse', as_pandas = True, seed = 15)

print (type (untuned_cv_results_rmse))

print('Untuned RMSE: %f' % ((untuned_cv_results_rmse['test-rmse-mean']).tail(1)))

<class 'pandas.core.frame.DataFrame'>
Untuned RMSE: 36184.604004


In [41]:
# using previous data

tuned = {'objective': 'reg:linear', 'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 5}

tuned_cv_results_rmse = xgb.cv (dtrain = housing_DMatrix, params = tuned, nfold = 4, num_boost_round = 10, 
                                metrics = 'rmse', as_pandas = True, seed = 15)

print ('Tuned RMSE: %f' % ((tuned_cv_results_rmse['test-rmse-mean']).tail (1)))

Tuned RMSE: 81235.318360
