# XGBoost (Extreme Gradient Boosting)

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.dropna(inplace=True)

## Data Preprocessing

In [4]:
X = df[['pclass', 'sex', 'age']]

In [5]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [6]:
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
y = df['survived']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) 

In [10]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    '''
    print accuracy score and confusion matrix
    '''
    
    if train:
        '''
        train performance
        '''
        print('Train Result: \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Average SD: \t\t {0:.4f}'.format(np.std(res)))
        
    elif train == False:
        '''
        test performance
        '''
        print('Test Result \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))

## XGBoost

In [13]:
import xgboost as xgb

In [14]:
xgb_clf = xgb.XGBClassifier(max_depth = 3, n_estimators = 5000, learning_rate = 0.2)

In [15]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=5000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [16]:
print_score(xgb_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9370

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.90      0.90        42
           1       0.95      0.95      0.95        85

   micro avg       0.94      0.94      0.94       127
   macro avg       0.93      0.93      0.93       127
weighted avg       0.94      0.94      0.94       127
 

Confusion Matrix: 
 [[38  4]
 [ 4 81]] 

Average Accuracy: 	 0.7474
Average SD: 		 0.1295


In [17]:
print_score(xgb_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8364

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.65      0.71        17
           1       0.85      0.92      0.89        38

   micro avg       0.84      0.84      0.84        55
   macro avg       0.82      0.78      0.80        55
weighted avg       0.83      0.84      0.83        55
 

Confusion Matrix: 
 [[11  6]
 [ 3 35]] 



| Classifier | Decision Tree | Bagging | Random Forest | Optimized RF | Extra-Trees | AdaBoost(CART) | AdaBoost(RF) | Gradient Boosting |
|:-|:-|:-|:-|:-|:-|:-|:-|:-|
| Train Accuracy Score | 0.9669 | 0.9669 | 0.9134 | 0.8740 | 0.9370 | 0.9055 | 0.9291 | 0.9291 |
| Average Accuracy Score | 0.8198 | 0.8044 | 0.7478 | 0.7580 | 0.7407 | 0.7547 | 0|.8043 | 0.8210 |
| SD | 0.1217 | 0.1360 | 0.1003 | 0.0836 | 0.0584 |  0.0920 | 0.1032 | 0.0956 |
| Test Accuracy Score | 0.7213 | 0.7541 | 0.7636 | 0.8545 | 0.8182 |  0.6909 | 0.7636 | 0.7636 |

In [18]:
xgb_clf = xgb.XGBClassifier(max_depth = 5, n_estimators = 10000, learning_rate = 0.3)

In [19]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.3,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=10000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [20]:
print_score(xgb_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9370

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.90      0.90        42
           1       0.95      0.95      0.95        85

   micro avg       0.94      0.94      0.94       127
   macro avg       0.93      0.93      0.93       127
weighted avg       0.94      0.94      0.94       127
 

Confusion Matrix: 
 [[38  4]
 [ 4 81]] 

Average Accuracy: 	 0.7314
Average SD: 		 0.1210


In [21]:
print_score(xgb_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8364

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.65      0.71        17
           1       0.85      0.92      0.89        38

   micro avg       0.84      0.84      0.84        55
   macro avg       0.82      0.78      0.80        55
weighted avg       0.83      0.84      0.83        55
 

Confusion Matrix: 
 [[11  6]
 [ 3 35]] 

