# Apply XGBoost (Extreme Gradient Boost) to Classification
Reference: https://www.kaggle.com/uciml/pima-indians-diabetes-database/downloads/diabetes.csv/1

In [25]:
import pandas as pd
import xgboost as xgb
import sklearn
import numpy as np


# Read Datasets

In [2]:
data = pd.read_csv('./datasets/step_0/diabetes.csv')

In [3]:
data.shape

(768, 9)

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [16]:
X_data, y = data.iloc[:, :-1], data['Outcome']

In [17]:
type(X_data)

pandas.core.frame.DataFrame

In [18]:
X_data.shape

(768, 8)

In [19]:
y.shape

(768,)

In [14]:
X_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [20]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

# Split Dataset into Train/Test Datasets

In [27]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_data,
                                                                            y,
                                                                            test_size=0.3,
                                                                            random_state=7)

In [28]:
type(X_train)

pandas.core.frame.DataFrame

In [29]:
X_train.shape

(537, 8)

In [30]:
X_test.shape

(231, 8)

# Create XGBoost Class

#### Using Default Settings

In [41]:
SEED=27

model_default = xgb.XGBClassifier(seed=SEED)
%time model_default.fit(X_train, y_train)
pred = model_default.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test, pred)

print()
print('Accuracy: {}'.format(accuracy))

CPU times: user 39.1 ms, sys: 230 Âµs, total: 39.4 ms
Wall time: 39.7 ms

Accuracy: 0.7792207792207793


In [42]:
print(sklearn.metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83       147
           1       0.69      0.70      0.70        84

   micro avg       0.78      0.78      0.78       231
   macro avg       0.76      0.76      0.76       231
weighted avg       0.78      0.78      0.78       231



#### Setting 2

In [48]:
model2 = xgb.XGBClassifier(n_estimators=100,  # number of trees you want to build.
                          max_depth=8,  # determines how deeply each tree is allowed to grow during any boosting round.
                          learning_rate=0.1,
                          subsample=0.5,   # percentage of samples used per tree. Low value can lead to underfitting.
                          seed=SEED)

%time model2.fit(X_train, y_train)

pred = model2.predict(X_test)

accuracy = sklearn.metrics.accuracy_score(y_test, pred)

print()
print('Accuracy: {}'.format(accuracy))

CPU times: user 63.7 ms, sys: 4.06 ms, total: 67.7 ms
Wall time: 68.1 ms

Accuracy: 0.7402597402597403


In [49]:
print(sklearn.metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       147
           1       0.65      0.62      0.63        84

   micro avg       0.74      0.74      0.74       231
   macro avg       0.72      0.71      0.72       231
weighted avg       0.74      0.74      0.74       231



#### Setting 3

In [47]:
model3 = xgb.XGBClassifier(learning_rate=0.1,
                         n_estimators=1000,
                         max_depth=5,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective='binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=27)

%time model3.fit(X_train, y_train)

pred = model3.predict(X_test)

accuracy = sklearn.metrics.accuracy_score(y_test, pred)

print()
print('Accuracy: {}'.format(accuracy))

print(sklearn.metrics.classification_report(y_test, pred))

CPU times: user 816 ms, sys: 389 ms, total: 1.2 s
Wall time: 606 ms

Accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       147
           1       0.61      0.60      0.60        84

   micro avg       0.71      0.71      0.71       231
   macro avg       0.69      0.69      0.69       231
weighted avg       0.71      0.71      0.71       231

