# Boosting

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

### Prepare Dataset

In [None]:
df = pd.read_csv("data/diabetes.csv")

# Use some week features
X = df[['age','serum_insulin']]
y = df['class'].values

# Normalize
X = StandardScaler().fit_transform(X)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### AdaBoosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

kfold = StratifiedKFold(n_splits=5)
num_trees = 100

# Dection Tree with 5 fold cross validation
# lets restrict max_depth to 1 to have more impure leaves
DT = DecisionTreeClassifier(max_depth=1).fit(X_train,y_train)
results = cross_val_score(DT, X_train,y_train, cv=kfold)
print ("Decision Tree (stand alone) - CV Train : %.2f" % results.mean())
print ("Decision Tree (stand alone) - Test : %.2f" % metrics.accuracy_score(DT.predict(X_train), y_train))
print ("Decision Tree (stand alone) - Test : %.2f" % metrics.accuracy_score(DT.predict(X_test), y_test))

# Using Adaptive Boosting of 100 iteration
AdaBoost_DT = AdaBoostClassifier(base_estimator=DT, n_estimators=num_trees, learning_rate=0.1).fit(X_train,y_train)
results = cross_val_score(AdaBoost_DT, X_train, y_train, cv=kfold)
print ("\nDecision Tree (AdaBoosting) - CV Train : %.2f" % results.mean())
print ("Decision Tree (AdaBoosting) - Train : %.2f" % metrics.accuracy_score(AdaBoost_DT.predict(X_train), y_train))
print ("Decision Tree (AdaBoosting) - Test : %.2f" % metrics.accuracy_score(AdaBoost_DT.predict(X_test), y_test))

AdaBoost algorithm has given an average rise in accuracy score between train / test dataset compared to the stanalone decision tree model.

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Using Gradient Boosting of 100 iterations
GBT = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.1).fit(X_train, y_train)
results = cross_val_score(GBT, X_train, y_train, cv=kfold)

print ("\nGradient Boosting - CV Train : %.2f" % results.mean())
print ("Gradient Boosting - Train : %.2f" % metrics.accuracy_score(GBT.predict(X_train), y_train))
print ("Gradient Boosting - Test : %.2f" % metrics.accuracy_score(GBT.predict(X_test), y_test))

# XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

num_rounds = 100

kfold = StratifiedKFold(n_splits=5)

XGB = XGBClassifier(n_estimators = num_rounds, objective='binary:logistic')

# use early_stopping_rounds to stop the cv when there is no score imporovement
XGB.fit(X_train,y_train, early_stopping_rounds=20, eval_set=[(X_test, y_test)], verbose=False)

results = cross_val_score(XGB, X_train,y_train, cv=kfold)
print ("\nxgBoost - CV Train : %.2f" % results.mean())
print ("xgBoost - Train : %.2f" % metrics.accuracy_score(XGB.predict(X_train), y_train))
print ("xgBoost - Test : %.2f" % metrics.accuracy_score(XGB.predict(X_test), y_test))