# Bagging

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

### Prepare Dataset

In [None]:
df = pd.read_csv("data/diabetes.csv")

# Features & Target
X = df.iloc[:,:8].values
y = df['class'].values

# Normalize
X = StandardScaler().fit_transform(X)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Bagged Decision Trees for Classification

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# K-fold 5 splits
kfold = StratifiedKFold(n_splits=5)

num_trees = 100

# Decision Tree with 5 fold cross validation
DT = DecisionTreeClassifier().fit(X_train,y_train)
results = cross_val_score(DT, X_train,y_train, cv=kfold)
print ("Decision Tree (stand alone) - Train : ", results.mean())
print ("Decision Tree (stand alone) - Test : ", metrics.accuracy_score(DT.predict(X_test), y_test))

# Using Bagging and build 100 decision tree models
bag_DT = BaggingClassifier(base_estimator=DT, n_estimators=num_trees).fit(X_train,y_train)
results = cross_val_score(bag_DT, X_train, y_train, cv=kfold)
print ("\nDecision Tree (Bagging) - Train : ", results.mean())
print ("Decision Tree (Bagging) - Test : ", metrics.accuracy_score(bag_DT.predict(X_test), y_test))

### Feature Important

In [None]:
feature_importance = DT.feature_importances_

# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, df.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# K-fold 5 splits
kfold = StratifiedKFold(n_splits=5)

num_trees = 100

RF = RandomForestClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = cross_val_score(RF, X_train, y_train, cv=kfold)

print ("\nRandom Forest - Train : ", results.mean())
print ("Random Forest - Test : ", metrics.accuracy_score(RF.predict(X_test), y_test))

# Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# K-fold 5 splits
kfold = StratifiedKFold(n_splits=5)

num_trees = 100

ET = ExtraTreesClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = cross_val_score(ET, X_train, y_train, cv=kfold)

print ("\nExtraTree - Train : ", results.mean())
print ("ExtraTree - Test : ", metrics.accuracy_score(ET.predict(X_test), y_test))

### Decision Boundary Looks Like

In [None]:
# new X with only 2 columns
X = X[:, -2:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)

kfold = StratifiedKFold(n_splits=5)

num_trees = 100

# Decision Tree with 5 fold cross validation
DT = DecisionTreeClassifier().fit(X_train,y_train)
results = cross_val_score(DT, X_train,y_train, cv=kfold)
print ("Decision Tree (stand alone) - Train : ", results.mean())
print ("Decision Tree (stand alone) - Test : ", metrics.accuracy_score(DT.predict(X_test), y_test))

# Using Bagging and build 100 decision tree models
bag_DT = BaggingClassifier(base_estimator=DT, n_estimators=num_trees).fit(X_train,y_train)
results = cross_val_score(bag_DT, X_train, y_train, cv=kfold)
print ("\nDecision Tree (Bagging) - Train : ", results.mean())
print ("Decision Tree (Bagging) - Test : ", metrics.accuracy_score(bag_DT.predict(X_test), y_test))

# Using Random Forest with 100 trees
RF = RandomForestClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = cross_val_score(RF, X_train, y_train, cv=kfold)

print ("\nRandom Forest - Train : ", results.mean())
print ("Random Forest  - Test : ", metrics.accuracy_score(RF.predict(X_test), y_test))

# Using Extra Trees with 100 trees
ET = ExtraTreesClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = cross_val_score(ET, X_train, y_train, cv=kfold)

print ("\nExtraTree - Train : ", results.mean())
print ("ExtraTree - Test : ", metrics.accuracy_score(ET.predict(X_test), y_test))

def plot_decision_regions(X, y, classifier):
    h = .02  
    markers = ('s', 'x', 'o', '^', 'v')

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),np.arange(x2_min, x2_max, h))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],alpha=0.8,marker=markers[idx], label=cl)

# Plot the decision boundary
plt.figure(figsize=(10,6))
plt.subplot(221)
plot_decision_regions(X, y, DT)
plt.title('Decision Tree (Stand alone)')
plt.xlabel('X')
plt.ylabel('Y')

plt.subplot(222)
plot_decision_regions(X, y, bag_DT)
plt.title('Decision Tree (Bagging - 100 trees)')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend(loc='best')

plt.subplot(223)
plot_decision_regions(X, y, RF)
plt.title('RandomForest Tree (100 trees)')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend(loc='best')

plt.subplot(224)
plot_decision_regions(X, y, ET)
plt.title('Extreme Random Tree (100 trees)')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend(loc='best')
plt.tight_layout()