In [102]:
import numpy as np
import pandas as pd
import random
import math

import matplotlib.pyplot as plt
import sklearn.metrics as metrics

from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler  

from sklearn import cross_validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate

from sklearn.model_selection import train_test_split  
from sklearn.model_selection import cross_val_predict

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [85]:
def load_dataset(input_path):
    data = pd.read_csv(input_path)
    data = shuffle(data, random_state = 102)
    return data

def import_dataset(id):
    if id == 1:
        input_dataset = "Datasets/iris.csv"
    elif id == 2:
        input_dataset = "Datasets/glass.csv"
    elif id == 3:
        input_dataset = "Datasets/wine.csv"
    else:
        input_dataset = "Datasets/pima-indians-diabetes.csv"

    return load_dataset(input_dataset)   

def split_data_on_x_y(dataset):
    X = np.split(dataset, [-1], axis=1)
    y = X[1]
    X = X[0]
    return X, y

def split_data_on_training_test(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=142)
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)    
    return X_train, X_test, y_train, y_test

def do_normalization(X, X_train, X_test):
    scaler = StandardScaler()  
    scaler.fit(X)

    X = scaler.transform(X)
    X_train = scaler.transform(X_train)  
    X_test = scaler.transform(X_test)  
    return X, X_train, X_test


In [86]:
def plot_cross_validation(y1, y2, folds_number):
    plt.style.use('ggplot')

    fig, ax = plt.subplots(figsize=(20, 10))

    ax.set_ylabel('F1-Score')
    ax.set_title('K-Fold vs Stratified K-Fold')

    x = np.arange(len(folds_number))

    width = 0.25
    ax.bar(x, y1, width, label='K-Fold')
    ax.bar(x + width, y2, width,
            color=list(plt.rcParams['axes.prop_cycle'])[2]['color'], label='Stratified K-Fold')
    ax.set_xticks(x + width)
    ax.set_xticklabels(folds_number)
    ax.legend()

    plt.show()    

In [105]:
def return_f1_macro_mean_score(scores):
    for key, score in scores.items():
        if key == 'test_f1_macro':
            return np.mean(score)
        
def cross_validation(data, target, cv, clf):
    clf = BaggingClassifier(base_estimator=DecisionTreeClassifier())
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(clf, data, target, cv=cv, scoring=scoring, return_train_score=False)
    f1_macro = return_f1_macro_mean_score(scores)
    return f1_macro
    
def do_cross_validation_test(X, y, X_test, y_test, classifier, folds_number = [2, 3, 4, 5, 6, 7, 8, 9]):
    f1_kfold_score = []
    f1_stratified_kfold_score = []

    f1_kfold_score = []
    f1_stratified_kfold_score = []

    for i in folds_number:
        
        f1_kfold = cross_validation(X, np.ravel(y), KFold(n_splits=i), classifier)
        f1_kfold_score.append(f1_kfold)
        
        f1_stratified_kfold = cross_validation(X, np.ravel(y), StratifiedKFold(n_splits=i), classifier)
        f1_stratified_kfold_score.append(f1_stratified_kfold) 
        
        print("K = {}\t{:.3f}\t{:.3f}".format(i, f1_kfold, f1_stratified_kfold))
        
    plot_cross_validation(f1_kfold_score, f1_stratified_kfold_score, folds_number)

def plot_confusion_matrix(Y_test, Y_pred):
    cm = metrics.confusion_matrix(Y_test, Y_pred)
    img = plt.matshow(cm, cmap=plt.cm.Blues)
    plt.colorbar(img, fraction=0.045)
    plt.grid('off')
    for x in range(cm.shape[0]):
        for y in range(cm.shape[1]):
            plt.text(x, y, "%0.2f" % cm[x,y], 
                     size=12,  ha="center", va="center")
    plt.show()


In [148]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid


def cross_validation(data, target, cv, clf):
    
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(clf, data, target, cv=cv, scoring=scoring, return_train_score=False)
    f1_macro = return_f1_macro_mean_score(scores)
    return f1_macro
    
    
def do_bagging_test(X, y, X_test, n_fold, params):
    for param in params:
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                            **param
                           )
        f1_score = cross_validation(X, np.ravel(y), StratifiedKFold(n_splits=n_fold), clf)
        for key, value in param.items():
            print("\t{}\t{}\t{:.3f}".format(key, value, f1_score))           

def do_boosting_test(X, y, X_test, n_fold, params):
    for param in params:
        clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                            **param
                           )
        f1_score = cross_validation(X, np.ravel(y), StratifiedKFold(n_splits=n_fold), clf)
        for key, value in param.items():
            print("\t{}\t{}\t{:.3f}".format(key, value, f1_score))           

def do_random_forest_test(X, y, X_test, n_fold, params):
    for param in params:
        clf = RandomForestClassifier(**param)
        f1_score = cross_validation(X, np.ravel(y), StratifiedKFold(n_splits=n_fold), clf)
        for key, value in param.items():
            print("\t{}\t{}\t{:.3f}".format(key, value, f1_score))           

def do_best_test(X, y, X_test, n_fold, clf):
    f1_score = cross_validation(X, np.ravel(y), StratifiedKFold(n_splits=n_fold), clf)
    print("{:.3f}".format( f1_score))           
            
def main(dataset_number, n_folds, test_size, isActive):

    ######## Preparing Data ########
    
    # Load Data
    dataset = import_dataset(dataset_number)
    
    # Split Data
    X, y = split_data_on_x_y(dataset)
    X_train, X_test, y_train, y_test = split_data_on_training_test(X, y, test_size)
    
    # Normalize Data
    if isActive[0]:
        X, X_train, X_test = do_normalization(X, X_train, X_test)



    # Cross Validation Test
    if isActive[1]:
        baggingClassifier = BaggingClassifier(base_estimator=DecisionTreeClassifier())
        do_cross_validation_test(X, y, X_test, y_test, baggingClassifier)

       
        
    if isActive[2]:
        for i in range(0, 5):
            if(i == 0):
                params =ParameterGrid({'bootstrap_features': [True, False]})
            if(i == 1):
                params =ParameterGrid({'bootstrap': [True, False]})
            if(i == 2):
                params =ParameterGrid({'max_samples': [1, 3, 5, 8, 10]})
            if(i == 3):
                params =ParameterGrid({'max_features': [1, 2, 5, 7]})
            if(i == 4):
                params =ParameterGrid({'n_estimators': [1, 5, 10, 15, 20, 40, 50, 100, 300]})
            
            do_bagging_test(X, y, X_test, n_folds, params)
    
        
    if isActive[3]:
        for i in range(0, 3):
            if(i == 0):
                params =ParameterGrid({'n_estimators': [1, 5, 10, 15, 20, 40, 50, 100, 300]})
            if(i == 1):
                params =ParameterGrid({'learning_rate': [0.1, 0.5, 1, 2, 5, 10, 20, 50]})
            if(i == 2):
                params =ParameterGrid({'algorithm': ['SAMME', 'SAMME.R']}) 
                
            do_boosting_test(X, y, X_test, n_folds, params)
    
    if isActive[4]:
        for i in range(0, 3):
            if(i == 0):
                params =ParameterGrid({'n_estimators': [1, 5, 10, 15, 20, 40, 50, 100, 300]})
            if(i == 1):
                params =ParameterGrid({'criterion': ['gini', 'entropy']})
            if(i == 2):
                params =ParameterGrid({'min_samples_leaf': [1, 3, 5, 10, 20, 50]})

            do_random_forest_test(X, y, X_test, n_folds, params)
            
    if isActive[5]:
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                                bootstrap_features = False, 
                                bootstrap = True, 
                                max_samples = 1.0, 
                                max_features = 1.0, 
                                n_estimators = 20
                               )
        do_best_test(X, y, X_test, n_folds, clf)
        
        clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators = 20,
                                learning_rate = 2,
                                algorithm = 'SAMME.R'
                                )
        do_best_test(X, y, X_test, n_folds, clf)    
                
        clf = RandomForestClassifier(n_estimators = 20,
                                     criterion = 'gini',
                                     min_samples_leaf = 1
                                    )
        do_best_test(X, y, X_test, n_folds, clf)  
        print()
        print()
    if isActive[6]:
            
        X, X_train, X_test = do_normalization(X, X_train, X_test)
            
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                                bootstrap_features = False, 
                                bootstrap = True, 
                                max_samples = 1.0, 
                                max_features = 1.0, 
                                n_estimators = 20
                               )
        do_best_test(X, y, X_test, n_folds, clf)
        
        clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators = 20,
                                learning_rate = 2,
                                algorithm = 'SAMME.R'
                                )
        do_best_test(X, y, X_test, n_folds, clf)    
                
        clf = RandomForestClassifier(n_estimators = 20,
                                     criterion = 'gini',
                                     min_samples_leaf = 1
                                    )
        do_best_test(X, y, X_test, n_folds, clf)  
    
     
    print()

In [150]:
main(2, 5, 0.2, [0, 0, 1, 0, 0, 0, 0])
main(3, 5, 0.2, [0, 0, 1, 0, 0, 0, 0])
main(4, 5, 0.2, [0, 0, 1, 0, 0, 0, 0])

	bootstrap_features	True	0.666
	bootstrap_features	False	0.687
	bootstrap	True	0.685
	bootstrap	False	0.648
	max_samples	1	0.083
	max_samples	3	0.163
	max_samples	5	0.233
	max_samples	8	0.280
	max_samples	10	0.335
	max_features	1	0.350
	max_features	2	0.457
	max_features	5	0.603
	max_features	7	0.677
	n_estimators	1	0.620
	n_estimators	5	0.600
	n_estimators	10	0.634
	n_estimators	15	0.700
	n_estimators	20	0.723
	n_estimators	40	0.678
	n_estimators	50	0.708
	n_estimators	100	0.706
	n_estimators	300	0.709

	bootstrap_features	True	0.962
	bootstrap_features	False	0.929
	bootstrap	True	0.952
	bootstrap	False	0.911
	max_samples	1	0.157
	max_samples	3	0.565
	max_samples	5	0.769
	max_samples	8	0.856
	max_samples	10	0.892
	max_features	1	0.753
	max_features	2	0.944
	max_features	5	0.946
	max_features	7	0.962
	n_estimators	1	0.902
	n_estimators	5	0.912
	n_estimators	10	0.963
	n_estimators	15	0.962
	n_estimators	20	0.951
	n_estimators	40	0.957
	n_estimators	50	0.962
	n_estimators	100	0.962
	n_es