<h1>Import all libraries and reading explored data into Dataframe</h1>

In [1]:
import re
import io
import time

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]), \
        index=['true:1', 'true:2', 'true:3', 'true:4', 'true:5'], 
        columns=['pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """


<h1>AdaBoost (with Decision Tree)</h1>

Refer to the following links on for detail explanation on the implementation:
- [AdaBoost Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
- [DataCamp Implementation](https://www.datacamp.com/community/tutorials/adaboost-classifier-python)
- [Setting Learning Rate and N Estimators](https://stats.stackexchange.com/questions/82323/shrinkage-parameter-in-adaboost)
- [Parameter Tuning](https://machinelearningmastery.com/tune-learning-rate-for-gradient-boosting-with-xgboost-in-python/)

*Note that the default AdaBoost implementation in SKLearn is Decision Tree 


### Experiment D
Concatenation of (1) Categorical & Numerical attributes, and (2) Lower Dimension Vector after performing dimension reduction on original TF-IDF vector

In [2]:
# Read from dataframe
X_testD = pd.read_pickle("Experiment D Train Test Split/X_test")
X_trainD = pd.read_pickle("Experiment D Train Test Split/X_train")
y_testD = pd.read_pickle("Experiment D Train Test Split/y_test")
y_trainD = pd.read_pickle("Experiment D Train Test Split/y_train")

In [3]:
start = time.time()

#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
parametersD = {"base_estimator__max_depth" : [9], #list(range(5, 10))
              "base_estimator__criterion" : ["gini"], #["gini", "entropy"]
              "base_estimator__splitter" :   ["random"], #["best", "random"]
              "n_estimators": [300], #[50, 100, 300]
              "learning_rate": [0.001] #[0.001, 0.25, 0.5]
             }

adaboostTreeD = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier()), cv=5, param_grid=parametersD)

#Fit the training feature Xs and training label Ys
adaboostTreeD.fit(X_trainD, y_trainD.values.ravel())

#Use the trained model to predict the test data
y_predD = adaboostTreeD.predict(X_testD)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_testD, y_predD)
print("Best Parameters:",adaboostTreeD.best_params_)

# meansD = adaboostTreeD.cv_results_['mean_test_score']
# stdsD = adaboostTreeD.cv_results_['std_test_score']
# paramsD = adaboostTreeD.cv_results_['params']
# for mean, stdev, param in zip(meansD, stdsD, paramsD):
#     print("%f (%f) with: %r" % (mean, stdev, param))
    
end = time.time()
print(end - start)

Confusion Matrix:
        pred:1  pred:2  pred:3  pred:4  pred:5
true:1     488     195      85      19       2
true:2     425     210     113      48       6
true:3     285     152     172     147      30
true:4     196      85     118     246     154
true:5     120      44      38     113     421
Accuracy: 0.39289366053169733
F1: 0.39289366053169733
Best Parameters: {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 9, 'base_estimator__splitter': 'random', 'learning_rate': 0.001, 'n_estimators': 300}
16795.475114107132
