<h1>Import all libraries and reading explored data into Dataframe</h1>

In [8]:
import re
import io
import time

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]), \
        index=['true:1', 'true:2', 'true:3', 'true:4', 'true:5'], 
        columns=['pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """


<h1>AdaBoost (with Decision Tree)</h1>

Refer to the following links on for detail explanation on the implementation:
- [AdaBoost Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
- [DataCamp Implementation](https://www.datacamp.com/community/tutorials/adaboost-classifier-python)
- [Setting Learning Rate and N Estimators](https://stats.stackexchange.com/questions/82323/shrinkage-parameter-in-adaboost)
- [Parameter Tuning](https://machinelearningmastery.com/tune-learning-rate-for-gradient-boosting-with-xgboost-in-python/)

*Note that the default AdaBoost implementation in SKLearn is Decision Tree 


### Experiment E
Concatenation of (1) Categorical & Numerical attributes, and (2) Lower Dimension Vector after performing dimension reduction on original TF-IDF vector (Experiment C without casts, directors and overview)

In [9]:
# Read from dataframe
X_testE = pd.read_pickle("Experiment E Train Test Split/X_test")
X_trainE = pd.read_pickle("Experiment E Train Test Split/X_train")
y_testE = pd.read_pickle("Experiment E Train Test Split/y_test")
y_trainE = pd.read_pickle("Experiment E Train Test Split/y_train")

print(X_testE.shape)
print(X_trainE.shape)
print(y_testE.shape)
print(y_trainE.shape)

(3912, 1021)
(15648, 1021)
(3912, 1)
(15648, 1)


In [5]:
start = time.time()

#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
parametersE = {"base_estimator__max_depth" : [9], #list(range(5, 10))
              "base_estimator__criterion" : ["gini"], #["gini", "entropy"]
              "base_estimator__splitter" :   ["random"], #["best", "random"]
              "n_estimators": [300], #[50, 100, 300]
              "learning_rate": [0.001] #[0.001, 0.25, 0.5]
             }

adaboostTreeE = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier()), cv=5, param_grid=parametersE)

#Fit the training feature Xs and training label Ys
adaboostTreeE.fit(X_trainE, y_trainE.values.ravel())

#Use the trained model to predict the test data
y_predE = adaboostTreeE.predict(X_testE)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_testE, y_predE)
print("Best Parameters:",adaboostTreeE.best_params_)

# meansE = adaboostTreeE.cv_results_['mean_test_score']
# stdsE = adaboostTreeE.cv_results_['std_test_score']
# paramsE = adaboostTreeE.cv_results_['params']
# for mean, stdev, param in zip(meansE, stdsE, paramsE):
#     print("%f (%f) with: %r" % (mean, stdev, param))
    
end = time.time()
print(end - start)

Confusion Matrix:
        pred:1  pred:2  pred:3  pred:4  pred:5
true:1     551     145      62      28       3
true:2     453     177     125      40       7
true:3     306     124     204     135      17
true:4     214      47     134     273     131
true:5     132      24      34     143     403
Accuracy: 0.4110429447852761
F1: 0.4110429447852761
Best Parameters: {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 9, 'base_estimator__splitter': 'random', 'learning_rate': 0.001, 'n_estimators': 300}
1669.4794654846191


<h1>Feature Importance Ranking</h1>

In [6]:
feature_importances = pd.DataFrame(adaboostTreeE.best_estimator_.feature_importances_,
                                   index = X_trainE.columns,
                                    columns=['Importance']).sort_values('Importance', ascending=False)
feature_importances[0:20]

Unnamed: 0,Importance
budget,0.45
vote_count,0.23
documentary,0.1
drama,0.03
paramount,0.01
universalpictures,0.01
columbiapictures,0.01
family,0.01
comedy,0.01
thriller,0.01


<h1>Save the best model</h1>

In [7]:
import pickle

#save classifier
save_classifier = open("adaboostTreeE.pickle","wb") #binary write
pickle.dump(adaboostTreeE, save_classifier)
save_classifier.close()