<h1>Import all libraries and reading explored data into Dataframe</h1>

In [1]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]), \
        index=['true:1', 'true:2', 'true:3', 'true:4', 'true:5'], 
        columns=['pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """
    
# Read from dataframe    
X_test = pd.read_pickle("X_test")
X_train = pd.read_pickle("X_train")
y_test = pd.read_pickle("y_test")
y_train = pd.read_pickle("y_train")

<h1>Random Forest</h1>

In [2]:
#Instantiate model
randomforest = RandomForestClassifier()

parameters = { 
    'n_estimators': [800, 1400], #500, 900 1000 not good
    'criterion': ['entropy'] #gini not good
}

#Fit the training feature Xs and training label Ys
randomforest = GridSearchCV(randomforest, cv=5, param_grid=parameters, scoring='f1_micro')
randomforest.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = randomforest.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:", randomforest.best_params_)

Confusion Matrix:
        pred:1  pred:2  pred:3  pred:4  pred:5
true:1     344     230     171      32      12
true:2     285     257     184      48      28
true:3     237     193     167      74     115
true:4     159     131     118      74     317
true:5     101      49      52      34     500
Accuracy: 0.3430470347648262
F1: 0.3430470347648262
Best Parameters: {'criterion': 'entropy', 'n_estimators': 1400}


<h1>Logistic Regression</h1>

In [7]:
#create a new logistic regression model ‘lbfgs’, ‘sag’ and ‘newton-cg’ solvers.
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

parameters = {
    'C': [0.1] 
}

#Fit the training feature Xs and training label Ys
log_reg = GridSearchCV(log_reg, cv=5, param_grid=parameters, scoring='f1_micro')
log_reg.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = log_reg.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:", log_reg.best_params_)

Confusion Matrix:
        pred:1  pred:2  pred:3  pred:4  pred:5
true:1     458     203     105      21       2
true:2     374     256     133      31       8
true:3     277     194     186      99      30
true:4     174     122     105     212     186
true:5     104      48      49     117     418
Accuracy: 0.3911042944785276
F1: 0.39110429447852757
Best Parameters: {'C': 0.1}


<h1>Ensemble (Stacking with all models)</h1>

As I mentioned in lecture, it is possible to ensemble different models. So how can we do that in python? Check out the following link and try it for your project!:
https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/ 

In [2]:
#Adaboost(DecisionTree) with best parameters
adaboostTree = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini', max_depth=9, splitter='random'), learning_rate=0.001, n_estimators=300)
adaboostTree.fit(X_train, y_train.values.ravel())

#random forest with best parameters
randomforest = RandomForestClassifier(criterion='entropy', n_estimators=1400)
randomforest.fit(X_train, y_train.values.ravel())

#create a new logistic regression model ‘lbfgs’, ‘sag’ and ‘newton-cg’ solvers.
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=0.1)

#create a dictionary of our models
estimators=[('adaboostTree', adaboostTree),
            ('randomforest', randomforest), 
            ('log_reg', log_reg)]

#create our voting classifier, inputting our models, voting hard means asking classifers to make predictions by majority vote
ensemble = VotingClassifier(estimators, voting='hard')

#fit model to training data
ensemble.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = ensemble.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
        pred:1  pred:2  pred:3  pred:4  pred:5
true:1     567     170      36      11       5
true:2     500     197      70      24      11
true:3     378     134     139      95      40
true:4     224     105     107     154     209
true:5     134      45      30      67     460
Accuracy: 0.38778118609406953
F1: 0.38778118609406953


<h1>Save the best model</h1>

In [20]:
#save classifier
save_classifier = open("logregmodel.pickle","wb") #binary write
pickle.dump(log_reg, save_classifier)
save_classifier.close()