<h1>Import all libraries and reading explored data into Dataframe</h1>

In [None]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]), \
        index=['true:1', 'true:2', 'true:3', 'true:4', 'true:5'], 
        columns=['pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """
    
# Read from dataframe    
X_test = pd.read_pickle("X_test")
X_train = pd.read_pickle("X_train")
y_test = pd.read_pickle("y_test")
y_train = pd.read_pickle("y_train")

<h1>Random Forest</h1>

In [None]:
#Instantiate model
randomforest = RandomForestClassifier()

parameters = { 
    'n_estimators': [800, 1000]
}

#Fit the training feature Xs and training label Ys
randomforest = GridSearchCV(randomforest, cv=3, param_grid=parameters, scoring='f1_micro')
randomforest.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = randomforest.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:", randomforest.best_params_)

<h1>Logistic Regression</h1>

In [None]:
#create a new logistic regression model ‘lbfgs’, ‘sag’ and ‘newton-cg’ solvers.
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

#fit the model to the training data
log_reg.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = log_reg.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

#regularisation c

<h1>Ensemble (Stacking with all models)</h1>

As I mentioned in lecture, it is possible to ensemble different models. So how can we do that in python? Check out the following link and try it for your project!:
https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/ 

In [None]:
#knn with best parameters
knn = KNeighborsClassifier(algorithm='auto', leaf_size=3, n_jobs=-1, n_neighbors=9)
knn.fit(X_train,y_train.values.ravel())

#baggingTree with best parameters
baggingTree = BaggingClassifier(DecisionTreeClassifier(max_depth=8), max_features=0.7, max_samples=0.5, n_estimators=100)
baggingTree.fit(X_train, y_train.values.ravel())

#baggingknn with best parameters
baggingknn = BaggingClassifier(knn, max_features=0.5, max_samples=0.7, n_estimators=200)
baggingknn.fit(X_train, y_train.values.ravel())

#Adaboost(DecisionTree) with best parameters
adaboostTree = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini', max_depth=8, splitter='best'), learning_rate=2, n_estimators=1)
adaboostTree.fit(X_train, y_train.values.ravel())

#random forest with best parameters
randomforest = RandomForestClassifier(criterion='gini', max_depth=6, max_features='log2', n_estimators=500)
randomforest.fit(X_train, y_train.values.ravel())

#create a dictionary of our models
estimators=[('knn', knn), 
            ('baggingTree', baggingTree),
            ('baggingknn', baggingknn),
            ('adaboostTree', adaboostTree),
            #('naivebayes', naivebayes),
            #('adaboostnaivebayes', adaboostnaivebayes),
            ('randomforest', randomforest), 
            ('log_reg', log_reg)]

#create our voting classifier, inputting our models, voting hard means asking classifers to make predictions by majority vote
ensemble = VotingClassifier(estimators, voting='hard')

#fit model to training data
ensemble.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = ensemble.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)