## Imports

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import struct
import time

# from sklearn import metrics

import pandas as pd
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import Binarizer, PowerTransformer, Normalizer
# from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import KFold, GridSearchCV
# from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import pickle

## Functions

## Data

In [4]:
#3x28x28 matrix of numbers images
with open('train-images.idx3-ubyte','rb') as f:
    magic, size  = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data_images  = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    data_images  = data_images.reshape((size, nrows, ncols))
    data_images  = data_images[8:]

#labels from the images previously loaded
with open('train-labels.idx1-ubyte','rb') as f:
    magic, size  = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data_labels  = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>')).astype('str')

## ETL and batching 

In [5]:
# separating test and train sets 
X_train, X_test, y_train, y_test = train_test_split(data_images.reshape((59992,28*28)),data_labels,
                                                    train_size=0.005, random_state=42)

## Model Loading

In [7]:
# loading the trained models
lr_model = pickle.load(open('LogisticRegression.sav', 'rb'))

dt_model = pickle.load(open('DecisionTree.sav', 'rb'))

rf_model = pickle.load(open('RandomForest.sav', 'rb'))

xgrb_model = pickle.load(open('XGRB.sav', 'rb'))

## Model Prediction

In [8]:
# using the loaded models to predict y_test and y_train

# Logistic Regression
y_pred_lr    = lr_model.predict(X_test)

y_train_lr   = lr_model.predict(X_train)

# Decision Tree
y_pred_dt    = dt_model.predict(X_test)

y_train_dt   = dt_model.predict(X_train)

# Random Forest
y_pred_rf    = rf_model.predict(X_test)

y_train_rf   = rf_model.predict(X_train)

# XG Boost
y_pred_xgrb  = xgrb_model.predict(X_test)

y_train_xgrb = xgrb_model.predict(X_train)

## Prediction Score and Result Dataframe

In [24]:
# calculating the accuracy of each model
accuracy_lr            = round(metrics.accuracy_score(y_test , y_pred_lr),2)
accuracy_lr_train      = round(metrics.accuracy_score(y_train, y_train_lr),2)

accuracy_dt            = round(metrics.accuracy_score(y_test , y_pred_dt),2)
accuracy_dt_train      = round(metrics.accuracy_score(y_train, y_train_dt),2)

accuracy_fr            = round(metrics.accuracy_score(y_test , y_pred_rf),2)
accuracy_fr_train      = round(metrics.accuracy_score(y_train, y_train_rf),2)

accuracy_xgbr          = round(metrics.accuracy_score(y_test.astype(np.float64) , y_pred_xgrb),2)
accuracy_xgbr_train    = round(metrics.accuracy_score(y_train.astype(np.float64), y_train_xgrb),2)

In [25]:
# creating a dataframe with the accuracy of each model with train and test data
model_accuracy = pd.DataFrame({"Model"   : ["LR", "DT", "RF", "XGRB"],
                               "Test_AC" : [accuracy_lr, accuracy_dt, accuracy_fr, accuracy_xgbr],
                               "Train_AC": [accuracy_lr_train, accuracy_dt_train, accuracy_fr_train, accuracy_xgbr_train]})

In [52]:
# calculating the F1 of each model
f1_lr   = round(metrics.f1_score(y_test , y_pred_lr, average='weighted'),2)

f1_dt   = round(metrics.f1_score(y_test , y_pred_dt, average='weighted'),2)

f1_fr   = round(metrics.f1_score(y_test , y_pred_rf, average='weighted'),2)

f1_xgbr = round(metrics.f1_score(y_test.astype(np.float64) , y_pred_xgrb, average='weighted'),2)

In [53]:
# creating a dataframe with the accuracy of each model with train and test data
model_f1 = pd.DataFrame({"Model"   : ["LR", "DT", "RF", "XGRB"],
                         "Test_F1" : [f1_lr, f1_dt, f1_fr, f1_xgbr]})

In [42]:
# making a dataframe for each prediction with the original and pred values

# Logistic Regression
pred_lr = pd.DataFrame({'real':y_test.astype(int), 'pred':y_pred_lr.astype(int)})

pred_lr.loc[pred_lr['pred'] != pred_lr['real'], 'pred_false'] = 1

# Decision Tree
pred_dt = pd.DataFrame({'real':y_test.astype(int), 'pred':y_pred_dt.astype(int)})

pred_dt.loc[pred_dt['pred'] != pred_dt['real'], 'pred_false'] = 1

# Random Forest
pred_rf = pd.DataFrame({'real':y_test.astype(int), 'pred':y_pred_rf.astype(int)})

pred_rf.loc[pred_rf['pred'] != pred_rf['real'], 'pred_false'] = 1

# XG Boost
pred_xgrb = pd.DataFrame({'real':y_test.astype(int), 'pred':y_pred_xgrb.astype(int)})

pred_xgrb.loc[pred_xgrb['pred'] != pred_xgrb['real'], 'pred_false'] = 1

In [43]:
# creating a data frame for each model with the count misclassified samples per number

# Logistic Regression
pred_lr_ms   = pred_lr.drop(columns=['pred']).groupby(['real']).sum()

# Decision Tree
pred_dt_ms   = pred_dt.drop(columns=['pred']).groupby(['real']).sum()

# Random Forest
pred_rf_ms   = pred_rf.drop(columns=['pred']).groupby(['real']).sum()

# XG Boost
pred_xgrb_ms = pred_xgrb.drop(columns=['pred']).groupby(['real']).sum()

In [56]:
# Creating a clasification report for each model

# Logistic Regression
pred_lr_cr   = metrics.classification_report(y_test , y_pred_lr)

# Decision Tree
pred_dt_cr   = metrics.classification_report(y_test , y_pred_dt)

# Random Forest
pred_rf_cr   = metrics.classification_report(y_test , y_pred_rf)

# XG Boost
pred_xgrb_cr = metrics.classification_report(y_test.astype(np.float64) , y_pred_xgrb)

In [57]:
pred_xgrb_cr

'              precision    recall  f1-score   support\n\n         0.0       0.84      0.92      0.88      5891\n         1.0       0.87      0.87      0.87      6715\n         2.0       0.79      0.70      0.74      5926\n         3.0       0.75      0.81      0.78      6095\n         4.0       0.70      0.84      0.77      5803\n         5.0       0.79      0.65      0.71      5395\n         6.0       0.91      0.75      0.82      5899\n         7.0       0.86      0.83      0.85      6231\n         8.0       0.68      0.78      0.73      5815\n         9.0       0.76      0.74      0.75      5923\n\n    accuracy                           0.79     59693\n   macro avg       0.80      0.79      0.79     59693\nweighted avg       0.80      0.79      0.79     59693\n'