In [1]:
#import library
import os
import numpy as np
import pickle
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import matplotlib.pyplot as plt
#import cv2
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

2023-04-22 23:38:11.080535: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
#opening pickle file of augmented added data
with open('../data/external/combined_augmented_data_v2.pkl','rb') as f:
    X_train,y_train,X_test,y_test = pickle.load(f)

In [8]:
#printing shapes 
print(f'X_train Shape: {X_train.shape}')
print(f'y_train Shape: {y_train.shape}')
print(f'X_test Shape: {X_test.shape}')
print(f'y_test Shape: {y_test.shape}')

X_train Shape: (109820, 28, 28)
y_train Shape: (109820,)
X_test Shape: (28688, 28, 28)
y_test Shape: (28688,)


In [9]:
#fuction to find the indices given a label 
def find_indices(data,label):
    #check if data is numpy array
    if type(data) == np.ndarray:
        #return indices
        return np.where(data==label)
    #check if data is pandas series 
    elif type(data) == pd.Series:
        #return indices
        return data[data==label].index
    #else not supported in this function
    else:
        raise Exception('Not supported data type for this function.')

In [10]:
#letters
letters = ['A','B','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y']
#numbers 
numbers = sorted(list(np.unique(y_train.astype(int))))
#dictionary of labels 
labels = dict(zip(numbers,letters))

In [11]:
# Reshape the data to (num_samples, 784)
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
y_train = y_train
y_test = y_test
# Print the shapes of the augmented data
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (109820, 784)
y_train shape: (109820,)
X_test shape: (28688, 784)
y_test shape: (28688,)


In [13]:
#standardized data 
#using sklearn standard scaler model and fitting on training data 
sc = StandardScaler().fit(X_train)
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

In [12]:
#normalized data 
X_train_norm = X_train/255 
X_test_norm = X_test/255

#### Applying naive bayes 

In [14]:
gnb = GaussianNB()

In [17]:
# applying NB on normalized train data
y_pred_nm = gnb.fit(X_train_norm, y_train).predict(X_train_norm)

print(f"Accuracy: {accuracy_score(y_train, y_pred_nm)}")
print("Classification report:")
print(classification_report(y_train, y_pred_nm, target_names=letters))

Accuracy: 0.21761063558550356
Classification report:
              precision    recall  f1-score   support

           A       0.45      0.19      0.27      4504
           B       0.46      0.19      0.27      4040
           C       0.63      0.43      0.51      4576
           D       0.37      0.08      0.14      4784
           E       0.27      0.34      0.30      3828
           F       0.30      0.13      0.18      4816
           G       0.47      0.29      0.36      4360
           H       0.67      0.13      0.22      4052
           I       0.18      0.38      0.24      4648
           K       0.25      0.30      0.27      4456
           L       0.38      0.25      0.30      4964
           M       0.34      0.07      0.12      4220
           N       0.43      0.04      0.08      4604
           O       0.39      0.26      0.31      4784
           P       0.10      0.65      0.17      4352
           Q       0.40      0.28      0.33      5116
           R       0.18     

In [18]:
# applying NB on normalized test data
y_pred_nm = gnb.fit(X_train_norm, y_train).predict(X_test_norm)

print(f"Accuracy: {accuracy_score(y_test, y_pred_nm)}")
print("Classification report:")
print(classification_report(y_test, y_pred_nm, target_names=letters))

Accuracy: 0.19739960959286112
Classification report:
              precision    recall  f1-score   support

           A       0.44      0.17      0.24      1324
           B       0.58      0.15      0.24      1728
           C       0.55      0.31      0.40      1240
           D       0.35      0.09      0.15       980
           E       0.38      0.30      0.34      1992
           F       0.29      0.14      0.19       988
           G       0.43      0.30      0.35      1392
           H       0.79      0.12      0.21      1744
           I       0.11      0.28      0.16      1152
           K       0.25      0.27      0.26      1324
           L       0.30      0.21      0.25       836
           M       0.30      0.05      0.09      1576
           N       0.28      0.04      0.07      1164
           O       0.28      0.21      0.24       984
           P       0.13      0.71      0.22      1388
           Q       0.21      0.25      0.22       656
           R       0.09     

In [19]:
# applying NB on standerdized data
y_pred_sc = gnb.fit(X_train_sc, y_train).predict(X_train_sc)

print(f"Accuracy: {accuracy_score(y_train, y_pred_sc)}")
print("Classification report:")
print(classification_report(y_train, y_pred_sc, target_names=letters))

Accuracy: 0.21761063558550356
Classification report:
              precision    recall  f1-score   support

           A       0.45      0.19      0.27      4504
           B       0.46      0.19      0.27      4040
           C       0.63      0.43      0.51      4576
           D       0.37      0.08      0.14      4784
           E       0.27      0.34      0.30      3828
           F       0.30      0.13      0.18      4816
           G       0.47      0.29      0.36      4360
           H       0.67      0.13      0.22      4052
           I       0.18      0.38      0.24      4648
           K       0.25      0.30      0.27      4456
           L       0.38      0.25      0.30      4964
           M       0.34      0.07      0.12      4220
           N       0.43      0.04      0.08      4604
           O       0.39      0.26      0.31      4784
           P       0.10      0.65      0.17      4352
           Q       0.40      0.28      0.33      5116
           R       0.18     

In [20]:
# applying NB on standerdized data
y_pred_sc = gnb.fit(X_train_sc, y_train).predict(X_test_sc)

print(f"Accuracy: {accuracy_score(y_test, y_pred_sc)}")
print("Classification report:")
print(classification_report(y_test, y_pred_sc, target_names=letters))

Accuracy: 0.19739960959286112
Classification report:
              precision    recall  f1-score   support

           A       0.44      0.17      0.24      1324
           B       0.58      0.15      0.24      1728
           C       0.55      0.31      0.40      1240
           D       0.35      0.09      0.15       980
           E       0.38      0.30      0.34      1992
           F       0.29      0.14      0.19       988
           G       0.43      0.30      0.35      1392
           H       0.79      0.12      0.21      1744
           I       0.11      0.28      0.16      1152
           K       0.25      0.27      0.26      1324
           L       0.30      0.21      0.25       836
           M       0.30      0.05      0.09      1576
           N       0.28      0.04      0.07      1164
           O       0.28      0.21      0.24       984
           P       0.13      0.71      0.22      1388
           Q       0.21      0.25      0.22       656
           R       0.09     

### Logistic regrassion

In [21]:
lr = LogisticRegression(max_iter=5000)

In [22]:
# testing logistic regression on test data
y_pred_lr_sc = lr.fit(X_train_sc, y_train).predict(X_test_sc)

print(f"Accuracy: {accuracy_score(y_test, y_pred_lr_sc)}")
print("Classification report:")
print(classification_report(y_test, y_pred_lr_sc, target_names=letters))


Accuracy: 0.5139431121026213
Classification report:
              precision    recall  f1-score   support

           A       0.56      0.64      0.60      1324
           B       0.75      0.65      0.70      1728
           C       0.75      0.71      0.73      1240
           D       0.43      0.53      0.48       980
           E       0.73      0.57      0.64      1992
           F       0.48      0.52      0.50       988
           G       0.62      0.57      0.59      1392
           H       0.78      0.57      0.66      1744
           I       0.42      0.45      0.43      1152
           K       0.55      0.52      0.53      1324
           L       0.50      0.74      0.60       836
           M       0.50      0.40      0.45      1576
           N       0.36      0.38      0.37      1164
           O       0.57      0.54      0.55       984
           P       0.77      0.72      0.75      1388
           Q       0.48      0.70      0.57       656
           R       0.10      

In [23]:
# testing logistic regression on train data
y_pred_lr = lr.fit(X_train_sc, y_train).predict(X_train_sc)

print(f"Accuracy: {accuracy_score(y_train, y_pred_lr)}")
print("Classification report:")
print(classification_report(y_train, y_pred_lr, target_names=letters))


Accuracy: 0.6823711527954835
Classification report:
              precision    recall  f1-score   support

           A       0.71      0.75      0.73      4504
           B       0.77      0.78      0.77      4040
           C       0.87      0.87      0.87      4576
           D       0.64      0.60      0.62      4784
           E       0.74      0.70      0.72      3828
           F       0.69      0.67      0.68      4816
           G       0.77      0.76      0.76      4360
           H       0.72      0.68      0.70      4052
           I       0.66      0.70      0.68      4648
           K       0.68      0.71      0.69      4456
           L       0.67      0.74      0.70      4964
           M       0.64      0.58      0.61      4220
           N       0.64      0.61      0.63      4604
           O       0.78      0.77      0.77      4784
           P       0.75      0.79      0.77      4352
           Q       0.77      0.79      0.78      5116
           R       0.57      

In [24]:
#normalized data 
X_train_norm = X_train/255 
X_test_norm = X_test/255

In [25]:
# applying logistic regression on normalized data
y_pred_lr_nm = lr.fit(X_train_norm, y_train).predict(X_test_norm)

print(f"Accuracy: {accuracy_score(y_test, y_pred_lr_nm)}")
print("Classification report:")
print(classification_report(y_test, y_pred_lr_nm, target_names=letters))


Accuracy: 0.5329754601226994
Classification report:
              precision    recall  f1-score   support

           A       0.57      0.65      0.61      1324
           B       0.77      0.66      0.71      1728
           C       0.75      0.71      0.73      1240
           D       0.47      0.55      0.50       980
           E       0.74      0.58      0.65      1992
           F       0.49      0.56      0.52       988
           G       0.63      0.58      0.61      1392
           H       0.81      0.58      0.68      1744
           I       0.44      0.47      0.45      1152
           K       0.54      0.49      0.51      1324
           L       0.51      0.78      0.62       836
           M       0.55      0.44      0.49      1576
           N       0.37      0.39      0.38      1164
           O       0.60      0.55      0.57       984
           P       0.80      0.78      0.79      1388
           Q       0.50      0.69      0.58       656
           R       0.11      

In [26]:
# applying logistic regression on normalized data
y_pred_lr_nm = lr.fit(X_train_norm, y_train).predict(X_train_norm)

print(f"Accuracy: {accuracy_score(y_train, y_pred_lr_nm)}")
print("Classification report:")
print(classification_report(y_train, y_pred_lr_nm, target_names=letters))


Accuracy: 0.6718175195774905
Classification report:
              precision    recall  f1-score   support

           A       0.70      0.75      0.73      4504
           B       0.76      0.77      0.77      4040
           C       0.87      0.86      0.86      4576
           D       0.63      0.58      0.61      4784
           E       0.73      0.69      0.71      3828
           F       0.68      0.66      0.67      4816
           G       0.76      0.75      0.75      4360
           H       0.71      0.67      0.69      4052
           I       0.64      0.69      0.66      4648
           K       0.67      0.69      0.68      4456
           L       0.65      0.73      0.69      4964
           M       0.64      0.57      0.60      4220
           N       0.64      0.60      0.62      4604
           O       0.77      0.76      0.76      4784
           P       0.74      0.78      0.76      4352
           Q       0.76      0.77      0.77      5116
           R       0.56      