## Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

## Importing the Dataset

In [2]:
#Reading the data from CSV file
data=pd.read_csv('A_Z Handwritten Data.csv')

#Generating the features
X=data.iloc[:,1:].values

#Generating the output values
Y=data.iloc[:,0].values

## Scaling the Dataset

In [3]:
#Converting the values into 0-1 with threshold 128
X=X//128

## Splitting the data into train and test

In [4]:
#Splitting the data into 80% training and 20% test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,random_state=0)

## Training the Naive Bayes model on the Training set

In [5]:
#Creating a dictionary to store the training data information which will be helpful in reducing the time complexity for predicting new values
info = {}

#Storing the size of training data
info["total_data"] = len(Y_train)

#List of values which the output can take
class_values = list(range(1,27))

#Running a loop over each class
for current_class in class_values:
    
    #Creating a nested dictionary to store current class information
    info[current_class] = {}
    
    #Taking those indices for which output is current class
    current_indexes = (Y_train == current_class)
    
    #Taking those points for which output is current class
    X_train_current = X_train[current_indexes]
    
    #Calculating the number of features
    num_features = X_train.shape[1]
    
    #Storing the size of data for which output is current class
    info[current_class]["total_count"] = np.sum(current_indexes)
    
    #Running a loop over each feature
    for j in range(1, num_features + 1):
        
        #Creating a nested dictionary to store the information of each feature for current class
        info[current_class][j] = {}
        
        #Running a loop over binary values 0 and 1 which each feature can take
        for i in range(0,2):
            
            #Storing the count of points where output is current class(current_class) and feature is the current feature(j) and its value is current value(i)
            info[current_class][j][i] = np.sum((X_train_current[:, j - 1] == i))

## Predicting the test set results

In [7]:
#Function to calculate the weight of current class given the point is x
def getWeight(x, current_class):
    
    #Variable to store the final value
    output = (info[current_class]["total_count"])/(info["total_data"])
    num_features = len(info[current_class].keys()) - 1
    
    #Running a loop over the features
    for j in range(1, num_features + 1):
        xj = x[j - 1]
        
        #Applying the laplace-smoothening
        count = info[current_class][j][xj] + 1
        count_current_class = info[current_class]["total_count"] + 2
        
        #Multiplying the output by the probability of feature j being xj and class being current_class
        output *= (count/count_current_class)
        
    #Returning the output
    return output

#Function to predict the value of a single point x
def predict(x):
    classes = info.keys()
    best_p = -1
    best_class = -1
    
    #Running a loop over each class
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        
        #Calculating the weight of current class given the input is x
        p_current_class = getWeight(x, current_class)
        
        #Checking if current value is better than the best seen till now
        if (p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
    return best_class
Y_pred=[]

#Predicting the output of test data
for i in X_test:
    Y_pred.append(predict(i))

#Calculating the number of correct values predicted
correct=np.sum(Y_pred==Y_test)

#Printing the accuracy
print("Accuracy: "+str((correct)*100/len(Y_test)))

Accuracy: 68.39173043361525


## Generating the Confusion Matrix

In [8]:
#A confusion matrix is a technique for summarizing the performance of a classification algorithm.
cm=np.zeros((26,26),dtype=int)

for i in range(len(Y_test)):
    true_value=Y_test[i]
    pred_value=Y_pred[i]
    cm[true_value-1][pred_value-1]+=1
cm

array([[1315,   11,   46,   32,    0,   12,   57,    0,   32,    2,    5,
          19,   24,   41,   16,   12,   26,   35,    0,    2,    0,    5,
           2,   24,   39,    0],
       [  79, 3334,   12,  154,   16,   57,   11,    2,   34,   67,  207,
          33,   42,  255,   81,   43,   12,    1,   33,   63,    0,  179,
           1,   14,    6,    0],
       [  90,    6, 1377,    0,    0,    4,    1,    1,   98,    0,   11,
          21,   24,  219,   35,   24,    3,   11,    1,   31,    0,   28,
           1,   10,   26,    0],
       [ 148,   67,    4, 1362,   37,   56,    2,    8,    4,  134,   95,
          23,   31,   12,   46,    2,   98,   38,   13,   17,    0,   13,
          10,   15,   28,    0],
       [   0,    0,    0,    5,  201,    0,    0,    0,    0,    1,    0,
           0,    0,    0,   15,    0,    1,    0,    1,    1,    0,    0,
           0,    1,    0,    0],
       [  25,   18,    2,   27,    1,  854,    8,    0,   17,    3,    6,
           9,   20,  