In [5]:
import numpy as np
import scipy as sp 
import pandas as pd
import h5py as h5py
import math

In [98]:
def load_dataset(dataset = 0):

    """ Returns the shuffled dataset from the .h5 file and calculates the class frequences. 

    Parameters
    ----------
    dataset : Integer to denoted the type of dataset to be loaded, default = 0

    Returns
    --------
    X :  2-dimensional numpy array of shape (n_samples, n_features) which acts as data.
    Y : 1-dimensional numpy array of shape (n_samples,) which acts as labels.

    """
    ## Dataset A
    if (dataset == 0):
        hf = h5py.File('Dataset/part_A_train.h5', 'r') #read the dataset 
        
        X = np.array(hf['X']) # X data  
        Y = np.array(hf['Y']) # class labels of the form [0,0,0,1,0,,..]
        print(X.shape,Y.shape)

        """ To calculate the class frequencies """

        print("The class frequencies are : ")

        for i in range(Y.shape[1]):
            freq = np.sum(Y[:,i])
            print("The frequency of class " + str(i) + " is " + str(freq) + " / " + str(Y.shape[0]) )

        """ Converting the binary class labels into single valued blabels """
        y = []
        for i in range(Y.shape[0]):
            for j in range(Y.shape[1]):
                if(Y[i,j] == 1):
                    y.append(j)
        
        y = np.array(y)
        Y = y.reshape(-1,1)
        Y = np.squeeze(Y)

        """ Shuffling the dataset """
        np.random.seed(123)
        index = np.random.permutation(X.shape[0])
        np.take(X, index, axis = 0, out = X)
        np.take(Y, index, axis = 0, out = Y)

    ## dataset B
    elif(dataset == 1):

        hf = h5py.File('Dataset/part_B_train.h5', 'r') #read the dataset 

        X = np.array(hf['X']) # X data  
        Y = np.array(hf['Y']) # class labels of the form [0,0,0,1,0,,..]
        print(X.shape,Y.shape)

        """ To calculate the class frequencies """

        print("The class frequencies are : ")

        for i in range(Y.shape[1]):
            freq = np.sum(Y[:,i])
            print("The frequency of class " + str(i) + " is " + str(freq) + " / " + str(Y.shape[0]) )

        """ Converting the binary class labels into single valued blabels """
        y = []
        for i in range(Y.shape[0]):
            for j in range(Y.shape[1]):
                if(Y[i,j] == 1):
                    y.append(j)

        y = np.array(y)
        Y = y.reshape(-1,1)
        Y = np.squeeze(Y)

        """ Shuffling the dataset """
        np.random.seed(123)
        index = np.random.permutation(X.shape[0])
        np.take(X, index, axis = 0, out = X)
        np.take(Y, index, axis = 0, out = Y)

    return X,Y

In [99]:
X,y = load_dataset(0)
print(y)

(4200, 784) (4200, 10)
The class frequencies are : 
The frequency of class 0 is 400.0 / 4200
The frequency of class 1 is 494.0 / 4200
The frequency of class 2 is 393.0 / 4200
The frequency of class 3 is 424.0 / 4200
The frequency of class 4 is 416.0 / 4200
The frequency of class 5 is 398.0 / 4200
The frequency of class 6 is 441.0 / 4200
The frequency of class 7 is 431.0 / 4200
The frequency of class 8 is 410.0 / 4200
The frequency of class 9 is 393.0 / 4200
[0 5 5 ... 4 4 5]


In [100]:
print(X[1])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  16 207 237
 152 194 167 131  98   7   7   7   7   7   4   0   0   0   0   0   0   0
   0   0   0   0   0   0   0  83 254 254 254 254 254 254 254 254 254 254
 254 254 148   0   0   0   0   0   0   0   0   0   0   0   0   0   0  11
 121 112 112 135 240 254 254 254 254 254 254 238  7

In [130]:
class MyGaussianNaiveBayes():
    
    def __init__(self):
        pass
    
    def fit(self, X,y):
        self.X = X
        self.y = y
        self.num_classes = np.unique(y) #number of unique classes in y
        model_parameters = {} #mean and variance of each feature for each class
        
        for idx,class_i in enumerate(self.num_classes):
            X_c = X[np.where(y==class_i)] #all training samples when label = class_i
            
            class_parameters = [] #storing mean and variance of each feature of class_i
            for i in range(X_c.shape[1]):
                feature_mean = X_c[:,i].mean()
                feature_var = X_c[:,i].var()
                class_parameters.append({"mean" : feature_mean, "var" : feature_var})
            model_parameters[class_i] = class_parameters
        self.model_parameters = model_parameters
        print(self.model_parameters)
    
    
    def predict(self, X):
        
        y_pred = []
        
        for k in range(X.shape[0]):
            sample = X[k,:]
            probabilities = []
            for idx,class_i in enumerate(self.num_classes):

                p = list(self.y).count(class_i)/len(self.y)
                print(class_i)
                feature_list = self.model_parameters[class_i]
                epsilon = 10**-4
#                 print(len(feature_list))
#                 break
                for i in range(len(feature_list)):
                    curr_feature = feature_list[i]
                    curr_mean = curr_feature["mean"]
                    curr_var = curr_feature["var"]
                    curr_x = sample[i]
                    gaussian_estimate_c = 1/(2*math.pi*curr_var + epsilon)**0.5
                    gaussian_estimate_exp = np.exp(-1*((curr_x - curr_mean)**2/(2*curr_var + epsilon)))
                    print(gaussian_estimate_c)
                    print(gaussian_estimate_exp)
                    p*=gaussian_estimate_c*gaussian_estimate_exp
#                     print(p)

                probabilities.append(p)
            print(probabilities)
            y_pred.append(np.argmax(probabilities))
        return y_pred
        

In [131]:
nb = MyGaussianNaiveBayes()
nb.fit(X,y)

{0: [{'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.0}, {'mean': 0.0, 'var': 0.

In [132]:
y_pred = nb.predict(X[0:200])
# print(y_pred)
# print(y[:200])

0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4

1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5

100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
1

6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[

100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.095238095238

1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.117619047

5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]
0
100.0
1.0
1
100.0
1.0
2
100.0
1.0
3
100.0
1.0
4
100.0
1.0
5
100.0
1.0
6
100.0
1.0
7
100.0
1.0
8
100.0
1.0
9
100.0
1.0
[0.09523809523809523, 0.11761904761904762, 0.09357142857142857, 0.10095238095238095, 0.09904761904761905, 0.09476190476190476, 0.105, 0.10261904761904762, 0.09761904761904762, 0.09357142857142857]


In [115]:
print(y_pred)
print(y[:200])
c = 0
for i in range(len(y[:200])):
    if(y[i] == y_pred[i]):
        c+=1
print(c/len(y[:200]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0 5 5 8 3 9 4 9 7 6 3 7 5 0 6 4 5 6 5 2 5 9 8 1 9 9 1 2 3 5 4 9 7 2 7 8 7
 4 4 8 5 1 9 1 1 9 5 9 5 6 8 8 0 7 9 8 3 3 7 7 9 9 7 8 5 7 0 6 8 7 8 9 9 8
 8 6 9 2 5 2 2 2 2 9 8 8 2 6 3 2 6 5 5 4 7 3 8 7 0 0 4 3 3 3 4 2 3 9 6 0 3
 1 7 8 8 3 8 0 2 3 5 2 7 3 4 0 2 1 8 0 6 2 8 5 9 2 6 9 0 3 5 5 6 1 8 4 6 0
 9 3 0 3 2 5 3 7 2 3 8 9 2 3 6 2 6 0 8 6 2 8 5 5 2 8 2 9 8 7 1 9 6 0 3 4 2
 4 5 1 8 7 5 6 1 6 4 2 9

In [51]:
from sklearn.naive_bayes import GaussianNB

In [55]:
gnb = GaussianNB()
y_pred = gnb.fit(X, y).predict(X)

In [58]:
print(y_pred)
print(y)
c = 0
for i in range(len(y)):
    if(y[i] == y_pred[i]):
        c+=1
print(c/len(y))

[0 9 8 ... 0 9 8]
[0 5 5 ... 4 4 5]
0.5952380952380952
