## Naive Bayes Algorithm for Classification on Iris Dataset

#### Author - Agyeya Mishra

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [None]:
from google.colab import files
uploaded = files.upload()

Saving iris.csv to iris (1).csv


In [None]:
class NaiveBayesClassifier():
    '''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        calculate prior probabilities
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        '''
        calculate probability from gaussian density function (normally distributed)
        we will assume that probability of specific target value given specific class is normally distributed 
        
        probability density function derived from wikipedia:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²)), where μ is mean, σ² is variance, σ is quare root of variance (standard deviation)
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
#         numerator = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()

In [None]:
# pre-process dataset for training 

# upload Iris dataset -  shape is (150, 5)
df = pd.read_csv("iris.csv")
# shuffle dataset with sample
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
# df shape
print(df.shape)
# set features and target
X, y = df.iloc[:, :-1], df.iloc[:, -1]


# # split on train and test 0.7/0.3
X_train, X_test, y_train, y_test = X[:100], X[100:], y[:100], y[100:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(150, 6)
(100, 5) (100,)
(50, 5) (50,)


In [None]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,15,5.8,4.0,1.2,0.2,Iris-setosa
1,99,5.1,2.5,3.0,1.1,Iris-versicolor
2,76,6.6,3.0,4.4,1.4,Iris-versicolor
3,17,5.4,3.9,1.3,0.4,Iris-setosa
4,132,7.9,3.8,6.4,2.0,Iris-virginica
...,...,...,...,...,...,...
145,134,6.3,2.8,5.1,1.5,Iris-virginica
146,138,6.4,3.1,5.5,1.8,Iris-virginica
147,73,6.3,2.5,4.9,1.5,Iris-versicolor
148,141,6.7,3.1,5.6,2.4,Iris-virginica


In [None]:
X_train.shape

(100, 5)

In [None]:
X_train

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,15,5.8,4.0,1.2,0.2
1,99,5.1,2.5,3.0,1.1
2,76,6.6,3.0,4.4,1.4
3,17,5.4,3.9,1.3,0.4
4,132,7.9,3.8,6.4,2.0
...,...,...,...,...,...
95,146,6.7,3.0,5.2,2.3
96,88,6.3,2.3,4.4,1.3
97,149,6.2,3.4,5.4,2.3
98,110,7.2,3.6,6.1,2.5


In [None]:
# train the model
x = NaiveBayesClassifier()


x.fit(X_train, y_train)

In [None]:
predictions = x.predict(X_test)

In [None]:
x.classes, x.feature_nums, x.rows, x.count

(array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object),
 5,
 100,
 3)

In [None]:
x.calc_prior(X_train, y_train)

array([0.31, 0.32, 0.37])

In [None]:
x.prior

array([0.31, 0.32, 0.37])

In [None]:
x.calc_statistics(X_train, y_train)

(array([[ 26.58064516,   5.08387097,   3.50322581,   1.46129032,
           0.24193548],
        [ 77.03125   ,   5.9125    ,   2.790625  ,   4.275     ,
           1.33125   ],
        [123.72972973,   6.71891892,   2.98918919,   5.63243243,
           2.05675676]]),
 array([[1.93920916e+02, 1.13610822e-01, 1.09344433e-01, 2.43080125e-02,
         9.53173777e-03],
        [2.30967773e+02, 2.12968750e-01, 8.27246094e-02, 1.85625000e-01,
         3.21484375e-02],
        [2.00899927e+02, 3.56669102e-01, 1.13396640e-01, 3.28677867e-01,
         5.92111030e-02]]))

In [None]:
x.mean

array([[ 26.58064516,   5.08387097,   3.50322581,   1.46129032,
          0.24193548],
       [ 77.03125   ,   5.9125    ,   2.790625  ,   4.275     ,
          1.33125   ],
       [123.72972973,   6.71891892,   2.98918919,   5.63243243,
          2.05675676]])

In [None]:
x.var

array([[1.93920916e+02, 1.13610822e-01, 1.09344433e-01, 2.43080125e-02,
        9.53173777e-03],
       [2.30967773e+02, 2.12968750e-01, 8.27246094e-02, 1.85625000e-01,
        3.21484375e-02],
       [2.00899927e+02, 3.56669102e-01, 1.13396640e-01, 3.28677867e-01,
        5.92111030e-02]])

In [None]:
predictions = x.predict(X_test)

In [None]:
x.accuracy(y_test, predictions)

0.98

In [None]:
y_test.value_counts(normalize=True)

Iris-setosa        0.38
Iris-versicolor    0.36
Iris-virginica     0.26
Name: Species, dtype: float64