In [2]:
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
import math
from sklearn.metrics import accuracy_score
from math import exp, pi, sqrt

# Implementation of Gaussian Naive Bayes Classifier

In [3]:
class MyGaussianNB(BaseEstimator, ClassifierMixin):          
    def fit(self, Xt, yt):
        
        self.Xt = pd.DataFrame(Xt)
        self.yt = pd.DataFrame(yt)
        
        #convert data types 
        for i in range(0,len(self.Xt.columns)):
            self.Xt[i] = self.Xt[i].astype(type(self.Xt[i][0]))
        
        #getting the class probabilities
        c_dict = Counter(yt)
        total = sum(c_dict.values())
        self.class_prob_dict = {}
        for item in c_dict:
            self.class_prob_dict[item] = (c_dict[item]/total)
            
    
        #creating dictionary of conditional probabilities for the categorical features and storing 
        
        #concatenating x and y to get the full dataframe 
        full_df = pd.concat([self.yt,self.Xt], axis=1, ignore_index=True)
        #creating a list of the indices of the categorical features
        categorical = full_df.select_dtypes(exclude=[float]).columns.delete(0)
        self.conditional_probs = {}
        
        #iterating over each categorical feature 
        for item in categorical:
            self.conditional_probs[item] = (full_df.groupby(by=[0])[item].value_counts()/ full_df.groupby(by=[0])[item].count())        

            
        #getting the mean and standard deviation of each class for each numeric features and storing
        classes = self.yt[0].unique()
        numeric = full_df.select_dtypes(include=[float]).columns
        
        self.class_dict = {}
        
        #iterating over each numerical feature and then iterating over each class within that feature
        #appending the mean and standard deviation for each class to a dictionary 
        for item in numeric:
            numeric_dict = {}
            for thing in classes:
                class_item = full_df[full_df[0] == thing][item]
                numeric_dict[thing] = {'mean':class_item.mean(),'std':class_item.std()}

            self.class_dict[item] = numeric_dict
            
        return self
    
    def predict(self, Xtest):
        self.Xtest = pd.DataFrame(Xtest)
        
        #convert data types 
        for i in range(0,len(self.Xtest.columns)):
            self.Xtest[i] = self.Xtest[i].astype(type(self.Xtest[i][0]))
        
        #iterating over each row of the dataframe 
        #creating a dictionary of lists for each row for each class and getting the product of these lists
        #checking which product is the largest and returning that class as the predicted class
        final_result_array = []
        big_dict = {}
        
        for index, row in self.Xtest.iterrows():
            classes = self.yt[0].unique()
            results = {}

            for item in classes:
                results[item] = []
                results[item].append(self.class_prob_dict[item])
                
                #iterating over each value in the row 
                for i in range(0, len(self.Xtest.columns)):
                    try:
                        #if numeric, access the dictionary of means and std dev and plug values into 
                        #the probability function 
                        if type(row[i]) == np.float64 or type(row[i]) == float:
                            results[item].append(self.calculate_probability(row[i],self.class_dict[i+1][item]['mean'],self.class_dict[i+1][item]['std']))
                        
                        #if categorical, access the dictionary of conditional probabilities created in fit()
                        else:
                            results[item].append(self.conditional_probs[i+1][item][row[i]])
                            
                    # exception occurs in cases where feature isn't present in a particualr class
                    # in this case the value filled in is zero 
                    except:
                        results[item].append(0)
                
                big_dict[item] = np.prod(results[item])  
                
            final_result_array.append(max(big_dict, key=big_dict.get))
            
        return final_result_array
    
    # probability function 
    def calculate_probability(self, x, mean, stdev):
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (sqrt(2 * pi * stdev**2)) * exponent)   



# Testing using variety of datasets

## Penguins Data Set (Numerical and Categorical features)

In [4]:
penguins_af = pd.read_csv('penguins_af.csv', index_col = 0)

In [5]:
y = penguins_af.pop('species').values
X = penguins_af.values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)

In [7]:
GBC = MyGaussianNB()

In [8]:
GBC.fit(X_train,y_train)

MyGaussianNB()

In [9]:
GBC.class_dict

{2: {'Chinstrap': {'mean': 48.9764705882353, 'std': 3.1156553689226625},
  'Gentoo': {'mean': 47.78928571428569, 'std': 2.9531910981745937},
  'Adelie': {'mean': 38.922368421052624, 'std': 2.658149666426405}},
 3: {'Chinstrap': {'mean': 18.66176470588235, 'std': 1.120339425266953},
  'Gentoo': {'mean': 15.014285714285716, 'std': 0.9964352045582884},
  'Adelie': {'mean': 18.293421052631576, 'std': 1.2699958557744235}},
 4: {'Chinstrap': {'mean': 196.85294117647058, 'std': 6.592614568205835},
  'Gentoo': {'mean': 217.625, 'std': 7.636187768537625},
  'Adelie': {'mean': 189.25, 'std': 6.515878042648333}},
 5: {'Chinstrap': {'mean': 3736.764705882353, 'std': 405.93304857907145},
  'Gentoo': {'mean': 5075.892857142857, 'std': 517.861956986132},
  'Adelie': {'mean': 3695.3947368421054, 'std': 479.00435847557435}}}

In [10]:
GBC.predict(X_test)

['Chinstrap',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Adelie',
 'Adelie',
 'Adelie',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Gentoo',
 'Chinstrap',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Chinstrap',
 'Adelie',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Adelie',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Chinstrap',
 'Gentoo',
 'Adelie',
 'Chinstrap',
 'Chinstrap',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Adelie',
 'Gentoo',
 'Chinstrap',
 'Chinstrap',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Chinstrap',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Adelie',
 'Gentoo',
 'Gentoo',
 'Adelie',
 'Adelie'

In [11]:
GBC.score(X_test, y_test)

0.9820359281437125

A very high accuracy score of 98% is achieved when both categorical and numerical features are included. This indicates that the inclusion of the categorical features marginally improves the predictive power for the Gaussian Naive Bayes model for this dataset.

## Limiting to numerical features only 

We are aiming to test the performance of our implementation of the Gaussian Naive Bayes classifier against the sci kit learn implementation. Therefore we will limit the penguins dataset to numerical features only and test again. 

In [None]:
penguins = pd.read_csv('penguins_af.csv', index_col = 0)
penguins

In [None]:
penguins = penguins[['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
penguins

In [None]:
y = penguins.pop('species').values
X = penguins.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)

In [None]:
GBC = MyGaussianNB()
GBC.fit(X_train,y_train)
GBC.predict(X_test)

In [None]:
GBC.score(X_test, y_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
gnb.predict(X_test)
gnb.score(X_test, y_test)

In [None]:
accuracy_score(GBC.predict(X_test), gnb.predict(X_test))

The results produced from the MyGaussianNB implementation identically match those produced by the scikit learn implementation, both produce scores of 96%. This is a good sign as it shows our implementations is working as it should be. An accuracy score of 1.0 is produced showing that the model predicts the exact same values as the sklearn implementation on the test set.

## Diabetes Data Set 

In [None]:
diabetes = pd.read_csv('diabetes.csv')
diabetes

In [None]:
y = diabetes.pop('neg_pos').values
X = diabetes.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)

In [None]:
GBC_diabetes = MyGaussianNB()

In [None]:
GBC_diabetes.fit(X_train,y_train)

In [None]:
GBC_diabetes.predict(X_test)

In [None]:
GBC_diabetes.score(X_test, y_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
gnb.predict(X_test)
gnb.score(X_test, y_test)

In [None]:
accuracy_score(GBC_diabetes.predict(X_test), gnb.predict(X_test))

Once again a very similar accuracy score is achieved for both our implementation (74%) and the scikit-learn implementation (73.7%). For this dataset, our model produced a marginally better result but this margin is neglible in the grand scheme of things. Once again this is an inidcator that our model is predicting to the same standard as the scikit learn model. An accuracy score of 99.4% is produced between our model and the sklearn implementation.

## Glass Data Set

In [None]:
glass = pd.read_csv('glassV2.csv')

In [None]:
glass['Type'].value_counts(dropna=False)

Due to the class imbalance here, we will remove Types 3,5 and 7 from the dataset such that it is a binary classification.

In [None]:
glass = glass[glass.Type != 3]
glass = glass[glass.Type != 5]
glass = glass[glass.Type != 7]

In [None]:
y = glass.pop('Type').values
X = glass.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)

In [None]:
GBC_glass = MyGaussianNB()

In [None]:
GBC_glass.fit(X_train,y_train)

In [None]:
GBC_glass.predict(X_test)

In [None]:
GBC_glass.score(X_test, y_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
gnb.predict(X_test)
gnb.score(X_test, y_test)

In [None]:
accuracy_score(GBC_glass.predict(X_test), gnb.predict(X_test))

Once again a very similar score is achieved for both our implementation and the scikit-learn implementation. This result is only achieved after the class imbalance is dealt with, before dealing with this imbalance the sklearn model was producing a score around 35% and our model produced a score around 60%. An accuracy score of 1.0 is produced between our model and the sklearn implementation showing that they are predicting the exact same values on the test set.