In [579]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [580]:
def preprocessing(path, class_col):
        # convert text to pandas dataframe
        df = pd.read_csv(path, sep=' ', header=None)
        df[class_col] = df.iloc[:,57]
        df[class_col] = df[class_col].map({1:'spam', 0:'ham'})
        df.drop(columns=df.columns[57], inplace=True)
        return df

def reshapeXTestTrain(x_test, x_train):
    # Reshape the dataset in order to properly train and test the model
    return x_test.iloc[:,:-1], x_train.iloc[:,:-1]

class GaussianNaiveBayes:
    def calculatePriors(self, testset, class_col):
        # Calculate prior probability for both classes
        self.priors = testset.groupby(class_col).apply(lambda x: len(x)/testset.shape[0]).to_numpy()

    def calculateEpsilon(self, var_list, smoothing_factor):
        epsilon = (np.amax(var_list) * smoothing_factor)
        smoothed_var = var_list + epsilon
        return smoothed_var, epsilon

    def calculateMeanAndVar(self, testset, class_col, smoothing_factor=10**-9):
        # Calculate columnar means and variances for both classes, and increment variances by a factor to ensure non-zero variances
        self.mu = testset.groupby(class_col).mean().to_numpy()
        vars = testset.groupby(class_col).var(ddof=0).to_numpy()
        self.sig2, self.epsilon = self.calculateEpsilon(vars, smoothing_factor)

    def calculateGaussianProbability(self, x, classId):
        # Calculate the gaussian probability of a feature given its mean and variance
        # pdf = 1/SQRT(2 * pi * sig^2) * e^(-((x-mu)/(2*sig^2)))
        numer = np.exp(-((x-self.mu[classId])/(2 * self.sig2[classId])))
        denom = np.sqrt(2 * np.pi * self.sig2[classId])
        print('{}\t{}'.format(numer,denom))
        return (numer/denom)

    def calculatePosteriorProbability(self, x, class_dict):
        posteriorProbs = []
        for i in [0, 1]:
            priorProb = np.log(self.priors[i])
            conditionalProb = np.sum(np.log(self.calculateGaussianProbability(x, i)))
            posteriorProbs.append(priorProb + conditionalProb)
        return class_dict[np.argmax(posteriorProbs)]

    def fit(self, X_train, fieldname, smoothing_factor=10**-9):
        self.calculateMeanAndVar(X_train, fieldname, smoothing_factor)
        self.calculatePriors(X_train, fieldname)
        
    def predict(self, X_test):
        preds = []
        for f in X_test.to_numpy():
            fPred = self.calculatePosteriorProbability(f, {1:'spam', 0:'ham'})
            preds.append(fPred)
        return preds

In [581]:
import os
path = os.getcwd() + '/spamData.txt'
smoothing_factor = 10 ** -9


In [582]:
X = preprocessing(path, 'email_type')
y = X.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

gnb = GaussianNB()
gnb.fit(X_test.iloc[:,:-1], y_test)


In [583]:
hbGaussianNB = GaussianNaiveBayes()
hbGaussianNB.fit(X_train, 'email_type')

In [584]:
X_test2, X_train2 = reshapeXTestTrain(X_test, X_train)

preds = hbGaussianNB.predict(X_test2)

-49.90136207623835	30.455059368357134
-3.257962330182131	-16.244685669496768
-18.25846504351004	-inf
-15.904222643384255	30.008383929136027
-43.537144477800986	38.26824769911232
-12.766891341703756	36.662300962957154
-112.20739362434225	27.359374045601687
-3.0512387397929523	41.26667205743953
-2.7013520823484813	24.846844845733333
-7.777177191368143	-inf
-6.404949381140538	35.42927539640947
-39.15033922876312	-9.81622517882392
-89.27347774137229	24.31230758135138
-1.9067089712535013	-inf
-61.73191314168758	26.922953622653083
-142.7475424936831	7.2759950927273715
-10.415245623817665	-898.0247593492209
-7.374853195631937	39.47368224089301
-6.45339485172722	-inf
-4.834949442710963	-110.59285025006331
-57.16733696297694	37.927751794954204
-8.55610142673738	-804.9782429894123
-33.40270176535393	-898.8857865044138
-3.131675501826414	30.321846531400332
-9.95960264479246	-244.2643271843175
-3.9482560086556395	-inf
-1.451071867149778	-inf
-151.18227448208197	29.82113946812923
-15.26024826145580

  conditionalProb = np.sum(np.log(self.calculateGaussianProbability(x, i)))


In [585]:
skpreds = gnb.predict(X_test2)

In [586]:
accuracy_score(y_test, preds)

0.8079930495221547

In [587]:
accuracy_score(y_test, skpreds)

0.8245004344048653