In [590]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [591]:
def preprocessing(path, class_col):
        # convert text to pandas dataframe
        df = pd.read_csv(path, sep=' ', header=None)
        df[class_col] = df.iloc[:,57]
        df[class_col] = df[class_col].map({1:'spam', 0:'ham'})
        df.drop(columns=df.columns[57], inplace=True)
        return df

def reshapeXTestTrain(x_test, x_train):
    # Reshape the dataset in order to properly train and test the model
    return x_test.iloc[:,:-1], x_train.iloc[:,:-1]

class GaussianNaiveBayes:
    def calculatePriors(self, testset, class_col):
        # Calculate prior probability for both classes
        self.priors = testset.groupby(class_col).apply(lambda x: len(x)/testset.shape[0]).to_numpy()

    def calculateEpsilon(self, var_list, smoothing_factor):
        epsilon = (np.amax(var_list) * smoothing_factor)
        smoothed_var = var_list + epsilon
        return smoothed_var, epsilon

    def calculateMeanAndVar(self, testset, class_col, smoothing_factor=10**-9):
        # Calculate columnar means and variances for both classes, and increment variances by a factor to ensure non-zero variances
        self.mu = testset.groupby(class_col).mean().to_numpy()
        vars = testset.groupby(class_col).var(ddof=0).to_numpy()
        self.sig2, self.epsilon = self.calculateEpsilon(vars, smoothing_factor)

    def calculateGaussianProbability(self, x, classId):
        # Calculate the gaussian probability of a feature given its mean and variance
        # pdf = 1/SQRT(2 * pi * sig^2) * e^(-((x-mu)/(2*sig^2)))
        numer = np.exp(-((x-self.mu[classId])/(2 * self.sig2[classId])))
        denom = np.sqrt(2 * np.pi * self.sig2[classId])
        print('{}\t{}'.format(numer,denom))
        return (numer/denom)

    def calculatePosteriorProbability(self, x, class_dict):
        posteriorProbs = []
        for i in [0, 1]:
            priorProb = np.log(self.priors[i])
            conditionalProb = np.sum(np.log(self.calculateGaussianProbability(x, i)))
            posteriorProbs.append(priorProb + conditionalProb)
        return class_dict[np.argmax(posteriorProbs)]

    def fit(self, X_train, fieldname, smoothing_factor=10**-9):
        self.calculateMeanAndVar(X_train, fieldname, smoothing_factor)
        self.calculatePriors(X_train, fieldname)
        
    def predict(self, X_test):
        preds = []
        for f in X_test.to_numpy():
            fPred = self.calculatePosteriorProbability(f, {1:'spam', 0:'ham'})
            preds.append(fPred)
        return preds

In [592]:
import os
path = os.getcwd() + '/spamData.txt'
smoothing_factor = 10 ** -9


In [593]:
X = preprocessing(path, 'email_type')
y = X.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

gnb = GaussianNB()
gnb.fit(X_test.iloc[:,:-1], y_test)


In [594]:
hbGaussianNB = GaussianNaiveBayes()
hbGaussianNB.fit(X_train, 'email_type')

In [595]:
X_test2, X_train2 = reshapeXTestTrain(X_test, X_train)

preds = hbGaussianNB.predict(X_test2)

[1.49251661e+00 1.04501214e+00 4.92152239e-02 1.53512348e+00
 1.27118177e+00 1.52577800e+00 1.70726495e+00 1.39042758e+00
 1.32347333e-08 1.20120009e+00 1.67434084e+00 1.32088428e+00
 1.79069875e+00 1.36796000e+00 1.51001000e+00 1.08982658e+00
 1.57986421e+00 1.35516466e+00 5.96574534e-01 1.54620305e+00
 1.25422714e+00 1.06513326e+00 2.52918906e+00 1.10406221e+00
 1.10594751e+00 1.17882192e+00 1.03460811e+00 1.26494984e+00
 1.14491311e+00 1.12497213e-02 1.26726605e+00 1.24187339e+00
 1.14528343e+00 1.24266984e+00 1.18181708e+00 1.32163977e+00
 1.47387335e+00 1.11359009e+00 1.23967824e+00 1.24583059e+00
 1.18624290e+00 1.12291206e+00 1.55974518e+00 1.10465173e+00
 1.15482507e+00 1.10530376e+00 1.45930135e+00 1.18709098e+00
 1.30247383e+00 2.96531746e+00 1.66998298e+00 1.07132490e+00
 3.69356661e+00 1.16843741e+00 1.06358499e+00 1.00644546e+00
 1.00050749e+00]	[7.43148394e-01 4.56372386e+00 1.21610497e+00 9.05692910e-02
 1.52682130e+00 5.79035879e-01 2.05568784e-01 5.92751572e-01
 5.2186

  conditionalProb = np.sum(np.log(self.calculateGaussianProbability(x, i)))


In [596]:
skpreds = gnb.predict(X_test2)

In [597]:
accuracy_score(y_test, preds)

0.8088618592528236

In [598]:
accuracy_score(y_test, skpreds)

0.8349261511728931