In [18]:
# Antonio Emanuele Cinà
# Simple spam filter using Naive Bayes classifier
import numpy as np
from sklearn.model_selection import cross_val_score

TRAINING_SET = "spambase/spambase.data"

def load_csv(filename):
    fread = open(filename, "r")
    data = np.loadtxt(fread, delimiter=",")
    return data

email = load_csv(TRAINING_SET)

In [19]:
from sklearn.base import BaseEstimator
class NaiveBayesClassifier(BaseEstimator):

    def score(self, X, Y):
        p_x_spam_i = (2*np.pi*self.var_spam)**(-1./2) * np.exp(-1./(2*self.var_spam)*np.power(X-self.mu_spam,2))
        p_x_ham_i = (2*np.pi*self.var_ham)**(-1./2) * np.exp(-1./(2*self.var_ham)*np.power(X-self.mu_ham,2))
        
        p_x_spam = np.prod(p_x_spam_i, axis= 1)
        p_x_ham = np.prod(p_x_ham_i, axis= 1)
        
        #p_x = (p_x_spam * self.p_spam + p_x_ham * self.p_ham)# + 1e-130
        p_spam_x = p_x_spam * self.p_spam#/p_x
        p_ham_x = p_x_ham * self.p_ham#/p_x
                           
        predicted_labels = np.argmax([p_ham_x,p_spam_x], axis = 0)
        return np.mean(predicted_labels == Y)

    def fit(self, X, Y, **kwargs):
        self.spam = X[Y == 1,:54]
        self.ham = X[Y == 0,:54]
        
        self.N = float(self.spam.shape[0] + self.ham.shape[0])
        self.k_spam = self.spam.shape[0] # frequency of spam
        self.k_ham = self.ham.shape[0] # frequency of ham

        self.p_spam = self.k_spam/self.N
        self.p_ham = self.k_ham/self.N
        
        self.mu_spam = np.mean(self.spam, axis=0)
        self.mu_ham = np.mean(self.ham, axis=0)
        
        # Avoid division by zero adding a small costant
        self.var_spam = np.var(self.spam, axis=0)+1e-128
        self.var_ham = np.var(self.ham, axis=0)+1e-128

In [20]:
np.random.shuffle(email)# shuffle dataset
Y = email[:,57] 
X = email[:,:54]

In [21]:
scores = cross_val_score(NaiveBayesClassifier(), X, Y, cv = 10)

In [22]:
print("Min Accuracy: " + str(scores.min())+"\n")
print("Mean Accuracy: " + str(scores.mean())+"\n")
print("Max Accuracy: " + str(scores.max())+"\n")
print("Variance/Std Accuracy: " + str(scores.var()) +" / " +str(scores.std())+"\n")

print("=================================")

Min Accuracy: 0.7717391304347826

Mean Accuracy: 0.8056898990851644

Max Accuracy: 0.8347826086956521

Variance/Std Accuracy: 0.0003930480038227552 / 0.019825438300899056



In [23]:
# Apply 10-Way Cross validation 'run' times and get all the scores 
def eval_model(data, classifier, run = 10):
    scores = np.array([])
    for i in range(run):
        np.random.shuffle(data)
        Y = email[:,57] 
        X = email[:,:54]
        scores = np.append(scores,cross_val_score(classifier, X, Y, cv = 10))
    return scores

In [29]:
scores_run = eval_model(email, NaiveBayesClassifier(),run = 20)
print("Min Accuracy: " + str(scores_run.min())+"\n")
print("Mean Accuracy: " + str(scores_run.mean())+"\n")
print("Max Accuracy: " + str(scores_run.max())+"\n")
print("Variance/Std Accuracy: " + str(scores_run.var()) +" / " +str(scores_run.std())+"\n")
print("=================================")

Min Accuracy: 0.7086956521739131

Mean Accuracy: 0.8039997170612091

Max Accuracy: 0.8652173913043478

Variance/Std Accuracy: 0.0008362055152846054 / 0.028917218318583227



In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

clf = NaiveBayesClassificator()
md = clf.fit(x_train,y_train)

print("Accuracy: "+str(clf.score(x_test, y_test)))

In [None]:
print("p(spam): "+str(clf.p_spam))
print("p(ham): "+str(clf.p_ham))

In [None]:
print("mu spam: "+str(np.round(clf.mu_spam,3)))
print("var spam: "+str(np.round(clf.var_spam,3)))

In [None]:
print("mu spam: "+str(np.round(clf.mu_ham,3)))
print("var spam: "+str(np.round(clf.var_ham,3)))