In [7]:
import pandas as pd
import string

In [8]:
class Naive_Bayes:
    
    def __init__(self):
        self.spam, self.ham = {}, {}
        self.spam_count, self.ham_count = 0, 0
        self.V = set()
                
    def tokenize(self, s):
        s = s.lower()
        tokens = s.split()
        only_letters = lambda w: all([c in string.ascii_letters for c in w])
        words = list(filter(only_letters, tokens))
        return words

    def train(self, X, Y):
        for x,y in zip(X,Y):
            x = self.tokenize(x)
            if y == 0:
                self.ham_count += 1
                for word in x:
                    self.ham[word] = self.ham.get(word, 0) + 1
                    self.V.add(word)
            if y == 1:
                self.spam_count += 1
                for word in x:
                    self.spam[word] = self.spam.get(word, 0) + 1
                    self.V.add(word)

    def test(self, X):
        Y, V, spam, ham, spam_count, ham_count = [], self.V, self.spam, self.ham, self.spam_count, self.ham_count
        for x in X:
            x = self.tokenize(x)
            p = spam_count/(spam_count + ham_count)
            for word in x:
                p *= (spam.get(word, 0) + 1)/(spam_count + len(V))
            q = ham_count/(spam_count + ham_count)
            for word in x:
                q *= (ham.get(word, 0) + 1)/(ham_count + len(V))
            Y.append(1 if p >= q else 0)
        return Y

In [9]:
def train_test_split(df, split):
    df = df.sample(frac=1)
    df = df.reset_index(drop=True)
    idx = round(split*len(df))
    train_x, train_y = df.loc[:idx,'message'].values, df.loc[:idx,'label'].values
    test_x, test_y = df.loc[idx:,'message'].values, df.loc[idx:,'label'].values
    return train_x, train_y, test_x, test_y

In [10]:
df = pd.read_csv('spam.csv', encoding = 'latin-1')
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)
df.rename(columns = {'v1': 'label', 'v2': 'message'}, inplace=True)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
split = .7
train_x, train_y, test_x, test_y = train_test_split(df, split)
if split == 1:
    test_x, test_y = train_x, train_y
print(len(train_y), len(test_y))
model = Naive_Bayes()
model.train(train_x, train_y)
yy = model.test(test_x)
count_wrong = lambda test_y,yy: sum([a != b for a,b in zip(test_y, yy)])
num_errs = count_wrong(yy, test_y)
# total = spam_count + ham_count
total = len(test_y)
print('Accuracy {:.4f}, Errors: {}'.format(1 - num_errs/total, num_errs))
model.test(['Congratulations ur awarded $500 '])

3619 1551
Accuracy 0.9458, Errors: 84


[1]