In [19]:
import pandas as pd
import string

In [20]:
class Naive_Bayes:

    def __init__(self):
        self.spam, self.ham = {}, {}
        self.spam_c, self.ham_c = 0, 0
        self.V = set()

    def preprocess(self, s):
        s = s.lower() # Make all into lower case
        tokens = s.split()
        only_letters = lambda w: all([c in string.ascii_letters for c in w])
        words = list(filter(only_letters, tokens))
        return words
    

In [21]:
def train(model, X, Y):
    for x,y in zip(X,Y):
        x = model.preprocess(x)
        if y == 0:
            model.ham_c += 1
            for word in x:
                model.ham[word] = model.ham.get(word, 0) + 1
                model.V.add(word)
        if y == 1:
            model.spam_c += 1
            for word in x:
                model.spam[word] = model.spam.get(word, 0) + 1
                model.V.add(word)
    return model

In [22]:
def test(model, X):
    Y, V, spam, ham, spam_c, ham_c = [], model.V, model.spam, model.ham, model.spam_c, model.ham_c
    for x in X:
        x = model.preprocess(x)
        p = spam_c/(spam_c + ham_c)
        for word in x:
            p *= (spam.get(word, 0) + 1)/(spam_c )
        q = ham_c/(spam_c + ham_c)
        for word in x:
            q *= (ham.get(word, 0) + 1)/(ham_c )
        Y.append(1 if p >= q else 0)
    return Y

In [23]:
def load_data():
    df = pd.read_csv('spam_ham.csv', encoding = 'latin-1')
    df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
    # print(df.head())
    df.drop_duplicates(inplace=True)
    df.rename(columns = {'v1': 'label', 'v2': 'message'}, inplace=True)
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

In [24]:
def train_test_split(df, split):
    df = df.sample(frac = 1)
    df = df.reset_index(drop = True)
    index = round(split* len(df))
    train_x, train_y = df.loc[:index, 'message'].values, df.loc[:index, 'label'].values
    test_x, test_y = df.loc[index:, 'message'].values, df.loc[index:, 'label'].values
    return train_x, train_y, test_x, test_y

In [25]:
def confusion_matrix(predictions, actual):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(actual)):
        tp += int(actual[i] == 1 and predictions[i] == 1)
        tn += int(actual[i] == 0 and predictions[i] == 0)
        fp += int(actual[i] == 0 and predictions[i] == 1)
        fn += int(actual[i] == 1 and predictions[i] == 0)

    print('   Actual Values')
    l = [['', 'P', 'N'], ['P', tp, fp], ['N', fn, tn]]
    print('\n'.join(['\t'.join([str(cell) for cell in row]) for row in l]))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fScore = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", fScore)
    print("Accuracy: ", accuracy)

    

In [26]:
def main():
    df = load_data()
    # print(df.head())
    split = 0.75
    train_x, train_y, test_x, test_y = train_test_split(df, split)
    print(len(train_y), len(test_y))
    model = Naive_Bayes()
    model = train(model, train_x, train_y)
    yy = test(model, test_x)
    count_wrong = lambda test_y,yy: sum([a != b for a,b in zip(test_y, yy)])
    num_errs = count_wrong(yy, test_y)
    # total = spam_count + ham_count
    total = len(test_y)
    print('Accuracy {:.4f}, Errors: {} out of {}'.format(1 - num_errs/total, num_errs, total))
    confusion_matrix(yy, test_y)
    # print(model.test(['Congratulations ur awarded $500 ']))
    # print(model.test(['Hello']))

In [27]:
if __name__ == '__main__':
    main()

3878 1292
Accuracy 0.7577, Errors: 313 out of 1292
   Actual Values
	P	N
P	150	309
N	4	829
Precision:  0.32679738562091504
Recall:  0.974025974025974
F-score:  0.4893964110929853
Accuracy:  0.7577399380804953
