First we need to import Numpy and CSV so we can parse the spam test data

In [112]:
import csv
import re
from math import floor

Now lets open the spam.csv and read the data

First lets create a list to store the 2 categories of messages

0 <- ham/not spam

1 <- spam messages

In [113]:
messages = [[], []]

In [114]:
with open('spam.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)

    header = next(reader)
    
    for lines in reader:
        # ham (means regular message)
        if lines[0] == 'ham':
            messages[0].append(lines[1])
            
        elif lines[0] == 'spam':
            messages[1].append(lines[1])

Now we will take a random 70/30 split from each category for train and test

In [115]:
# training split params
train = 0.7
total_good = len(messages[0])
total_spam = len(messages[1])

# Split for normal messages
x_train_good = floor(train * total_good)
x_test_good = total_good - x_train_good

# Split for spam messages
x_train_spam = floor(train * total_spam)
x_test_spam = total_spam - x_train_spam

print(x_train_good, x_test_good, x_train_spam, x_test_spam)


3377 1448 522 225


Now we will split into a train and test dataset

can also shuffle the list to randomize order if needed but we will just pick the first x amount

In [116]:
# Mark if a word appears in a spam message
def parse_message_into_vectors(message):
    # first let's lower the message
    lowered_message = message.lower()
    
    # Now lets remove symbols
    # alphabet_numbers = re.sub(r'[^A-Za-z0-9]', '', lowered_message)
    
    # Now Numbers
    alphabet = re.sub(r'[^A-Za-z ]', '', lowered_message)
    
    # Split into a list so we can iterate through
    with_spaces = alphabet.split(" ")
    cleaned = [w for w in with_spaces if w.strip() != '']
    
    
    return with_spaces

Next we need to go through the train data for good and spam messages and get the totals for each word

first we keep a counter of total words

so first we loop through all 

In [117]:
# Compute a word count for each message and combine them
def create_word_count(messages):
    
    total_msg_counts = len(messages)
    
    returned_dict = {}
    
    for i in messages:
        out = parse_message_into_vectors(i)
        
        # Append the keys to the dictionary
        for d in out:
            if d not in returned_dict:
                returned_dict[d] = 1
            else:
                returned_dict[d] += 1
                
    # compute the probability that a word appears in spam messages
    for i in returned_dict.keys():
        returned_dict[i] = returned_dict[i]/total_msg_counts
                
    return returned_dict
    
    

In [118]:
# Create the train Jsons
x_train_good_word_counts = create_word_count(messages[0][:x_train_good])
x_train_spam_word_counts = create_word_count(messages[1][:x_train_spam])

for i in x_train_good_word_counts.keys():
    if i not in x_train_spam_word_counts:
        x_train_spam_word_counts[i] = 0

for i in x_train_spam_word_counts.keys():
    if i not in x_train_good_word_counts:
        x_train_good_word_counts[i] = 0

#print(x_train_good_word_counts)
# print(x_train_spam_word_counts)


Now we need to make a method that can use Bayes Theorem to calculate given a sequence of words, good words counts, spam words counts

In [119]:
# Bayes Theorem given a list of words, the spam word probabilities and good word probabilities
def computeProbabilityOfSpam(listOfWords, spamWords, goodWords):
    top = 1
    bottomLeft = 1
    bottomRight = 1
    
    for i in listOfWords:
        if i in spamWords and i in goodWords:
            top*= spamWords[i]
            bottomLeft *= spamWords[i]
            bottomRight *= goodWords[i]
            
    if (bottomLeft + bottomRight) == 0:
        return 0
    
    return top/(bottomLeft+bottomRight)


In [120]:
# Now we need to compute the probabilities for messages that count as spam
probability_scores = []

for i in messages[1][:x_train_spam]:
    formatted_msg = parse_message_into_vectors(i)
    
    print("Message: ", formatted_msg)
    
    
    prob = computeProbabilityOfSpam(formatted_msg, x_train_spam_word_counts, x_train_good_word_counts)
    print("Probability of Spam: ", prob)
    
    probability_scores.append(prob)

avg = 0
for i in probability_scores:
    avg += i
avg /= len(probability_scores)

print("Average Probability of Spam: ", avg)
    

Message:  ['free', 'entry', 'in', '', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', '', 'text', 'fa', 'to', '', 'to', 'receive', 'entry', 'questionstd', 'txt', 'ratetcs', 'apply', 'overs']
Probability of Spam:  1.0
Message:  ['freemsg', 'hey', 'there', 'darling', 'its', 'been', '', 'weeks', 'now', 'and', 'no', 'word', 'back', 'id', 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'to', 'send', '', 'to', 'rcv']
Probability of Spam:  1.0
Message:  ['winner', 'as', 'a', 'valued', 'network', 'customer', 'you', 'have', 'been', 'selected', 'to', 'receivea', '', 'prize', 'reward', 'to', 'claim', 'call', '', 'claim', 'code', 'kl', 'valid', '', 'hours', 'only']
Probability of Spam:  1.0
Message:  ['had', 'your', 'mobile', '', 'months', 'or', 'more', 'u', 'r', 'entitled', 'to', 'update', 'to', 'the', 'latest', 'colour', 'mobiles', 'with', 'camera', 'for', 'free', 'call', 'the', 'mobile', 'update', 'co', 'free', 'on', '']
P

Validate The Model We Created On The Test Data

In [121]:
cutoff = avg

correct = 0

# Test Good Messages
for i in messages[0][x_train_good:]:
    formatted_msg = parse_message_into_vectors(i)
    
    # print("Message: ", formatted_msg)
    prob = computeProbabilityOfSpam(formatted_msg, x_train_spam_word_counts, x_train_good_word_counts)
    
    if prob < cutoff:
        correct += 1

# Test Spam Messages  
for i in messages[1][x_train_spam:]:
    formatted_msg = parse_message_into_vectors(i)
    
    # print("Message: ", formatted_msg)
    prob = computeProbabilityOfSpam(formatted_msg, x_train_spam_word_counts, x_train_good_word_counts)
    
    if prob >= cutoff:
        correct += 1
        
print("Correct: ", correct)
print("Accuracy: ", correct/(x_test_good + x_test_spam))

Correct:  1598
Accuracy:  0.9551703526598924
