Project: Building a Spam Filter with Naive Bayes

In [1]:
import pandas as pd
import numpy as np

In [2]:
msgs = pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['Label', 'SMS'])

In [3]:
msgs.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


ham means non-spam

In [4]:
msgs['Label'].value_counts(normalize=True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

there are fewer spams. 13.4% to be exact.
There are roughly 87% non-spams.

In [5]:
msgs.shape

(5572, 2)

there are 5572 rows/messages and 2 columns.

In [6]:
import math
eighty = math.floor(5572*0.8)
twenty = 5572 - eighty
randomized = msgs.sample(random_state=1,frac=1)

training_set = randomized.sample(eighty, random_state=1)
training_set.reset_index(inplace=True)
training_set.drop('index',axis=1,inplace=True)
test_set = randomized.sample(twenty,random_state=1)
test_set.reset_index(inplace=True)
test_set.drop('index',axis=1,inplace=True)

randomized the data set and split them into two different sets; training and test sets.

In [7]:
training_set['Label'].value_counts(normalize=True)*100

ham     86.67265
spam    13.32735
Name: Label, dtype: float64

In [8]:
test_set['Label'].value_counts(normalize=True)*100

ham     86.188341
spam    13.811659
Name: Label, dtype: float64

The percentages of spam and ham(non-spam) in both sets; test and training are similar.

Transforming the sms to a suitable format.

In [9]:
import re
test_set['SMS'] = test_set['SMS'].apply(lambda a: re.sub('\W',' ',a))
training_set['SMS'] = training_set['SMS'].apply(lambda a: re.sub('\W',' ',a))

In [10]:
test_set['SMS'].head()

0         Good night my dear   Sleepwell amp Take care
1    Sen told that he is going to join his uncle fi...
2    Thank you baby  I cant wait to taste the real ...
3                                 When can ü come out 
4                 No  Thank you  You ve been wonderful
Name: SMS, dtype: object

In [11]:
test_set['SMS'] = test_set['SMS'].str.lower()
training_set['SMS'] = training_set['SMS'].str.lower()

There some double spaces that need to be removed from the SMS column.

In [12]:
test_set['SMS'] = test_set['SMS'].apply(lambda a: re.sub('\s+',' ',a))

In [13]:
training_set['SMS'] = training_set['SMS'].apply(lambda a: re.sub('\s+',' ',a))

Transforming the SMS column to represent the number of times a particular word appears.

In [14]:
vocabulary = []
test = list(test_set['SMS'])
training = list(training_set['SMS'])
test.extend(training)

In [15]:
test[0:5]

['good night my dear sleepwell amp take care',
 'sen told that he is going to join his uncle finance in cbe',
 'thank you baby i cant wait to taste the real thing ',
 'when can ü come out ',
 'no thank you you ve been wonderful']

In [16]:
len(test)

5572

In [17]:
vocab = []
for i in test:
    x= i.split(' ')
    for t in x:
        vocab.append(t)

In [18]:
vocabulary = set(vocab)
vocabulary.discard('')#removed empty string from the set.
vocabulary = list(vocabulary)

In [19]:
vocabulary

['losing',
 'east',
 'projects',
 'wun',
 'outreach',
 'carlos',
 'grow',
 'sozi',
 'ac',
 'notifications',
 'weakness',
 'effect',
 'upgrading',
 '4years',
 'win150ppmx3age16',
 'tunji',
 'self',
 'tortilla',
 '1thing',
 'including',
 'receipt',
 'num',
 'nagar',
 'mumhas',
 'challenge',
 'interested',
 'saeed',
 'thou',
 'pai',
 'wizzle',
 'lots',
 'afghanistan',
 'hairdressers',
 'yep',
 'drinking',
 'print',
 'viva',
 'ystrday',
 'in',
 'instructions',
 'stress',
 '50award',
 'snowball',
 'deviousbitch',
 'sony',
 'wisheds',
 'howu',
 '8p',
 '13',
 'rest',
 'hex',
 'oblivious',
 'torrents',
 'colleg',
 'kr',
 '09066364589',
 'between',
 'gin',
 'wife',
 'mushy',
 'conference',
 'rental',
 'amanda',
 'normally',
 'gaps',
 'warwick',
 'possessiveness',
 'challenging',
 '150',
 'downs',
 'uworld',
 'yifeng',
 'deny',
 'dialogue',
 'jas',
 'increments',
 'suply',
 'department',
 'relation',
 'nationwide',
 '08448350055',
 'conversations',
 'taylor',
 'prepayment',
 '700',
 'whether',
 

Converted to set to remove duplicates.

In [20]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

In [21]:
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        if word not in word_counts_per_sms:
            continue
        word_counts_per_sms[word][index] += 1

In [22]:
df = pd.DataFrame(word_counts_per_sms)
training_set_2 = pd.concat([training_set,df],axis=1)

In [23]:
spam_num = sum(training_set_2['Label']=='spam')
non_spam_num = sum(training_set_2['Label']=='ham')

In [24]:
spam_num

594

In [25]:
non_spam_num

3863

In [26]:
len(vocabulary)

7711

Firstly i need to calculate the probabilty of having a spam and non-spam message respectively.

In [27]:
tot_messages = training_set_2.shape[0]
tot_messages

4457

In [None]:
p_spam = spam_num/tot_messages

p_non_spam = non_spam_num/tot_messages

In [None]:
spam = {}
non_spam = {}
words = training_set_2.columns[2:]
for col in words:
    sum_spam = training_set_2[training_set_2['Label']=='spam'][col].sum()
    sum_non_spam = training_set_2[training_set_2['Label']=='non_spam'][col].sum()
    spam[col] = sum_spam
    non_spam[col] = sum_non_spam
    

In [None]:
# Create dictionaries to store word counts for spam and non-spam messages
spam_word_counts = {word: 0 for word in vocabulary}
non_spam_word_counts = {word: 0 for word in vocabulary}
n_spam = sum(spam.values())
n_non_spam = sum(non_spam.values())

# Iterate through each word in the vocabulary
for word in vocabulary:
    # Calculate Nwi|Spam: Number of times word appears in spam messages
    spam_word_counts[word] = sum(training_set_2[training_set_2['Label'] == 'spam'][word])
    
    # Calculate Nwi|Non-Spam: Number of times word appears in non-spam messages
    non_spam_word_counts[word] = sum(training_set_2[training_set_2['Label'] == 'non_spam'][word])

# Calculate P(spam|words) using the Naive Bayes formula
# P(spam|words) = P(spam) * Π[P(wi|spam)] / [P(spam) * Π[P(wi|spam)] + P(non_spam) * Π[P(wi|non_spam)]]

# Calculate the probability of spam and non-spam messages
p_spam = spam_num / tot_messages
p_non_spam = non_spam_num / tot_messages

# Initialize the numerator and denominator
numerator = p_spam
denominator = p_spam + p_non_spam

numerator2 = p_non_spam
denominator2 = p_spam + p_non_spam
# Iterate through each word in the message
for index, words in enumerate(training_set['SMS']):
    # Calculate P(wi|spam) and P(wi|non_spam)
      for word in words:
        p_word_given_spam = (spam_word_counts[word] + 1) / (n_spam + len(vocabulary))
        p_word_given_non_spam = (non_spam_word_counts[word] + 1) / (n_non_spam + len(vocabulary))
    
    # Update the numerator and denominator
        numerator *= p_word_given_spam
        denominator *= (p_word_given_spam + p_word_given_non_spam)
    
        numerator2 *= p_word_given_non_spam
        denominator2 *= p_word_given_non_spam + p_word_given_spam
# Calculate the final probability
p_spam_given_words = numerator / denominator
p_non_spam_given_words = numerator2 / denominator2

In [None]:

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in vocabulary:
            p_spam_given_message*= spam_word_counts[word]
            p_ham_given_message*= non_spam_word_counts[word]
    '''    
    This is where we calculate:

    p_spam_given_message = ?
    p_ham_given_message = ?
    '''    

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')