In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Project Scope

- We are going to build a spam filter!
- DataSet: SMS Spam Collection Data Set (created by Federal University of Sao Carlos)
- Method: Naive Bayes
- Accuracy goal: >80%

In [116]:
SMSSpamCollection = pd.read_csv('SMSSpamCollection.csv', sep = '\t', header=None, names=['Label', 'SMS'])

In [117]:
SMSSpamCollection.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [118]:
SMSSpamCollection.info

<bound method DataFrame.info of      Label                                                SMS
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DAT

In [119]:
SMSSpamCollection.describe

<bound method NDFrame.describe of      Label                                                SMS
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A D

In [120]:
SMSSpamCollection.shape

(5572, 2)

In [121]:
SMSSpamCollection['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [122]:
percentage_spam = (len(SMSSpamCollection[SMSSpamCollection['Label'] == 'spam']) * 100) /len(SMSSpamCollection['Label']) 
percentage_spam

13.406317300789663

In [123]:
SMSSpamCollection.isna().sum()

Label    0
SMS      0
dtype: int64

# Data Description

- The data set is comprised of 5,572 rows and 2 Columns (The text column and the label column)
- Thankfully all rows have already been labelled (either 'ham' or 'spam')
- 13,5% of the data is spam
- There are no NaN values

# Training and Testing Set

- We are going to split the data into a training and testing set
- The training will contain 80% of the data whilst testing the remainder
- Method: randomise using DataFrame.sample() method
- Method: split the data 80-20%
- 80% of 5572 ~= 4,458

In [124]:
randomised_dataset = SMSSpamCollection.sample(frac=1, random_state = 1)

In [125]:
spam_training = randomised_dataset.iloc[:4458,:].reset_index(drop=True)
len(spam_training)

4458

In [126]:
spam_testing = randomised_dataset.iloc[4458:,:].reset_index(drop=True)
len(spam_testing)

1114

# Check both training and testing sets

- Purpose: make sure both are reprensatitve and evenly split
- Method: compare % of spam in both data-sets

In [127]:
spam_training_spam_perct = (len(spam_training[spam_training['Label'] == 'spam']) * 100) /len(spam_training['Label'])
spam_training_spam_perct

13.458950201884253

In [128]:
spam_testing_spam_perct = (len(spam_testing[spam_testing['Label'] == 'spam']) * 100) /len(spam_testing['Label'])
spam_testing_spam_perct

13.195691202872531

# Both Data Set Representative

- The testing and training datasets show very similar % of spam
- We therefore confirm their relevance and move ahead

# Counting words per mail

- Method: Create 1 column for each unique word, then count # of words per mail
- Method: Each row is 1 email
- Data preparation: remove all special characters
- Data preparation: transform all letters to lower-case

In [129]:
import re

In [130]:
spam_training['SMS'] = spam_training['SMS'].str.replace('\W', ' ').str.lower()

In [131]:
spam_training['SMS'].head()

0                         yep  by the pretty sculpture
1        yes  princess  are you going to make me moan 
2                           welp apparently he retired
3                                              havent 
4    i forgot 2 ask ü all smth   there s a card on ...
Name: SMS, dtype: object

# Getting unique words

- Goal: extract all unique words from column 'SMS'
- Method_step_1: transform each SMS message into a string
- Method_step_2: iterate through the SMS column and append each unique word to an empty list
- Method_step_3: transform the list into a set - this will remove duplicates - then back to a list

In [132]:
spam_training['SMS'] = spam_training['SMS'].str.split()

In [133]:
vocabulary = []

for rows in spam_training['SMS']:
    for row in rows:
        vocabulary.append(row)

vocabulary = list(set(vocabulary))

In [134]:
vocabulary

['neighbors',
 'nat',
 'finds',
 '400',
 'weirdest',
 '150p16',
 'paris',
 'questioned',
 'looked',
 'lubly',
 '08002986906',
 'usmle',
 'contacted',
 'stoners',
 'improve',
 '08712300220',
 'yck',
 'web2mobile',
 'boggy',
 'aiyah',
 'events',
 'child',
 'loosing',
 'navigate',
 'music',
 'dreams',
 'log',
 'b4',
 'monkey',
 'syllabus',
 'forgt',
 'sat',
 'easier',
 'liver',
 'secondary',
 'kerala',
 'othrs',
 '021',
 'film',
 'rudi',
 'sambar',
 '8am',
 'lotto',
 'her',
 'outsider',
 'edwards',
 'bettersn',
 '600',
 'dirtiest',
 'james',
 'spinout',
 'move',
 'moves',
 'sharing',
 '08712400200',
 'plaza',
 'wrk',
 'surgical',
 'amla',
 '09050000301',
 'keep',
 '2years',
 'browse',
 'unable',
 'collection',
 'enjoying',
 'atlast',
 'meaningful',
 'bill',
 '08709222922',
 'tomorrow',
 'ruthful',
 'president',
 'mid',
 'freephone',
 '5pm',
 '08719181259',
 'department',
 'fffff',
 '4get',
 'most',
 'admirer',
 'yor',
 'hols',
 'fight',
 'walsall',
 'ou',
 '02',
 '09099725823',
 'interest

# Creating a dataframe

- Goal: counting each unique word per SMS
- Method: first create a dictionary that from the vocabulary list then transform it into a df

In [135]:
# create a dictionary that has 0s for each row in smap_training['SMS']
word_counts_per_sms = {new_list:[0] * len(spam_training['SMS']) for new_list in vocabulary}

In [136]:
# loop through each SMS, take the index + words and append them to the dictionary

for index, sms in enumerate(spam_training['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [137]:
# Now that we have all the values in the dictionary, let's tranform it into a df

df_sms = pd.DataFrame(word_counts_per_sms)

In [138]:
df_sms.head()

Unnamed: 0,neighbors,nat,finds,400,weirdest,150p16,paris,questioned,looked,lubly,...,15pm,shame,duvet,byatch,bergkamp,goggles,accept,nyc,ny,stifled
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [139]:
# Combine both df horizontally using pd.concat([x,y], axis=1) so that we get the SMS and Label Columns

SMS_data = pd.concat([spam_training, df_sms], axis=1)
SMS_data.head()

Unnamed: 0,Label,SMS,neighbors,nat,finds,400,weirdest,150p16,paris,questioned,...,15pm,shame,duvet,byatch,bergkamp,goggles,accept,nyc,ny,stifled
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Naive Bayes Formula

- Probability Spam =  P(wi|Spam) = (N[wi|spam] + 1) / (Nspam + 1 * N[vocabulary])
- Probability non-Spam = P(wi|Ham) = N[wi|ham] + 1 / Nham + 1 * N[vocabulary])

# Terms 

- P(Spam) = probability of SMS being a spam
- P(Ham) = probability of SMS not being a spam
- Nspam = number of all words in all spam messages
- Nham = number of all words in all non-spam messages
- wi|Spam = probability of having the word given it's a spam

# Preparing the variables for Naive Bayes

- Before jumping straight into the calculations we need a couple of variables
- Probability that a SMS is Spam: P(Spam)
- Probability that a SMS is not Spam: P(Ham)
- Since we will use Laplace smoothing, we'll set a variable alpha = 1

In [140]:
alpha = 1

In [141]:
spams= SMS_data[SMS_data['Label'] == 'spam']
len_spams = len(spams)

In [142]:
hams = SMS_data[SMS_data['Label'] == 'ham']
len_hams = len(hams)

In [143]:
P_spam = len_spams / (len_spams + len_hams) 
P_spam 

0.13458950201884254

In [144]:
P_ham = len_hams / (len_spams + len_hams) 
P_ham

0.8654104979811574

# Further variables

- Now that we have probabilities we need Nspam, Nham, Nvocabulary

In [145]:
words_in_spam = spams['SMS'].apply(len)
words_in_spam

16      31
18      26
56      24
60      24
61      25
62      32
70      27
71      32
84      28
89      27
98      27
106     29
113     32
142     28
144      6
158     30
159     24
162      8
164     25
165     27
166     29
179     19
181     13
186     21
191     21
200     24
203     27
206     30
218     22
219     24
        ..
4297    25
4298    20
4306    25
4312    13
4318    29
4331    27
4332    24
4350    27
4353    13
4354    29
4357    13
4359    17
4373    24
4377    27
4379    32
4383    33
4387    14
4388    28
4390    30
4392    29
4401    24
4403    25
4407    32
4414    30
4433    36
4437    20
4439    25
4443    25
4449    30
4455    28
Name: SMS, Length: 600, dtype: int64

In [146]:
Nspam = words_in_spam.sum()
Nspam

15190

In [147]:
words_in_ham = hams['SMS'].apply(len)
words_in_ham

0         5
1         9
2         4
3         1
4        26
5        15
6        17
7         6
8         4
9         7
10       10
11        2
12        5
13       16
14       12
15        8
17        9
19        8
20       14
21       11
22       11
23        8
24        7
25       11
26        6
27        4
28       33
29       66
30        6
31       18
       ... 
4422     26
4423      8
4424     10
4425     30
4426     10
4427     28
4428      6
4429      8
4430      6
4431      4
4432     19
4434      9
4435      4
4436      7
4438     11
4440      6
4441      6
4442    100
4444      9
4445     11
4446      5
4447     30
4448      8
4450      4
4451     25
4452      6
4453     17
4454     34
4456     27
4457      4
Name: SMS, Length: 3858, dtype: int64

In [148]:
Nham = words_in_ham.sum()
Nham

57237

# Proceed with calculation of P(wi|Spam) and P(wi|Ham)

- Strength of Bayes is that it calculates all probabilities beforehand making it faster when getting new email
- Therefore we'll calculate each word probability of Spam and Ham (P(wi|Spam) and P(wi|Ham))
- This amounts to twice the total of our vocabulary = 2 * 7,783 words = 15,566

In [149]:
p_wi_spam_dictionary = {each_word:0  for each_word in vocabulary}
p_wi_ham_dictionary = {each_word:0  for each_word in vocabulary}

In [150]:
length_voca = len(vocabulary)

In [151]:
vocabulary

['neighbors',
 'nat',
 'finds',
 '400',
 'weirdest',
 '150p16',
 'paris',
 'questioned',
 'looked',
 'lubly',
 '08002986906',
 'usmle',
 'contacted',
 'stoners',
 'improve',
 '08712300220',
 'yck',
 'web2mobile',
 'boggy',
 'aiyah',
 'events',
 'child',
 'loosing',
 'navigate',
 'music',
 'dreams',
 'log',
 'b4',
 'monkey',
 'syllabus',
 'forgt',
 'sat',
 'easier',
 'liver',
 'secondary',
 'kerala',
 'othrs',
 '021',
 'film',
 'rudi',
 'sambar',
 '8am',
 'lotto',
 'her',
 'outsider',
 'edwards',
 'bettersn',
 '600',
 'dirtiest',
 'james',
 'spinout',
 'move',
 'moves',
 'sharing',
 '08712400200',
 'plaza',
 'wrk',
 'surgical',
 'amla',
 '09050000301',
 'keep',
 '2years',
 'browse',
 'unable',
 'collection',
 'enjoying',
 'atlast',
 'meaningful',
 'bill',
 '08709222922',
 'tomorrow',
 'ruthful',
 'president',
 'mid',
 'freephone',
 '5pm',
 '08719181259',
 'department',
 'fffff',
 '4get',
 'most',
 'admirer',
 'yor',
 'hols',
 'fight',
 'walsall',
 'ou',
 '02',
 '09099725823',
 'interest

In [153]:
for words in vocabulary:
    number_words_spam = spams[words].sum()
    number_words_ham = hams[words].sum()
    p_wi_spam = (number_words_spam + alpha) / (Nspam + alpha * length_voca)
    p_wi_ham = (number_words_ham + alpha) / (Nham + alpha * length_voca)
    p_wi_spam_dictionary[words] = p_wi_spam
    p_wi_ham_dictionary[words] = p_wi_ham

# Creating the Spam filter

- Now that we have trained the model, we can now put it in use
- We need to create a function that takes in new SMS and classify them


In [154]:
import re

In [161]:
def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
        
    p_spam_given_message = P_spam
    p_ham_given_message = P_ham

    for word in message:
        if word in p_wi_spam_dictionary:
            p_spam_given_message *= p_wi_spam_dictionary[word]
            
        if word in p_wi_ham_dictionary:
            p_ham_given_message *= p_wi_ham_dictionary[word] 

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

# Testing the function

In [162]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [163]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


# Applying the function on the test data-set

In [170]:
# Let's first convert the function with a return statement instead of print 
# This way we can .apply it on a column



def classify_2(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
        
    p_spam_given_message = P_spam
    p_ham_given_message = P_ham

    for word in message:
        if word in p_wi_spam_dictionary:
            p_spam_given_message *= p_wi_spam_dictionary[word]
            
        if word in p_wi_ham_dictionary:
            p_ham_given_message *= p_wi_ham_dictionary[word] 

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'Needs further checking'

In [171]:
# Using the function on the test data
# We update an additional column to then compare both the function and original value

spam_testing['predicted']  = spam_testing['SMS'].apply(classify_2)
spam_testing.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [193]:
comparison = spam_testing[spam_testing.Label == spam_testing.predicted]
print(comparison)

     Label                                                SMS predicted
0      ham          Later i guess. I needa do mcat study too.       ham
1      ham             But i haf enuff space got like 4 mb...       ham
2     spam  Had your mobile 10 mths? Update to latest Oran...      spam
3      ham  All sounds good. Fingers . Makes it difficult ...       ham
4      ham  All done, all handed in. Don't know if mega sh...       ham
5      ham  But my family not responding for anything. Now...       ham
6      ham                                           U too...       ham
7      ham  Boo what time u get out? U were supposed to ta...       ham
8      ham  Genius what's up. How your brother. Pls send h...       ham
9      ham                             I liked the new mobile       ham
10     ham                          For my family happiness..       ham
11     ham  If i let you do this, i want you in the house ...       ham
12     ham  Do you know why god created gap between your f...   

In [199]:
length = len(spam_testing)
comparison_length = len(comparison)

In [201]:
accuracy = comparison_length / length
accuracy

0.9874326750448833

# The Spam Filter scores a ~99% accuracy

- The Naive Bayes Algorithm has proved succesful in correctly scoring most of the Spams

# Implementation in real-life and limitations

- The data was based on one person texting, therefore the algorithm is limited to the person's writing style and people/ institutions he/she communicated with

- Practically, to implement this in real life we would give the Algorithm more data to ensure that this encompasses more inputs that are relevant to an overall population vs one individual