# Building a Spam Filter with Naive Bayes
The aim is to build a spam filter to classify SMS messages as either SPAM or otherwise. 
We will using the multinominal Naive Bayes algorithm to do this.


In [1]:
import pandas as pd
import numpy as np
message_dataset = pd.read_csv("SMSSpamCollection", sep ='\t', header = None, names=['Label','SMS'])
message_dataset.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
message_dataset["Label"].value_counts(normalize = True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

Non spam messages are the majority approximately 87% and around 13% of the messages are considered spam. 

Now let us split the database into 2,
1. Training dataset
2. Test dataset 

We will use the training dataset to train the algorithm to test it on the test dataset. 

80% of the data will go to the training dataset and the remaining 20% in the test dataset

In [3]:
index = round(message_dataset.shape[0]*0.8)
message_randomized = message_dataset.sample(frac=1, random_state=1)
training_set = message_randomized[0:index]
training_set = training_set.reset_index(drop = True)
test_set = message_randomized[index:]
test_set = test_set.reset_index(drop = True)

In [4]:
training_set["Label"].value_counts(normalize = True)*100
# training_set.head()

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [5]:
test_set["Label"].value_counts(normalize = True)*100
# test_set.head()

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

In [6]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [7]:
training_set['SMS'] = training_set["SMS"].str.split()
vocabulary = [ ]
for i in training_set['SMS']:
    for j in i:
        if j not in vocabulary:
            vocabulary.append(j)
print(len(vocabulary))

7783


In [8]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [9]:
clean_training_set = pd.concat([training_set, word_counts], axis=1)
clean_training_set.head()

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [10]:
p_spam = clean_training_set[clean_training_set["Label"]=="spam"].shape[0]/len(clean_training_set["Label"])
p_ham = clean_training_set[clean_training_set["Label"]=="ham"].shape[0]/len(clean_training_set["Label"])
print("Probability SPAM",p_spam)
print("Probability not SPAM",p_ham)

Probability SPAM 0.13458950201884254
Probability not SPAM 0.8654104979811574


In [11]:
# add = 0
# for i in clean_training_set[clean_training_set["Label"]=="spam"]["SMS"]:
#     add+=len(i)
# print(add)
def summation_words(ser):
    add = 0
    for i in ser:
        add+=len(i)
    return add
n_spam = summation_words(clean_training_set[clean_training_set["Label"]=="spam"]["SMS"])
n_ham = summation_words(clean_training_set[clean_training_set["Label"]=="ham"]["SMS"])
print("Number of words in all SPAM messages",n_spam)
print("Number of words in all SPAM messages",n_ham)
print("Number of words in the Vocabulary", len(vocabulary))
alpha = 1

Number of words in all SPAM messages 15190
Number of words in all SPAM messages 57237
Number of words in the Vocabulary 7783


In [12]:
spam_messages = clean_training_set[clean_training_set["Label"]=="spam"]
ham_messages = clean_training_set[clean_training_set["Label"]=="ham"]
parameters_spam = {}
parameters_ham = { }
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()   
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*len(vocabulary))
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()   
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*len(vocabulary))
    parameters_ham[word] = p_word_given_ham

In [13]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
            
#     print('P(Spam|message):', p_spam_given_message)
#     print('P(Ham|message):', p_ham_given_message)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'Equal proabilities, have a human classify this!'

In [14]:
classify("WINNER!! This is the secret code to unlock the money: C3421.")

'spam'

In [15]:
classify("Sounds good, Tom, then see u there")

'ham'

Classify is a function we use to classify whether a given message is SPAM or otherwise. 

We have tested our algorithm on 2 example messages above:
1. WINNER!! This is the secret code to unlock the money: C3421. -- **SPAM**
2. Sounds good, Tom, then see u there -- ** Not SPAM**




In [16]:
test_set["Predicted"] = test_set["SMS"].apply(classify)
test_set[["Predicted","Label"]]
test_set[test_set["Predicted"] == test_set["Label"]].shape[0]/test_set.shape[0]*100

98.74326750448833

Accuracy of our classification is 98.74%, so we can say our algorithm is pretty accurate. We have 14 messages from the test algorithm where we had a dispute in classifying the message. 

Let us take a deeper look at them. 

In [17]:
test_set[test_set["Predicted"] != test_set["Label"]]["Predicted"].value_counts()

ham                                                8
spam                                               5
Equal proabilities, have a human classify this!    1
Name: Predicted, dtype: int64

- 8 messages that have been classifed as SPAM, were incorrectly classfied as ham by our algorithm.
- 5 messages that have been classfied as HAM, were incorrectly classified as spam by our algorithm. 
- 1 message requires human intervention to classify it as either SPAM or HAM.

In [18]:
li = test_set[test_set["Predicted"] != test_set["Label"]]
spam_detected = li[li["Label"]=="spam"]["SMS"]
for i in spam_detected:
    print(i)

Not heard from U4 a while. Call me now am here all night with just my knickers on. Make me beg for it like U did last time 01223585236 XX Luv Nikiyu4.net
More people are dogging in your area now. Call 09090204448 and join like minded guys. Why not arrange 1 yourself. There's 1 this evening. A£1.50 minAPN LS278BB
Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50
Hi babe its Chloe, how r u? I was smashed on saturday night, it was great! How was your weekend? U been missing me? SP visionsms.com Text stop to stop 150p/text
0A$NETWORKS allow companies to bill for SMS, so they are responsible for their "suppliers", just as a shop has to give a guarantee on what they sell. B. G.
RCT' THNQ Adrian for U text. Rgds Vatian
2/2 146tf150p
Hello. We need some posh birds and chaps to user trial prods for champneys. Can i put you down? I need your address and dob asap. Ta r


In [19]:
ham_detected = li[li["Label"]=="ham"]["SMS"]
for i in ham_detected:
    print(i)

Unlimited texts. Limited minutes.
26th OF JULY
Nokia phone is lovly..
A Boy loved a gal. He propsd bt she didnt mind. He gv lv lttrs, Bt her frnds threw thm. Again d boy decided 2 aproach d gal , dt time a truck was speeding towards d gal. Wn it was about 2 hit d girl,d boy ran like hell n saved her. She asked 'hw cn u run so fast?' D boy replied "Boost is d secret of my energy" n instantly d girl shouted "our energy" n Thy lived happily 2gthr drinking boost evrydy Moral of d story:- I hv free msgs:D;): gud ni8
No calls..messages..missed calls
We have sent JD for Customer Service cum Accounts Executive to ur mail id, For details contact us


- By observing the messages marked HAM incorrectly, we can observe that these messgaes are long and wordy, they contain many *normal* words, which is why I believe they get marked as ham by our algorithmn. But we can make a few observations, details like Money and links are mentioned.
- The messages that have been marked SPAM incorrectly are short in nature, the reason they get marked as SPAM, could possibly be because the word doesn't exist in respevtive libraries, sometimes these words are typos or mispelt, or they are capitalized.


We can maybe try to avoid making all the words in the message into lowercase, to check if our probability gets affected in anyway.

In [25]:

def classify_exp(message):

    message = re.sub('\W', ' ', message)
    message = message.split()
#     print(message)
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
            
#     print('P(Spam|message):', p_spam_given_message)
#     print('P(Ham|message):', p_ham_given_message)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'Equal proabilities, have a human classify this!'

In [23]:
classify_exp("26th OF JULY")

['26th', 'OF', 'JULY']
P(Spam|message): 1.1717189920240502e-05
P(Ham|message): 1.3309912303616694e-05


'ham'

We can make the algorithm more case sensitive by removing all the str.lower() cases, but I do not believe we will get a better accuracy rate than what we have currently. 

We have made a small correction to the classify function by making the function case sensitivity, let us have a look at the accuracy.

In [29]:
test_set_exp1 = test_set.copy()

test_set_exp1["Predicted"] = test_set_exp1["SMS"].apply(classify_exp)
test_set_exp1[["Predicted","Label"]]
test_set_exp1[test_set_exp1["Predicted"] == test_set_exp1["Label"]].shape[0]/test_set_exp1.shape[0]*100

98.56373429084381

There is a small decrease in probability, now let us confirm by including case sensitivity throughout the project and see if it has any sort of impact. 
We cannot discern by the accuracy given above

In [32]:
training_set_exp = message_randomized[:4458].reset_index(drop=True)
test_set_exp = message_randomized[4458:].reset_index(drop=True)
training_set_exp['SMS'] = training_set_exp['SMS'].str.replace('\W', ' ')

vocabulary_exp = []
for sms in training_set_exp['SMS']:
    for word in sms:
        vocabulary_exp.append(word)
vocabulary_exp = list(set(vocabulary_exp))

word_counts_per_sms_exp = {unique_word: [0] * len(training_set_exp['SMS']) for unique_word in vocabulary_exp}
for index, sms in enumerate(training_set_exp['SMS']):
    for word in sms:
        word_counts_per_sms_exp[word][index]+=1
        
word_counts_exp = pd.DataFrame(word_counts_per_sms_exp)

training_set_final_exp = pd.concat([training_set_exp, word_counts_exp], axis=1)
    
spam_sms_exp = training_set_final_exp[training_set_final_exp['Label']=='spam']
ham_sms_exp = training_set_final_exp[training_set_final_exp['Label']=='ham']

p_spam_exp = training_set_final_exp['Label'].value_counts()['spam']/len(training_set_final_exp)
p_ham_exp = training_set_final_exp['Label'].value_counts()['ham']/len(training_set_final_exp)

n_spam_exp = 0
n_ham_exp = 0
for i in range(len(training_set_final_exp)):
    row = list(training_set_final_exp.iloc[i].values)
    for j in range(2,len(row)):
        if row[0]=='spam':
            n_spam_exp+=row[j]
        else:
            n_ham_exp+=row[j]
            
n_vocabulary_exp = len(vocabulary_exp)
alpha = 1

p_wi_spam_exp = {}
p_wi_ham_exp = {}
for word in vocabulary_exp:
    p_wi_spam_exp[word] = (spam_sms_exp[word].sum()+alpha)/(n_spam_exp+alpha*n_vocabulary_exp)
    p_wi_ham_exp[word] = (ham_sms_exp[word].sum()+alpha)/(n_ham_exp+alpha*n_vocabulary_exp)
    
def classify_test_set_exp(message):
    message = re.sub('\W', ' ', message)
    message = message.split()
    p_spam_given_message_exp = p_spam_exp
    p_ham_given_message_exp = p_ham_exp
    for word in message:
        if word in p_wi_spam_exp:
            p_spam_given_message_exp*=p_wi_spam_exp[word]
        if word in p_wi_ham_exp:
            p_ham_given_message_exp*=p_wi_ham_exp[word]
    if p_ham_given_message_exp > p_spam_given_message_exp:
        return 'ham'
    elif p_spam_given_message_exp > p_ham_given_message_exp:
        return 'spam'
    else:
        return 'needs human classification'
test_set_exp['Predicted'] = test_set_exp['SMS'].apply(classify_test_set_exp)

correct_exp = 0
total_exp = len(test_set_exp)

for row in test_set_exp.iterrows():
    if row[1]['Predicted']==row[1]['Label']:
        correct_exp+=1
accuracy_exp = correct_exp/total_exp*100
print(accuracy_exp)

85.27827648114902


On completely including case sensitivity we see that the accuracy decreases significantly. Including this added no significant value in fact it worsened our accuracy by 14%.

My understanding is that we continue with the initial classification algorithm, that gave us around 99% accuracy. The remaining 1-1.5% that get marked incorrecty, should require human intervention to confirm their classification.  