# Building a Spam Filter with Naive Bayes

In this project, I'm going to build a spam filter for SMS messages using the multinomial Naive Bayes algorithm. My goal is to write a program that classifies new messages with an accuracy greater than 80% — so I expect that more than 80% of the new messages will be classified correctly as spam or ham (non-spam).

To train the algorithm, I'll use a dataset of 5,572 SMS messages that are already classified by humans. The dataset was put together by Tiago A. Almeida and José María Gómez Hidalgo, and it can be downloaded from the The UCI Machine Learning Repository.


In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Read the dataset
df=pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['Label','SMS'])
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
## We see that about 87% of the messages are ham, and the remaining 13% are spam
df['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [4]:
# Randomize the dataset
df=df.sample(frac=1,random_state=1)
#Training/Test split (80% for traning and 20% for test)
training_set=df[:4458].reset_index(drop=True)
test_set=df[4459:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1113, 2)


In [5]:
# Training and test sets are well randomized
training_set['Label'].value_counts(normalize=True)
test_set['Label'].value_counts(normalize=True)

ham     0.867925
spam    0.132075
Name: Label, dtype: float64

## Data Cleaning


In [6]:
# Before cleaning
training_set.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [7]:
# After cleaning (removing punctutation and setting lower cases)
training_set['SMS']=training_set['SMS'].str.replace(r'\W+',' ').str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on da...


In [8]:
# Creating a vocabulary list containing unique words
training_set['SMS']=training_set['SMS'].str.split()
vocabulary=[]
for row in training_set['SMS']:
    for word in row:
        vocabulary.append(word)
vocabulary=set(vocabulary)
vocabulary=list(vocabulary)
len(vocabulary)
# There are 7,783 unique words in all the messages of the training set

7783

In [9]:
# Creating a new dataset that counts word in each message
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [10]:
word_counts_df=pd.DataFrame(word_counts_per_sms)
word_counts_df.head()

Unnamed: 0,mail,duvet,stamped,sarcasm,fuuuuck,pls,prasad,dance,bags,name1,...,three,ls1,fuckinnice,4fil,prize,noooooooo,instituitions,09050000878,bloomberg,transfered
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Concatenate the two data sets (training_set and word_counts_df)
training_set_clean=pd.concat([training_set,word_counts_df],axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,mail,duvet,stamped,sarcasm,fuuuuck,pls,prasad,dance,...,three,ls1,fuckinnice,4fil,prize,noooooooo,instituitions,09050000878,bloomberg,transfered
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculating Parameters

In [12]:
# Probability of spam and ham
p_spam=training_set_clean['Label'].value_counts(normalize=True)[1]
p_ham=training_set_clean['Label'].value_counts(normalize=True)[0]

In [13]:
# Number of words in spam messages
n_spam=0
for row in training_set_clean['SMS'][training_set_clean['Label']=='spam']:
        n_spam += len(row)

# Number of words in ham messages      
n_ham=0
for row in training_set_clean['SMS'][training_set_clean['Label']=='ham']:
        n_ham += len(row)

# Number of unique words        
n_vocabulary=len(vocabulary)
alpha=1

In [14]:
# Isolating spam and ham messages
spam_messages=training_set_clean[training_set_clean['Label']=='spam']
ham_messages=training_set_clean[training_set_clean['Label']=='ham']

# Initiate paramters
parameter_spam={}
parameter_ham={}

# Caculate parameters
for word in vocabulary:
    # Calculate probability of a word given spam messages
    n_word_given_spam=spam_messages[word].sum()
    p_word_given_spam=(n_word_given_spam+alpha)/(n_spam+alpha*n_vocabulary)
    parameter_spam[word]=p_word_given_spam
    
    # Calculate probability of a word given ham messages
    n_word_given_ham=ham_messages[word].sum()
    p_word_given_ham=(n_word_given_ham+alpha)/(n_ham+alpha*n_vocabulary)
    parameter_ham[word]=p_word_given_ham
    

## Creating A Function to Classify A New Message

In [15]:
# Create a classifying function
def classify_test_set(message):
    message = re.sub(r'\W+', ' ', message)
    message = message.lower()
    message = message.split()
    p_spam_given_message=p_spam
    p_ham_given_message=p_ham 
    for word in message:
        if word in vocabulary:
            p_word_given_spam=parameter_spam[word]
            p_spam_given_message *= p_word_given_spam
            
            p_word_given_ham=parameter_ham[word]
            p_ham_given_message *= p_word_given_ham
      
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

## Measuring the Spam Filter's Accuracy Using the Test Set

In [16]:
# Create a column showing the predictions of the filter
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,But i haf enuff space got like 4 mb...,ham
1,spam,Had your mobile 10 mths? Update to latest Oran...,spam
2,ham,All sounds good. Fingers . Makes it difficult ...,ham
3,ham,"All done, all handed in. Don't know if mega sh...",ham
4,ham,But my family not responding for anything. Now...,ham


In [17]:
# Write a function to measure the accuracy of the spam filter 
correct=0
total=test_set.shape[0]
for index,row in test_set.iterrows():
    if row['Label']==row['predicted']:
        correct += 1


In [18]:
# The accuracy is close to 98.74%, which is really good.
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1099
Incorrect: 14
Accuracy: 0.9874213836477987


In [19]:
test_set[test_set['Label']!=test_set['predicted']]

Unnamed: 0,Label,SMS,predicted
113,spam,Not heard from U4 a while. Call me now am here...,ham
134,spam,More people are dogging in your area now. Call...,ham
151,ham,Unlimited texts. Limited minutes.,spam
158,ham,26th OF JULY,spam
283,ham,Nokia phone is lovly..,spam
292,ham,A Boy loved a gal. He propsd bt she didnt mind...,needs human classification
301,ham,No calls..messages..missed calls,spam
318,ham,We have sent JD for Customer Service cum Accou...,spam
503,spam,Oh my god! I've found your number again! I'm s...,ham
545,spam,"Hi babe its Chloe, how r u? I was smashed on s...",ham
