# A spam filter based on the Naive Bayes algorithm

Spam messages are a nuissance to consumers, and effective spam filtering represents a real advantage for any service provider. In this project, a dataset of text messages will be used to build a basic spam filter based on the Naive Bayes algorithm. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Exploration

In [2]:
msgs = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

In [3]:
msgs.shape

(5572, 2)

In [4]:
msgs.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
msgs['Label'].value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

## Randomization & Train/Test split

In [6]:
msgs_random = msgs.sample(frac=1, random_state=1)

In [7]:
split_index = round(len(msgs_random) * 0.8)
msgs_train = msgs_random[:split_index].reset_index(drop=True)
msgs_test = msgs_random[split_index:].reset_index(drop=True)

In [8]:
msgs_train['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [9]:
msgs_test['Label'].value_counts(normalize=True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

## Data cleaning

In [10]:
msgs_train_cln = msgs_train.copy()

In [11]:
msgs_train_cln['SMS'] = msgs_train_cln['SMS'].str.replace('\W', ' ')
msgs_train_cln['SMS'] = msgs_train_cln['SMS'].str.lower()

In [12]:
msgs_train_cln.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [13]:
msgs_train_cln['SMS'] = msgs_train_cln['SMS'].str.split()

In [14]:
msgs_train_cln.head()

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


In [15]:
vocabulary = []
for row in msgs_train_cln['SMS']:
    for item in row:
        vocabulary.append(item)
        
len(vocabulary)

72427

In [16]:
vocabulary = list(set(vocabulary))
len(vocabulary)

7783

In [17]:
word_counts_per_sms = {word: [0] * len(msgs_train_cln['SMS']) for word in vocabulary}

In [18]:
for index, sms in enumerate(msgs_train_cln['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [19]:
word_count_df = pd.DataFrame(word_counts_per_sms)

In [20]:
final_training = pd.concat([msgs_train_cln, word_count_df], axis=1)
final_training

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
5,ham,"[ok, i, thk, i, got, it, then, u, wan, me, 2, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,ham,"[i, want, kfc, its, tuesday, only, buy, 2, mea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,ham,"[no, dear, i, was, sleeping, p]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ham,"[ok, pa, nothing, problem]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,ham,"[ill, be, there, on, lt, gt, ok]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
proportions = final_training['Label'].value_counts(normalize = True)
p_spam = proportions['spam']
p_ham = proportions['ham']

In [32]:
train_spam = final_training[final_training['Label'] == 'spam']
spam_len = train_spam['SMS'].apply(len)
n_spam = spam_len.sum()

train_ham = final_training[final_training['Label'] == 'ham']
ham_len = train_ham['SMS'].apply(len)
n_ham = ham_len.sum()

alpha = 1

print(p_spam, p_ham, n_spam, n_ham)

0.13458950201884254 0.8654104979811574 15190 57237


In [46]:
p_given_spam = {}
p_given_ham = {}

for word in vocabulary:
    p_given_spam[word] = (train_spam[word].sum() + alpha) / (n_spam + alpha * len(vocabulary))
    p_given_ham[word] = (train_ham[word].sum() + alpha) / (n_ham + alpha * len(vocabulary))

In [47]:
p_given_spam

{'forth': 4.3529360553693465e-05,
 'closed': 4.3529360553693465e-05,
 'financial': 4.3529360553693465e-05,
 'diff': 4.3529360553693465e-05,
 'across': 4.3529360553693465e-05,
 'gucci': 4.3529360553693465e-05,
 'anymore': 4.3529360553693465e-05,
 'eaten': 4.3529360553693465e-05,
 'kg': 4.3529360553693465e-05,
 'fools': 4.3529360553693465e-05,
 'surely': 4.3529360553693465e-05,
 'smart': 8.705872110738693e-05,
 'wan2': 8.705872110738693e-05,
 '08719181513': 0.0001305880816610804,
 '255': 4.3529360553693465e-05,
 'info': 0.0004352936055369347,
 'thk': 4.3529360553693465e-05,
 'rimac': 4.3529360553693465e-05,
 'red': 0.00017411744221477386,
 'cares': 4.3529360553693465e-05,
 'shadow': 4.3529360553693465e-05,
 'buns': 4.3529360553693465e-05,
 'woman': 8.705872110738693e-05,
 'murder': 4.3529360553693465e-05,
 'specially': 0.0003482348844295477,
 'bare': 4.3529360553693465e-05,
 'ages': 8.705872110738693e-05,
 'settle': 4.3529360553693465e-05,
 'lodging': 4.3529360553693465e-05,
 'lookin': 8

In [55]:
import re

def classify(message):
    message = re.sub('\W', ' ', message).lower().split()
   
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in p_given_spam:
            p_spam_given_message *= p_given_spam[word]
        if word in p_given_ham:
            p_ham_given_message *= p_given_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [56]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

'spam'

In [57]:
classify("Sounds good, Tom, then see u there")

'ham'

In [62]:
msgs_test['prediction'] = msgs_test['SMS'].apply(classify)

In [63]:
correct = 0
total = msgs_test.shape[0]

for element in msgs_test.iterrows():
    row = element[]
    if row['Label'] == row['prediction']:
        correct += 1

TypeError: tuple indices must be integers, not str