In [1]:
import pandas as pd

In [2]:
sms_spam_collection = pd.read_csv('smsspamcollection\SMSSpamCollection', sep='\t', header = None, names = ['Label', 'SMS'])
sms_spam_collection.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
sms_spam_collection.Label.value_counts(normalize = True) * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

# Training and Test Set

In [4]:
randomized_dataset = sms_spam_collection.sample(frac = 1, random_state = 1)

training_index = round(len(randomized_dataset) * 0.8)

training_set = randomized_dataset[:training_index].reset_index(drop = 'index')
test_set = randomized_dataset[training_index:].reset_index(drop = 'index')

In [5]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [6]:
training_set.Label.value_counts(normalize = True) * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [7]:
test_set.head()

Unnamed: 0,Label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


In [8]:
test_set.Label.value_counts(normalize = True) * 100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

# Letter Case and Punctuation 

In [9]:
training_set.SMS = training_set.SMS.str.replace('\W', ' ').str.lower()

In [10]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [11]:
test_set.head()

Unnamed: 0,Label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


# Creating the Vocabulary

In [12]:
# For training_set

training_SMS_list = training_set.SMS.str.split().tolist()
training_SMS_list[:5]

[['yep', 'by', 'the', 'pretty', 'sculpture'],
 ['yes', 'princess', 'are', 'you', 'going', 'to', 'make', 'me', 'moan'],
 ['welp', 'apparently', 'he', 'retired'],
 ['havent'],
 ['i',
  'forgot',
  '2',
  'ask',
  'ü',
  'all',
  'smth',
  'there',
  's',
  'a',
  'card',
  'on',
  'da',
  'present',
  'lei',
  'how',
  'ü',
  'all',
  'want',
  '2',
  'write',
  'smth',
  'or',
  'sign',
  'on',
  'it']]

In [13]:
vocabulary = set()

for sms in training_SMS_list:
    for word in sms:
        if word not in vocabulary:
            vocabulary.add(word)

vocabulary = list(vocabulary)

# The Final Training Set 

In [14]:
word_count_per_sms = {unique_word : [0] * len(training_set) for unique_word in vocabulary}

for index, sms in enumerate(training_SMS_list):
    for word in sms:
        word_count_per_sms[word][index] += 1

In [15]:
word_count_per_sms_df = pd.DataFrame(word_count_per_sms)

In [16]:
word_count_per_sms_df.head()

Unnamed: 0,prakasamanu,cab,recd,occasion,his,choosing,owo,flirtparty,st,brothas,...,an,uniform,house,dict,last,value,happenin,cc100p,spanish,adding
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
training_set_word_count = pd.concat([training_set, word_count_per_sms_df], axis = 1)
training_set_word_count.head()

Unnamed: 0,Label,SMS,prakasamanu,cab,recd,occasion,his,choosing,owo,flirtparty,...,an,uniform,house,dict,last,value,happenin,cc100p,spanish,adding
0,ham,yep by the pretty sculpture,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,yes princess are you going to make me moan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,welp apparently he retired,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,havent,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,i forgot 2 ask ü all smth there s a card on ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculating Constants 

In [18]:
p_ham = training_set.Label.value_counts(normalize = True)[0]
p_spam = training_set.Label.value_counts(normalize = True)[1]

In [19]:
p_ham

0.8654104979811574

In [20]:
p_spam

0.13458950201884254

In [21]:
training_set_word_count['words_per_sms'] = training_set_word_count.iloc[:,2:].sum(axis = 1)

In [22]:
training_set_word_count['words_per_sms']

0        5
1        9
2        4
3        1
4       26
        ..
4453    17
4454    34
4455    28
4456    27
4457     4
Name: words_per_sms, Length: 4458, dtype: int64

In [23]:
ham_messages = training_set_word_count[training_set_word_count['Label'] == 'ham']
n_ham = ham_messages['words_per_sms'].sum()
n_ham

57237

In [24]:
spam_messages = training_set_word_count[training_set_word_count['Label'] == 'spam']
n_spam = spam_messages['words_per_sms'].sum()
n_spam

15190

In [25]:
training_set_word_count['words_per_sms'].sum()

72427

In [26]:
n_vocabulary = len(vocabulary)
alpha = 1 

# Calculating Parameters 

In [27]:
# Initialize 
ham_parameters = {unique_word: 0 for unique_word in vocabulary}
spam_parameters = {unique_word: 0 for unique_word in vocabulary}

In [28]:
# Ham Parameters
for word in ham_parameters:
    ham_parameters[word] = (ham_messages[word].sum() + alpha) / (n_ham + alpha*n_vocabulary)
    
# Spam Parameters    
for word in spam_parameters:
    spam_parameters[word] = (spam_messages[word].sum() + alpha) / (n_spam + alpha*n_vocabulary)

In [29]:
list(ham_parameters.items())[0:5]

[('prakasamanu', 3.075976622577668e-05),
 ('cab', 3.075976622577668e-05),
 ('recd', 1.537988311288834e-05),
 ('occasion', 3.075976622577668e-05),
 ('his', 0.0007689941556444171)]

In [30]:
list(spam_parameters.items())[0:5]

[('prakasamanu', 4.3529360553693465e-05),
 ('cab', 4.3529360553693465e-05),
 ('recd', 0.0001305880816610804),
 ('occasion', 4.3529360553693465e-05),
 ('his', 4.3529360553693465e-05)]

# Classifying A New Message 

In [31]:
import re

def classify(message):
    
    message = re.sub('\W', ' ', message).lower().split()
    
    p_ham_given_message = p_ham 
    
    for word in message:
        if word not in vocabulary:
            pass
        else:
            p_ham_given_message *= ham_parameters[word]
        
    p_spam_given_message = p_spam
    
    for word in message:
        if word not in vocabulary:
            pass
        else:
            p_spam_given_message *= spam_parameters[word]
    
    print('P(Ham|message):', p_ham_given_message)
    print('P(Spam|message):', p_spam_given_message)
    print('\n')
    
    if p_ham_given_message > p_spam_given_message:
        print('Label : Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label : Spam')
    else:
        print('Please let a human classify this') 

In [32]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Ham|message): 1.9368049028589875e-27
P(Spam|message): 1.3481290211300841e-25


Label : Spam


In [33]:
classify("Sounds good, Tom, then see u there")

P(Ham|message): 3.687530435009238e-21
P(Spam|message): 2.4372375665888117e-25


Label : Ham


# Measuring the Spam Filter's Accuracy 

In [34]:
def classify_test_set(message):
    
    message = re.sub('\W', ' ', message).lower().split()
    
    p_ham_given_message = p_ham 
    
    for word in message:
        if word not in vocabulary:
            pass
        else:
            p_ham_given_message *= ham_parameters[word]
        
    p_spam_given_message = p_spam
    
    for word in message:
        if word not in vocabulary:
            pass
        else:
            p_spam_given_message *= spam_parameters[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'Please let a human classify this'

In [35]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)

In [36]:
test_set

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham
...,...,...,...
1109,ham,"We're all getting worried over here, derek and...",ham
1110,ham,Oh oh... Den muz change plan liao... Go back h...,ham
1111,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...,ham
1112,spam,Text & meet someone sexy today. U can find a d...,spam


In [37]:
correct = test_set['Label'] == test_set['predicted']
correct.value_counts(normalize = True, dropna = False) * 100

True     98.743268
False     1.256732
dtype: float64