In [44]:
import pandas as pd
import numpy as np
import warnings
from gensim.models.phrases import Phrases

In [45]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)
df.rename(columns={"v1": "Label", "v2": "Email"}, inplace=True)
df.head(10)

Unnamed: 0,Label,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [46]:
df['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [47]:
# Randomize the dataset
data_randomized = df.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [48]:
training_set['Label'].value_counts(normalize=True)

ham     0.864065
spam    0.135935
Name: Label, dtype: float64

In [49]:
test_set['Label'].value_counts(normalize=True)

ham     0.873429
spam    0.126571
Name: Label, dtype: float64

In [50]:
training_set['Email'] = training_set['Email'].str.replace(
   '\W', ' ') # Removes punctuation
training_set['Email'] = training_set['Email'].str.lower()
training_set.head(10)

  training_set['Email'] = training_set['Email'].str.replace(


Unnamed: 0,Label,Email
0,ham,convey my regards to him
1,ham,û_ anyway many good evenings to u s
2,ham,my sort code is and acc no is the bank is n...
3,ham,sorry i din lock my keypad
4,spam,hi babe its chloe how r u i was smashed on s...
5,ham,ok i thk i got it then u wan me 2 come now or...
6,ham,oi when you gonna ring
7,ham,will be office around 4 pm now i am going hos...
8,ham,have you heard about that job i m going to th...
9,ham,oh my god i m almost home


In [51]:
training_set['Email'] = training_set['Email'].str.split()

vocabulary = []
for email in training_set['Email']:
    for word in email:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [52]:
word_counts_per_email = {unique_word: [0] * len(training_set['Email']) for unique_word in vocabulary}

for index, email in enumerate(training_set['Email']):
    for word in email:
        word_counts_per_email[word][index] += 1

In [53]:
word_counts = pd.DataFrame(word_counts_per_email)
print(word_counts.shape)
word_counts.head()

(4458, 7766)


Unnamed: 0,somone,cougar,arestaurant,habit,clos1,disturbing,dats,whole,86021,disc,...,address,revision,funny,aberdeen,wasnåõt,blacko,artists,comb,atleast,guessing
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head(10)

Unnamed: 0,Label,Email,somone,cougar,arestaurant,habit,clos1,disturbing,dats,whole,...,address,revision,funny,aberdeen,wasnåõt,blacko,artists,comb,atleast,guessing
0,ham,"[convey, my, regards, to, him]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[û_, anyway, many, good, evenings, to, u, s]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[my, sort, code, is, and, acc, no, is, the, ba...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[sorry, i, din, lock, my, keypad]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,spam,"[hi, babe, its, chloe, how, r, u, i, was, smas...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,ham,"[ok, i, thk, i, got, it, then, u, wan, me, 2, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,ham,"[oi, when, you, gonna, ring]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,ham,"[will, be, office, around, 4, pm, now, i, am, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ham,"[have, you, heard, about, that, job, i, m, goi...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,ham,"[oh, my, god, i, m, almost, home]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['Email'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['Email'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

In [56]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + n_vocabulary) 
    parameters_spam[word] = p_word_given_spam

    n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [57]:
import re

def classify(message):
    '''
    message: a string
    '''

    message = re.sub('\W', ' ', message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham: 
            p_ham_given_message *= parameters_ham[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [58]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 4.914731817244035e-25
P(Ham|message): 4.052730830087535e-21
Label: Ham


In [59]:
def classify_test_set(message):
    '''
    message: a string
    '''

    message = re.sub('\W', ' ', message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        
        else:
            p_spam_given_message *= (0 + alpha) / (n_spam + alpha*n_vocabulary) 

        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
        
        else:
            p_ham_given_message *= (0 + alpha) / (n_ham + alpha*n_vocabulary) 

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [60]:
test_set['predicted'] = test_set['Email'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,Email,predicted
0,ham,S...from the training manual it show there is ...,ham
1,spam,Do you want a new Video phone? 600 anytime any...,spam
2,ham,True. Its easier with her here.,ham
3,ham,Midnight at the earliest,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [61]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1103
Incorrect: 11
Accuracy: 0.9901256732495511


In [62]:
# Assumption: Positive => Spam and. Negative => Ham

# Actual Values
actual_positive = len(test_set[test_set['Label'] == 'spam'])
actual_negative = len(test_set[test_set['Label'] == 'ham'])

# Predicted Values
predicted_positive = len(test_set[test_set['predicted'] == 'spam'])
predicted_negative = len(test_set[test_set['predicted'] == 'ham'])

# True Predictions
true_positive = len(test_set[(test_set['Label'] == 'spam') & (test_set['predicted'] == 'spam')])
true_negative = len(test_set[(test_set['Label'] == 'ham') & (test_set['predicted'] == 'ham')])

# False Predictions
false_negative = len(test_set[(test_set['Label'] == 'spam') & (test_set['predicted'] == 'ham')])
false_positive = len(test_set[(test_set['Label'] == 'ham') & (test_set['predicted'] == 'spam')])

# Lets see the values
print(f'Actual Positive: {actual_positive}')
print(f'Actual Negative: {actual_negative}')
print(f'Predicted Positives: {predicted_positive}')
print(f'Predicted Negative: {predicted_negative}' )

# Confusion Matrix
print()
print('Confusion Matrix')
np.array([[true_positive, false_negative], [false_positive, true_negative]])

Actual Positive: 141
Actual Negative: 973
Predicted Positives: 146
Predicted Negative: 968

Confusion Matrix


array([[138,   3],
       [  8, 965]])

In [63]:
# Acuraccy, Precision, Recall and F1
accuracy = (true_positive + true_negative) / (false_positive + false_negative + true_positive + true_negative)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1_score =2 *  (precision * recall) / (precision + recall)

# Lets see our calculated metrics
print(f'Accuracy: {accuracy*100:.2f}%')
print(f'Precision: {precision*100:.2f}%')
print(f'Recall: {recall*100:.2f}%')
print(f'F1 Score: {f1_score*100:.2f}%')

Accuracy: 99.01%
Precision: 94.52%
Recall: 97.87%
F1 Score: 96.17%
