In [97]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [98]:
df = pd.read_csv('spam.csv')

---
#### Explore and Clean our dataset

In [99]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [101]:
# Drop the useless columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [103]:
# rename columns to meaningful names
df.rename(columns={'v1': 'Class', 'v2': 'Email'}, inplace=True)
df.head(10)

Unnamed: 0,Class,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [104]:
# Remove punctuations from our emails 
df["Email"] = df['Email'].str.replace('[^\w\s]','')
df

Unnamed: 0,Class,Email
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will _ b going to esplanade fr home
5569,ham,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like id ...


In [105]:
# Lower case all the emails
df.Email = df.Email.str.lower()
df

Unnamed: 0,Class,Email
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will _ b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


In [106]:
# Check duplication
duplicateRows = df[df.duplicated()] 
duplicateRows

Unnamed: 0,Class,Email
102,ham,as per your request melle melle oru minnaminun...
153,ham,as per your request melle melle oru minnaminun...
206,ham,as i entered my cabin my pa said happy bday b...
222,ham,sorry ill call later
325,ham,no callsmessagesmissed calls
...,...,...
5535,ham,i know you are thinkin malaria but relax child...
5539,ham,just sleepingand surfing
5547,spam,had your contract mobile 11 mnths latest motor...
5553,ham,hahahause your brain dear


In [107]:
# Remove duplication
df.drop_duplicates(keep = False, inplace = True)

---
### Implement Naive Bayes: With laplacian smoothin to avoid overfitting 

In [108]:
df['Count'] = [len(sent.split(" ")) for sent in df['Email']]
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.head()

Unnamed: 0,Class,Email,Count
0,ham,go until jurong point crazy available only in ...,20
1,ham,ok lar joking wif u oni,6
2,ham,u dun say so early hor u c already then say,11
3,ham,nah i dont think he goes to usf he lives aroun...,13
4,spam,freemsg hey there darling its been 3 weeks now...,32


In [109]:
# getting total number of distinct words in the emails (aka number of classes)
email_tokens = []
for index, row in df.iterrows():
    email_tokens.append(row['Email'].split(' '))

email_tokens = [j for sub in email_tokens for j in sub]

num_of_classes = len(set(email_tokens))
num_of_classes

8908

In [110]:
# Total number of words in spam emails
num_words_spam = df[df['Class'] == 'spam'].Count.sum()
num_words_spam

12762

In [111]:
# Total number of words in non-spam emails
num_words_ham = df[df['Class'] == 'ham'].Count.sum()
num_words_ham

60893

In [112]:
# Probability of spam class without laplacian smoothing
p_spam = len(df[df['Class'] == 'spam']) / len(df)
p_spam

0.11248710010319918

In [113]:
# Probability of spam class with laplacian smoothing (K = 1)
p_spam_smothed = (len(df[df['Class'] == 'spam']) + 1) / (len(df) + 2)
p_spam_smothed

0.11264699814318135

In [114]:
# Probability of ham class without laplacian smoothing
p_ham = len(df[df['Class'] == 'ham']) / len(df)
p_ham

0.8875128998968008

In [115]:
# Probability of ham class with laplacian smoothing (K = 1)
p_ham_smothed = (len(df[df['Class'] == 'ham']) + 1) / (len(df) + 2)
p_ham_smothed

0.8873530018568186

In [116]:
# Calculate the probability that an email is spam
def naive_bayes_spam(email):
    email = email.lower()
    p_email_given_spam = total_liklehood(email, 1)
    p_email = total_liklehood(email, 1) * p_spam + total_liklehood(email, 0) * p_ham
    prob_naive_bayes_spam = (p_email_given_spam * p_spam) / (p_email)
    
    return prob_naive_bayes_spam

In [117]:
# Calculate the probability that an email is spam with Laplacian Smoothing
def naive_bayes_spam_smoothed(email, k):
    email = email.lower()
    p_email_given_spam = total_liklehood_smothed(email, 1, k)
    p_email = total_liklehood_smothed(email, 1, 1) * p_spam_smothed + total_liklehood_smothed(email, 0, 1) * p_ham_smothed
    prob_naive_bayes_spam = (p_email_given_spam * p_spam_smothed) / (p_email)
    
    return prob_naive_bayes_spam

In [118]:
# Calculate product of liklehoods of an email
def total_liklehood(email, spam_ham):
    total_liklehood = 1
    tokens = email.split(" ")
    for token in tokens:
        total_liklehood *= likelihood(token, spam_ham)
    
    return total_liklehood

In [119]:
# Calculate product od liklehoods of an email (smothed)
def total_liklehood_smothed(email, spam_ham, k):
    total_liklehood = 1
    tokens = email.split(" ")
    for token in tokens:
        total_liklehood *= likelihood_smothed(token, spam_ham, k)
    
    return total_liklehood

In [120]:
# Calculate liklehood probability
def likelihood(token, spam_ham):
    if spam_ham == 1:
        num_words = num_words_spam
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'spam']['Email']])        
    else:
        num_words = num_words_ham
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'ham']['Email']])
        
    return token_count / num_words

In [121]:
# Calculate liklehood probability
def likelihood_smothed(token, spam_ham, k):
    if spam_ham == 1:
        num_words = num_words_spam
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'spam']['Email']])        
    else:
        num_words = num_words_ham
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'ham']['Email']])
        
    return (token_count + k) / (num_words + k * num_of_classes)

In [122]:
def spam_ham_classify(email, threshold):
    return 'spam' if naive_bayes_spam(email) >= threshold else 'ham'

# Apply naive bayes to the dataset 
# Since this is imbalanced classification problem, 0.5 threshhold will be a mistake, 0.8 will be good
df['Classification'] = df['Email'].apply(lambda x: spam_ham_classify(x, 0.7))
df.head(10)

Unnamed: 0,Class,Email,Count,Classification
0,ham,go until jurong point crazy available only in ...,20,ham
1,ham,ok lar joking wif u oni,6,ham
2,ham,u dun say so early hor u c already then say,11,ham
3,ham,nah i dont think he goes to usf he lives aroun...,13,ham
4,spam,freemsg hey there darling its been 3 weeks now...,32,spam
5,ham,even my brother is not like to speak with me t...,16,ham
6,ham,im gonna be home soon and i dont want to talk ...,21,ham
7,ham,ive been searching for the right words to than...,37,ham
8,ham,i have a date on sunday with will,8,ham
9,spam,xxxmobilemovieclub to use your credit click th...,19,spam


In [123]:
df['Real'] = df['Class']
df.drop('Class', inplace=True, axis=1)
df.head(10)

Unnamed: 0,Email,Count,Classification,Real
0,go until jurong point crazy available only in ...,20,ham,ham
1,ok lar joking wif u oni,6,ham,ham
2,u dun say so early hor u c already then say,11,ham,ham
3,nah i dont think he goes to usf he lives aroun...,13,ham,ham
4,freemsg hey there darling its been 3 weeks now...,32,spam,spam
5,even my brother is not like to speak with me t...,16,ham,ham
6,im gonna be home soon and i dont want to talk ...,21,ham,ham
7,ive been searching for the right words to than...,37,ham,ham
8,i have a date on sunday with will,8,ham,ham
9,xxxmobilemovieclub to use your credit click th...,19,spam,spam


### Acuraccy, Precision, Recall and F1

In [124]:
# Assumption: Positive => Spam and. Negative => Ham

# Actual Values
actual_positive = len(df[df['Real'] == 'spam'])
actual_negative = len(df[df['Real'] == 'ham'])

# Predicted Values
predicted_positive = len(df[df['Classification'] == 'spam'])
predicted_negative = len(df[df['Classification'] == 'ham'])

# True Predictions
true_positive = len(df[(df['Real'] == 'spam') & (df['Classification'] == 'spam')])
true_negative = len(df[(df['Real'] == 'ham') & (df['Classification'] == 'ham')])

# False Predictions
false_negative = len(df[(df['Real'] == 'spam') & (df['Classification'] == 'ham')])
false_positive = len(df[(df['Real'] == 'ham') & (df['Classification'] == 'spam')])

# Lets see the values
print(f'Actual Positive: {actual_positive}')
print(f'Actual Negative: {actual_negative}')
print(f'Predicted Positives: {predicted_positive}')
print(f'Predicted Negative: {predicted_negative}' )

# Confusion Matrix
print()
print('Confusion Matrix')
np.array([[true_positive, false_negative], [false_positive, true_negative]])

Actual Positive: 545
Actual Negative: 4300
Predicted Positives: 547
Predicted Negative: 4298

Confusion Matrix


array([[ 544,    1],
       [   3, 4297]])

In [125]:
# Acuraccy, Precision, Recall and F1
accuracy = (true_positive + true_negative) / (false_positive + false_negative + true_positive + true_negative)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1_score =2 *  (precision * recall) / (precision + recall)

# Lets see our calculated metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

Accuracy: 0.9991744066047472
Precision: 0.9945155393053017
Recall: 0.998165137614679
F1 Score: 0.9963369963369965
