In [69]:
import pandas as pd

In [71]:
df = pd.read_csv('spam.csv')

---
#### Explore and Clean our dataset

In [72]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [74]:
# Drop the useless columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [76]:
# rename columns to meaningful names
df.rename(columns={'v1': 'Class', 'v2': 'Email'}, inplace=True)
df.head(10)

Unnamed: 0,Class,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [77]:
# Remove punctuations from our emails 
df["Email"] = df['Email'].str.replace('[^\w\s]','')
df

  df["Email"] = df['Email'].str.replace('[^\w\s]','')


Unnamed: 0,Class,Email
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will _ b going to esplanade fr home
5569,ham,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like id ...


In [78]:
# Lower case all the emails
df.Email = df.Email.str.lower()
df

Unnamed: 0,Class,Email
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will _ b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


In [79]:
# Check duplication
duplicateRows = df[df.duplicated()] 
duplicateRows

Unnamed: 0,Class,Email
102,ham,as per your request melle melle oru minnaminun...
153,ham,as per your request melle melle oru minnaminun...
206,ham,as i entered my cabin my pa said happy bday b...
222,ham,sorry ill call later
325,ham,no callsmessagesmissed calls
...,...,...
5535,ham,i know you are thinkin malaria but relax child...
5539,ham,just sleepingand surfing
5547,spam,had your contract mobile 11 mnths latest motor...
5553,ham,hahahause your brain dear


In [80]:
# Remove duplication
df.drop_duplicates(keep = False, inplace = True)

---
### Implement Naive Bayes: With laplacian smoothin to avoid overfitting 

In [86]:
df['Count'] = [len(sent.split(" ")) for sent in df['Email']]
df

Unnamed: 0,Class,Email,Count
0,ham,go until jurong point crazy available only in ...,20
1,ham,ok lar joking wif u oni,6
3,ham,u dun say so early hor u c already then say,11
4,ham,nah i dont think he goes to usf he lives aroun...,13
5,spam,freemsg hey there darling its been 3 weeks now...,32
...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,30
5568,ham,will _ b going to esplanade fr home,8
5569,ham,pity was in mood for that soany other suggest...,10
5570,ham,the guy did some bitching but i acted like id ...,26


In [87]:
# getting total number of distinct words in the emails (aka number of classes)
email_tokens = []
for index, row in df.iterrows():
    email_tokens.append(row['Email'].split(' '))

email_tokens = [j for sub in email_tokens for j in sub]

num_of_classes = len(set(email_tokens))
num_of_classes

8908

In [88]:
# Total number of words in spam emails
num_words_spam = df[df['Class'] == 'spam'].Count.sum()
num_words_spam

12762

In [89]:
# Total number of words in non-spam emails
num_words_ham = df[df['Class'] == 'ham'].Count.sum()
num_words_ham

60893

In [90]:
# Probability of spam class without laplacian smoothing
p_spam = len(df[df['Class'] == 'spam']) / len(df)
p_spam

0.11248710010319918

In [93]:
# Probability of spam class with laplacian smoothing (K = 1)
p_spam_smothed = (len(df[df['Class'] == 'spam']) + 1) / (len(df) + 2)
p_spam_smothed

0.11264699814318135

In [94]:
# Probability of ham class without laplacian smoothing
p_ham = len(df[df['Class'] == 'ham']) / len(df)
p_ham

0.8875128998968008

In [95]:
# Probability of ham class with laplacian smoothing (K = 1)
p_ham_smothed = (len(df[df['Class'] == 'ham']) + 1) / (len(df) + 2)
p_ham_smothed

0.8873530018568186

In [96]:
# Calculate the probability that an email is spam
def naive_bayes_spam(email):
    email = email.lower()
    p_email_given_spam = total_liklehood(email, 1)
    p_email = total_liklehood(email, 1) * p_spam + total_liklehood(email, 0) * p_ham
    prob_naive_bayes_spam = (p_email_given_spam * p_spam) / (p_email)
    
    return prob_naive_bayes_spam

In [97]:
# Calculate the probability that an email is spam with Laplacian Smoothing
def naive_bayes_spam_smoothed(email, k):
    email = email.lower()
    p_email_given_spam = total_liklehood_smothed(email, 1, k)
    p_email = total_liklehood_smothed(email, 1, 1) * p_spam_smothed + total_liklehood_smothed(email, 0, 1) * p_ham_smothed
    prob_naive_bayes_spam = (p_email_given_spam * p_spam_smothed) / (p_email)
    
    return prob_naive_bayes_spam

In [98]:
# Calculate product of liklehoods of an email
def total_liklehood(email, spam_ham):
    total_liklehood = 1
    tokens = email.split(" ")
    for token in tokens:
        total_liklehood *= likelihood(token, spam_ham)
    
    return total_liklehood

In [99]:
# Calculate product od liklehoods of an email (smothed)
def total_liklehood_smothed(email, spam_ham, k):
    total_liklehood = 1
    tokens = email.split(" ")
    for token in tokens:
        total_liklehood *= likelihood_smothed(token, spam_ham, k)
    
    return total_liklehood

In [101]:
# Calculate liklehood probability
def likelihood(token, spam_ham):
    if spam_ham == 1:
        num_words = num_words_spam
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'spam']['Email']])        
    else:
        num_words = num_words_ham
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'ham']['Email']])
        
    return token_count / num_words

In [102]:
# Calculate liklehood probability
def likelihood_smothed(token, spam_ham, k):
    if spam_ham == 1:
        num_words = num_words_spam
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'spam']['Email']])        
    else:
        num_words = num_words_ham
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'ham']['Email']])
        
    return (token_count + k) / (num_words + k * num_of_classes)