In [67]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [68]:
df = pd.read_csv('spam.csv')

In [69]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [71]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [72]:
# rename columns to meaningful names
df.rename(columns={'v1': 'Class', 'v2': 'Email'}, inplace=True)
df.head(10)

Unnamed: 0,Class,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [73]:
# Remove punctuations from our emails 
df["Email"] = df['Email'].str.replace('[^\w\s]','')
df

Unnamed: 0,Class,Email
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will _ b going to esplanade fr home
5569,ham,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like id ...


In [74]:
# Lower case all the emails
df.Email = df.Email.str.lower()
df.head()

Unnamed: 0,Class,Email
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [75]:
# Check duplication
duplicateRows = df[df.duplicated()] 
duplicateRows

Unnamed: 0,Class,Email
102,ham,as per your request melle melle oru minnaminun...
153,ham,as per your request melle melle oru minnaminun...
206,ham,as i entered my cabin my pa said happy bday b...
222,ham,sorry ill call later
325,ham,no callsmessagesmissed calls
...,...,...
5535,ham,i know you are thinkin malaria but relax child...
5539,ham,just sleepingand surfing
5547,spam,had your contract mobile 11 mnths latest motor...
5553,ham,hahahause your brain dear


In [76]:
# Remove duplication
df.drop_duplicates(keep = False, inplace = True)

In [77]:
df['Count'] = [len(sent.split(" ")) for sent in df['Email']]
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.head()

Unnamed: 0,Class,Email,Count
0,ham,go until jurong point crazy available only in ...,20
1,ham,ok lar joking wif u oni,6
2,ham,u dun say so early hor u c already then say,11
3,ham,nah i dont think he goes to usf he lives aroun...,13
4,spam,freemsg hey there darling its been 3 weeks now...,32


In [78]:
# getting total number of distinct words in the emails (aka number of classes)
email_tokens = []
for index, row in df.iterrows():
    email_tokens.append(row['Email'].split(' '))

email_tokens = [j for sub in email_tokens for j in sub]

num_of_classes = len(set(email_tokens))
num_of_classes

8908

In [79]:
# Total number of words in spam emails
num_words_spam = df[df['Class'] == 'spam'].Count.sum()
num_words_spam

12762

In [80]:
# Total number of words in non-spam emails
num_words_ham = df[df['Class'] == 'ham'].Count.sum()
num_words_ham

60893

In [81]:
# Probability of spam class without laplacian smoothing
p_spam = len(df[df['Class'] == 'spam']) / len(df)
p_spam

0.11248710010319918

In [82]:
# Probability of ham class without laplacian smoothing
p_ham = len(df[df['Class'] == 'ham']) / len(df)
p_ham

0.8875128998968008

---
#### Split the data to train/test sets

In [83]:
split = int(0.75*len(df))
df_train = df[:split]
df_test = df[split+1:]

In [84]:
df_train

Unnamed: 0,Class,Email,Count
0,ham,go until jurong point crazy available only in ...,20
1,ham,ok lar joking wif u oni,6
2,ham,u dun say so early hor u c already then say,11
3,ham,nah i dont think he goes to usf he lives aroun...,13
4,spam,freemsg hey there darling its been 3 weeks now...,32
...,...,...,...
3628,ham,im good have you registered to vote,7
3629,ham,hmm ok ill stay for like an hour cos my eye is...,14
3630,ham,dear got bus directly to calicut,6
3631,ham,mm umma ask vava also to come tell him can pla...,13


In [85]:
df_test

Unnamed: 0,Class,Email,Count
3634,ham,ltgt w jetton ave if you forgot,8
3635,ham,ok im coming home now,5
3636,ham,can not use foreign stamps in this country,8
3637,ham,sorry its a lot of friendofafriend stuff im ju...,21
3638,spam,cmon babe make me horny turn me on txt me your...,30
...,...,...,...
4840,spam,this is the 2nd time we have tried 2 contact u...,30
4841,ham,will _ b going to esplanade fr home,8
4842,ham,pity was in mood for that soany other suggest...,10
4843,ham,the guy did some bitching but i acted like id ...,26


---
#### Training

In [86]:
# Calculate liklehood probability
def likelihood(df, token, spam_ham):
    if spam_ham == 1:
        num_words = num_words_spam
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'spam']['Email']])        
    else:
        num_words = num_words_ham
        token_count = sum([sent.split(" ").count(token) for sent in df[df['Class'] == 'ham']['Email']])
        
    return token_count / num_words

In [96]:
# Training Approach: Calculating likehood of every distinct word in all emails in the training dataset

likelihood_spam = {}
likelihood_ham = {}

train_email_tokens = []
for index, row in df_train.iterrows():
    train_email_tokens.append(row['Email'].split(' '))
    
train_email_tokens = [j for sub in train_email_tokens for j in sub]

tokens = set(train_email_tokens)
for word in tokens:
    likelihood_ham[word] = likelihood(df_train, word, 0)
    likelihood_spam[word] = likelihood(df_train, word, 1)

----
#### Testing

In [118]:
# Calculate product of liklehoods of an email
def total_liklehood(email, spam_ham):
    total = 1
    
    if spam_ham == 0:
        words = email.split(" ")
        for word in words:
            if word in likelihood_ham:
                total *= likelihood_ham['word']
    elif spam_ham == 1:
        words = email.split(" ")
        for word in words:
            if word in likelihood_spam:
                total *= likelihood_spam['word']


    return total

In [119]:
# Calculate the probability that an email is spam
def naive_bayes_spam(email):
    email = email.lower()
    p_email_given_spam = total_liklehood(email, 1)
#     print(p_email_given_spam)
    p_email = total_liklehood(email, 1) * p_spam + total_liklehood(email, 0) * p_ham
    print(p_email)
    prob_naive_bayes_spam = (p_email_given_spam * p_spam) / (p_email)
    
    return prob_naive_bayes_spam

In [120]:
def spam_ham_classify(email, threshold):
#     print(naive_bayes_spam(email))
    return 'spam' if naive_bayes_spam(email) >= threshold else 'ham'

# Apply naive bayes to the dataset 
# Since this is imbalanced classification problem, 0.5 threshhold will be a mistake, 0.7 will be good
df_test['Classification'] = df_test['Email'].apply(lambda x: spam_ham_classify(x, 0.7))
df_test.head(10)

3.975821558248423e-23
5.3549356554603165e-17
3.42685144564196e-26
5.762110975474395e-63
2.362790889124965e-81
1.6302867703825892e-41
6.204271457788388e-109
8.998285816015186e-54
3.42685144564196e-26
6.685096388091307e-60
5.3549356554603165e-17
3.975821558248423e-23
8.998285816015186e-54
1.4051993787285787e-44
1.8914290714091943e-38
5.3549356554603165e-17
2.5459049929848763e-32
5.3549356554603165e-17
7.755927282256581e-57
4.6130626872419715e-20
5.3549356554603165e-17
2.1944016341686676e-35
2.1944016341686676e-35
2.5459049929848763e-32
7.755927282256581e-57
1.8914290714091943e-38
4.6130626872419715e-20
2.9537135607967343e-29
5.762110975474395e-63
4.966558590363448e-66
5.3549356554603165e-17
6.235109128196901e-14
1.6302867703825892e-41
5.3549356554603165e-17
6.685096388091307e-60
6.685096388091307e-60
1.5130285471425918e-90
5.3549356554603165e-17
2.9537135607967343e-29
1.0439647598550091e-50
1.5130285471425918e-90
3.975821558248423e-23
4.966558590363448e-66
1.2111889332108396e-47
7.404437

Unnamed: 0,Class,Email,Count,Classification
3634,ham,ltgt w jetton ave if you forgot,8,spam
3635,ham,ok im coming home now,5,spam
3636,ham,can not use foreign stamps in this country,8,spam
3637,ham,sorry its a lot of friendofafriend stuff im ju...,21,spam
3638,spam,cmon babe make me horny turn me on txt me your...,30,spam
3639,ham,wylie update my weed dealer carlos went to fre...,15,spam
3640,ham,are you happy baby are you alright did you t...,35,spam
3641,ham,c movie is juz last minute decision mah juz wa...,18,spam
3642,ham,how are you enjoying this semester take care b...,9,spam
3643,spam,important information 4 orange user 0796xxxxxx...,22,spam


In [109]:
df_test[df_test['Classification'] == 'ham']

Unnamed: 0,Class,Email,Count,Classification
3720,ham,gwr,1,ham
3883,ham,erutupalam thandiyachu,2,ham
4026,ham,havent,1,ham
4076,ham,anytime,1,ham
4104,ham,keng rocking in ashes,4,ham
4286,ham,kkcongratulation,2,ham
4574,ham,er,1,ham
4706,ham,east coast,2,ham
