### Eucledian Repesentation of the Email data

In [1]:
#Using PorterStemmer to do stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from glob import glob
import pandas as pd

In [2]:
porter=PorterStemmer()

In [3]:
#Getting hold of all the files in ham and spam folder in a list form
all_ham_files = glob(".\ham\*.txt")
all_spam_files = glob(".\spam\*.txt")

In [4]:
#A function to tokenize a email and stem it
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)


In [5]:
#Stemming all emails in ham folder to ham_stemmed

for file_name in all_ham_files:
    with open(file_name) as file:
        data = file.read().replace("\n", " ")
        stemmed = stemSentence(data)
        filename = file_name.split("\\")[-1]
        with open(f"./ham_stemmed/{filename}", "w") as stem:
            stem.write(data)

In [6]:
#Stemming all emails in spam folder to spam_stemmed

for file_name in all_spam_files:
    with open(file_name, encoding='utf-8', errors='ignore' ) as file:
        data = file.read().replace("\n", " ")
        stemmed = stemSentence(data)
        filename = file_name.split("\\")[-1]
        with open(f"./spam_stemmed/{filename}", "w") as stem:
            stem.write(data)

In [7]:
#Getting hold of all files in ham_stemmed and spam_stemmed in a list
ham_stemmed= glob(".\ham_stemmed\*.txt")
spam_stemmed = glob(".\spam_stemmed\*.txt")

In [8]:
#Make a dictionary with all the words in all emails as key and a value of zero

word_dict = {}

for file_name in spam_stemmed + ham_stemmed:
    with open(file_name, encoding='utf-8', errors='ignore' ) as file:
        sentence = file.read()
        words = word_tokenize(sentence)
        for word in words:
            if word not in word_dict.keys():
                word_dict[word] = 0
                
len(word_dict)

50540

In [9]:
#Create a vector of words for each spam email

spam_emails_vectors = {}

for file in spam_stemmed:
    with open(file) as file:
        sentence = file.read()
        words = word_tokenize(sentence)
        email_word_dict = word_dict.copy()
        for word in words:
            email_word_dict[word] += 1
        
        spam_emails_vectors[file] = [value for key, value in email_word_dict.items()]
            
len(spam_emails_vectors)

1500

In [10]:
#Create a vector of words for each ham email

ham_emails_vectors = {}

for file in ham_stemmed:
    with open(file) as file:
        sentence = file.read()
        words = word_tokenize(sentence)
        email_word_dict = word_dict.copy()
        for word in words:
            email_word_dict[word] += 1
        
        ham_emails_vectors[file] = [value for key, value in email_word_dict.items()]
            
len(ham_emails_vectors)

3672

In [11]:
#Now we have a Eucledian representation of each email in ham_emails_vectors and spam_emails_vectors.
#Each email is a  single entry with the email name as key and email vector as the key

### Developing a Spam Classifier

#### Naive Bayes

In [14]:
import numpy as np

In [16]:
spam_vector =[]
for value in spam_emails_vectors.values():
    spam_vector.append(value)

In [19]:
spam_vector = np.array(spam_vector)
spam_vector.shape

(1500, 50540)

In [20]:
ham_vector =[]
for value in ham_emails_vectors.values():
    ham_vector.append(value)

In [21]:
ham_vector = np.array(ham_vector)
ham_vector.shape

(3672, 50540)

In [23]:
#Ham and spam will be the count of a word when it is ham or spam respectively( (Y=1|x_i) and (Y=0|x_0))
spam = np.zeros((1, 50540))
ham = np.zeros((1, 50540))

In [26]:
for i in range(1500):
    for j in range(50540):
        spam[0, j] += spam_vector[i, j]

In [27]:
for i in range(3672):
    for j in range(50540):
        ham[0, j] += ham_vector[i, j]

In [30]:
# Now we calculate probabiliy of each word x_i when a message is spam or not.

prob_spam_xi = spam/50540
prob_ham_xi = ham/50540

In [31]:
prob_spam_per_word = np.zeros((1, 50540))
prob_ham_per_word = np.zeros((1, 50540))
prob_spam = 0.3
prob_ham = 0.7
for i in range(prob_spam_per_word.shape[1]):
    spam_a = (prob_spam_xi[0,i]*prob_spam) 
    spam_b = spam_a + (prob_ham_xi[0,i]*prob_ham)
    spam_c = spam_a/spam_b
    
    ham_a = (prob_ham_xi[0,i]*prob_ham) 
    ham_b = ham_a + (prob_spam_xi[0,i]*prob_spam)
    ham_c = ham_a/ham_b
    
    prob_spam_per_word[0,i] = spam_c
    prob_ham_per_word[0,i] = ham_c

In [34]:
prob_spam_per_word

array([[0.14898689, 0.09782922, 1.        , ..., 0.        , 0.        ,
        0.        ]])

In [65]:
pred_spam = np.zeros((spam_vector.shape[0], 1))
for t in range(spam_vector.shape[0]):
    conditional_prob_spam = prob_spam
    conditional_prob_ham = prob_ham
    
    for i in range(prob_spam_per_word.shape[1]):
        if spam_vector[t][i] == 1:
            conditional_prob_spam = conditional_prob_spam * prob_spam_per_word[0,i]
            conditional_prob_ham = conditional_prob_ham * prob_ham_per_word[0,i]
            
    if conditional_prob_spam != 0:
        prob = conditional_prob_spam / (conditional_prob_spam + conditional_prob_ham) * 100
    else:
        prob = 0.0
    
    if prob > 0.5 * 100:
        pred_spam[t, 0] = 1  
    else:
        pred_spam[t, 0] = 0
        
pred_spam    

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [64]:
pred_ham = np.zeros((ham_vector.shape[0], 1))
for t in range(ham_vector.shape[0]):
    conditional_prob_spam = prob_spam
    conditional_prob_ham = prob_ham
    
    for i in range(prob_ham_per_word.shape[1]):
        if ham_vector[t][i] == 1:
            conditional_prob_spam = conditional_prob_spam * prob_spam_per_word[0,i]
            conditional_prob_ham = conditional_prob_ham * prob_ham_per_word[0,i]
            
    if conditional_prob_spam != 0:
        prob = conditional_prob_spam / (conditional_prob_spam + conditional_prob_ham) * 100
    else:
        prob = 0.0
    
    if prob > 0.5 * 100:
        pred_ham[t, 0] = 1 
    else:
        pred_ham[t, 0] = 0
        
pred_ham

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

#### k nearest Neighbors

In [46]:
prob_spam_per_word

array([[0.14898689, 0.09782922, 1.        , ..., 0.        , 0.        ,
        0.        ]])

In [47]:
prob_ham_per_word

array([[0.85101311, 0.90217078, 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [85]:
#calculate manhattan distance
man_dist = np.abs(spam_vector - prob_spam_per_word)
man_dist.shape

(1500, 50540)

In [86]:
manhattan_sum = np.zeros((1500,1))
for j in range(1500):
    manhattan_sum[j, 0] = man_dist[j].sum()
    
manhattan_sum/50540

array([[0.65113103],
       [0.6505021 ],
       [0.65033063],
       ...,
       [0.65007533],
       [0.65013028],
       [0.65158603]])

In [59]:
man_dist = np.abs(ham_vector - prob_ham_per_word)
man_dist.shape

(3672, 50540)

In [61]:
manhattan_sum_ham = np.zeros((3672,1))
for j in range(3672):
    manhattan_sum_ham[j, 0] = man_dist[j].sum()
    
manhattan_sum_ham/50540

array([[0.34999635],
       [0.37432886],
       [0.34996341],
       ...,
       [0.34988539],
       [0.35262034],
       [0.35346798]])

####  Accuracy

In [76]:
import math

k_neigh_manhattan_pred_ham = np.zeros((3672, 1))
for i in range(3672):
    if (manhattan_sum_ham/50540)[i] >= 0.5:
        k_neigh_manhattan_pred_ham[i] = 1
    else:
        k_neigh_manhattan_pred_ham[i] = 0

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [87]:
import math
k_neigh_manhattan_pred_spam = np.zeros((1500, 1))
for i in range(1500):
    if (manhattan_sum/50540)[i] >= 0.5:
        k_neigh_manhattan_pred_spam[i] = 1
    else:
        k_neigh_manhattan_pred_spam[i] = 0
        
k_neigh_manhattan_pred_spam

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [72]:
#naive bayes
print("Naive Bayes Spam Prediction Accuracy: ",(sum(pred_spam)/1500) * 100)
print("Naive Bayes Ham Prediction Accuracy: ",100 - (sum(pred_ham)/3672) * 100)

Naive Bayes Spam Prediction Accuracy:  [94.46666667]
Naive Bayes Ham Prediction Accuracy:  [100.]


In [88]:
print("K Neighbors Spam Prediction Accuracy: ",(sum(k_neigh_manhattan_pred_spam)/1500) * 100)
print("K Neighbors Ham Prediction Accuracy: ",100 - (sum(k_neigh_manhattan_pred_ham)/3672) * 100)


K Neighbors Spam Prediction Accuracy:  [100.]
K Neighbors Ham Prediction Accuracy:  [100.]


In [89]:
#The above show the train accuracy.The naive bayes has a better confediantilty in its result as compared to k neighbors but 
# k neighbors has a general better accuracy.