In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
emails_df = pd.read_csv("../input/spam-filter/emails.csv")
emails_df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [3]:
emails_df.shape

(5728, 2)

In [4]:
emails_df = emails_df.drop_duplicates(keep = 'last')
emails_df.shape 

(5695, 2)

In [5]:
# Describing the values in the Spam column
emails_df.groupby('spam').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4327,4327,"Subject: hello guys , i ' m "" bugging you "" f...",1
1,1368,1368,Subject: naturally irresistible your corporate...,1


In [6]:
from sklearn.model_selection import train_test_split
X = emails_df.drop(columns=['spam'])
Y = emails_df.spam
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [7]:
#unigram
from sklearn.feature_extraction.text import CountVectorizer
corpus = X_train['text'].to_numpy()
vectorizer = CountVectorizer(min_df = 50)
C = vectorizer.fit_transform(corpus)

In [8]:
#bigram
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2), min_df = 50)
C2 = vectorizer2.fit_transform(corpus)

In [9]:
X_train_unigram = X_train.drop(columns=['text'])
X_train_bigram = X_train_unigram.copy()

vocab_unigram = vectorizer.get_feature_names()
counts_unigram = C.toarray()

vocab_bigram = vectorizer2.get_feature_names()
counts_bigram = C2.toarray()

#Vectorizing the training datas accordingly
for i in range(len(vocab_unigram)):
    X_train_unigram[vocab_unigram[i]] = counts_unigram[:,i]
for i in range(len(vocab_bigram)):
    X_train_bigram[vocab_bigram[i]] = counts_bigram[:,i]

  if sys.path[0] == '':
  


In [10]:
def P_words(spam,ham,vocabulary):
    
    dict_prob = {} #For each word in vocabulary consist P(word|spam) and P(word|ham)
    for word in vocabulary:
        #P(word|spam) = [num(word,spam) + @] / [num(spam) + @*len(vocabulary)]
        #P(word|ham) = [num(word,ham) + @]/ [num(ham) + @*len(vocabulary)]
        ##num(X) --> number of occurences of X. We chose @ = 1.
        p_word_given_spam = (len(spam[spam[word] != 0])+1)/(len(spam)+len(vocabulary))
        p_word_given_ham = (len(ham[ham[word] != 0])+1)/(len(ham)+len(vocabulary))
        
        dict_prob[word] = {'spam': p_word_given_spam, 'ham': p_word_given_ham}

    return dict_prob

In [11]:
def Naive_Bayes(X_train,y_train,X_test,vocabulary,unigram):
    
    training_set = X_train.copy()
    training_set['spam'] = y_train #Adding the label to training set.

    spam = training_set[training_set['spam'] == 1] #All mails in training set that are spam
    ham = training_set[training_set['spam'] == 0]  #All mails in training set that are ham

    dict_prob = P_words(spam,ham,vocabulary)

    P_spam = len(spam)/len(training_set)#P(spam)
    P_ham = len(ham)/len(training_set)#P(ham)  
    
    #Vectorizing the test set
    corpus = X_test['text'].to_numpy()
    if(unigram):
        vectorizer_test = CountVectorizer(min_df = 20)
    else:
        vectorizer_test = CountVectorizer(analyzer='word', ngram_range=(2, 2), min_df = 20)
        
    C_test = vectorizer_test.fit_transform(corpus)    
    
    test_set = C_test.toarray() #Counts of words in test_vocab for each mail
    test_vocab = vectorizer_test.get_feature_names() #Vocabulary of test set
    
    results = list() 
    for row in test_set: #For each mail
        P_spamWord = math.log(P_spam)
        P_hamWord = math.log(P_ham)
        for w_index in range(len(row)):
            if(row[w_index] != 0): #If word exists in the mail
                word = test_vocab[w_index]
                if(word in dict_prob): #If this word is also in training vocabulary
                    P_spamWord += math.log(dict_prob[word]['spam'])
                    P_hamWord += math.log(dict_prob[word]['ham'])
                else:
                    continue  #Ignoring the words we have not seen.
            else:
                continue
        if(P_spamWord>P_hamWord):
            results.append(1) #Spam
        else:
            results.append(0) #Ham
            
    return results

In [12]:
results = Naive_Bayes(X_train_unigram,y_train,X_test,vocab_unigram,True)  ##Unigram
results_2 = Naive_Bayes(X_train_bigram,y_train,X_test,vocab_bigram,False) ##Bigram

In [13]:
print("unigram accuracy:", accuracy_score(y_test,results))
print("unigram precision:", precision_score(y_test,results))
print("unigram recall:", recall_score(y_test,results))
print("unigram f1_score:", f1_score(y_test,results))

print('')

print("bigram accuracy:", accuracy_score(y_test,results_2))
print("bigram precision:", precision_score(y_test,results_2))
print("bigram recall:", recall_score(y_test,results_2))
print("bigram f1_score:", f1_score(y_test,results_2))

unigram accuracy: 0.7585601404741001
unigram precision: 1.0
unigram recall: 0.07094594594594594
unigram f1_score: 0.13249211356466875

bigram accuracy: 0.7936786654960492
bigram precision: 1.0
bigram recall: 0.20608108108108109
bigram f1_score: 0.34173669467787116
