# CA02: Spam eMail Detection 
## using Naive BayesClassification Algorithm

In [150]:
#import libraries
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Data Preparation

In [151]:
#specifying the data paths
train_data = './train-mails' 
test_data = './test-mails'

'/' means root directory

**'./' means current directory**

'../' means parent directory


## Creating a Dictionary of the most common words (function breakdown)

Before writing the function let's use the training_data to understand the code

In [152]:
all_words = [] # creating an empty list for all of the words from the emails
emails = [os.path.join(train_data,f) for f in os.listdir(train_data)] # list comprehension, creating a list of 
emails[0:5]

['./train-mails\\3-1msg1.txt',
 './train-mails\\3-1msg2.txt',
 './train-mails\\3-1msg3.txt',
 './train-mails\\3-375msg1.txt',
 './train-mails\\3-378msg1.txt']

In [153]:
# the same can be written as
emails = []
for f in os.listdir(train_data):
    file_path = os.path.join(train_data,f)
    emails.append(file_path)
emails[0:5]

['./train-mails\\3-1msg1.txt',
 './train-mails\\3-1msg2.txt',
 './train-mails\\3-1msg3.txt',
 './train-mails\\3-375msg1.txt',
 './train-mails\\3-378msg1.txt']

Notice the delimiter '\\\\' in the path './train-mails\\\\3-1msg1.txt'. If your delimeter is different you might have to change it in the function. 

In [154]:
# looking at one file
f = open('./train-mails\\6-380msg1.txt', "r")
print(f.read())
# we can see that the fist line is the subject

Subject: re : 6 . 199 ipa

why must kind stuff decide vote ? since obviously ipa membership equal interest matter never , achieve us join ipa force issue , " pack " , never representative . why let invisible hand " market " idea operate freely instead ? fewer fewer ipa 's made-up symbol , either organization become completely irrelevant , own mind respond " market force " , perhap group step propose system manifestly better anyone else 's , achieve standardization . really few top name phonetics together editor few journal , probably something . simply anywhere else : someone publish truly superior system start . finally , reality , seem certain trend occur anyway particular resistance hachek correspond ipa symbol wane . put inconsistency american v . canadian v . british spell , probably here . gonna worry something , worry those case where te same symbol different commonly meaning ' j ' ' y ' . alexis mr



In [155]:
#getting all the words from all emails
all_words = []
for mail in emails: #loop through all the emails
    with open(mail) as m: #open each email
        for line in m: #loop through each line
            words = line.split() #split lines into words separated by a whitespace
            all_words += words #append each word to the list of all words
print(all_words[0:20])

['Subject:', 're', ':', '2', '.', '882', 's', '-', '>', 'np', 'np', '>', 'deat', ':', 'sun', ',', '15', 'dec', '91', '2']


We can see that the subject line is included, let's fix that

In [156]:
all_words = []
for mail in emails: #loop through all the emails
    with open(mail) as m: #open each email
        for i, line in enumerate(m): #loop through each line
            if i == 2: #use only the 3rd line [2]################################################# this is the fix
                words = line.split() #split lines into words separated by a whitespace
                all_words += words #append each word to the list of all words
print(all_words[0:20])

['>', 'deat', ':', 'sun', ',', '15', 'dec', '91', '2', ':', '25', ':', '2', 'est', '>', ':', 'michael', '<', 'mmorse', '@']


In [157]:
# creating a counter object
dictionary = Counter(all_words) #Counter() creates a dictionary {'element': count}, e.g., {'success1': 169, 'plan': 169}
dictionary

Counter({'>': 1605,
         'deat': 50,
         ':': 5011,
         'sun': 17,
         ',': 18338,
         '15': 208,
         'dec': 18,
         '91': 4,
         '2': 866,
         '25': 134,
         'est': 21,
         'michael': 36,
         '<': 69,
         'mmorse': 1,
         '@': 752,
         'vm1': 2,
         '.': 28559,
         'yorku': 1,
         'ca': 126,
         'subject': 209,
         're': 190,
         '864': 1,
         'query': 57,
         'wlodek': 1,
         'zadrozny': 1,
         'ask': 206,
         '"': 4410,
         'anything': 93,
         'interest': 362,
         'construction': 74,
         's': 576,
         'np': 27,
         'second': 132,
         'much': 318,
         'relate': 77,
         'consider': 91,
         'form': 380,
         'discuss': 49,
         'list': 935,
         'late': 37,
         'reduplication': 54,
         '?': 1102,
         'logical': 13,
         'sense': 36,
         'john': 109,
         'mcnamara': 3,
 

In [158]:
# but we can't iterate through dictionaries, that's why we put those values in a list
list_to_remove = list(dictionary) #creating a list of keys (or words only) without the counts
list_to_remove[0:10]
#notice how each value is uniaue, because the list is based on the Counter

['>', 'deat', ':', 'sun', ',', '15', 'dec', '91', '2', '25']

In [159]:
# let's use our list_to_remove list to remove numbers and single characters
for item in list_to_remove: # loop through each word
    if item.isalpha() == False: # if contains something other than letters
      del dictionary[item] # delete an item from the Counter object with the key from our list
    elif len(item) == 1: # single letters, characters?
      del dictionary[item]

dictionary

Counter({'deat': 50,
         'sun': 17,
         'dec': 18,
         'est': 21,
         'michael': 36,
         'mmorse': 1,
         'yorku': 1,
         'ca': 126,
         'subject': 209,
         're': 190,
         'query': 57,
         'wlodek': 1,
         'zadrozny': 1,
         'ask': 206,
         'anything': 93,
         'interest': 362,
         'construction': 74,
         'np': 27,
         'second': 132,
         'much': 318,
         'relate': 77,
         'consider': 91,
         'form': 380,
         'discuss': 49,
         'list': 935,
         'late': 37,
         'reduplication': 54,
         'logical': 13,
         'sense': 36,
         'john': 109,
         'mcnamara': 3,
         'name': 878,
         'tautologous': 4,
         'thus': 33,
         'level': 192,
         'indistinguishable': 1,
         'here': 397,
         'support': 117,
         'those': 345,
         'semantics': 35,
         'irrelevant': 7,
         'natural': 80,
         'language': 1

In [160]:
# the last thing the function does is it picks only 3000 of the most common words
dictionary = dictionary.most_common(3000) # .most_common is a method of Counter

### All above steps in a function make_Dictionary

In [161]:
#make_Dictionary (modified)
# I added an enumerator and an if statement to only select the message of the email
def make_Dictionary(root_dir):
  all_words = [] #create an empty list for all words
  emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)] #list comprehension for all of the file paths
  for mail in emails: #loop through all the emails
    with open(mail) as m: #open each email
      for i, line in enumerate(m): #loop through each line
        if i == 2: # use only the 3rd line [2]
            words = line.split() #split lines into words separated by a whitespace
            all_words += words #append each word to the list of all words
            
  dictionary = Counter(all_words) #Counter creates a dictionary {'element': count}, e.g., {'success1': 169, 'plan': 169}

  list_to_remove = list(dictionary) #creating a list of key (or words only) without the counts
    
  for item in list_to_remove: # loop through each word
    if item.isalpha() == False: # if contains something other than letters
      del dictionary[item] # delete an item from the Counter object with the key from our list
    elif len(item) == 1: # single letters, characters?
      del dictionary[item]
  dictionary = dictionary.most_common(3000) # .most_common is a method of Counter
  return dictionary

## Creating feature columns and labels

In [162]:
# extract_features() modified
# deleted unnecesary lines
# substituted startswith and split('/') by 'if 'spmsg' in fil:' as a more robust solution on different platforms
# since my os had a diffrent delimiter '\\'
# deleted 'count = count + 1' since it was never used in the code
def extract_features(mail_dir):
  files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)] # getting the file paths
  features_matrix = np.zeros((len(files),3000)) # creating an empty features matrix
  train_labels = np.zeros(len(files)) #creating an empty series for labels
  docID = 0
    
  for fil in files: # loop through each file
    with open(fil) as fi:   # open each file
      for i, line in enumerate(fi): # loop through each line, use only line 3 [2]   
        if i ==2:
          words = line.split() # split on whitespace 
          for word in words: # lop through each word
            wordID = 0 # setting an initial value
            for i, d in enumerate(dictionary):
              if d[0] == word: # d[0] selects the actual word (key) from the dictionary and compares it with the word
                wordID = i # wordID is the same as enummeration number in the dictionary
                features_matrix[docID,wordID] = words.count(word) # docID as a row, wordID as a column, we count how many times a word appeared in a document
      train_labels[docID] = 0; # non spam = 0 by default
    
      if 'spmsg' in fil: # if 'spmsg' in the filename (filepath)
        train_labels[docID] = 1 # then change the label to 1
      docID = docID + 1 # increase document id for the next loop, but we could use enumerate
  return features_matrix, train_labels

# Running preprocessing functions

### make_Dictioinary

In [163]:
# Running preprocessing functions
print('Creating a training dictionary...', end='')
dictionary = make_Dictionary(train_data)
print('Done!')

Creating a training dictionary...Done!


### extract_features

In [164]:
print ("reading and processing emails from TRAIN and TEST folders")
print('Creating a feature_matrix for training data...', end='')
features_matrix, labels = extract_features(train_data) # extracts feature matrix and labels and saves into the respective variables
print('Done!')
print('Creating a feature_matrix for testing data...', end='')
test_features_matrix, test_labels = extract_features(test_data)
print('Done!')

reading and processing emails from TRAIN and TEST folders
Creating a feature_matrix for training data...Done!
Creating a feature_matrix for testing data...Done!


In [165]:
#let's take a look at the test_features_matrix
test_features_matrix # the numbers represent the counts of words (columns) in documents (rows)

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [17.,  2.,  0., ...,  0.,  0.,  0.]])

## Training

In [166]:
model = GaussianNB() # we choose Gaussian model, because we assume normal distribution
print ("Training Model using Gaussian Naibe Bayes algorithm .....")
model.fit(features_matrix, labels) # we fit the model X = feature_matrix, y = labels 
print ("Training completed")

Training Model using Gaussian Naibe Bayes algorithm .....
Training completed


## Calassifying

In [167]:
print ("testing trained model to predict Test Data labels")
predicted_labels = model.predict(test_features_matrix) # predicting labels using .predict model
print ("Completed classification of the Test Data")

testing trained model to predict Test Data labels
Completed classification of the Test Data


## Accuracy

In [168]:
print (accuracy_score(test_labels, predicted_labels)) # calculates the % accuracy comparing predicted labels with test labels

0.9615384615384616
