# CMSC 197 (Machine Learning) - Problem Set 2

### Earl James Q. Rentillo

In [211]:
# import preliminary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import sklearn.metrics
import re
import email
from collections import Counter

# import sklearn libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 

## Preprocessing

In [7]:
# create emails dataframe
emails = pd.DataFrame()
labels = []
locs = []

# read labels file, split the label and file location, then append lists as part of the dataframe
with open('labels') as f:
    for line in f:
        label, loc = line.split()
        labels.append(label)
        locs.append(loc.replace('../data/', ''))

emails['location'] = locs
emails['label'] = labels
emails.head()
            

Unnamed: 0,location,label
0,000/000,ham
1,000/001,spam
2,000/002,spam
3,000/003,ham
4,000/004,spam
5,000/005,ham
6,000/006,ham
7,000/007,spam
8,000/008,spam
9,000/009,spam


In [150]:
# reading stopwords.txt

with open('stop_words.txt', 'r') as f:
    stopwords = f.readlines()
    
# remove '\n'
stopwords = [word[:-1] for word in stopwords]

# check first 5 words
print(stopwords[:5])

['a', 'able', 'about', 'above', 'abst']


In [158]:
# defining functions to get email body and clean an email
 
def get_email_body(message):
    msg = email.message_from_file(message)

    # if the email is multipart
    if msg.is_multipart():
        for part in msg.walk():
            # find part with the text/plain content type
            if part.get_content_type() == 'text/plain':
                # extract part as a string
                body = part.get_payload()
                return body

    # if the message is single part
    else:
        body = msg.get_payload()
        return body

def clean_email(content):
    # remove html tags, non-alphabets, and links
    content = re.sub(r'<[^>]*>', '', content)
    content = re.sub(r'[^a-zA-Z\n ]','', content)
    content = re.sub(r'http\S+', '', content)
    content = re.sub(r'www', '', content)
    content = re.sub(r'goo\S+', '', content)
    content = content.lower().split()
    
    # remove stopwords in email content
    content = [word for word in content if word not in stopwords]
    
    content = ' '.join(content)
    # remove stop words and other unwanted words 
    unwanted_words = ['mime', 'mimeversion', 'contenttransferencoding', 'contenttransfer' , 'contenttype',
                      'textplain', 'texthtml', 'charsetiso', 'formatflowed', 'multipart message' ]
    for word in unwanted_words:
        content = content.replace(word, '')
   
    return content

In [159]:
# reading the email files and cleaning 

contents = []

for loc in emails['location']:

    with open(f'data/{loc}', 'r', encoding = 'latin-1') as f:
        content = clean_email(str(get_email_body(f)))
        contents.append((content))
    

In [160]:
emails['email_content'] = contents
emails.head()

Unnamed: 0,location,label,email_content
0,000/000,ham,mailing list queried weeks ago running set arc...
1,000/001,spam,luxury watches buy rolex rolex cartier bvlgari...
2,000/002,spam,academic qualifications prestigious nonacc red...
3,000/003,ham,greetings verify subscription planfans list ch...
4,000/004,spam,chauncey conferred luscious continued tonsillitis


In [184]:
# set emails in folders 0-70 to train set
train_set = emails[emails['location'] < '071']

# set emails in folders 71-127 to test set
test_set = emails[emails['location'] >= '071']

# check length of train and test set
print(f'Train set length: {len(train_set)}')
print(f'Test set length: {len(test_set)}')

Train set length: 21300
Test set length: 16522


In [197]:
# splitting the training set into ham and spam

train_ham = emails[emails['label'] == 'ham'].reset_index()
train_spam = emails[emails['label'] == 'spam'].reset_index()

In [198]:
# extract the most common 10000 words from the training dataset
most_used_words = Counter(" ".join(train_set['email_content']).split()).most_common(10000)

top_used_words = pd.DataFrame(most_used_words, columns = ['words', 'num_of_occurences'])
top_used_words.head(5)

Unnamed: 0,words,num_of_occurences
0,will,11304
1,bb,7395
2,board,5148
3,company,4533
4,price,4497


## Creating the Feature Matrices

In [199]:
# creating the feature dictionary for ham and spam

ham_word_counts = {unique_words: [0] * len(train_ham) for unique_words, _ in most_used_words}
spam_word_counts = {unique_words: [0] * len(train_spam) for unique_words, _ in most_used_words}

top_words_list = [key for key, _ in most_used_words]

In [200]:
# spam feature set
spam_word_count = pd.DataFrame(spam_word_counts)

# loop through the train spam set index
for i in train_spam.index:
    # count the word frequency per row in the train spam set
    frequency = dict(Counter(train_spam['email_content'][i].split()))
    for key, val in frequency.items():
        if key in top_words_list:
            spam_word_count.loc[i, key] += val

spam_feat_matrix = spam_word_count.to_numpy()
spam_feat_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [203]:
# ham feature set
ham_word_count = pd.DataFrame(ham_word_counts)

for i in train_ham.index:
    frequency = dict(Counter(train_ham['email_content'][i].split()))
    # count the word frequency per row in the train ham set
    for key, val in frequency.items():
        if key in top_words_list:  # add the word frequency to the row and column where the word is found
            ham_word_count.loc[i, key] += val

ham_feat_matrix = ham_word_count.to_numpy()
ham_feat_matrix

array([[2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Computing the ham and spam priors

In [204]:
# number of spam emails
spam_train_size = train_spam.shape[0]
# number of ham emails
ham_train_size = train_ham.shape[0]
# total number of emails for training
total_train_size = train_set.shape[0]      

spam_prior =  spam_train_size / total_train_size
ham_prior = ham_train_size / total_train_size

print("Prior probabilities for spam: ", spam_prior)
print("Prior probabilities for ham: ", ham_prior)

Prior probabilities for spam:  1.1695774647887325
Prior probabilities for ham:  0.6061032863849766


## Computing the likelihood of each word

In [205]:
# sum of each words in spam
spam_train_words_sum = np.sum(spam_feat_matrix, axis=0)
# sum of each words in ham
ham_train_words_sum = np.sum(ham_feat_matrix, axis=0)

# total sum of words in spam
spam_train_word_total = spam_train_words_sum.sum()
# total sum of words in ham
ham_train_word_total = ham_train_words_sum.sum()

# laplace smoothing
lambda_val = 1 
ham_likelihood = {}
spam_likelihood = {}

for i in range(len(top_words_list)):
    c_ham = (ham_train_words_sum[i]+lambda_val) / (ham_train_word_total + lambda_val*len(top_words_list))
    c_spam = (spam_train_words_sum[i]+lambda_val) / (spam_train_word_total + lambda_val*len(top_words_list))
    
    ham_likelihood[top_words_list[i]] = c_ham
    spam_likelihood[top_words_list[i]] = c_spam

## Classifying the emails

In [206]:
# defining function to classify emails

def classify_emails(email_content, spam_prior, ham_prior, spam_likelihood, ham_likelihood, top_words_list):
    
    # get log values of spam and ham probabilities
    spam_prob_log = np.log(spam_prior)
    ham_prob_log = np.log(ham_prior)
    
    email_words = str(email_content).split()
    
    for word in email_words:
        if word in top_words_list:
            ham_prob_log += np.log(ham_likelihood[word])
            spam_prob_log += np.log(spam_likelihood[word])
            
    if spam_prob_log > ham_prob_log:
        return 'spam'
    else:
        return 'ham'

## Testing the classifier

In [208]:
# creating a dictionary with the predicted label and respective location

predicted_dict = {'location':[], 'predicted_label': []}

for path, content in zip(test_set['location'], test_set['email_content']):
    predicted_dict['location'].append(path) 
    pred_label = classify_emails(content, spam_prior, ham_prior, spam_likelihood, ham_likelihood, top_words_list)
    
    #add the predicted label by the classfying_emails function
    predicted_dict['predicted_label'].append(pred_label) 

In [216]:
predicted_test = pd.DataFrame.from_dict(predicted_dict)

merged_emails = pd.merge(test_set, predicted_test, on='location')
merged_emails.head()

Unnamed: 0,location,label,email_content,predicted_label
0,071/000,spam,hesitantly derive perverse satisfaction clodho...,spam
1,071/001,ham,things perform experiment display will remain ...,ham
2,071/002,spam,best offer month viggra ci ialis vaiium xa naa...,spam
3,071/003,spam,de ar wne cr doesnt matter ow real st mmed ia ...,spam
4,071/004,spam,special offer adobe video collection adobe pre...,spam


## Performance Evaluation

In [214]:
# performance evaluation for accuracy, recall, precision and f1 score

actual_label = merged_emails['label'].to_numpy()
predicted_label = merged_emails['predicted_label'].to_numpy()

print('Performance Evaluation on Test Set\n')
print('Accuracy score: ', accuracy_score(actual_label, predicted_label))
print("Recall score:", recall_score(actual_label, predicted_label, pos_label="spam"))
print("Precision score:", precision_score(actual_label, predicted_label, pos_label="spam"))
print("F1 score:", f1_score(actual_label, predicted_label, pos_label="spam"))

Performance Evaluation on Test Set

Accuracy score:  0.9391720130734778
Recall score: 0.9311180960933992
Precision score: 0.9775598717706958
F1 score: 0.9537739754381123


In [215]:
# confusion matrix

c_matrix = confusion_matrix(merged_emails['label'].to_numpyb(), merged_emails['predicted_label'].to_numpy(), labels=["spam", "ham"])
c_matrix

array([[10368,   767],
       [  238,  5149]], dtype=int64)