In [None]:
# -*- coding: utf-8 -*-
# Code here inspired by this kernel on Kaggle: https://www.kaggle.com/veleon/spam-classification
# 
# The code performs some pre-processing tasks on the "ham" and "spam" sample mail
# provided by the above Kaggle project
#
# Namely: 
# - Strips email headers
# - Extracts email content
# - Trims word endings, normalizes capitalization, removes punctuation
# - Discards non-text content (attachments, multimedia)
import os
import email
import email.policy
from bs4 import BeautifulSoup # Process HTML-formatted emails

# Directory paths
hamdir = './hamnspam/ham/'
spamdir = './hamnspam/spam/'

# Get a list of ham and spam filenames 
# Note: we assume all filenames are 37 characters long
ham_filenames = [name for name in os.listdir(hamdir) if len(name) == 37]
spam_filenames = [name for name in os.listdir(spamdir) if len(name) == 37]

# Output some debugging info
ham_count = len(ham_filenames)
spam_count = len(spam_filenames)

print('SPAM email count: {}'.format(spam_count))
print('HAM email count: {}'.format(ham_count))
print('HAM/SPAM Ratio: {ratio: 0.2f}% SPAM'.format(ratio=spam_count/ham_count * 100))

In [None]:
# Read in ham + spam email messages
ham_emails = []
for filename in ham_filenames:
    f = open(hamdir + filename, 'rb')
    try:
        msg = email.parser.BytesParser(policy=email.policy.default).parse(f)
    except:
        print('ERROR: Unable to read email {}'.format(hamdir + filename))
        continue

    ham_emails.append(msg)
    
spam_emails = []
for filename in spam_filenames:
    f = open(spamdir + filename, 'rb')
    try:
        msg = email.parser.BytesParser(policy=email.policy.default).parse(f)
    except:
        print('ERROR: Unable to read email {}'.format(spamdir + filename))
        continue

    spam_emails.append(msg)
        
# Print out the first (non-multipart) ham + spam messages in each list, as a sanity check
print('****** First HAM message ******\n')
for item in ham_emails:
    if item.get_content_type() not in ['text/plain', 'text/html']:
        continue
    else:
        print('*** HEADERS ***\n')
        print(item.values())
        print('\n*** BODY ***\n')
        print(item.get_content())
        break
print('\n')

print('*** First SPAM message ***\n')
for item in spam_emails:
    if item.get_content_type() not in ['text/plain', 'text/html']:
        continue
    else:
        print('*** HEADERS ***\n')
        print(item.values())
        print('\n*** BODY ***\n')
        print(item.get_content())
        break
print('\n')


In [None]:
# In order to differentiate spam and ham, we need to develop features 
# which can be used to differentiate spam and ham emails. In our case, we'll
# look at the text in the subject and body of the email. We extract text from our
# emails below, ignoring everything except plaintext and HTML-formatted content
# (meaning we ignore images, video, and other binary formats)
from bs4 import BeautifulSoup as bs

# Convert email content to plaintext
def html_to_text(html):
    soup = bs(html, 'lxml')
    try:
        return soup.text
    except:
        print('ERROR: Unable to convert email\'s HTML content to text')
        return ''
    
def email_to_text(email):
    plaintext = ''
    
    try:
        plaintext += '\n' + email['subject'] + '\n'
    except:
        print('ERROR: Unable to parse email\'s subject line')
    
    for part in email.walk():
        if part.get_content_type() == 'text/plain':
            try:
                plaintext += '\n' + part.get_content() + '\n'
            except:
                print('ERROR: get_content() method failed on plaintext email, possibly an encoding issue')
                continue
        elif part.get_content_type() == 'text/html':
            try:
                html_content = part.get_content()
            except:
                print('ERROR: Unable to get content of HTML email message')
                continue
            
            plaintext += '\n' + html_to_text(html_content) + '\n'
    
    return plaintext

# Create text-only email corpus from ham and spam mail raw data
ham_emails_textonly = [email_to_text(email) for email in ham_emails]
spam_emails_textonly = [email_to_text(email) for email in spam_emails]

# Sanity check: show first email in each list
print('========= HAM =========')
print(ham_emails_textonly[0])
print('========= SPAM =========')
print(spam_emails_textonly[0])


In [None]:
# Once we have extracted the text, we need to score each word for "spamminess". 
#
# One popular way to do this is by using Naive Bayes classification to judge emails
# based on the "spamminess" of words used in the body. What we will do here is 
# to use ComplementNB (Complement Naive Bayes) from scikit-learn:
#
# https://scikit-learn.org/stable/modules/naive_bayes.html#complement-naive-bayes
#
# We're using ComplementNB because it is said to have better performance than ordinary Naive
# bayes, especially for text classification tasks

# Convert spam and non-spam email into feature vectors (using the CountVectorizer class)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(ham_emails_textonly + spam_emails_textonly)

# Create label vector (0 = ham, 1 = spam)
y = len(ham_emails_textonly) * [0] + len(spam_emails_textonly) * [1]

In [None]:
# Split the data into test and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 12)

# Train the Naive Bayes model
from sklearn.naive_bayes import ComplementNB

cnb = ComplementNB()
cnb.fit(X_train, y_train)

# Compute precisino and recall 
from sklearn.metrics import precision_score, recall_score

y_predicted = cnb.predict(X_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_predicted)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_predicted)))

In [None]:
# Let's try a quick sanity check. Generate fake ham and spam emails, and see what predictions our model makes

fake_ham = '''Hey Frank,

I know it might not be the best time but I need to have a talk with you about the model we've been working on. It seems like Naive Bayes might not be the best fit for our use-case. Would you be willing to consider another model such as a linear classifier?
'''.strip()

fake_spam = '''
Dear sir or madam,

Congratulations! You have received an invite to be part of the money making scheme of the century! Earn $$$ from the comfort of your own home, selling replica luxury goods online. Simply make the calls! WE provide the customers and WE close the deal. All you have to do is call!

Join today
'''.strip()

X_fakeham = vectorizer.transform([fake_ham])
X_fakespam = vectorizer.transform([fake_spam])

print('Fake HAM is classified as (0 for ham, 1 for spam): {}'.format(cnb.predict(X_fakeham)))
print('Fake SPAM is classified as (0 for ham, 1 for spam): {}'.format(cnb.predict(X_fakespam)))