In [1]:
# python ≥3.5 required
import sys
assert sys.version_info >= (3, 5)

# scikit-Learn ≥0.20 required
import sklearn
assert sklearn.__version__ >= "0.20"

# common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


### Spam Classifier

Fetch data:

In [2]:
import tarfile
import urllib
import os

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [3]:
fetch_spam_data()

In [4]:
# loading all emails
HAM_DIR = os.path.join(SPAM_PATH, 'easy_ham')
SPAM_DIR = os.path.join(SPAM_PATH, 'spam')
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [5]:
len(ham_filenames)

2500

In [6]:
len(spam_filenames)

500

In [7]:
# using python Email module to parse emails (headers, encoding et al)
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = 'spam' if is_spam else 'easy_ham'
    with open(os.path.join(spam_path, directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [8]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [9]:
# let's see an example of each email
print(ham_emails[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [10]:
print(spam_emails[0].get_content().strip())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content="text/html; charset=windows-1252" http-equiv=Content-Type>
<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none 
style="COLOR: black; DISPLAY: none" width="100%">
  <TBODY>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT 
color=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Why Spend More Than You Have To?
<CENTER><FONT color=#ff0000 face="Copp

Some of the emails are multipart, with images and attachments (themselves having other attachments). Let's look at the different structures we have.

In [11]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return ('multipart({})'.format(', ').join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [12]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [13]:
structures_counter(ham_emails).most_common()

[('text/plain', 2411),
 ('text/plainmultipart(, )application/pgp-signature', 66),
 ('text/plainmultipart(, )text/html', 8),
 ('text/plainmultipart(, )text/plain', 5),
 ('text/plainmultipart(, )application/octet-stream', 2),
 ('text/plainmultipart(, )text/enriched', 1),
 ('text/plainmultipart(, )application/ms-tnefmultipart(, )text/plain', 1),
 ('text/plainmultipart(, )text/plainmultipart(, )text/plainmultipart(, )application/pgp-signature',
  1),
 ('text/plainmultipart(, )video/mng', 1),
 ('text/plainmultipart(, )application/x-pkcs7-signature', 1),
 ('text/plainmultipart(, )text/plainmultipart(, )text/plainmultipart(, )text/rfc822-headers',
  1),
 ('text/plainmultipart(, )text/plainmultipart(, )text/plainmultipart(, )text/plainmultipart(, )application/x-pkcs7-signature',
  1),
 ('text/plainmultipart(, )application/x-java-applet', 1)]

In [14]:
structures_counter(spam_emails).most_common()

[('text/plain', 237),
 ('text/html', 208),
 ('text/plainmultipart(, )text/html', 45),
 ('text/plainmultipart(, )image/jpeg', 3),
 ('text/htmlmultipart(, )application/octet-stream', 2),
 ('text/plainmultipart(, )application/octet-stream', 1),
 ('text/htmlmultipart(, )text/plain', 1),
 ('text/htmlmultipart(, )application/octet-streammultipart(, )image/jpeg', 1),
 ('text/plainmultipart(, )text/htmlmultipart(, )image/gif', 1),
 ('multipart/alternative', 1)]

It seems that the ham emails are more often plain text, while spam has quite a lot of HTML. Moreover, quite a few ham emails are signed using PGP, while no spam is. In short, it seems that the email structure is useful information to have.

Now let's take a look at the email headers:

In [15]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

There's probably a lot of useful information in there, such as the sender's email address (12a1mailbot1@web.de looks fishy), but we will just focus on the Subject header:

In [16]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [18]:
# split train and test set
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

First, we will need a function to convert HTML to plain text. Let's hack a quick & dirty solution using regular. The following function first drops the <head> section, then converts all <a> tags to the word HYPERLINK, then it gets rid of all HTML tags, leaving only the plain text. For readability, it also replaces multiple newlines with single newlines, and finally it unescapes html entities (such as &gt; or &nbsp;):

In [23]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [24]:
html_spam_emails = [email for email in X_train[y_train==1]
                   if get_email_structure(email) == 'text/html']

sample_html_spam = html_spam_emails[5]
print(sample_html_spam.get_content().strip()[:1000], "...")

<HR>
<html>
<div bgcolor="#FFFFCC">

  <p align="center"><a
href="http://www.webbasedmailing.com"><img border="0"
src="http://www.webbasedmailing.com/Toners2goLogo.jpg"
width="349" height="96"></a></p>
<p align="center"><font size="6" face="Arial MT
Black"><i>Tremendous Savings</i>
on Toners,&nbsp;</font></p>
<p align="center"><font size="6" face="Arial MT
Black">
Inkjets, FAX, and Thermal Replenishables!!</font></p>
<p><a href="http://www.webbasedmailing.com">Toners 2 Go
</a>is your secret
weapon to lowering your cost for <a
href="http://www.webbasedmailing.com">High Quality,
Low-Cost</a> printer
supplies!&nbsp; We have been in the printer
replenishables business since 1992,
and pride ourselves on rapid response and outstanding
customer service.&nbsp;
What we sell are 100% compatible replacements for
Epson, Canon, Hewlett Packard,
Xerox, Okidata, Brother, and Lexmark; products that
meet and often exceed
original manufacturer's specifications.</p>
<p><i><font size="4">Check out these
p

In [25]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


   HYPERLINK
Tremendous Savings
on Toners, 
Inkjets, FAX, and Thermal Replenishables!!
 HYPERLINK Toners 2 Go
is your secret
weapon to lowering your cost for  HYPERLINK High Quality,
Low-Cost printer
supplies!  We have been in the printer
replenishables business since 1992,
and pride ourselves on rapid response and outstanding
customer service. 
What we sell are 100% compatible replacements for
Epson, Canon, Hewlett Packard,
Xerox, Okidata, Brother, and Lexmark; products that
meet and often exceed
original manufacturer's specifications.
Check out these
prices!
        Epson Stylus
Color inkjet cartridge
(SO20108):     Epson's Price:
$27.99    
Toners2Go price: $9.95!
         HP
LaserJet 4 Toner Cartridge
(92298A):           
HP's
Price:
$88.99           
Toners2Go
  price: $41.75!
 
Come visit us on the web to check out our hundreds
of similar bargains at  HYPERLINK Toners
2 Go!
  request to be excluded by visiting  HYPERLINK HERE
beverley
 ...


Let's now write a function that takes an email as input and returns its content as plain text, whatever its format is:

In [26]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ('text/plain', 'text/html'):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [27]:
print(email_to_text(sample_html_spam)[:100], "...")


   HYPERLINK
Tremendous Savings
on Toners, 
Inkjets, FAX, and Thermal Replenishables!!
 HYPERLINK T ...


nltk (Natural Language Toolkit):

In [32]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


We also need a way to replace URLs with the word "URL". 

In [33]:
pip install urlextract

Collecting urlextract
  Downloading https://files.pythonhosted.org/packages/06/db/23b47f32d990dea1d9852ace16d551a0003bdfc8be33094cfd208757466e/urlextract-0.14.0-py3-none-any.whl
Collecting appdirs (from urlextract)
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting uritools (from urlextract)
  Downloading https://files.pythonhosted.org/packages/eb/1a/5995c0a000ef116111b9af9303349ba97ec2446d2c9a79d2df028a3e3b19/uritools-3.0.0-py3-none-any.whl
Installing collected packages: appdirs, uritools, urlextract
Successfully installed appdirs-1.4.3 uritools-3.0.0 urlextract-0.14.0
Note: you may need to restart the kernel to use updated packages.
