# Chapter 3: Exercise 4
## Build an Email Spam Classifier

This is based on the solution [here](https://github.com/ageron/handson-ml/blob/master/03_classification.ipynb), which I reimplemented as an educational exercise.

### Downloading the Data

In [0]:
# Defining a function to get the data, taken from solution.

import os
import tarfile
from six.moves import urllib

DOWNLOAD_URL = 'https://spamassassin.apache.org/old/publiccorpus/'
HAM_URL = DOWNLOAD_URL + '20021010_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_URL + '20021010_spam.tar.bz2'
SPAM_PATH = os.path.join('datasets', 'spam')

def fetch_data():
  if not os.path.isdir(SPAM_PATH):
    os.makedirs(SPAM_PATH)
  for fname, url in [('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)]:
    path = os.path.join(SPAM_PATH, fname)
    if not os.path.isfile(path):
      urllib.request.urlretrieve(url, path)
    tar_bz2_file = tarfile.open(path)
    tar_bz2_file.extractall(path=SPAM_PATH)
    tar_bz2_file.close()

In [0]:
fetch_data()

In [0]:
# Getting a list of the filenames

HAM_DIR = os.path.join(SPAM_PATH, 'easy_ham')
SPAM_DIR = os.path.join(SPAM_PATH, 'spam')

ham_filenames = [name for name in sorted(os.listdir(HAM_DIR))
                 if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR))
                  if len(name) > 20]

In [5]:
print('Number of ham:', len(ham_filenames))
print('Number of spam:', len(spam_filenames))

Number of ham: 2551
Number of spam: 501


### Parsing the Emails in the Dataset

In [0]:
# Define an email parsing function, taken from solution.

import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
  directory = 'spam' if is_spam else 'easy_ham'
  with open(os.path.join(spam_path, directory, filename), 'rb') as f:
    return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [0]:
# Opening and parsing the email files.

ham_emails = [load_email(is_spam=False, filename=fname)
              for fname in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=fname)
               for fname in spam_filenames][1:]

In [8]:
# Example of ham.

print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [9]:
# Example of spam.

print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [0]:
# Function for breaking down the email structure.

def get_email_structure(email):
  if isinstance(email, str):
    return email
  payload = email.get_payload()
  if isinstance(payload, list):
    return 'multipart({})'.format(', '.join(
        get_email_structure(sub_email)
        for sub_email in payload
    ))
  return email.get_content_type()

In [0]:
# Use the Counter class to keep track of how many emails
# have a given structure.

from collections import Counter

def count_structures(emails):
  structures = Counter()
  for email in emails:
    struct = get_email_structure(email)
    structures[struct] += 1
  return structures

In [12]:
count_structures(ham_emails).most_common()

[('text/plain', 2453),
 ('multipart(text/plain, application/pgp-signature)', 72),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [13]:
count_structures(spam_emails).most_common()

[('text/plain', 221),
 ('text/html', 181),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 19),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [14]:
# Taking the look at the headers in the spam emails.

for header, value in spam_emails[0].items():
  print(header + ':', value)

Return-Path: <12a1mailbot1@web.de>
Delivered-To: zzzz@localhost.example.com
Received: from localhost (localhost [127.0.0.1])	by phobos.labs.example.com (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received: from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received: from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@example.com>; Thu, 22 Aug 2002 13:09:41 +0100
From: 12a1mailbot1@web.de
Received: from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To: dcek1a1@netsgo.com
Subject: Life Insurance - Why Pay More?
Date: Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version: 1.0
Message-ID: <0103c1042001882DD_IT7@dd_it7>
Content-Type: text/html; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable


In [0]:
# Splitting the data into a train and test set.

import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=42)

In [0]:
# Parsing the HTML in the emails into plaintext.

import re
from html import unescape

def html_to_plaintext(html):
  text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
  text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
  text = re.sub('<.*?>', '', text, flags=re.M | re.S)
  text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
  return unescape(text)

In [17]:
# An example of HTML spam.

html_spam_emails = [email for email in X_train[y_train == 1]
                    if get_email_structure(email) == 'text/html']
sample_html_spam = html_spam_emails[8]
print(sample_html_spam.get_content().strip(), '...')

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>

<HEAD>
	<META NAME="GENERATOR" Content="Visual Page 1.0 for Windows">
	<META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=iso-8859-1">
	<TITLE>untitled</TITLE>
</HEAD>

<BODY onLoad="(window.open('http://sam.hostcentrel.com/norton/'))">

<P ALIGN="CENTER"><FONT COLOR="#0000FF" face="Arial"><B>ATTENTION: This is a MUST for ALL Computer Users!!!<BR>
</B></FONT><FONT COLOR="#000000" face="Arial"><BR>
</FONT><FONT face="Arial"><B>*NEW-Special Package Deal!*<BR>
</B></FONT><FONT COLOR="#000000" face="Arial"><BR>
</FONT><FONT COLOR="#FF0000" face="Arial"><B>Norton SystemWorks 2002 Software Suite -Professional Edition-

Includes Six - Yes 6! - Feature-Packed Utilities
ALL For 1 Special LOW Price!6 Feature-Packed Utilities...1 Great Price!
A $300+ Combined Retail Value!

FREE Shipping!</B></FONT></P>

<P ALIGN="CENTER"><FONT COLOR="#FF0000" face="Arial"><B><BR>
</B></FONT><A HREF="http://sam.hostcentrel.com/norton/"><FONT face="

In [18]:
print(html_to_plaintext(sample_html_spam.get_content().strip()))


ATTENTION: This is a MUST for ALL Computer Users!!!
*NEW-Special Package Deal!*
Norton SystemWorks 2002 Software Suite -Professional Edition-
Includes Six - Yes 6! - Feature-Packed Utilities
ALL For 1 Special LOW Price!6 Feature-Packed Utilities...1 Great Price!
A $300+ Combined Retail Value!
FREE Shipping!
 HYPERLINK Click Here Now!



In [0]:
# Defining a function which returns the text content of the email.

def email_to_text(email):
  html = None
  for part in email.walk():
    ctype = part.get_content_type()
    if not ctype in ('text/plain', 'text/html'):
      continue
    try:
      content = part.get_content()
    except:
      content = str(part.get_payload())
    if ctype == 'text/plain':
      return content
    html = content
  if html:
    return html_to_plaintext(html)

In [23]:
print(email_to_text(sample_html_spam))


ATTENTION: This is a MUST for ALL Computer Users!!!
*NEW-Special Package Deal!*
Norton SystemWorks 2002 Software Suite -Professional Edition-
Includes Six - Yes 6! - Feature-Packed Utilities
ALL For 1 Special LOW Price!6 Feature-Packed Utilities...1 Great Price!
A $300+ Combined Retail Value!
FREE Shipping!
 HYPERLINK Click Here Now!



In [24]:
# Create a PorterStemmer to transform words into their stems.

import nltk

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute",
             "Compulsive"):
  print(word, '=>', stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [25]:
!pip3 install urlextract



In [26]:
# Instantiating the URLExtract class from the urlextract module.

import urlextract

url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls(
    "Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [0]:
# Putting it together into a Scikit-Learn transformer, taken from solution.

from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCountTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, lower_case=True, replace_urls=True, replace_numbers=True,
               replace_punctuation=True, stemming=True):
    self.lower_case = lower_case
    self.replace_urls = replace_urls
    self.replace_numbers = replace_numbers
    self.replace_punctuation = replace_punctuation
    self.stemming = stemming
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    X_transformed = []
    for email in X:
      text = email_to_text(email) or ''
      if self.lower_case:
        text = text.lower()
      if self.replace_urls:
        urls = list(set(url_extractor.find_urls(text)))
        urls.sort(key=lambda url: len(url), reverse=True)
        for url in urls:
          text = text.replace(url, ' URL ')
      if self.replace_numbers:
        text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
      if self.replace_punctuation:
        text = re.sub(r'\W+', ' ', text, flags=re.M)
      word_counts = Counter(text.split())
      if self.stemming:
        stemmed_word_counts = Counter()
        for word, count in word_counts.items():
          stem = stemmer.stem(word)
          stemmed_word_counts[stem] += count
        word_counts = stemmed_word_counts
      X_transformed.append(word_counts)
    return np.array(X_transformed)

In [32]:
# Testing out EmailToWordCountTransformer.

X_few = X_train[:3]
X_few_wordcounts = EmailToWordCountTransformer().fit_transform(X_few)
print(X_few_wordcounts)

[Counter({'it': 4, 'pay': 3, 't': 3, 'the': 2, 'you': 2, 'without': 2, 's': 2, 'i': 2, 'a': 2, 'can': 2, 'look': 2, 'at': 2, 'if': 1, 'creator': 1, 'didnt': 1, 'say': 1, 'could': 1, 'have': 1, 'theft': 1, 'so': 1, 'simpl': 1, 'hell': 1, 'that': 1, 'even': 1, 'in': 1, 'all': 1, 'major': 1, 'holi': 1, 'book': 1, 'wow': 1, 've': 1, 'got': 1, 'great': 1, 'idea': 1, 'll': 1, 'hire': 1, 'skywrit': 1, 'to': 1, 'write': 1, 'thi': 1, 'then': 1, 'lock': 1, 'up': 1, 'everybodi': 1, 'who': 1, 'and': 1, 'didn': 1, 'fail': 1, 'jesu': 1, 'is': 1, 'on': 1, 'my': 1, 'side': 1, 'url': 1})
 Counter({'number': 8, 'i': 8, 'to': 5, 'the': 5, 'url': 4, 'of': 4, 'we': 3, 'realli': 3, 'look': 3, 'it': 3, 'that': 3, 'with': 3, 'date': 2, 't': 2, 'but': 2, 'is': 2, 'what': 2, 'would': 2, 'time': 2, 'for': 2, 'a': 2, 'have': 2, 'm': 2, 'not': 2, 'will': 2, 'tue': 1, 'aug': 1, 'from': 1, 'brent': 1, 'welch': 1, 'messag': 1, 'id': 1, 'if': 1, 'are': 1, 'allow': 1, 'assum': 1, 'or': 1, 'higher': 1, 'which': 1, 'can'

In [0]:
# Building a transformer to turn the word counts into a vector.

from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, vocabulary_size=1000):
    self.vocabulary_size = vocabulary_size
  def fit(self, X, y=None):
    total_count = Counter()
    for word_count in X:
      for word, count in word_count.items():
        total_count[word] += min(count, 10)
    most_common = total_count.most_common()[:self.vocabulary_size]
    self.most_common_ = most_common
    self.vocabulary_ = {word: i + 1 for i, (word, _) in enumerate(most_common)}
    return self
  def transform(self, X, y=None):
    rows, cols, data = [], [], []
    for row, word_count in enumerate(X):
      for word, count in word_count.items():
        rows.append(row)
        cols.append(self.vocabulary_.get(word, 0))
        data.append(count)
    return csr_matrix((data, (rows, cols)),
                      shape=(len(X), self.vocabulary_size + 1))

In [35]:
# Testing out WordCounterToVectorTransformer.

vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
print(X_few_vectors.toarray())

[[ 53   2   0   2   4   1   1   3   1   2   1]
 [109   8   8   5   3   0   5   2   1   3   4]
 [ 51   0   1   1   1   6   0   1   3   0   0]]


In [36]:
vocab_transformer.vocabulary_

{'all': 8,
 'i': 1,
 'in': 5,
 'it': 4,
 'look': 9,
 'number': 2,
 't': 7,
 'the': 3,
 'to': 6,
 'url': 10}

In [0]:
# Building a preprocessing Pipeline for the model.

from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ('email_to_wordcount', EmailToWordCountTransformer()),
    ('word_count_to_vector', WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [40]:
# Testing out LogisticRegression on the data, getting over 98.6% accuracy.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver='liblinear', random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ....................... , score=0.9889434889434889, total=   0.2s
[CV]  ................................................................
[CV] ........................ , score=0.990159901599016, total=   0.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ........................ , score=0.981549815498155, total=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s finished


0.9868844020135533

In [43]:
# Measuring precision and recall on the test set.

from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver='liblinear', random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print('Precision score: {:.2f}'.format(100 * precision_score(y_test, y_pred)))
print('Recall score: {:.2f}'.format(100 * recall_score(y_test, y_pred)))

Precision score: 93.81
Recall score: 95.79
