SPAM classifier

In [176]:
# Load the dataset
import tarfile
from pathlib import Path
import urllib.request
import urllib.parse


def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [177]:
ham_dir, spam_dir = fetch_spam_data()

In [178]:
# Load the emails
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [179]:
# Parse emails
import email
import email.policy

def load_email(filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [180]:
ham_emails = [load_email(filepath) for filepath in ham_filenames]
spam_emails = [load_email(filepath) for filepath in spam_filenames]

In [181]:
# Visualise the parsed content
# print(ham_emails[1].get_content().strip())
print(spam_emails[1].get_content().strip())

1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk007

2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.adclick.ws/p.cfm?o=249&s=pk007

3) Get the Child Support You Deserve - Free Legal Advice
http://www.adclick.ws/p.cfm?o=245&s=pk002

4) Join the Web's Fastest Growing Singles Community
http://www.adclick.ws/p.cfm?o=259&s=pk007

5) Start Your Private Photo Album Online!
http://www.adclick.ws/p.cfm?o=283&s=pk007

Have a Wonderful Day,
Offer Manager
PrizeMama













If you wish to leave this list please use the link below.
http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258


-- 
Irish Linux Users' Group: ilug@linux.ie
http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.
List maintainer: listmaster@linux.ie


In [182]:
# Analyse the structure of the email
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email)
                               for sub_email in payload])
        return f"multipart({multipart})"
    else:
        return email.get_content_type()

In [183]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [184]:
# Ham email structure
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [185]:
# Spam email structure
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [186]:
# Header data: Spam
for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [187]:
# Header data: Ham
for header, value in ham_emails[0].items():
    print(header, ":", value)

Return-Path : <exmh-workers-admin@spamassassin.taint.org>
Delivered-To : zzzz@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received : from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100
Received : from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by    listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002    07:35:02 -0400 (EDT)
Delivered-To : exmh-workers@listman.spamassassin.taint.org
Received : from int-mx1.corp.spamassassin.taint.org (int-mx1.corp.spamassassin.taint.org 

In [188]:
# Splitting the data
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

[0 0 0 ... 1 1 1]


In [189]:
print(X_train.shape)
print(X_train[0])

(2400,)
Return-Path: <fork-admin@xent.com>
Delivered-To: yyyy@localhost.spamassassin.taint.org
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id DEA8516F03
	for <jm@localhost>; Thu, 19 Sep 2002 13:26:35 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Thu, 19 Sep 2002 13:26:35 +0100 (IST)
Received: from xent.com ([64.161.22.236]) by dogma.slashnull.org
    (8.11.6/8.11.6) with ESMTP id g8JCFfC18989 for <jm@jmason.org>;
    Thu, 19 Sep 2002 13:15:42 +0100
Received: from lair.xent.com (localhost [127.0.0.1]) by xent.com (Postfix)
    with ESMTP id B101E294108; Thu, 19 Sep 2002 05:12:05 -0700 (PDT)
Delivered-To: fork@spamassassin.taint.org
Received: from argote.ch (argote.ch [80.65.224.17]) by xent.com (Postfix)
    with ESMTP id 19FF329409E for <fork@xent.com>; Thu, 19 Sep 2002 05:11:32
    -0700 (PDT)
Received: by argote.ch (Postfix, from userid 500) id CA1F9C44D;
    Thu, 19 S

In [190]:
# Convert the html email to plain text
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [191]:
# url extractor

%pip install -q -U urlextract
import urlextract

url_extractor = urlextract.URLExtract()

In [192]:
# stemming
import nltk

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute",
             "Compulsive"):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [193]:
# Creating a custom transformer

# split the email into words
# replace the url content with the word URL
# remove noise (ex: grammar)
# convert to lower_case

from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True,
                 remove_punctuation=True, replace_urls=True,
                 replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [194]:
# # output of the transformer (vocabulary)
# email_to_word_transformer = EmailToWordCounterTransformer()
# X_train_email_to_words = email_to_word_transformer.fit_transform(X_train[:5])

In [195]:
# Creating a vector from the transformed data
# idea is to create a vector for each word
# create a row for each email word counts
# for example ([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}))
# Counter({'the': 11, 'murcko': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3,
# the best i can imagine is each email will be a row and the word will be the columns and the value(saclar) will the occurance
# [
#   [1,], [1,], [1,], [1,], [1,], [1,],
#   [0,], [9,], [0,], [0,], [0,], [0,],
#   # ...
# ]
# from sklearn.pipeline import Pipeline

# vocabulary = output[0].keys()
# count_vectorise = TfidfVectorizer(vocabulary=vocabulary)
# email_data = X_train[:1]


# vectorise_pipeline = Pipeline(
#     [
#         ('count_vectorise', count_vectorise)
#     ]
# )

# email_contents = [email.get_content() for email in email_data]

# output_vectorised = vectorise_pipeline.fit_transform(email_contents)

# # output_vectorised = vectorise.fit_transform(output)
# print(output)
# print(output_vectorised)


Now we have the word counts, and we need to convert them to vectors. For this, we will build another transformer whose fit() method will build the vocabulary (an ordered list of the most common words) and whose transform() method will use the vocabulary to convert word counts to vectors. The output is a sparse matrix.

In [196]:
# Create a custom transformer for vectorisation
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
from scipy.sparse import csr_matrix

class VectoriseEmailTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, vocabulary_size=1000):
    self.vocabulary_size = vocabulary_size

  def fit(self, X, y=None):
    total_count = Counter()
    for word_count in X:
      for word, count in word_count.items():
        total_count[word] += min(count, 10) # limt the max count
    most_common = total_count.most_common()[:self.vocabulary_size]

    self.vocabulary_ = {word: index + 1
                            for index, (word, count) in enumerate(most_common)}

    return self

  def transform(self, X, y=None):
    rows = []
    columns = []
    data = []
    for row, word_count in enumerate(X):
      for word, count in word_count.items():
        rows.append(row)
        columns.append(self.vocabulary_.get(word, 0))
        data.append(count)
    return csr_matrix((data, (rows, columns)), shape=(len(X), self.vocabulary_size + 1))


In [197]:
# vectorise_email = VectoriseEmailTransformer(vocabulary_size=10)
# X_few_vectors = vectorise_email.fit_transform(X_train_email_to_words)
# print(X_few_vectors.toarray())
# print(X_train_email_to_words[0])
# print(X[0])

In [198]:
# Pipeline for email transformation

from sklearn.pipeline import Pipeline

steps = {
    ('email_to_words', EmailToWordCounterTransformer()),
    ('vectorised_email', VectoriseEmailTransformer())
}

pipeline = Pipeline(steps)

email_transformed = pipeline.fit_transform(X_train)

In [203]:
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression

X_test_transformed = pipeline.transform(X_test)
X_train_transformed = pipeline.transform(X_train)
log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

Precision: 96.88%
Recall: 97.89%


In [204]:
# Scoring
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}
results = cross_validate(log_clf, X_test_transformed, y_test, cv=5, scoring=scoring, return_train_score=True)

accuracy_scores = results['test_accuracy']
precision_scores = results['test_precision']
recall_scores = results['test_recall']
f1_scores = results['test_f1']


print(f'Accuracy scores: {accuracy_scores}')
print(f'Precision scores: {precision_scores}')
print(f'Recall scores: {recall_scores}')
print(f'F1 scores: {f1_scores}')

Accuracy scores: [1.         0.975      0.95       0.95833333 0.95833333]
Precision scores: [1.         0.97572115 0.9528379  0.95773237 0.95941667]
Recall scores: [1.         0.975      0.95       0.95833333 0.95833333]
F1 scores: [1.         0.9741115  0.951      0.9568525  0.95876387]
