In [369]:
import os
import urllib.request
import tarfile
import re
import email

import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [5]:
DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_PATHS = [
    '20021010_easy_ham.tar.bz2',
    '20021010_hard_ham.tar.bz2',
    '20021010_spam.tar.bz2',
    '20030228_easy_ham.tar.bz2',
    '20030228_easy_ham_2.tar.bz2',
    '20030228_hard_ham.tar.bz2',
    '20030228_spam.tar.bz2',
    '20030228_spam_2.tar.bz2',
    '20050311_spam_2.tar.bz2',
]

### Concatenating all datasets might not work!! Too much data

In [587]:
def fetch_spam_data(download_root=DOWNLOAD_ROOT, spam_paths=SPAM_PATHS):
    if not os.path.isdir('data'):
        os.makedirs('data')
    for path in SPAM_PATHS:
        url = DOWNLOAD_ROOT + path
        tgz_path = os.path.join('data', path)
        urllib.request.urlretrieve(url, tgz_path)
        spam_tgz = tarfile.open(tgz_path)
        spam_tgz.extractall(path='data')
    spam_tgz.close()

In [588]:
fetch_spam_data()

### One spam example + one ham example should do it

In [6]:
def load_data():
    ham_path = os.path.join(os.getcwd(), 'data', 'easy_ham_2')
    spam_path = os.path.join(os.getcwd(), 'data', 'spam_2')
    
    x_spam = []
    x_ham = []
    
    for filename in os.listdir(ham_path):
        abs_path = os.path.join(ham_path, filename)
        with open(abs_path, 'r', encoding='utf8', errors='ignore') as f:
            x_ham.append(f.read())

    for filename in os.listdir(spam_path):
        abs_path = os.path.join(spam_path, filename)
        with open(abs_path, 'r', encoding='utf8', errors='ignore') as f:
            x_spam.append(f.read())
            
    return np.array(x_spam, dtype='object'), np.array(x_ham, dtype='object')

## Warning! Don't use the function below

In [238]:
def load_all_data():
    """
    If OpenAI ever wants to test their TPUs, this is a good place to start.
    Not feasible on a normal computer though :(
    """
    x_spam = []
    x_ham = []
    path = os.path.join(os.getcwd(), 'data')
    for dirname in os.listdir(path):
        if 'tar' not in dirname:
            if 'spam' in dirname:
                for filename in os.listdir(os.path.join(path, dirname)):
                    abs_path = os.path.join(path, dirname, filename)
                    with open(abs_path, 'rb') as f:
                        x_spam.append(f.read().split())
            else:
                for filename in os.listdir(os.path.join(path, dirname)):
                    abs_path = os.path.join(path, dirname, filename)
                    with open(abs_path, 'rb') as f:
                        x_ham.append(f.read().split())
    return np.array(x_spam), np.array(x_ham)

In [7]:
x_spam, x_ham = load_data()
y_spam = np.ones(x_spam.shape)
y_ham = np.zeros(x_ham.shape)

In [8]:
x = np.concatenate([x_spam, x_ham])
y = np.concatenate([y_spam, y_ham])

##### Define a custom preprocessing steps to convert to lowercase and remove: 
- URLs
- Numbers

In [392]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import make_pipeline

In [276]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [277]:
for train_index, test_index in sss.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [632]:
class EmailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=False, convert_lower=True):
        self.strip_headers = strip_headers
        self.convert_lower = convert_lower
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_out = np.empty((x.shape), dtype=x.dtype)
        for i, e in enumerate(x):
            msg = email.message_from_string(e)

            if self.strip_headers:
                for k in msg.keys():
                    del msg[k]

                    
            payload = msg.get_payload()
            
            if self.convert_lower:
                if isinstance(payload, list):
                    payload_str = ''.join(p.as_string().lower() for p in payload)
                    msg.set_payload(payload_str)
                else:
                    msg.set_payload(msg.get_payload().lower())
            
            x_out[i] = msg.as_string()

        return x_out

In [704]:
class EmailReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, replace_url=False, replace_number=False):
        self.replace_url = replace_url
        self.replace_number = replace_number
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_out = np.empty((x.shape), dtype=x.dtype)
        
        for i, e in enumerate(x):
            if isinstance(e, bytes):
                e = e.decode('ISO-8859-1')
                
            if self.replace_url:
                e = re.sub(r'http\S+|www\S+', 'URL', e)
            
            if self.replace_number:
                e = re.sub(r'\d+', 'NUMBER', e)
                
            e = e.lower()
            
            x_out[i] = e
        
        return x_out

In [705]:
class EmailStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, use_porter=False):
        self.use_porter = use_porter
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_out = np.empty((x.shape), dtype=x.dtype)

        for i, e in enumerate(x):
            if self.use_porter:
                x_out[i] = SnowballStemmer('porter').stem(e)
            else:
                x_out[i] = SnowballStemmer('english').stem(e)
                
        return x_out

##### Don't use this 🔫

In [713]:
# This is useless since we have CountVectorizer
class TransformBOW(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        vectorizer = CountVectorizer(encoding='ISO-8859-1')
        x_bow_sparse = vectorizer.fit_transform(x)
        x_bow = x_bow_sparse.toarray()
        return x_bow

In [716]:
pipeline = make_pipeline(EmailTransformer(), EmailReplacer(replace_number=True, replace_url=True), EmailStemmer(), verbose=3)

In [717]:
x_tr = pipeline.fit_transform(x)

[Pipeline] .. (step 1 of 3) Processing emailtransformer, total=   4.4s
[Pipeline] ..... (step 2 of 3) Processing emailreplacer, total=   0.4s
[Pipeline] ...... (step 3 of 3) Processing emailstemmer, total=   1.4s


In [718]:
vectorizer = CountVectorizer(encoding='ISO-8859-1')
vectorizer.fit(x_tr)

In [719]:
x_train_tr = vectorizer.transform(x_train)
x_test_tr = vectorizer.transform(x_test)

In [709]:
x_train_tr = pipeline.transform(x_train)
x_test_tr = pipeline.transform(x_test)

In [737]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [742]:
gb = GradientBoostingClassifier()
gb.fit(x_train_tr, y_train)

In [743]:
y_pred = gb.predict(x_test_tr)
score = accuracy_score(y_test, y_pred)

In [745]:
score

0.9839285714285714

In [753]:
y_test[4]

1.0

In [750]:
x_test[4]

'From wholesaleman6002@eudoramail.com  Tue Aug  6 10:58:55 2002\nReturn-Path: <wholesaleman6002@eudoramail.com>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id EEB224411C\n\tfor <jm@localhost>; Tue,  6 Aug 2002 05:53:54 -0400 (EDT)\nReceived: from mail.webnote.net [193.120.211.219]\n\tby localhost with POP3 (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Tue, 06 Aug 2002 10:53:54 +0100 (IST)\nReceived: from relais.videotron.ca (relais.videotron.ca [24.201.245.36])\n\tby webnote.net (8.9.3/8.9.3) with ESMTP id KAA07661\n\tfor <jm@netnoteinc.com>; Sat, 3 Aug 2002 10:44:23 +0100\nFrom: wholesaleman6002@eudoramail.com\nReceived: from ntserver2.CIM.ORG ([207.253.176.67]) by\n          relais.videotron.ca (Videotron-Netscape Messaging Server v4.15\n          MTA-PRD2) with ESMTP id H09HPE01.MS6; Sat, 3 Aug 2002 05:44:02 -0400 \nReceived: from 221 (218.4.51.134 [218.4.51.134]) by ntser

In [746]:
y_pred

array([0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
       0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
       1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 0., 1.