In [64]:
import os
import urllib.request
import tarfile
import re
import email

import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [2]:
DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_PATHS = [
    '20021010_easy_ham.tar.bz2',
    '20021010_hard_ham.tar.bz2',
    '20021010_spam.tar.bz2',
    '20030228_easy_ham.tar.bz2',
    '20030228_easy_ham_2.tar.bz2',
    '20030228_hard_ham.tar.bz2',
    '20030228_spam.tar.bz2',
    '20030228_spam_2.tar.bz2',
    '20050311_spam_2.tar.bz2',
]

### Concatenating all datasets might not work!! Too much data

In [3]:
def fetch_spam_data(download_root=DOWNLOAD_ROOT, spam_paths=SPAM_PATHS):
    if not os.path.isdir('data'):
        os.makedirs('data')
    for path in SPAM_PATHS:
        url = DOWNLOAD_ROOT + path
        tgz_path = os.path.join('data', path)
        urllib.request.urlretrieve(url, tgz_path)
        spam_tgz = tarfile.open(tgz_path)
        spam_tgz.extractall(path='data')
    spam_tgz.close()

In [4]:
fetch_spam_data()

### One spam example + one ham example should do it

In [2]:
def load_data():
    ham_path = os.path.join(os.getcwd(), 'data', 'easy_ham_2')
    spam_path = os.path.join(os.getcwd(), 'data', 'spam_2')
    
    x_spam = []
    x_ham = []
    
    for filename in os.listdir(ham_path):
        abs_path = os.path.join(ham_path, filename)
        with open(abs_path, 'r', encoding='utf8', errors='ignore') as f:
            x_ham.append(f.read())

    for filename in os.listdir(spam_path):
        abs_path = os.path.join(spam_path, filename)
        with open(abs_path, 'r', encoding='utf8', errors='ignore') as f:
            x_spam.append(f.read())
            
    return np.array(x_spam, dtype='object'), np.array(x_ham, dtype='object')

## Warning! Don't use the function below

In [31]:
def load_all_data():
    """
    If OpenAI ever wants to test their TPUs, this is a good place to start.
    Not feasible on a normal computer though :(
    """
    x_spam = []
    x_ham = []
    path = os.path.join(os.getcwd(), 'data')
    for dirname in os.listdir(path):
        if 'tar' not in dirname:
            if 'spam' in dirname:
                for filename in os.listdir(os.path.join(path, dirname)):
                    abs_path = os.path.join(path, dirname, filename)
                    with open(abs_path, 'rb') as f:
                        x_spam.append(f.read().split())
            else:
                for filename in os.listdir(os.path.join(path, dirname)):
                    abs_path = os.path.join(path, dirname, filename)
                    with open(abs_path, 'rb') as f:
                        x_ham.append(f.read().split())
    return np.array(x_spam), np.array(x_ham)

In [5]:
x_spam, x_ham = load_data()
y_spam = np.ones(x_spam.shape)
y_ham = np.zeros(x_ham.shape)

In [6]:
x = np.concatenate([x_spam, x_ham])
y = np.concatenate([y_spam, y_ham])

##### Define a custom preprocessing steps to convert to lowercase and remove: 
- URLs
- Numbers

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import make_pipeline

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [9]:
for train_index, test_index in sss.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [86]:
class EmailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=False, convert_lower=True):
        self.strip_headers = strip_headers
        self.convert_lower = convert_lower
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_out = np.empty((x.shape), dtype=x.dtype)
        for i, e in enumerate(x):
            msg = email.message_from_string(e)

            if self.strip_headers:
                for k in msg.keys():
                    del msg[k]

                    
            payload = msg.get_payload()
            
            if self.convert_lower:
                if isinstance(payload, list):
                    payload_str = ''.join(p.as_string().lower() for p in payload)
                    msg.set_payload(payload_str)
                else:
                    msg.set_payload(msg.get_payload().lower())
            
            x_out[i] = msg.as_string()

        return x_out

In [87]:
class EmailReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, replace_url=False, replace_number=False):
        self.replace_url = replace_url
        self.replace_number = replace_number
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_out = np.empty((x.shape), dtype=x.dtype)
        
        for i, e in enumerate(x):
            if isinstance(e, bytes):
                e = e.decode('ISO-8859-1')
                
            if self.replace_url:
                e = re.sub(r'http\S+|www\S+', 'URL', e)
            
            if self.replace_number:
                e = re.sub(r'\d+', 'NUMBER', e)
                
            e = e.lower()
            
            x_out[i] = e
        
        return x_out

In [88]:
class EmailStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, use_porter=False):
        self.use_porter = use_porter
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_out = np.empty((x.shape), dtype=x.dtype)

        for i, e in enumerate(x):
            if self.use_porter:
                x_out[i] = SnowballStemmer('porter').stem(e)
            else:
                x_out[i] = SnowballStemmer('english').stem(e)
                
        return x_out

##### Don't use this 🔫

In [89]:
# This is useless since we have CountVectorizer
class TransformBOW(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        vectorizer = CountVectorizer(encoding='ISO-8859-1')
        x_bow_sparse = vectorizer.fit_transform(x)
        x_bow = x_bow_sparse.toarray()
        return x_bow

In [90]:
pipeline = make_pipeline(EmailTransformer(), EmailReplacer(replace_number=True, replace_url=True), EmailStemmer(), verbose=3)

In [91]:
x_tr = pipeline.fit_transform(x)

[Pipeline] .. (step 1 of 3) Processing emailtransformer, total=   4.5s
[Pipeline] ..... (step 2 of 3) Processing emailreplacer, total=   0.4s
[Pipeline] ...... (step 3 of 3) Processing emailstemmer, total=   1.5s


In [92]:
vectorizer = CountVectorizer(encoding='ISO-8859-1')
vectorizer.fit(x_tr)

In [17]:
x_train_tr = pipeline.transform(x_train)
x_test_tr = pipeline.transform(x_test)

In [18]:
x_train_tr = vectorizer.transform(x_train_tr)
x_test_tr = vectorizer.transform(x_test_tr)

In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [20]:
gb = GradientBoostingClassifier()
gb.fit(x_train_tr, y_train)

In [21]:
y_pred = gb.predict(x_test_tr)
score = accuracy_score(y_test, y_pred)

In [22]:
score

0.9857142857142858

In [99]:
email_str = 'From bruces@yami.57thstreet.com  Tue Aug  6 23:43:54 2002\nReturn-Path: <bruces@yami.57thstreet.com>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id E7763440A8\n\tfor <jm@localhost>; Tue,  6 Aug 2002 18:43:53 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Tue, 06 Aug 2002 23:43:53 +0100 (IST)\nReceived: from yami.57thstreet.com ([66.100.224.110]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with SMTP id g76MiNk21740 for\n    <jm@jmason.org>; Tue, 6 Aug 2002 23:44:24 +0100\nReceived: (qmail 18139 invoked by uid 1045); 6 Aug 2002 22:40:55 -0000\nDate: 6 Aug 2002 22:40:55 -0000\nMessage-Id: <20020806224055.18137.qmail@yami.57thstreet.com>\nFrom: Bruce Sterling <bruces@well.com>\nTo: yyyy@spamassassin.taint.org\nSubject: Viridian Note 00326:  Air-Conditioned Tokyo\n\nKey concepts:  Tokyo, urban overheating,\nclimate change remediation\n\nAttention Conservation Notice: a weird,\nhand-waving Nipponese mega-scheme.\n\nLinks:\nhttp://http://www.viridiandesign.org/products/furniture.htm\nFrom:Laurence Aurbach <translucent*spamcop.net?>\nSubject:Viridian Furniture List\n\nThe Viridian Furniture List is now online in the \n"Recommended Products" section of the Viridian website. \nDavid Bergman did a yeoman-like job assembling this list \nand adding comments. He\'s also mirroring the list on his \nown furniture site, Fire and Water.\nhttp://cyberg.com/fw/ecofurn.htm \n\nMaybe you\'ll find a woven bamboo buffet or a biopolymer \nmesh coffee table. == L.J. Aurbach\n\n\n---------------------------------------------------\nEntries in the Global Civil Society Design Contest.\n\nFrom: Steven W. Schuldt <swschuldt*mac.com>\nhttp://www.americanrobotz.com/images2/Soon_GlobalCivilSocietyLaptop.jpg\n\nFrom: Ben Davis <bend*earthlink.net>\nhttp://www.digitaleverything.com/GlobalComputer.htm\n\nFrom: Joerg F. Wittenberger <Joerg.Wittenberger*pobox.com>\nhttp://www.askemos.org/ \nhttp://www.askemos.org:9080/RomePaper.pdf\n\nFrom: Scott Vandehey <scot*spaceninja.com >\nhttp://spaceninja.com/viridian/notebook.html\n\nFrom: Bob Morris <bob*bomoco.com>\nhttp://viridianrepository.com/GlobalCivil/\n\nFrom: Anonymous\nhttp://home.freiepresse.de/befis/zx2000.html\nhttp://apollo.spaceports.com/~bodo4all/zx/zx97.htm\nhttp://www.vkb.co.il/\n\nFrom: Jim Thompson <jim*musenki.com>\nhttp://www.simputer.org\nhttp://www.cnn.com/2002/TECH/ptech/07/05/india.simputer.reut/index.html\n\nFrom: Mike Rosing <eresrch*eskimo.com>\nhttp://www.eskimo.com/~eresrch/viridian\n\nFrom: Till Westermayer <till*tillwe.de>\nhttp://www.westermayer.de/till/projekte/02gcsdl.htm\n\nFrom:Duncan Stewart <stewarts*stewarts.org?>\nhttp://www.stewarts.org/viridian/GCS\n\nFrom: R. Charles Flickinger <idlewild*mac.com>\nhttp://homepage.mac.com/iHUG/GCS2000.html\n\nFrom:"Kevin Prichard" <kevin*indymedia.org>\n\n"I  nominate Rop Gonggrijp\'s Secure Notebook, which was \nshown recently at H2K2. (http://www.h2k2.net).\n\nhttp://www.nah6.com/\nhttp://www.nah6.com/nah6-h2k2_files/v3_document.html\n\n"The premise is both important and hilarious. The Secure \nNotebook provides a Secure Windows XP installation. \nWindows has a long history of being secure neither from \nattack nor privacy incursion, so this is something. \n\n"Nothing gets in and nothing gets out, without it being \nfirewalled,  filtered, proxied, and encrypted. How is this \ndone? A modified Debian  Linux boots first, running custom \nNAH6 crypto device drivers, and then  boots XP within \nvmware."\n\nSincerely yours, \nKevin Prichard \nkevin*indymedia.org\n\nThis contest expires in nine days:  August 15, 2002. \n----------------------------------------------------\n\nSource: Planet Ark\n\nhttp://www.planetark.org/dailynewsstory.cfm/newsid/17160/story.htm\n\n"Cooler Tokyo summers may be just a pipe dream away\nby Elaine Lies\n\nJAPAN: August 5, 2002\n\n   "TOKYO == In what could be the ultimate in public works \nprojects, a Japanese panel of experts has proposed \nrelieving the misery of steamy Tokyo summers by cooling \nthe huge city with sea water and a labyrinth of \nunderground pipes. \n\n   "Though summers are hard in any city, Tokyo\'s narrow \nstreets, hordes of people and clusters of massive \nskyscrapers, largely unrelieved by greenery, produce a \nspecial brand of discomfort.\n\n   "And it gets worse every year.  (((Oh yeah.  You bet it \ndoes.))) The number of nights when temperatures stay above \n25 Celsius (77 Fahrenheit) in Tokyo has doubled over the \nlast 30 years, while average temperatures have shot up by \n2.9 degrees C over the last century. Relief, however \ndistant, could be on the way.  ((("Great news, weather \nsufferers!  We live in the high-tech capital of a G-7 \nstate!")))\n\n   "At the behest of the Construction Ministry, the panel \nhas drawn up a plan that would use a network of buried \npipes, and water pumped from the sea, to cool things down. \n\'In the very best conditions, certain areas could in \ntheory become as much as 2.6 degrees Celsius cooler,\' said \nYujin Minobe, a ministry planner.\n\n    "The huge air-conditioning systems currently used to \ncool buildings get rid of the heat they take out of the \nstructure by venting it into the outside air, raising \ntemperatures still further and creating a \'heat island\' \nphenomenon in large cities.  (((Soon whole *cities* will \ndo it and vent their heat straight into the rising seas! \nLook out, Antarctica.)))\n\n    "Under the plan, this heat would be transferred to \nwater in large underground tanks, and the water then \npumped through a six-km (3.7-mile) network of underground \npipes to a cooling plant on the Tokyo waterfront.\n\n    "There the heat from this water would be transferred \nto cooler sea water before the then-cooled water was \npumped back through the underground pipes. The sea water, \nnow warmed, would be released into the waters of Tokyo \nBay.\n\n    "COSTLY PLAN.  (((That\'s unsurprising.)))  Minobe said \nthe plan would cover some 123 hectares (304 acres) in the \ncentre of Tokyo, including the Marunouchi business \ndistrict and the posh Ginza shopping area, and would \ninitially cost around 41 billion yen ($344 million).\n\n    "\'Savings on reduced energy usage would eventually \nhelp pay for this,\' he said.  (((A real nest of ironies \nhere, folks.))) Officials quoted in the English-language \nJapan Times said energy savings would total more than 1 \nbillion yen a year, meaning the system would pay for \nitself in a bit over 30 years.\n\n    "However, Minobe said many problems remained with the \nplan, which has only been under discussion since April \nlast year. One of the most serious problems is whether \nwarmer water being returned to Tokyo Bay would damage the \nfragile marine ecosystem, a point Minobe said still \nrequired more study.  (((Give it 30 years and there won\'t \nbe any ecosystem left to study.)))\n\n    "He said the average temperature cut is likely to be \nonly around 0.4 degrees. \'I\'m not even sure people would \nbe able to feel that difference,\' he said. Any such plan, \nhowever, would likely produce a gleam in the eyes of \nJapan\'s huge construction industry, known for its \npropensity for public works projects. Although several are \ndecried as wasteful, public works projects have long been \nused by the government in attempts to stimulate the \neconomy.  (((Nice use of the word "attempts.")))\n\n    "Frankly, I think this plan is still really more of a \ndream than anything else," Minobe said. \n\nO=c=O O=c=O O=c=O O=c=O\nTOKYO STAYS COOL\nAS DEADLY HEATWAVE BAKES \nKOBE, OSAKA, KYOTO\nO=c=O O=c=O O=c=O O=c=O\n\n\n'
emails = np.array([email_str], dtype='O')

In [100]:
msg = email.message_from_string(email_str)

In [110]:
x_test[400]



In [111]:
y_test[400]

0.0

In [101]:
msg.get_payload()

'Key concepts:  Tokyo, urban overheating,\nclimate change remediation\n\nAttention Conservation Notice: a weird,\nhand-waving Nipponese mega-scheme.\n\nLinks:\nhttp://http://www.viridiandesign.org/products/furniture.htm\nFrom:Laurence Aurbach <translucent*spamcop.net?>\nSubject:Viridian Furniture List\n\nThe Viridian Furniture List is now online in the \n"Recommended Products" section of the Viridian website. \nDavid Bergman did a yeoman-like job assembling this list \nand adding comments. He\'s also mirroring the list on his \nown furniture site, Fire and Water.\nhttp://cyberg.com/fw/ecofurn.htm \n\nMaybe you\'ll find a woven bamboo buffet or a biopolymer \nmesh coffee table. == L.J. Aurbach\n\n\n---------------------------------------------------\nEntries in the Global Civil Society Design Contest.\n\nFrom: Steven W. Schuldt <swschuldt*mac.com>\nhttp://www.americanrobotz.com/images2/Soon_GlobalCivilSocietyLaptop.jpg\n\nFrom: Ben Davis <bend*earthlink.net>\nhttp://www.digitaleverythin