In [228]:
import os
import urllib.request
import tarfile
import re

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [586]:
DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_PATHS = [
    '20021010_easy_ham.tar.bz2',
    '20021010_hard_ham.tar.bz2',
    '20021010_spam.tar.bz2',
    '20030228_easy_ham.tar.bz2',
    '20030228_easy_ham_2.tar.bz2',
    '20030228_hard_ham.tar.bz2',
    '20030228_spam.tar.bz2',
    '20030228_spam_2.tar.bz2',
    '20050311_spam_2.tar.bz2',
]

### Concatenating all datasets might not work!! Too much data

In [587]:
def fetch_spam_data(download_root=DOWNLOAD_ROOT, spam_paths=SPAM_PATHS):
    if not os.path.isdir('data'):
        os.makedirs('data')
    for path in SPAM_PATHS:
        url = DOWNLOAD_ROOT + path
        tgz_path = os.path.join('data', path)
        urllib.request.urlretrieve(url, tgz_path)
        spam_tgz = tarfile.open(tgz_path)
        spam_tgz.extractall(path='data')
    spam_tgz.close()

In [588]:
fetch_spam_data()

### One spam example + one ham example should do it

In [237]:
def load_data():
    ham_path = os.path.join(os.getcwd(), 'data', 'easy_ham_2')
    spam_path = os.path.join(os.getcwd(), 'data', 'spam_2')
    
    x_spam = []
    x_ham = []
    
    for filename in os.listdir(ham_path):
        abs_path = os.path.join(ham_path, filename)
        with open(abs_path, 'r', encoding='utf8', errors='ignore') as f:
            x_ham.append(f.read())

    for filename in os.listdir(spam_path):
        abs_path = os.path.join(spam_path, filename)
        with open(abs_path, 'r', encoding='utf8', errors='ignore') as f:
            x_spam.append(f.read())
            
    return np.array(x_spam, dtype='object'), np.array(x_ham, dtype='object')

## Warning! Don't use the function below

In [238]:
def load_all_data():
    """
    If OpenAI ever wants to test their TPUs, this is a good place to start.
    Not feasible on a normal computer though :(
    """
    x_spam = []
    x_ham = []
    path = os.path.join(os.getcwd(), 'data')
    for dirname in os.listdir(path):
        if 'tar' not in dirname:
            if 'spam' in dirname:
                for filename in os.listdir(os.path.join(path, dirname)):
                    abs_path = os.path.join(path, dirname, filename)
                    with open(abs_path, 'rb') as f:
                        x_spam.append(f.read().split())
            else:
                for filename in os.listdir(os.path.join(path, dirname)):
                    abs_path = os.path.join(path, dirname, filename)
                    with open(abs_path, 'rb') as f:
                        x_ham.append(f.read().split())
    return np.array(x_spam), np.array(x_ham)

In [240]:
x_spam, x_ham = load_data()
y_spam = np.ones(x_spam.shape)
y_ham = np.zeros(x_ham.shape)

In [223]:
x = np.concatenate([x_spam, x_ham])
y = np.concatenate([y_spam, y_ham])

### Define a custom tokenizer to ignore non UTF-8 chars

In [292]:
vectorizer = CountVectorizer(encoding='ISO-8859-1')
emails_bow_sparse = vectorizer.fit_transform(x)

In [296]:
emails_bow = emails_bow_sparse.toarray()

numpy.ndarray