# Studying Mr. Geron's Spam Classifier Notebook 

Code often borrowed from [Aurélien Geron's famous Jupyter Notebook on Classification.](https://github.com/ageron/handson-ml/blob/master/03_classification.ipynb)

Data can be pulled from [Apache SpamAssassin's old corpus.](http://spamassassin.apache.org/old/publiccorpus/)

In [1]:
import os
import sys 
import json
import nltk
import time
import pickle
import numpy as np
import scipy.sparse

from datetime import datetime
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import custom_functions as F # see custom module for code

start_time = time.time()
dt_object = datetime.fromtimestamp(time.time())
dt_object = str(dt_object).split('.')[0]
Date, StartTime = dt_object.split(' ')
print('Revised on: ' + Date)

Revised on: 2020-07-22


## Purpose 

Save preprocessed training and test data first - compare results.

### Data Ingestion

In [2]:
F.get_data_if_needed('spam', 'easy_ham', '20030228')

Data successfully downloaded.


In [3]:
data_dir = 'data'
spam_dir = os.path.join(data_dir, 'spam')
ham_dir = os.path.join(data_dir, 'easy_ham')

ham_filenames = [name for name in sorted(os.listdir(ham_dir)) if name != 'cmds']
spam_filenames = [name for name in sorted(os.listdir(spam_dir)) if name != 'cmds']

print('There are ' +str(len(ham_filenames)) + ' ham emails and ' + str(len(spam_filenames)) + ' spam emails.')

There are 2500 ham emails and 500 spam emails.


In [4]:
# extracting emails
spam = F.extract_emails(_path=spam_dir, _names=spam_filenames)
ham = F.extract_emails(_path=ham_dir, _names=ham_filenames)

### Split into Training and Test datasets

We need to split the traing and test sets before gaining too much information on the test set and biasing ourselves in creating the features for the training set.

In [5]:
X = np.array(ham + spam)
y = np.array([0] * len(ham) + [1] * len(spam))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Preprocess Train and Test sets - save

In [98]:
# Mr. Geron's pipeline - using stopwords
preprocess_pipeline_original = Pipeline([
    ("email_to_wordcount", F.EmailToWordCounterTransformer_revised(remove_stopwords=False)),
    ("wordcount_to_vector", F.WordCounterToVectorTransformer()),
])

# New pipeline without stopwords
preprocess_pipeline_nostopwords= Pipeline([
    ("email_to_wordcount", F.EmailToWordCounterTransformer_revised(remove_stopwords=True)),
    ("wordcount_to_vector", F.WordCounterToVectorTransformer()),
])

In [99]:
# setup directory 
path = 'processed_data'
if not os.path.exists(path):
    os.mkdir(path)   

In [100]:
# names and paths

# vocabulary
vocab_path_original = os.path.join(path, ''.join(['vocabulary_original', '.json']))
vocab_path_nostopwords = os.path.join(path, ''.join(['vocabulary_nostopwords', '.json']))

# training data
X_train_path_original = os.path.join(path, ''.join(['X_train_processed_original', '.npz']))
X_train_path_nostopwords = os.path.join(path, ''.join(['X_train_processed_nostopwrods', '.npz']))

# test data
X_test_path_original = os.path.join(path, ''.join(['X_test_processed_original', '.npz']))
X_test_path_nostopwords = os.path.join(path, ''.join(['X_test_processed_nostopwrods', '.npz']))

In [101]:
def load_vocabulary(vocabulary_path):
    try:
        with open(vocabulary_path, 'r') as fp:
            vocabulary_ = json.load(fp)
        return(vocabulary_)
    except FileNotFoundError as e:  
        print(e)

In [102]:
def load_sparse_matrix(filepath):
    try:
        X = np.load(filepath)
        npz = scipy.sparse.coo_matrix((X['data'], (X['row'], X['col'])), shape=X['shape'])
        return(npz)
    except FileNotFoundError as e:  
        print(e)

In [103]:
# load original 
X_train_transformed_original = load_sparse_matrix(X_train_path_original)
X_test_transformed_original = load_sparse_matrix(X_test_path_original)

# load nostopwords
X_train_transformed_nostopwords = load_sparse_matrix(X_train_path_nostopwords)
X_test_transformed_nostopwords = load_sparse_matrix(X_test_path_nostopwords)

In [16]:
## preprocess training and test data (6.5 mins)
#
## original
#X_train_transformed_original = preprocess_pipeline_original.fit_transform(X_train)
#X_test_transformed_original = preprocess_pipeline_original.fit_transform(X_test)
#
## no stopwords
#X_train_transformed_nostopwords = preprocess_pipeline_nostopwords.fit_transform(X_train)
#X_test_transformed_nostopwords = preprocess_pipeline_nostopwords.fit_transform(X_test)

In [105]:
# save vocabulary
def save_vocabulary(vocabulary_path, vocabulary_):
    with open(vocabulary_path, 'w') as fp:
        json.dump(vocabulary_)

In [71]:
def save_sparse_matrix(filepath, X):
    X_coo = X.tocoo()
    row = X_coo.row
    col = X_coo.col
    data = X_coo.data
    shape = X_coo.shape
    np.savez(filepath, row=row, col=col, data=data, shape=shape)

In [75]:
save_sparse_matrix(X_train_path_original, X_train_transformed_original)
save_sparse_matrix(X_train_path_nostopwords, X_train_transformed_nostopwords)
save_sparse_matrix(X_test_path_original, X_test_transformed_original)
save_sparse_matrix(X_test_path_nostopwords, X_test_transformed_nostopwords)

In [95]:
# sparse matrices in processed_data directory
[x for x in os.listdir(path) if x.split('.')[1] == 'npz']

['X_test_processed_nostopwrods.npz',
 'X_test_processed_original.npz',
 'X_train_processed_nostopwrods.npz',
 'X_train_processed_original.npz',
 'X_train_processed_sample1.npz']

### how to get the vocabulary_ ???