In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as scio
from sklearn import svm

import processEmail as pe
import emailFeatures as ef

In [2]:
plt.ion()
np.set_printoptions(formatter={'float': '{: 0.6f}'.format})

In [3]:
import numpy as np
import re
import nltk, nltk.stem.porter


def get_vocab_list():
    vocab_dict = {}
    with open('vocab.txt') as f:
        for line in f:
            (val, key) = line.split()
            vocab_dict[int(val)] = key

    return vocab_dict


def process_email(email_contents):
    vocab_list = get_vocab_list()

    word_indices = np.array([], dtype=np.int64)

    # ===================== Preprocess Email =====================

    email_contents = email_contents.lower()

    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Any numbers get replaced with the string 'number'
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Anything starting with http or https:// replaced with 'httpaddr'
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Strings with "@" in the middle are considered emails --> 'emailaddr'
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # The '$' sign gets replaced with 'dollar'
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # ===================== Tokenize Email =====================

    # Output the email
    # print('==== Processed Email ====')

    stemmer = nltk.stem.porter.PorterStemmer()

    # print('email contents : {}'.format(email_contents))

    tokens = re.split('[@$/#.-:&*+=\[\]?!(){\},\'\">_<;% ]', email_contents)

    idx_list = []
    for token in tokens:
        token = re.sub('[^a-zA-Z0-9]', '', token)
        token = stemmer.stem(token)

        if len(token) < 1:
            continue

        # ===================== Your Code Here =====================
        # Instructions : Fill in this function to add the index of token to
        #                word_indices if it is in the vocabulary. At this point
        #                of the code, you have a stemmed word frome email in
        #                the variable token. You should look up token in the
        #                vocab_list. If a match exists, you should add the
        #                index of the word to the word_indices nparray.
        #                Concretely, if token == 'action', then you should
        #                look up the vocabulary list the find where in vocab_list
        #                'action' appears. For example, if vocab_list[18] == 'action'
        #                then you should add 18 to the word_indices array.



        # ==========================================================
        for idx, s in vocab_list.items():
            if s == token:
                idx_list.append(idx)

    word_indices = np.array(idx_list)
    # print('==================')

    return word_indices


In [4]:
# ===================== Part 1: Email Preprocessing =====================
# To use an SVM to classify emails into spam v. non-spam, you first need to
# convert each email into a vector of features. In this part, you will
# implement the preprocessing steps for each email. You should
# complete the code in processEmail.py to produce a word indices vector
# for a given email.

print('Preprocessing sample email (emailSample1.txt) ...')

file_contents = open('emailSample1.txt', 'r').read()
word_indices = process_email(file_contents)

Preprocessing sample email (emailSample1.txt) ...


In [5]:
# Print stats
print('Word Indices: ')
print(word_indices)

Word Indices: 
[  86  916  794 1077  883  370 1699  790 1822 1831  883  431 1171  794
 1002 1893 1364  592 1676  238  162   89  688  945 1663 1120 1062 1699
  375 1162  479 1893 1510  799 1182 1237  810 1895 1440 1547  181 1699
 1758 1896  688 1676  992  961 1477   71  530 1699  531]


In [6]:
import numpy as np


def email_features(word_indices):
    # Total number of words in the dictionary
    n = 1899

    # You need to return the following variables correctly.
    # Since the index of numpy array starts at 0, to align with the word indices we make n + 1 size array
    features = np.zeros(n + 1)

    # ===================== Your Code Here =====================
    # Instructions : Fill in this function to return a feature vector for the
    #                given email (word_indices). To help make it easier to
    #                process the emails, we have already pre-processed each
    #                email and converted each word in the email into an index in
    #                a fixed dictionary (of 1899 words). The variable
    #                word_indices contains the list of indices of the words
    #                which occur in one email.
    #
    #                Concretely, if an email has the text:
    #
    #                   The quick brown fox jumped over the lazy dog.
    #
    #                Then, the word_indices vector for this text might look
    #                like:
    #
    #                   60  100   33  44  10      53  60  58  5
    #
    #                where, we have mapped each word onto a number, for example:
    #
    #                   the     --  60
    #                   quick   --  100
    #                   ...
    #
    #                Your task is take one such word_indices vector and construct
    #                a binary feature vector that indicates whether a particular
    #                word occurs in the email. That is, features[i] = 1 when word i
    #                is present in the email. Concretely, if the word 'the' (say,
    #                index 60) appears in the email, then features[60] = 1. The feature
    #                vector should look like:
    #
    #                features = [0, 0, 0, 0, 1, 0, 0, 0, ... 0, 0, 0, 1, ... 0, 0, 0, 1, 0]
    #
    #

    for idx in word_indices:
        features[idx] = 1
    # ==========================================================

    return features


In [7]:
# ===================== Part 2: Feature Extraction =====================
# Now, you will convert each email into a vector of features in R^n.
# You should complete the code in emailFeatures.py to produce a feature
# vector for a given mail

print('Extracting Features from sample email (emailSample1.txt) ... ')

# Extract features
features = email_features(word_indices)

Extracting Features from sample email (emailSample1.txt) ... 


In [8]:
# Print stats
print('Length of feature vector: {}'.format(features.size))
print('Number of non-zero entries: {}'.format(np.flatnonzero(features).size))

Length of feature vector: 1900
Number of non-zero entries: 45


In [9]:
# ===================== Part 3: Train Linear SVM for Spam Classification =====================
# In this section, you will train a linear classifier to determine if an
# email is Spam or Not-spam.

# Load the Spam Email dataset
# You will have X, y in your environment
data = scio.loadmat('spamTrain.mat')
X = data['X']
y = data['y'].flatten()

print('Training Linear SVM (Spam Classification)')
print('(this may take 1 to 2 minutes)')

Training Linear SVM (Spam Classification)
(this may take 1 to 2 minutes)


In [10]:
c = 0.1
clf = svm.SVC(c, kernel='linear')
clf.fit(X, y)

p = clf.predict(X)

print('Training Accuracy: {}'.format(np.mean(p == y) * 100))

Training Accuracy: 99.825


In [11]:
# ===================== Part 4: Test Spam Classification =====================
# After training the classifier, we can evaluate it on a test set. We have
# included a test set in spamTest.mat

# Load the test dataset
data = scio.loadmat('spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest'].flatten()

print('Evaluating the trained linear SVM on a test set ...')

p = clf.predict(Xtest)

print('Test Accuracy: {}'.format(np.mean(p == ytest) * 100))

Evaluating the trained linear SVM on a test set ...
Test Accuracy: 98.9


In [12]:
# ===================== Part 5: Top Predictors of Spam =====================
# Since the model we are training is a linear SVM, we can inspect the w
# weights learned by the model to understand better how it is determining
# whether an email is spam or not. The following code finds the words with
# the highest weights in the classifier. Informally, the classifier
# 'thinks' that these words are the most likely indicators of spam.
#

vocab_list = pe.get_vocab_list()
indices = np.argsort(clf.coef_).flatten()[::-1]
print(indices)

for i in range(15):
    print('{} ({:0.6f})'.format(vocab_list[indices[i]], clf.coef_.flatten()[indices[i]]))


[1190  297 1397 ... 1764 1665 1560]
otherwis (0.500614)
clearli (0.465916)
remot (0.422869)
gt (0.383622)
visa (0.367710)
base (0.345064)
doesn (0.323632)
wife (0.269724)
previous (0.267298)
player (0.261169)
mortgag (0.257298)
natur (0.253941)
ll (0.253467)
futur (0.248297)
hot (0.246404)
