# Programming Exercise 6: Support Vector Machines
# 2 Spam Classification
## 2.1 Preprocessing Emails

In [1]:
from nltk import stem

stemmer = stem.PorterStemmer()
print stemmer.stem('computer')
print stemmer.stem('programming')
print stemmer.stem('university')

comput
program
univers


In [2]:
import re
import string

def process_email(email_contents):
    text = email_contents.lower()
    text = re.sub(r'<[^<>]+>', ' ', text)
    text = re.sub(r'\d+', 'number', text)
    text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
    text = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', text)
    text = re.sub(r'[$]+', 'dollar', text)
    words = text.split()
    words = [''.join(c for c in word if c not in string.punctuation) for word in words]
    words = [re.sub(r'[^a-zA-Z0-9]', '', word) for word in words]
    words = filter(None, map(stemmer.stem, words))  # stem words and remove empty words
    return words

In [3]:
with open('../matlab/emailSample1.txt') as f:
    email1 = f.read()
    processed_email = ' '.join(process_email(email1))
    print processed_email

anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr


In [4]:
answer = '''anyon know how much it cost to host a web portal well it depend on \
how mani visitor your expect thi can be anywher from less than number buck a month \
to a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb if \
your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr'''
answer == processed_email

True

### 2.1.1 Vocabulary List

In [5]:
import csv

with open('../matlab/vocab.txt') as f:
    csv_reader = csv.reader(f, delimiter='\t')
    vocab = {row[1]: int(row[0]) for row in csv_reader}

In [6]:
with open('../matlab/emailSample1.txt') as f:
    words = process_email(f.read())
    ### YOUR CODE HERE ###


    ######################
    print word_indices

[86, 916, 794, 1077, 883, 370, 1699, 790, 1822, 1831, 883, 431, 1171, 794, 1002, 1895, 592, 1676, 238, 162, 89, 688, 945, 1663, 1120, 1062, 1699, 375, 1162, 479, 1893, 1510, 799, 1182, 1237, 810, 1895, 1440, 1547, 181, 1699, 1758, 1896, 688, 1676, 992, 961, 1477, 71, 530, 1699, 531]


## 2.2 Extracting Features from Emails

In [7]:
import numpy as np

def email_features(email_contents):
    ### YOUR CODE HERE ###





    ######################

X = email_features(email1)
print X.sum()
print vocab['anyon'], vocab['know'], vocab['how']
print X[[vocab['anyon'] - 1, vocab['know'] - 1, vocab['how'] - 1]]

44.0
86 916 794
[ 1.  1.  1.]


## 2.3 Training SVM for Spam Classification

In [8]:
import scipy.io

train = scipy.io.loadmat('../matlab/spamTrain.mat')
test = scipy.io.loadmat('../matlab/spamTest.mat')
print train.keys()
print test.keys()

['y', 'X', '__version__', '__header__', '__globals__']
['ytest', 'Xtest', '__version__', '__header__', '__globals__']


In [9]:
X = train['X']
y = train['y'][:, 0]
Xtest = test['Xtest']
ytest = test['ytest'][:, 0]
print X.shape, Xtest.shape

(4000, 1899) (1000, 1899)


In [10]:
from sklearn.svm import SVC

clf = SVC(kernel='linear', C=0.1)
clf.fit(X, y)
y_pred = clf.predict(X)
ytest_pred = clf.predict(Xtest)
print 'train accuracy = %.2f%%' % ((y_pred == y).mean() * 100)
print 'test accuracy = %.2f%%' % ((ytest_pred == ytest).mean() * 100)

train accuracy = 99.83%
test accuracy = 98.90%


## 2.4 Top Predictors for Spam

In [11]:
print clf.coef_

[[ 0.00793208  0.01563324  0.05546492 ..., -0.08670606 -0.00661274
   0.06506632]]


In [12]:
coef = clf.coef_[0]
top_indices = sorted(range(len(coef)), key=lambda i: coef[i], reverse=True)
inverse_table = {i: word for word, i in vocab.items()}
print [inverse_table[i + 1] for i in top_indices[:15]]

['our', 'click', 'remov', 'guarante', 'visit', 'basenumb', 'dollar', 'will', 'price', 'pleas', 'most', 'nbsp', 'lo', 'ga', 'hour']


## 2.5 Optional (ungraded) exercise: Try your own emails

In [13]:
X = np.zeros((4, len(vocab)))
for i, filename in enumerate(('emailSample1.txt', 'emailSample2.txt',
                              'spamSample1.txt', 'spamSample2.txt')):
    with open('../matlab/%s' % filename) as f:
        X[i, :] = email_features(f.read())

print clf.predict(X)

[0 0 1 1]
