## Programming Exercise 6: Support Vector Machines


In [182]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io 
from sklearn import svm 
import re 
import pandas as pd
from stemming.porter2 import stem
import nltk, nltk.stem.porter
import pandas as pd

## 2.1 Preprocessing Emails

In [39]:
email = ''
print("emailSample1.txt:")
email_sample = open("ex6/emailSample1.txt")
print(email_sample.read())


emailSample1.txt:
> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




In [86]:
def preProcess( email ):
    email = email.lower()
    email = re.sub("<[^<>]+>", "", email)
    email = re.sub("(http|https)://[^\s]*", "httpaddr", email)
    email = re.sub("[^\s]+@[^\s]+", 'emailaddr', email)
    email = re.sub("[\d]+", "number", email)
    email = re.sub("[\$]+", "dollar", email)
    return email

In [87]:
with open("ex6/emailSample2.txt") as f:
    print(preProcess(f.read()))

folks,
 
my first time posting - have a bit of unix experience, but am new to linux.

 
just got a new pc at home - dell box with windows xp. added a second hard disk
for linux. partitioned the disk and have installed suse number.number from cd, which went
fine except it didn't pick up my monitor.
 
i have a dell branded enumberfpp number" lcd flat panel monitor and a nvidia geforcenumber
tinumber video card, both of which are probably too new to feature in suse's default
set. i downloaded a driver from the nvidia website and installed it using rpm.
then i ran saxnumber (as was recommended in some postings i found on the net), but
it still doesn't feature my video card in the available list. what next?
 
another problem. i have a dell branded keyboard and if i hit caps-lock twice,
the whole machine crashes (in linux, not windows) - even the on/off switch is
inactive, leaving me to reach for the power cable instead.
 
if anyone can help me in any way with these probs., i'd be really gra

In [90]:
help(nltk.stem.porter.PorterStemmer())

Help on PorterStemmer in module nltk.stem.porter object:

class PorterStemmer(nltk.stem.api.StemmerI)
 |  PorterStemmer(mode='NLTK_EXTENSIONS')
 |  
 |  A word stemmer based on the Porter stemming algorithm.
 |  
 |      Porter, M. "An algorithm for suffix stripping."
 |      Program 14.3 (1980): 130-137.
 |      
 |  See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
 |  of the algorithm.
 |      
 |  Martin Porter has endorsed several modifications to the Porter
 |  algorithm since writing his original paper, and those extensions are
 |  included in the implementations on his website. Additionally, others
 |  have proposed further improvements to the algorithm, including NLTK
 |  contributors. There are thus three modes that can be selected by
 |  passing the appropriate constant to the class constructor's `mode`
 |  attribute:
 |  
 |      PorterStemmer.ORIGINAL_ALGORITHM
 |      - Implementation that is faithful to the original paper.
 |      
 |        Note that M

In [114]:
def email2TokenList( raw_email ):
    stemmer = nltk.stem.porter.PorterStemmer()
    raw_email = preProcess(raw_email)
    tokens = re.split("[ \`\~\!\@\#\$\%\^\&\*\(\)\-\_\=\+\[\]\{\}\;\:\'\"\|\\\,\<\>\.\/\?\s]", raw_email)
    tokenlist = []
    for token in tokens:
        token = re.sub("^a-zA-Z0-9", "", token)
        stemmed = stemmer.stem(token)
        if len(token) == 0:
            continue
        tokenlist.append(stemmed)
    return tokenlist

In [146]:
raw_email = open("ex6/emailSample1.txt", 'r')
#raw_email = email2TokenList(raw_email.read())
raw_email = raw_email.read()

## 2.1.1 Vocabulary List


In [138]:
def getVocabDict(reverse = False):
    vocab_dict = dict()
    with open("ex6\\vocab.txt") as f:
        for line in f:
            value, key = line.split()
            if not reverse:
                vocab_dict[key] = int(value)
            else:
                vocab_dict[int(value)] = key
    return vocab_dict

In [147]:
vocab_dict = getVocabDict(False)
vocab_dict['emailaddr']

531

In [148]:
def email2VocabIndices(raw_email, vocab_dict):
    token_list = email2TokenList(raw_email)
    vocab_indices = [vocab_dict[token] for token in token_list if token in vocab_dict]
    return vocab_indices

In [149]:
email2VocabIndices(raw_email, vocab_dict )


[86,
 916,
 794,
 1077,
 883,
 370,
 1699,
 790,
 1822,
 1831,
 883,
 431,
 1171,
 794,
 1002,
 1893,
 1364,
 592,
 1676,
 238,
 162,
 89,
 688,
 945,
 1663,
 1120,
 1062,
 1699,
 375,
 1162,
 479,
 1893,
 1510,
 799,
 1182,
 1237,
 810,
 1895,
 1440,
 1547,
 181,
 1699,
 1758,
 1896,
 688,
 1676,
 992,
 961,
 1477,
 71,
 530,
 1699,
 531]

## 2.2 Extracting Features from Emails


In [162]:
def email2FeatureVector(raw_email, vocab_dict):
    n = len(vocab_dict)
    vocab_list = np.zeros((n, 1))
    vocab_indices = email2VocabIndices(raw_email, vocab_dict)
    for vocab in vocab_indices:
        #print(vocab)
        vocab_list[vocab] = 1
    return vocab_list

In [163]:
len(email2FeatureVector(raw_email, vocab_dict ))

1899

In [166]:
vocab_dict = getVocabDict()
email_contents = open("ex6\\emailSample1.txt").read()
feature_vector = email2FeatureVector(email_contents, vocab_dict)

print("Length of feature vector: {}".format(len(feature_vector)))
print("Number of 1's entries: {}".format(sum(feature_vector==1)[0]))

Length of feature vector: 1899
Number of 1's entries: 45


## 2.3 Training SVM for spam classification

In [168]:
datafile = 'ex6/spamTrain.mat'
mat = scipy.io.loadmat( datafile )
X, y = mat['X'], mat['y']
#X =     np.insert(X    ,0,1,axis=1)

datafile = 'ex6/spamTest.mat'
mat = scipy.io.loadmat( datafile )
Xtest, ytest = mat['Xtest'], mat['ytest']

In [171]:
pos = np.array([X[i] for i in range(X.shape[0]) if y[i] == 1])
neg = np.array([X[i] for i in range(X.shape[0]) if y[i] == 0])
print("Total number of training samples: {}".format(X.shape[0]))
print("Number of training spam emails: {}".format(pos.shape[0]))
print("Number of training nonspam emails: {}".format(neg.shape[0]))

Total number of training samples: 4000
Number of training spam emails: 1277
Number of training nonspam emails: 2723


In [170]:
linear_svm = svm.SVC(C=0.1, kernel='linear')

linear_svm.fit(X, y.flatten())

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [178]:
train_prediction = linear_svm.predict(X).reshape((y.shape[0], 1))
train_acc = 100 * np.sum(train_prediction==y) / y.shape[0]
print("Training Accuracy: ", train_acc)

test_prediction = linear_svm.predict(Xtest).reshape((ytest.shape[0], 1))
test_acc = 100 * np.sum(test_prediction==ytest) / ytest.shape[0]
print("Test Accuracy: ", test_acc)

Training Accuracy:  99.825
Test Accuracy:  98.9


## Top predictors for Spam

In [213]:
vocab_dict_flipped = getVocabDict(reverse=True)

sorted_indices = np.argsort( linear_svm.coef_, axis=None )[::-1]

most_important_features = pd.DataFrame([ vocab_dict_flipped[x] for x in sorted_indices[:15] ], columns=["Most Important"])

least_important_features = pd.DataFrame([ vocab_dict_flipped[x] for x in sorted_indices[-15:] ], columns=["Least Important"])

print(most_important_features)
print("\n")
print(least_important_features)
print("\n")
# Most common word (mostly to debug):
most_common_word = vocab_dict_flipped[sorted_indices[0]]
print('# of spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(pos[:,1190]),pos.shape[0],  \
     100.*float(sum(pos[:,1190]))/pos.shape[0]))
print('# of NON spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(neg[:,1190]),neg.shape[0],      \
     100.*float(sum(neg[:,1190]))/neg.shape[0]))

   Most Important
0        otherwis
1         clearli
2           remot
3              gt
4            visa
5            base
6           doesn
7            wife
8        previous
9          player
10        mortgag
11          natur
12             ll
13          futur
14            hot


   Least Important
0             http
1             toll
2               xp
3            ratio
4           august
5       unsubscrib
6          useless
7         numberth
8            round
9            linux
10         datapow
11           wrong
12          urgent
13            that
14            spam


# of spam containing "otherwis" = 804/1277 = 62.96%
# of NON spam containing "otherwis" = 301/2723 = 11.05%
