In [1]:
%pylab inline

from __future__ import print_function
from __future__ import division

import pandas
import numpy
import matplotlib
import seaborn
import sklearn
import sys

from matplotlib import pyplot

Populating the interactive namespace from numpy and matplotlib


In [2]:
def index_merge(a, b):
    return a.merge(b, left_index = True, right_index = True)
pandas.DataFrame.index_merge = index_merge

# Limiting memory use

In [3]:
import resource

# Limit the memory usage to 6GByte.
resource.setrlimit(resource.RLIMIT_AS, (6e9, 6e9))

# Merging

In [47]:
subject_words = pandas.read_csv('dev/subject_word_features.csv', index_col = 'num')
body_words = pandas.read_csv('dev/body_word_features.csv', index_col = 'num')
features = features.index_merge(subject_words).index_merge(body_words)

In [None]:
features.to_csv('dev/features.csv', index = True, header = True)

# Getting Features

In [4]:
features = pandas.read_csv('dev/features.csv', index_col = 'num').sample(frac = 1, random_state = 0)

# Testing different methods

In [5]:
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier

In [6]:
X = features.drop('spam', axis = 1)
y = features.spam
%xdel features

In [7]:
word_bag = X.columns[X.columns.str.contains('_contains_')]
existence = X.columns[X.columns.str.contains('_exists')]
categorization = X.columns[X.columns.str.contains('=')]

features = ['_length', '_words', '_fields', '_avgWordLength', '_avgFieldLength']
header_features = X.columns[reduce(lambda x, y: x | y, map(X.columns.str.contains, features))]

assert (X.columns ^ (word_bag | header_features | existence | categorization)).size == 0, 'Not all features were categorized!'

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [9]:
folds = 10

In [10]:
def data(a):
    # print(u'µ = {}\nσ = {}\nmin = {}\nmax = {}\nmedian = {}'.format(a.mean(), a.std(), a.min(), a.max(), numpy.median(a)))
    print(pandas.Series(a).describe(percentiles = [.5]).drop('count'))
    return a

## Decision Tree

### As large as possible

In [10]:
# This takes WAY too long!

dt = DecisionTreeClassifier(random_state = 0)
data(cross_val_score(dt, X, y, cv = folds))

KeyboardInterrupt: 

### A little more comprehensive

In [16]:
dt = DecisionTreeClassifier(max_features = 'sqrt', max_depth = 5, random_state = 0)
data(cross_val_score(dt, X, y, cv = folds))

mean    0.973728
std     0.002218
min     0.971852
50%     0.972716
max     0.978395
dtype: float64


array([ 0.97839506,  0.97308642,  0.97234568,  0.97320988,  0.97234568,
        0.97185185,  0.97679012,  0.97222222,  0.97234568,  0.97469136])

In [19]:
sklearn.tree.export_graphviz(
    dt.tree_,
    feature_names = X.columns,
    class_names = ['ham', 'spam'],
    filled = True,
    proportion = True
)

AttributeError: 'NoneType' object has no attribute 'tree_'

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
rf = RandomForestClassifier(max_depth = 15)
data(cross_val_score(rf, X, y, cv = folds))

mean    0.993148
std     0.001312
min     0.991111
50%     0.993210
max     0.995432
dtype: float64


array([ 0.99382716,  0.99469136,  0.99259259,  0.99197531,  0.99197531,
        0.99111111,  0.9954321 ,  0.99345679,  0.99296296,  0.99345679])

## Naïve Bayes

In [30]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

### Gaussian

In [26]:
nb = GaussianNB()
data(cross_val_score(nb, X, y, cv = folds))

mean    0.914012
std     0.002359
min     0.911111
50%     0.913642
max     0.917407
dtype: float64


array([ 0.91185185,  0.91358025,  0.91148148,  0.91740741,  0.91283951,
        0.91432099,  0.91111111,  0.91641975,  0.9137037 ,  0.91740741])

### Multinomial

In [31]:
nb = MultinomialNB(alpha = 1, fit_prior = False)
data(cross_val_score(nb, X, y, cv = folds))

mean    0.903457
std     0.002926
min     0.898642
50%     0.903025
max     0.908395
dtype: float64


array([ 0.90246914,  0.90061728,  0.89864198,  0.90530864,  0.9037037 ,
        0.90320988,  0.90283951,  0.90839506,  0.90209877,  0.90728395])

### Bernoulli

In [63]:
nb = BernoulliNB(alpha = 1, fit_prior = True)
data(cross_val_score(nb, X, y, cv = folds))

mean    0.951210
std     0.001046
min     0.949877
50%     0.951049
max     0.953086
dtype: float64


array([ 0.95012346,  0.95271605,  0.95148148,  0.95308642,  0.95061728,
        0.95135802,  0.95148148,  0.94987654,  0.95061728,  0.95074074])

## Neighbours

In [12]:
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid

### K Nearest Neighbors

In [66]:
kn = KNeighborsClassifier(5, weights = 'uniform')
data(cross_val_score(kn, X, y, cv = folds))

KeyboardInterrupt: 

In [18]:
kn = KNeighborsClassifier(5, weights = 'uniform')


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
kn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [72]:
%timeit kn.score(X_test, y_test)

1 loop, best of 3: 52.8 s per loop


In [15]:
kn4 = KNeighborsClassifier(4, weights = 'uniform')
# data(cross_val_score(kn, X, y, cv = folds))

In [16]:
kn4.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

In [17]:
%timeit kn4.score(X_test, y_test)

1 loop, best of 3: 53 s per loop


In [20]:
kn.score(X_test, y_test)

0.95703703703703702

In [21]:
kn4.score(X_test, y_test)

0.95772839506172835

### Nearest Centroid

In [17]:
nc = NearestCentroid()
data(cross_val_score(nc, X, y, cv = folds))

mean    0.471185
std     0.003756
min     0.463951
50%     0.471173
max     0.476420
dtype: float64


array([ 0.47358025,  0.46395062,  0.47111111,  0.46703704,  0.47123457,
        0.46987654,  0.47641975,  0.47395062,  0.46987654,  0.47481481])

## Support Vector Machines

In [11]:
from sklearn.svm import SVC, LinearSVC

### Linear kernel

In [12]:
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [13]:
lsvc.score(X_test, y_test)

0.97511111111111115

In [14]:
primalsvc = LinearSVC(dual = False).fit(X_train, y_train)
primalsvc.score(X_test, y_test)

0.9790123456790123

In [12]:
data(cross_val_score(LinearSVC(dual = False), X, y, cv = folds))

mean    0.968210
std     0.002235
min     0.964938
50%     0.968333
max     0.973457
dtype: float64


array([ 0.96851852,  0.96703704,  0.96691358,  0.96876543,  0.9691358 ,
        0.96666667,  0.97345679,  0.96839506,  0.96493827,  0.9682716 ])

### Pipelining some normalization

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer, FunctionTransformer

In [13]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('vector', LinearSVC(dual = False))
])
pipe.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [19]:
pipe.score(X_test, y_test)

0.99244444444444446

In [22]:
pipe = Pipeline([
    ('transform', FunctionTransformer(numpy.log1p)),
    ('scale', StandardScaler()),
    ('vector', LinearSVC(dual = False))
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.99190123456790125

### Estimating C

In [13]:
from sklearn.grid_search import GridSearchCV

In [14]:
estimator = Pipeline([
    ('scale', StandardScaler()),
    ('vector', LinearSVC(dual = False, random_state = 0))
])

gs = GridSearchCV(
    estimator,
    {
        'vector__penalty': ['l2'],
        'vector__C': [.1, 1, 10]
    },
    cv = 3,
    refit = False,
    verbose = 2
)

In [15]:
gs.fit(X, y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] vector__penalty=l2, vector__C=0.1 ...............................
[CV] ...................... vector__penalty=l2, vector__C=0.1 -  19.6s
[CV] vector__penalty=l2, vector__C=0.1 ...............................
[CV] ...................... vector__penalty=l2, vector__C=0.1 -  23.4s
[CV] vector__penalty=l2, vector__C=0.1 ...............................
[CV] ...................... vector__penalty=l2, vector__C=0.1 -  24.2s
[CV] vector__penalty=l2, vector__C=1 .................................
[CV] ........................ vector__penalty=l2, vector__C=1 -  35.9s
[CV] vector__penalty=l2, vector__C=1 .................................
[CV] ........................ vector__penalty=l2, vector__C=1 -  31.9s
[CV] vector__penalty=l2, vector__C=1 .................................
[CV] ........................ vector__penalty=l2, vector__C=1 -  39.5s
[CV] vector__penalty=l2, vector__C=10 ................................
[CV] .............

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.6min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('vector', LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vector__penalty': ['l2'], 'vector__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=False, scoring=None, verbose=2)

In [16]:
gs.best_params_

{'vector__C': 10, 'vector__penalty': 'l2'}

In [19]:
gs.grid_scores_

[mean: 0.99226, std: 0.00040, params: {'vector__penalty': 'l2', 'vector__C': 0.1},
 mean: 0.99236, std: 0.00025, params: {'vector__penalty': 'l2', 'vector__C': 1},
 mean: 0.99238, std: 0.00025, params: {'vector__penalty': 'l2', 'vector__C': 10}]

In [None]:
estimator = Pipeline([
    ('scale', StandardScaler()),
    ('vector', LinearSVC(dual = False, random_state = 0))
])

gs2 = GridSearchCV(
    estimator,
    {
        'vector__penalty': ['l1'],
        'vector__C': [1, 10, 25]
    },
    cv = 3,
    refit = False,
    verbose = 2
)
gs2.fit(X, y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] vector__penalty=l1, vector__C=1 .................................


### Polynomial kernel

In [None]:
psvc = SVC(kernel = 'poly')
psvc.fit(X_train, y_train)

In [None]:
psvc.score(X_test, y_test)

## Voting Classifier

In [11]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid

### With a Decision Tree

In [15]:
vc = VotingClassifier(
    [
        ('tree', DecisionTreeClassifier(max_features = 'sqrt', max_depth = 5)),
        ('bayes', BernoulliNB()),
        # ('vector', SVC(kernel = 'linear', probability = True)),
        ('neighbours', KNeighborsClassifier(5, weights = 'uniform'))
    ],
    voting = 'soft'
)
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')), ('ba...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         voting='soft', weights=None)

In [16]:
vc.score(X_test, y_test)

0.97990123456790124

### With a Random Forest

In [25]:
vc = VotingClassifier(
    [
        ('forest', RandomForestClassifier(n_estimators = 40)),
        ('bayes', BernoulliNB()),
        # ('neighbours', KNeighborsClassifier(5, weights = 'uniform'))
    ],
    voting = 'soft'
)
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)), ('bayes', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))],
         voting='soft', weights=None)

In [26]:
vc.score(X_test, y_test)

0.97851851851851857

In [24]:
rf = RandomForestClassifier(n_estimators = 40, criterion = 'gini')
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
rf.score(X_test, y_test)

0.99683950617283945

# Principal Value Decomposition

In [12]:
from sklearn.decomposition import PCA, RandomizedPCA

In [None]:
pca = PCA(5)
fp = pca.fit_transform(X)

In [20]:
X.shape[1]

1068

In [13]:
rpca = RandomizedPCA(numpy.sqrt(X.shape[1]))
fpr = rpca.fit_transform(X)

In [17]:
rpca.explained_variance_ratio_

array([  9.72033839e-01,   2.35379236e-02,   3.94438650e-03,
         3.50238824e-04,   9.39514866e-05])

In [None]:
fpr_train, fpr_test, y_train, y_test = train_test_split(fpr, y)
SVC(kernel = 'poly').fit(fpr_train, y_train).score(fpr_test, y_test)

# Whitespace

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

