In [2]:
from __future__ import unicode_literals
import os
import codecs
import pandas as pd
import numpy as np
from email.parser import Parser

%matplotlib inline
from matplotlib import pyplot as plt

import seaborn as sns
sns.set(color_codes=True)

import regex as re

## I - Data Extraction

### Reading in of csv file

In [9]:
msg = pd.read_csv('message_df.csv', encoding='utf-8')

## II- Data Exploration & Feature Engineering

In [10]:
import spacy
import string
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 

nlp = spacy.load('en')
punc = string.punctuation

stopw = [x for x in stopwords]
stopw.extend(['what','when','who','why', 'X', 'article', 'thing', 'way'])

In [15]:
def clean_text(msg):
    msg = nlp(msg)
    tokens = [str(token.lemma_) for token in msg]
    tokens = [tok for tok in tokens if (tok not in stopw and tok not in punc)] 
    return tokens

## IV - Data Modelling

### Split dataset into training and testing
- Full text dataset
- Noun dataset

In [16]:
x = msg['Body']
y = msg['Category']

x_text = x.apply(clean_text).apply(str)
# x_noun = x.apply(clean_text).apply(str)

TypeError: Argument 'string' has incorrect type (expected unicode, got float)

In [38]:
pd.options.display.max_colwidth = -1
x

0        ['misc.entrepreneurs,misc.wanted,pnw.forsale,uw.pc.ibm,seattle.forsale,uw..forsale,misc.forsale,misc.forsale.computers.d,misc.forsale.computers.pc-clone,misc.forsale.coomputers.other,distribution', 'worldfollowup', 'from:yuri@atmos.washington.edureply-to', 'yuri@atmos.washington.eduorganization', 'subject', 'simms', 'sipp', 'mb', 'neededkeyword', 'simms', 'sipp', 'mb', 'price', 'offer', 'ave', 'home', 'work', 'fax)internet', 'yuri@atmos.washington.eduuucp', 'beaver!atmos.washington.edu!yuri']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [32]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

### Vectorizer
 - CountVectorizer
 - TfidfVectorizer (norm ='l1') : Manhattan distance 
 - TfidfVectorizer (norm ='l2') : Euclidean norm
 
CountVectorizer: Convert a collection of text documents to a matrix of token counts. <br>

TfidfVectorizer normalizes its result : Equivalent to CountVectorizer followed by TfidfTransformer.

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()

### Naive Bayes 
- Utilize Pipeline for cross-validation across different vectorizers

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
nb = MultinomialNB()

In [28]:
# create a pipeline of CountVectorizer and Naive Bayes
nb_count = make_pipeline(count_vect, nb)
nb_tfidf = make_pipeline(tfidf_vect, nb)

In [34]:
# cross_val splits data into training and testing 
# cross-validate the entire pipeline
# cv specifies the number of folds
from sklearn.model_selection import cross_val_score

print('Score for CountVectorizer and NB:')
print(cross_val_score(nb_count, x, y, cv=6, scoring='accuracy').mean())

Score for CountVectorizer and NB:
0.727165153378


In [35]:
print('Score for l1 and NB:')
print(cross_val_score(nb_tfidf, x, y, cv=6, scoring='accuracy').mean())

Score for l1 and NB:
0.756710521628


### K-Nearest-Neighbours (KNN)
 - **Non-parametric**: required no prior knowledge of the distribution of the data
 - **Instanace-based**: memorizes the training instance
 - Minimal training but expensive testing (memory + computational cost): classifying a given observation = run down of whole data set. **Not optimal for huge datasets**
 
 Cons: 
 - Can suffer from skewed class distributions when a certain class dominates the majority voting of the new example
 - Accuracy degrade with high-dimension data due to little difference between nearest and farthest neighbour

Improvement:
- skewed class distance: weighed voting
- changing distance metric: hamming distance for text

In [50]:
# use KNN with K=800
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [51]:
knn_count = make_pipeline(count_vect, knn)
knn_tfidf = make_pipeline(tfidf_vect, knn)

In [131]:
print('Score for CountVectorizer and KNN:')
print(cross_val_score(knn_count, x, y, cv=6, scoring='accuracy').mean())

Score for CountVectorizer and KNN:
0.140066221256


In [130]:
print('Score for l1 and KNN:')
print(cross_val_score(knn_tfidf, x, y, cv=6, scoring='accuracy').mean())

Score for l1 and KNN:
0.0640616425167


### Support Vector Machine (SVM)

RBF kernel is usully for images, not for text.
Linear kernels ar egood for text and compute faster 

In [47]:
# use SVM with a linear kernel
from sklearn import svm
svm = svm.SVC()

In [48]:
svm_count = make_pipeline(count_vect, svm)
svm_tfidf = make_pipeline(tfidf_vect, svm)

In [138]:
print('Score for CountVectorizer and SVM:')
print(cross_val_score(svm_count, x, y, cv=6, scoring='accuracy').mean())

Score for CountVectorizer and SVM:
0.653541222005


In [40]:
print('Score for l1 and SVM:')
print(cross_val_score(svm_tfidf, x, y, cv=6, scoring='accuracy').mean())

Score for l1 and SVM:


KeyboardInterrupt: 

### GridSearchCV with Pipeline

- **GridSearchCV** is usually used to locate optimal tuning parameters by performing an "exhaustive grid search" of different parameter combinations to obtain the best cross-validated accuracy.

- Passing **Pipeline** to **GridSearchCV** allows us to optimize tuning parameters for both the vectorizer and model.

Tfidf:

   max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
max_df = 25 means "ignore terms that appear in more than 25 documents".
The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

min_df is used for removing terms that appear too infrequently. For example:

min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
min_df = 5 means "ignore terms that appear in less than 5 documents".
The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms. 

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
# Paramsfor countvectorizer 
param_grid_nb = {'countvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
                 'multinomialnb__alpha':[0,0.025,0.05,0.1,0.3]}

param_grid_knn = {'countvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
                  'kneighborsclassifier__n_neighbors':[10,200,500,800,1200],
                  'kneighborsclassifier__weights':['uniform','distance'],
                  'kneighborsclassifier__metric':['euclidean','minkowski']}

param_grid_svm = {'countvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
                  'svc__C':[1.0,20.5,100.0,250.0,500.0],
                  'svc__tol':[0.0001,0.001,0.01,0.1,0.011]}

In [39]:
nb_tfidf.get_params().keys()

['tfidfvectorizer__lowercase',
 'tfidfvectorizer__vocabulary',
 'tfidfvectorizer__dtype',
 'tfidfvectorizer__binary',
 'tfidfvectorizer__encoding',
 'tfidfvectorizer__decode_error',
 'tfidfvectorizer__stop_words',
 'tfidfvectorizer__norm',
 'multinomialnb',
 'tfidfvectorizer__max_features',
 'tfidfvectorizer__use_idf',
 'tfidfvectorizer__preprocessor',
 'tfidfvectorizer__tokenizer',
 'multinomialnb__class_prior',
 'tfidfvectorizer__strip_accents',
 'tfidfvectorizer__analyzer',
 'tfidfvectorizer',
 'tfidfvectorizer__max_df',
 'tfidfvectorizer__smooth_idf',
 'multinomialnb__fit_prior',
 'tfidfvectorizer__input',
 'tfidfvectorizer__sublinear_tf',
 'tfidfvectorizer__min_df',
 'multinomialnb__alpha',
 'tfidfvectorizer__ngram_range',
 'tfidfvectorizer__token_pattern',
 'steps']

In [44]:
# Params for tfidfvectorizer
param_grid_nb_tfidf = {'tfidfvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
                       'multinomialnb__alpha':[0,0.025,0.05,0.1,0.3],
                       'tfidfvectorizer__norm':['l1','l2'],
                       'tfidfvectorizer__min_df':[0.0,0.03,0.1],
                       'tfidfvectorizer__use_idf':[True,False]}


param_grid_knn_tfidf = {'tfidfvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
                        'kneighborsclassifier__n_neighbors':[10,200,500,800,1200],
                        'kneighborsclassifier__weights':['uniform','distance'],
                        'kneighborsclassifier__metric':['euclidean','minkowski'],
                        'tfidfvectorizer__norm':['l1','l2'],
                       'tfidfvectorizer__min_df':[10,15.5,20]}

param_grid_svm_tfidf = {'tfidfvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
                        'svc__C':[1.0,20.5,100.0,250.0,500.0],
                        'svc__tol':[0.0001,0.001,0.01,0.1,0.011],
                        'tfidfvectorizer__norm':['l1','l2'],
                       'tfidfvectorizer__min_df':[10,15.5,20]}

In [37]:
# Tfidf Naive Bayes
grid_nb_tfidf = GridSearchCV(nb_tfidf, param_grid_nb_tfidf, cv=5, scoring='accuracy')
grid_nb_tfidf.fit(x_train,y_train)

print(grid_nb_tfidf.best_score_)
print(grid_nb_tfidf.best_params_)

  self.feature_log_prob_ = (np.log(smoothed_fc) -


0.770354070814
{u'tfidfvectorizer__token_pattern': u'\\b\\w\\w+\\b', u'tfidfvectorizer__norm': u'l2', u'multinomialnb__alpha': 0.05, u'tfidfvectorizer__min_df': 0.0, u'tfidfvectorizer__use_idf': True}


In [5]:
# CountVectorizer Naive Bayes
grid_nb_count = GridSearchCV(nb_count, param_grid_nb, cv=5, scoring='accuracy')
%time grid_nb_count.fit(x_train,y_train)

print(grid_nb_count.best_score_)
print(grid_nb_count.best_params_)

NameError: name 'nb_count' is not defined

In [53]:
"""Takes forever"""
# Tfidf Knn
grid_knn_tfidf = GridSearchCV(knn_tfidf, param_grid_knn_tfidf, cv=5, scoring='accuracy')
grid_knn_tfidf.fit(x_train,y_train)

print(grid_knn_tfidf.best_score_)
print(grid_knn_tfidf.best_params_)

KeyboardInterrupt: 

In [203]:
# CountVectorizer Knn
grid_knn_count = GridSearchCV(knn_count, param_grid_knn, cv=5, scoring='accuracy')
grid_knn_count.fit(x_train,y_train)
print(grid_knn_count.best_score_)
print(grid_knn_count.best_params_)

0.428964344652
{u'countvectorizer__token_pattern': u"'([a-z ]+)'", u'kneighborsclassifier__n_neighbors': 10, u'kneighborsclassifier__weights': u'distance'}


In [None]:
# Tfidf SVM
grid_svm_tfidf = GridSearchCV(svm_tfidf, param_grid_svm_tfidf, cv=5, scoring='accuracy')
grid_svm_tfidf.fit(x_train,y_train)

print(grid_svm_tfidf.best_score_)
print(grid_svm_tfidf.best_params_)

In [None]:
# CountVectorizer SVM
grid_svm_count = GridSearchCV(svm_count, param_grid_svm, cv=5, scoring='accuracy')
grid_svm_count.fit(x_train,y_train)
print(grid_svm_count.best_score_)
print(grid_svm_count.best_params_)

### RandomizedSearchCV with Pipeline
- It is too computationally infeasible to search all possible combination of parameter values.
- **RandomizedSearchCV** searches a sample of the parameter values with control on computational "budget".

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Tfidf Naive Bayes
random_nb_tfidf = RandomizedSearchCV(nb_tfidf, param_grid_nb_tfidf, cv=5, scoring='accuracy', n_iter=5, random_state=1)
random_nb_tfidf.fit(x_train,y_train)
print(random_nb_tfidf.best_score_)
print(random_nb_tfidf.best_params_)

0.756484630259
{u'tfidfvectorizer__token_pattern': u'\\b\\w\\w+\\b', u'tfidfvectorizer__norm': u'l2', u'multinomialnb__alpha': 0.1, u'tfidfvectorizer__min_df': 0.0, u'tfidfvectorizer__use_idf': False}


In [None]:
# Tfidf SVM
random_svm_tfidf = RandomizedSearchCV(svm_tfidf, param_grid_svm_tfidf, cv =5, scoring='accuracy', n_iter=5, random_state=1)
random_svm_tfidf.fit(x_train,y_train)
print(random_svm_tfidf.best_score_)
print(random_svm_tfidf.best_params_)

### Ensembling models

Combining of several predictive models to produce a combined model that is better than any individual model.
- **Regression:** average the predictions made by the individual models
- **Classification:** let the models "vote" and use the most common prediction, or average the predicted probabilities

For ensembling to work well, the models must have the following characteristics:

- **Accurate:** they outperform the null model
- **Independent:** their predictions are generated using different "processes", such as:
    - different types of models
    - different features
    - different tuning parameters

### Null Model

- For **classification** problems, the null model always predicts the most frequent class from the training data.
- For **regression problems**, the null model always predicts the mean of the response value from the training data.
- It can be a useful **baseline model** against which our model is measured.

In [146]:
# calculate null model accuracy
y_test.value_counts().head(1)/y_test.shape

talk.politics.misc    0.0586
Name: Category, dtype: float64