## Play with complete dataset from uci.edu

The objective of this is to better my learning with the complete datasest that is got from uci.edu archieves

In [1]:
import io
import numpy as np
import pandas as pd
import codecs # For file reading
from IPython.display import FileLink # For displaying file link in cell-output

## Read raw source and save it as csv file

In [None]:
# How do the lines look like in the target file?
!head -n 10 ./data/uci_edu/SMSSpamCollection.txt

In [None]:
# Read input source
with codecs.open('./data/uci_edu/SMSSpamCollection.txt') as f:
    labels, messages = zip(*[line.split('\t', maxsplit=2) for line in f.readlines()])

In [None]:
# Persist input source as CSV file
df = pd.DataFrame(data={ 'Message' : messages, 'Label' : labels})
df.to_csv('data/uci_edu/{}'.format('SMSSpamCollection.csv'), index=False)
FileLink('data/uci_edu/{}'.format('SMSSpamCollection.csv'))
df.head()

## Shuffle-N-Split input source for Train-N-Test

In [None]:
df.isnull().sum() # Basic sanity check for null values

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=2, random_state=42, train_size=.7)
splits = sss.split(df, df['Label'])
train_indices, test_indices = [*splits][0]

In [None]:
print('Total Count = ', df.shape[0])
print()

train_mask = df.index.isin(train_indices)
train = df[train_mask]
print('Train Count = ', train.shape[0])
print('Train value counts :\n', train['Label'].value_counts())
print()

test = df[~train_mask]
print('Test Count = ', test.shape[0])
print('Test value counts :\n', test['Label'].value_counts())

In [None]:
train.to_csv('data/uci_edu/train.csv', index_label=['Id'])
test.to_csv('data/uci_edu/test.csv', index_label=['Id'])

## Feature Extraction and Data Modeling

In [2]:
train = pd.read_csv('data/uci_edu/train.csv', index_col='Id')
test = pd.read_csv('data/uci_edu/test.csv', index_col='Id')

In [3]:
# Vectorizer or Feature Extractors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

# Classifiers
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

  from numpy.core.umath_tests import inner1d


In [4]:
def vectorize_fit_score(vectorizer, classifier, train_df, test_df):
    desc = '{0} and {1}'.format(vectorizer.__class__.__name__, classifier.__class__.__name__)
#     print('Processing with {}'.format(desc))
    
    # 1. vectorize and fit
    vectorized_text = vectorizer.fit_transform(train_df['Message'])
    classifier.fit(vectorized_text, train_df['Label'])
    # 2. vectorize and score
    vectorized_text = vectorizer.transform(test_df['Message']) # Not `fit_transform` but just `transform`
    score = classifier.score(vectorized_text, test_df['Label'])
    
    return [desc, score]

In [None]:
results = []
results.append(vectorize_fit_score(CountVectorizer(), BernoulliNB(), train, test))
results.append(vectorize_fit_score(TfidfVectorizer(), BernoulliNB(), train, test))
results.append(vectorize_fit_score(HashingVectorizer(), BernoulliNB(), train, test))
results.append(vectorize_fit_score(CountVectorizer(), MultinomialNB(), train, test))
results.append(vectorize_fit_score(TfidfVectorizer(), MultinomialNB(), train, test))
results.append(vectorize_fit_score(HashingVectorizer(non_negative=True), MultinomialNB(), train, test))
results

In [6]:
import itertools

vectorizers = [ 
    CountVectorizer(), 
    TfidfVectorizer(), 
    HashingVectorizer(alternate_sign=False)
]

classifiers = [
    DummyClassifier(), # Just for fun :)
    BernoulliNB(), 
    MultinomialNB(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    BaggingClassifier(),    
    RandomForestClassifier(n_estimators=100, n_jobs=-1), 
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    RidgeClassifier(),
    RidgeClassifierCV(),
    PassiveAggressiveClassifier(max_iter=100),
    SGDClassifier(max_iter=100),
    LogisticRegression(),
    OneVsRestClassifier(SVC(kernel='linear')),
    OneVsRestClassifier(LogisticRegression()),    
    CalibratedClassifierCV(),    
]

# cvs = [(c,v) for c in classifiers for v in vectorizers]
# Doing it lazily with generators
cvs = itertools.product(classifiers, vectorizers)
results = []
for cv in cvs:
    res = vectorize_fit_score(cv[1], cv[0], train, test)
    results.append(res)
results    

Processing with CountVectorizer and DummyClassifier
Processing with TfidfVectorizer and DummyClassifier
Processing with HashingVectorizer and DummyClassifier




Processing with CountVectorizer and BernoulliNB
Processing with TfidfVectorizer and BernoulliNB
Processing with HashingVectorizer and BernoulliNB
Processing with CountVectorizer and MultinomialNB




Processing with TfidfVectorizer and MultinomialNB
Processing with HashingVectorizer and MultinomialNB




Processing with CountVectorizer and KNeighborsClassifier
Processing with TfidfVectorizer and KNeighborsClassifier
Processing with HashingVectorizer and KNeighborsClassifier




Processing with CountVectorizer and DecisionTreeClassifier
Processing with TfidfVectorizer and DecisionTreeClassifier
Processing with HashingVectorizer and DecisionTreeClassifier




Processing with CountVectorizer and ExtraTreeClassifier
Processing with TfidfVectorizer and ExtraTreeClassifier
Processing with HashingVectorizer and ExtraTreeClassifier




Processing with CountVectorizer and ExtraTreesClassifier
Processing with TfidfVectorizer and ExtraTreesClassifier
Processing with HashingVectorizer and ExtraTreesClassifier




Processing with CountVectorizer and BaggingClassifier
Processing with TfidfVectorizer and BaggingClassifier
Processing with HashingVectorizer and BaggingClassifier




Processing with CountVectorizer and RandomForestClassifier
Processing with TfidfVectorizer and RandomForestClassifier
Processing with HashingVectorizer and RandomForestClassifier




Processing with CountVectorizer and AdaBoostClassifier
Processing with TfidfVectorizer and AdaBoostClassifier
Processing with HashingVectorizer and AdaBoostClassifier




Processing with CountVectorizer and GradientBoostingClassifier
Processing with TfidfVectorizer and GradientBoostingClassifier
Processing with HashingVectorizer and GradientBoostingClassifier




Processing with CountVectorizer and RidgeClassifier
Processing with TfidfVectorizer and RidgeClassifier
Processing with HashingVectorizer and RidgeClassifier




Processing with CountVectorizer and RidgeClassifierCV
Processing with TfidfVectorizer and RidgeClassifierCV
Processing with HashingVectorizer and RidgeClassifierCV




Processing with CountVectorizer and PassiveAggressiveClassifier
Processing with TfidfVectorizer and PassiveAggressiveClassifier
Processing with HashingVectorizer and PassiveAggressiveClassifier




Processing with CountVectorizer and SGDClassifier
Processing with TfidfVectorizer and SGDClassifier
Processing with HashingVectorizer and SGDClassifier




Processing with CountVectorizer and LogisticRegression
Processing with TfidfVectorizer and LogisticRegression
Processing with HashingVectorizer and LogisticRegression




Processing with CountVectorizer and OneVsRestClassifier
Processing with TfidfVectorizer and OneVsRestClassifier
Processing with HashingVectorizer and OneVsRestClassifier




Processing with CountVectorizer and OneVsRestClassifier
Processing with TfidfVectorizer and OneVsRestClassifier
Processing with HashingVectorizer and OneVsRestClassifier




Processing with CountVectorizer and CalibratedClassifierCV
Processing with TfidfVectorizer and CalibratedClassifierCV
Processing with HashingVectorizer and CalibratedClassifierCV




[['CountVectorizer and DummyClassifier', 0.7435744172145846],
 ['TfidfVectorizer and DummyClassifier', 0.7567244471010162],
 ['HashingVectorizer and DummyClassifier', 0.7686790197250448],
 ['CountVectorizer and BernoulliNB', 0.9778840406455469],
 ['TfidfVectorizer and BernoulliNB', 0.9778840406455469],
 ['HashingVectorizer and BernoulliNB', 0.8661087866108786],
 ['CountVectorizer and MultinomialNB', 0.9844590555887627],
 ['TfidfVectorizer and MultinomialNB', 0.9605499103407054],
 ['HashingVectorizer and MultinomialNB', 0.8852361028093245],
 ['CountVectorizer and KNeighborsClassifier', 0.9252839210998207],
 ['TfidfVectorizer and KNeighborsClassifier', 0.917513448894202],
 ['HashingVectorizer and KNeighborsClassifier', 0.9234907352062164],
 ['CountVectorizer and DecisionTreeClassifier', 0.9701135684399282],
 ['TfidfVectorizer and DecisionTreeClassifier', 0.9677226539151226],
 ['HashingVectorizer and DecisionTreeClassifier', 0.9742976688583384],
 ['CountVectorizer and ExtraTreeClassifier'

In [11]:
results_df = pd.DataFrame(data=results, columns=['Description', 'Score'])
results_df.to_csv('data/uci_edu/outputs/results.csv', header=True)
FileLink('data/uci_edu/outputs/results.csv')
results_df[results_df.Score > 0.985].sort_values(['Score'],ascending=[False])

Unnamed: 0,Description,Score
49,TfidfVectorizer and OneVsRestClassifier,0.987448
55,TfidfVectorizer and CalibratedClassifierCV,0.987448
41,HashingVectorizer and PassiveAggressiveClassifier,0.98685
43,TfidfVectorizer and SGDClassifier,0.98685
56,HashingVectorizer and CalibratedClassifierCV,0.98685
34,TfidfVectorizer and RidgeClassifier,0.986252
37,TfidfVectorizer and RidgeClassifierCV,0.986252
40,TfidfVectorizer and PassiveAggressiveClassifier,0.985655
44,HashingVectorizer and SGDClassifier,0.985655
54,CountVectorizer and CalibratedClassifierCV,0.985655


## Playground

In [None]:
import itertools
a = [1,2,3]
b = ['a','b','c']
for p in itertools.product(a,b):
    print(p[])