In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer, HashingVectorizer
)

import warnings
warnings.filterwarnings("ignore")

Read csv file into a dataframe

In [3]:
df = pd.read_csv("data/ebaytitles.csv")
df = df.sample(frac=0.1) # delete this line if you are brave and have many GBs of RAM
df.head()

Unnamed: 0,title,category_name
274003,Retro Flag - Spain Stripe Premium Faux Leather...,Mobile Phones & Communication
565955,TOWING 7 PIN PLUG FOR Talbot Samba MODELS CARA...,Vehicle Parts & Accessories
70145,Greys Prowla Pop up pears LRG 5pcs Pike fishin...,Sporting Goods
169285,Motorbike/Motorcycle DID Chain & Sprocket Kit ...,Vehicle Parts & Accessories
5792,VW AUDI SEAT TFSI AIR BOX TO T.I.P LINK PIPE 0...,Vehicle Parts & Accessories


Print out unique values of a column

In [4]:
df.category_name.unique()

array(['Mobile Phones & Communication', 'Vehicle Parts & Accessories',
       'Sporting Goods', 'Home, Furniture & DIY',
       'Clothes, Shoes & Accessories', 'Toys & Games',
       'Jewellery & Watches', 'Sound & Vision', 'Crafts',
       'Computers/Tablets & Networking', 'Business, Office & Industrial',
       'Garden & Patio', 'Collectibles', 'Music', 'Health & Beauty',
       'Cameras & Photography', 'DVDs, Films & TV', 'Pet Supplies',
       'Video Games & Consoles', 'Books, Comics & Magazines', 'Art',
       'Musical Instruments & Gear', 'Baby', 'Sports Memorabilia',
       'Antiques', 'Wholesale & Job Lots', 'Coins & Paper Money',
       'Dolls & Bears', 'Pottery, Porcelain & Glass',
       'Consumer Electronics', 'Everything Else', 'Stamps',
       'Cell Phones & Accessories', 'Entertainment Memorabilia', 'Travel',
       'Holidays & Travel'], dtype=object)

Split the data into train and test observations - there is a column

In [5]:
X = df.title.values
y = df.category_name.values

X_tr, X_te, y_tr, y_te = train_test_split(X, 
                                          y,
                                          test_size=0.1,
                                          random_state=0)

Exercise 
------------------

1. Count how many titles are in each category (```pandas.DataFrame.groupby```). Print out most common at the top

In [6]:
df.groupby('category_name').count().sort_values('title', ascending=False)
# df.loc[:,'category_name'].value_counts()

Unnamed: 0_level_0,title
category_name,Unnamed: 1_level_1
Vehicle Parts & Accessories,23116
"Clothes, Shoes & Accessories",16679
"Home, Furniture & DIY",12799
Computers/Tablets & Networking,6793
Jewellery & Watches,6275
Sporting Goods,4763
Mobile Phones & Communication,3811
Crafts,3421
Health & Beauty,3334
Toys & Games,3081


<a>Double click to show the solution</a>
<div class='spoiler'>

frequencies = df.groupby("category_name")["title"].count()
frequencies.sort_values(inplace=True,ascending=False)
print(frequencies)

# or faster

df.category_name.value_counts()

</div>

Bag of words
--------------------

Different types of vectorizers:

<ul>
<li>```sklearn.feature_extraction.text.CountVectorizer``` - Counts the number of times a word appears in the text</li>
<li>```sklearn.feature_extraction.text.TfidfVectorizer``` - Weighs the words according to the importance of the word in the context of whole collection. Is the word ```the``` important if it appears in all documents?</li>
<li>```sklearn.feature_extraction.text.HashingVectorizer``` - Useful when you don't know the vocabulary upfront. Feature number is calculated as ```hash(token) % vocabulary_size```.</li>
</ul>

Exercise
-------------------
1. Use ```CountVectorizer``` / ```TfidfVectorizer``` to fit the collection of documents
2. How many unique tokens are there in text? Print some examples (ie first few hundred).
3. What methods you can use to reduce this number? 
   - Check out and experiment with the arguments: ```ngram_range```, ```min_df```. How the vocabulary size changes with each change?
   - What would you replace / delete from the text?
4. Write a custom function `clean_text` that accepts a text as input and transforms it (remove/hash numbers, delete short/long words etc.)
5. (Extra points) When would you use ```HashingVectorizer```?

In [7]:
def cleaner(txt):
    txt = txt.lower()
    txt = ' '.join(txt.split()) # great to get rid of tabs, double space, etc.
    txt = txt.strip(' ')
    txt = txt.lstrip('\t')
    txt= re.sub("[^A-Za-z0-9]"," ",txt)
#     txt = re.sub("[0-9]+","#",txt)
    return txt
    
cleaner('\ttest')

'test'

In [8]:
vectorizers = [
#     ('Count Vectorizer', 
#      CountVectorizer(stop_words=None,
#                     token_pattern=r'(?u)\b\w\w+\b')),
    ('Count Vectorizer', 
     CountVectorizer(stop_words=None,
                    token_pattern=r'(?u)\b\w+\b')),
#     ('Count Vectorizer', 
#      CountVectorizer(stop_words='english',
#                     token_pattern=r'(?u)\b\w\w+\b')),
#     ('Count Vectorizer', 
#      CountVectorizer(stop_words='english',
#                     token_pattern=r'(?u)\b\w+\b')),
#     ('Count Vectorizer min_df=5', 
#      CountVectorizer(stop_words=None,
#                     token_pattern=r'(?u)\b\w+\b',
#                     min_df=5)),
    ('Count Vectorizer max_df=1', 
     CountVectorizer(stop_words=None,
                    token_pattern=r'(?u)\b\w+\b',
                    max_df=1)),
#     ('Count Vectorizer ngram_range=(1,3)', 
#      CountVectorizer(preprocessor=cleaner,
#                     analyzer='char',
#                     ngram_range=(1,3))),
#     ('Count Vectorizer ngram_range=(2,4)', 
#      CountVectorizer(preprocessor=cleaner,
#                     analyzer='char',
#                     ngram_range=(2,4))),
]

# ('Hashing Vectorizer',
#      HashingVectorizer(token_pattern=r'(?u)\b\w+\b',
#                       n_features=100000,
#                       stop_words='english'))

for name, vect in vectorizers:
    print(name)
    vect.fit(X_tr)

    print(list(vect.get_feature_names())[:25])
    print(len(vect.get_feature_names()), '\n', '-'*100)

Count Vectorizer
['0', '00', '000', '0000', '00000', '00001', '00003', '00003g', '000051446b', '00005320c2', '000060', '000091', '0001', '000106', '00012', '000122', '000146', '000150', '000171528', '0002', '000211', '000223', '000228', '000237', '00027']
72268 
 ----------------------------------------------------------------------------------------------------
Count Vectorizer max_df=1
['00000', '00001', '00003', '00003g', '000051446b', '00005320c2', '000060', '000091', '0001', '00012', '000122', '000146', '000150', '000171528', '0002', '000211', '000223', '000237', '00027', '000348', '00036a', '00037', '000379', '00038', '000393']
43112 
 ----------------------------------------------------------------------------------------------------


<a>Double click to show the solution</a>
<div class='spoiler'>
from sklearn.feature_extraction.text import CountVectorizer
import re

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    return t

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.get_feature_names())[:10])
    print(len(vect.get_feature_names()))
</div>

Stemming
------------------

Linguistic normalization in which variant forms are reduced to a common form

    connection
    connections
    connective     --->   connect
    connected
    connecting
    
Usage:

    import snowballstemmer

    stemmer = snowballstemmer.stemmer('english')
    print(stemmer.stemWords("We are the world".split()))

Putting it into a pipeline
----------------------

Now that we know how to transform text data, let's put it into a pipeline.

1. Create a pipeline with `CountVectorizer`, `StandardScaler` and `SGDClassifier` as your final algorithm
    a) use alternative format for pipeline definition when you name the steps - refer to the documentation how to do this
2. Using ```sklearn.metrics.classification_report``` create a report about your classifier

In [9]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

In [10]:
classifiers = [
    ('Classifier #1',
    make_pipeline(
        CountVectorizer(
            preprocessor=cleaner,
            stop_words='english',
            token_pattern=r'(?u)\b\w+\b'
        ),
        StandardScaler(
            with_mean=False,
            with_std=False
        ),
        SGDClassifier()
    )),
    ('Classifier #2',
    make_pipeline(
        CountVectorizer(
            preprocessor=cleaner,
            stop_words='english',
            token_pattern=r'(?u)\b\w+\b',
            min_df=5
        ),
        StandardScaler(
            with_mean=False,
            with_std=False
        ),
        SGDClassifier()
    )),
#     ('Classifier #2',
#     make_pipeline(
#         CountVectorizer(
#             preprocessor=cleaner,
#             analyzer='char',
#             ngram_range=(2,5)
#         ),
#         StandardScaler(
#             with_mean=False,
#             with_std=False
#         ),
#         SGDClassifier()
#     )),
]



for name, classifier in classifiers:
    print(name)
    classifier.fit(X_tr, y_tr)
    y_pred = classifier.predict(X_te)

    print(classification_report(y_te, y_pred))

#     preds = cross_val_predict(classifier, 
#                               X_tr, 
#                               y_tr, 
#                               cv=8, n_jobs=-1, verbose=True)

#     print(classification_report(y_tr, preds))
          
    print('-'*50)

Classifier #1
                                precision    recall  f1-score   support

                      Antiques       0.67      0.40      0.50        15
                           Art       0.72      0.74      0.73        58
                          Baby       0.87      0.72      0.79        67
     Books, Comics & Magazines       0.72      0.76      0.74        41
 Business, Office & Industrial       0.79      0.62      0.69       280
         Cameras & Photography       0.97      0.77      0.86        94
     Cell Phones & Accessories       1.00      0.17      0.29         6
  Clothes, Shoes & Accessories       0.92      0.97      0.95      1640
           Coins & Paper Money       0.89      0.85      0.87        20
                  Collectibles       0.85      0.71      0.78       272
Computers/Tablets & Networking       0.95      0.95      0.95       659
          Consumer Electronics       0.80      0.40      0.53        10
                        Crafts       0.89      0.

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

clf = Pipeline([('vect', CountVectorizer(min_df=10, preprocessor=clean_text)),
                ('scaling', StandardScaler(with_mean=False)),
                ('clf', SGDClassifier())])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

print(classification_report(y_tr, preds)


</div>


Grid search
--------------------------

Scikit-learn has `GridSearchCV` and `RandomizedSearchCV`. Both have the same functionality and can be used to find good parameters for the models. What is great about both these classes that they are both transformers - they return an estimator so you can chain them and put in your pipeline.

**GridSearchCV** - you specify the exact values of the parameters you want to test
**RandomizedSearchCV** - you specify ranges of parameters

Exercise
----------------------

1. Use `GridSearchCV` or `RandomizedSearchCV` to find the best parameters for the models. Check at least 2 parameters.

2. Inspect the attribute `cv_results_` after fitting. It gives a nice representation of the learning.

In [14]:
clf = Pipeline([
    ('vect', CountVectorizer(min_df=5)),
    ('scaling', StandardScaler(with_mean=False)),
    ('classifier', SGDClassifier())
])

grid = GridSearchCV(
    clf,
    param_grid={
#         'vect__min_df': [5, 10],
        'vect__max_df': [10, 20],
        'classifier__alpha': [0.5, 0.1, 0.01]
    },
    n_jobs=-1)

grid.fit(X_tr, y_tr)
y_pred = grid.predict(X_te)

print(classification_report(y_te, y_pred))

                                precision    recall  f1-score   support

                      Antiques       0.33      0.20      0.25        15
                           Art       0.50      0.24      0.33        58
                          Baby       0.61      0.21      0.31        67
     Books, Comics & Magazines       0.24      0.10      0.14        41
 Business, Office & Industrial       0.56      0.32      0.40       280
         Cameras & Photography       0.76      0.50      0.60        94
     Cell Phones & Accessories       0.00      0.00      0.00         6
  Clothes, Shoes & Accessories       0.67      0.39      0.49      1640
           Coins & Paper Money       0.58      0.55      0.56        20
                  Collectibles       0.54      0.29      0.37       272
Computers/Tablets & Networking       0.76      0.41      0.53       659
          Consumer Electronics       0.50      0.10      0.17        10
                        Crafts       0.70      0.40      0.51  

In [15]:
print(grid.best_estimator_)
print(grid.best_params_)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=20, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])
{'classifier__alpha': 0.1, 'vect__max_df': 20}


<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()

params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'vect__binary': [True, False]}

grid_clf = GridSearchCV(clf, params, n_jobs=1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])


for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'model__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(clf, params, n_jobs=1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

</div>


Useful materials

1. http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
2. http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html