This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [1]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer2, docTopTransformer, doc2Bow

In [2]:
# takes list of text fields (e.g. summary or reviewText fields) and
# tokenizes, removes stop words and stems. Returns result as array of 
# lists, one list per review
def preprocess_data(doc_set):    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        if not i:
            i = ' '
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens.append('null__') # add a bias term, will work as a kind of prior, important for empty reviews
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

# takes an array of lists as input, product labels, uniq_labels, and ratings,
# and merges lists with matching labels among labels uniq_labels, averages
# reviews belonging to the same, returns merged lists, and averaged ratings
# uniq_labels should typically be np.unique(product labels), however 
# the option of specifying a subset is useful for parallelization to allow
# different subsets to be processed by different engines
def combine_reviews(review_text, asins, uniq_asins, ratings):
    # cast to array for easier indexing
    review_text = np.array(review_text, dtype=object)
    ratings = np.array(ratings)
    asins = np.array(asins)
    
    #combine all the summaries into a single text and avg the review ratings for each product
    product_text = []
    avg_ratings = []
    for this_asin in uniq_asins:
        asin_idx = np.where(this_asin == asins)
        this_product_text = np.concatenate([text for text in review_text[asin_idx]])
        product_text.append(this_product_text)
        avg_ratings.append(np.mean(ratings[asin_idx]))
    
    return (product_text, avg_ratings)

def list_concat(nparray_of_lists):
    nparray_of_lists[-2] = nparray_of_lists[-2] + nparray_of_lists[-1]
    nparray_of_lists.pop()
    
    if len(nparray_of_lists) > 1:
        nparray_of_lists = list_concat(nparray_of_lists)
    
    return nparray_of_lists

In [3]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[:300000]

In [4]:
# create a list, doc_list, with one review per elem
summary = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    rating.append(json_dat[i].get('overall'))
    summary.append(json_dat[i].get('summary'))
    prod_id.append(json_dat[i].get('asin'))

In [5]:
# takes ~96 CPU minutes

# this cell runs things in parallel. make sure to start an 
# ipython cluster from the notebook dashboard's IPython Cluster
# tab before running
import ipyparallel as ipp

rc = ipp.Client()
dview = rc[:]
dview.execute('from nltk.tokenize import RegexpTokenizer;' +
              'from nltk.corpus import stopwords; ' + 
              'from nltk.stem.porter import PorterStemmer;' +
              'import numpy as np;')

t0 = time.time()


# clean text
dview.push(dict(preprocess_data=preprocess_data))
dview.scatter('summary', summary) # partitions data

%px cleaned_reviews = preprocess_data(summary)
cleaned_reviews = dview.gather('cleaned_reviews').get()

# combine text
dview.push(dict(cleaned_reviews=cleaned_reviews, combine_reviews=combine_reviews,
               rating=rating, prod_id=prod_id))
uniq_prod_id = np.unique(prod_id)
dview.scatter('uniq_prod_id', uniq_prod_id) # partition target labels

%px total_text, avg_ratings = combine_reviews(cleaned_reviews, prod_id, uniq_prod_id, rating)
total_text = dview.gather('total_text').get()
avg_ratings = dview.gather('avg_ratings').get()



t1 = time.time()

print(t1-t0)

12.074710845947266


In [6]:
import gc

del summary
del json_dat
del val_dat

gc.collect()

9695

In [7]:
# vectorize training data
train_lbls = np.array(avg_ratings) >= 4.5
train_text = total_text
lbls = prod_id

In [8]:
# run this cell tweaking the search space a big until you're satisfied you're
# at a local minimum, and not up against the edge of the search space
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from gensim.corpora import Dictionary
from sklearn.pipeline import Pipeline

tuned_parameters = [{'clf__kernel': ['rbf'], 
                     'clf__gamma': [1e-1, 1e-2, 1e-3],
                     'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['linear'], 
                      'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['poly'], 
                      'clf__C': [100, 1000, 10000],
                      'clf__degree': [2]}]

# manually update the search space to be centered on the local
# minimum found above, under the assumption that loss function
# is smooth over hyperparemters (not assured, or even likely, 
# but best we can do).
# this time run it on twice as much data, and do that twice
# averaging over the results using a bagging classifier
# compute CV metrics, and get a final model

my_dict = Dictionary(train_text)

prod2Bow = doc2Bow()
bow2Top = docTopTransformer(this_dict=my_dict)

baseClf = SVC()
estimators = [('bow2Top', bow2Top), ('norm', Normalizer()), ('clf', baseClf)]
semClf = Pipeline(estimators)

In [15]:
clf = GridSearchCV(semClf, tuned_parameters, cv=5, n_jobs=1, scoring='f1_macro')

# create bag of words mapping outside gridsearchcv loop for speedup
finalClf = Pipeline([('prod2Bow', prod2Bow), ('bagClf', clf)])

# cross validate the procedure
t0 = time.time()
sum_predict = cross_val_predict(finalClf, train_text, train_lbls, cv=10, n_jobs=10)
t1 = time.time()
print(t1-t0)


7352.557163476944


In [16]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, sum_predict)
print('Summary Text Prediction')
print(report)

report = classification_report(train_lbls, np.random.permutation(sum_predict))
print('Null Prediction')
print(report)

Summary Text Prediction
              precision    recall  f1-score   support

       False       0.77      0.75      0.76      5903
        True       0.73      0.75      0.74      5355

    accuracy                           0.75     11258
   macro avg       0.75      0.75      0.75     11258
weighted avg       0.75      0.75      0.75     11258

Null Prediction
              precision    recall  f1-score   support

       False       0.52      0.51      0.52      5903
        True       0.47      0.49      0.48      5355

    accuracy                           0.50     11258
   macro avg       0.50      0.50      0.50     11258
weighted avg       0.50      0.50      0.50     11258



In [9]:
# obtain a final model fit

clf = GridSearchCV(semClf, tuned_parameters, cv=5, n_jobs=16, scoring='f1_macro')

# create bag of words mapping outside gridsearchcv loop for speedup
finalClf = Pipeline([('prod2Bow', prod2Bow), ('bagClf', clf)])

t0 = time.time()
finalClf = finalClf.fit(train_text,train_lbls)
t1 = time.time()
print(t1-t0)

1073.51616024971


In [13]:
# save fitted classifier
#
# classifier will project tokenized test data to the pretrained 
# document-topic embedding, apply two SVM classifiers, which may
# be linear, polynomial or RBF kernel classifiers, with various
# hyperparameters, average the results, and give you the 
# resulting prediction. The selected kernels and hyperparameters
# are chosen to optimize f1 score
import pickle
with open('hypOptSVM_300k.clf',mode='wb') as f:
    pickle.dump(finalClf,f)

In [None]:
defaultClf = Pipeline([('prod2Bow', prod2Bow), ('bagClf', semClf)])
predict_default = cross_val_predict(defaultClf, train_text, train_lbls, cv=10, n_jobs=10)

report = classification_report(train_lbls, predict_default)
print('Prediction using hyperparameter defaults')
print(report)

In [None]:
# rerun the classifier to get hyperparameter samplings
my_dict = Dictionary(train_text)

prod2Bow = doc2Bow()
bow2Top = docTopTransformer(this_dict=my_dict)

baseClf = SVC()
estimators = [('bow2Top', bow2Top), ('norm', Normalizer()), ('clf', baseClf)]
semClf = Pipeline(estimators)

clf = GridSearchCV(semClf, tuned_parameters, cv=5, n_jobs=1, scoring='f1_macro')

# fit two models on two independent datasets, averaging results
n_estimators=10
bagClf = BaggingClassifier(base_estimator=clf, 
                        bootstrap=False, max_samples = 0.8, n_estimators=n_estimators,
                        n_jobs=10)

finalClf = Pipeline([('prod2Bow', prod2Bow), ('bagClf', bagClf)])

t0 = time.time()
finalClf.fit(train_text, train_lbls)
t1 = time.time()

# print out the hyperparameter fits for each bag
for this_clf in finalClf[1]:
    print(this_clf.best_estimator_)
    
print(t1-t0)