This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [1]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer2, docTopTransformer, doc2Bow

In [2]:
# searches for first match to target in dat, beginning
# search at start_offset
# useful for searching sorted lists.
def linearSearch(dat, target, start_offset=0):
    for i in range(start_offset, len(dat)):
        if target == dat[i]:
            return i

# takes n x 1 vectors of prsn_ratings and matching prsn_id,
# and an m x 1 (n >= m) vector of uniq_prsn_ids for whom we
# want to get average X. Does not preserve order.
# returns new uniq_lbls corresponding to order of avg_X
# O( n log(n) )
#
# if laplaceSmoothing is used then we add alpha to numerator
# and alpha*d to the denominator. If empirical distribution
# is known you can specify empirical distribution of category
# using mu. Mu can be a vector of length m if X is n x m
def avgByLbl(X, lbls):    
    # sort data for efficient averaging
    dat = sorted(list(zip(X,lbls)), key=lambda id: id[1])
    dat = [[i for i,j in dat], [j for i,j in dat]]
    X = np.array(dat[0])
    lbls = dat[1]
    
    uniq_lbls = np.unique(lbls)
    uniq_lbls = sorted(uniq_lbls)
    
    # use an averaging algorithm optimized for sorted entries
    # (requires sorted search targets and search list)
    # this algorithm never traverses the same element of the
    # search list twice, but carries the overhead of a pre-
    # sorted target list and search list. Thankfully those
    # can use the O(n log(n)) python sort implementation
    idx = 0
    avg_X = np.zeros(len(uniq_lbls))
    for i,this_id in enumerate(uniq_lbls):
        idx = linearSearch(lbls, this_id, idx)
        n = 0.0
        while idx < len(lbls) and lbls[idx] == this_id:
            avg_X[i] += X[idx]
            n += 1.0
            idx += 1
        avg_X[i] /= n

    return avg_X, uniq_lbls

def getProdRatings(target_prod_id):
    import pandas as pd

    prsn_asin = []
    prsn_id = []
    prsn_rating = []
    with open('../data/Sports_and_Outdoors_Ratings_training.csv') as file:
        reader = pd.read_csv(file, delimiter=',')
        prsn_rating = np.array([item[1] for item in reader['overall'].items()])
        prsn_id = np.array([item[1] for item in reader['reviewerID'].items()])
        prsn_asin = np.array([item[1] for item in reader['asin'].items()])

    
    prod_rating, prod_asin = avgByLbl(prsn_rating, prsn_asin)
    
    # sort prod_asin and target_prod_id so that they match
    # save inverse sort function to reverse at the end
    idx = np.argsort(target_prod_id)
    inv_sort = np.argsort(idx)
    
    target_prod_id = np.array(target_prod_id)
    target_prod_id = target_prod_id[idx]
    prod_list = sorted(list(zip(prod_rating, prod_asin)), key=lambda id: id[1])
    prod_rating = [i for i,j in prod_list]
    prod_asin = [j for i,j in prod_list]
    
    # now we can assume that prod_ratings will match target_prod_id because both prod_asin and 
    # target_prod_id are sorted
    prod_rating = [prod_rating[i] for i, this_prod in enumerate(prod_asin) if this_prod in target_prod_id] 
    prod_rating = np.array(prod_rating)
    
    # invert prod_rating to match original target_prod_rating order and return
    return prod_rating[inv_sort]

In [3]:
# takes list of text fields (e.g. summary or reviewText fields) and
# tokenizes, removes stop words and stems. Returns result as array of 
# lists, one list per review
def preprocess_data(doc_set):    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        if not i:
            i = ' '
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens.append('null__') # add a bias term, will work as a kind of prior, important for empty reviews
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

# takes an array of lists as input, product labels, uniq_labels, and ratings,
# and merges lists with matching labels among labels uniq_labels, averages
# reviews belonging to the same, returns merged lists, and averaged ratings
# uniq_labels should typically be np.unique(product labels), however 
# the option of specifying a subset is useful for parallelization to allow
# different subsets to be processed by different engines
def combine_reviews(review_text, asins, uniq_asins, ratings):
    # cast to array for easier indexing
    review_text = np.array(review_text, dtype=object)
    ratings = np.array(ratings)
    asins = np.array(asins)
    
    #combine all the summaries into a single text and avg the review ratings for each product
    product_text = []
    avg_ratings = []
    for this_asin in uniq_asins:
        asin_idx = np.where(this_asin == asins)
        this_product_text = np.concatenate([text for text in review_text[asin_idx]])
        product_text.append(this_product_text)
        avg_ratings.append(np.mean(ratings[asin_idx]))
    
    return (product_text, avg_ratings)

def list_concat(nparray_of_lists):
    nparray_of_lists[-2] = nparray_of_lists[-2] + nparray_of_lists[-1]
    nparray_of_lists.pop()
    
    if len(nparray_of_lists) > 1:
        nparray_of_lists = list_concat(nparray_of_lists)
    
    return nparray_of_lists

In [4]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat

In [5]:
# create a list, doc_list, with one review per elem
summary = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    rating.append(json_dat[i].get('overall'))
    summary.append(json_dat[i].get('summary'))
    prod_id.append(json_dat[i].get('asin'))

In [6]:
# takes ~96 CPU minutes

# this cell runs things in parallel. make sure to start an 
# ipython cluster from the notebook dashboard's IPython Cluster
# tab before running
import ipyparallel as ipp

rc = ipp.Client()
dview = rc[:]
dview.execute('from nltk.tokenize import RegexpTokenizer;' +
              'from nltk.corpus import stopwords; ' + 
              'from nltk.stem.porter import PorterStemmer;' +
              'import numpy as np;')

t0 = time.time()


# clean text
dview.push(dict(preprocess_data=preprocess_data))
dview.scatter('summary', summary) # partitions data

%px cleaned_reviews = preprocess_data(summary)
cleaned_reviews = dview.gather('cleaned_reviews').get()

# combine text
dview.push(dict(cleaned_reviews=cleaned_reviews, combine_reviews=combine_reviews,
               rating=rating, prod_id=prod_id))
uniq_prod_id = np.unique(prod_id)
dview.scatter('uniq_prod_id', uniq_prod_id) # partition target labels

%px total_text, avg_ratings = combine_reviews(cleaned_reviews, prod_id, uniq_prod_id, rating)
total_text = dview.gather('total_text').get()
avg_ratings = dview.gather('avg_ratings').get()



t1 = time.time()

print(t1-t0)

347.48716711997986


In [7]:
import gc

del summary
del json_dat
del val_dat

gc.collect()

16063

In [10]:
avg_ratings = getProdRatings(uniq_prod_id)

In [11]:
# vectorize training data
train_lbls = np.array(avg_ratings) >= 4.5
train_text = total_text

In [12]:
# run this cell tweaking the search space a big until you're satisfied you're
# at a local minimum, and not up against the edge of the search space
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from gensim.corpora import Dictionary
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

my_dict = Dictionary(train_text)

#tuned_parameters = [{'bow2Top__this_dict': [my_dict],
#                     'bow2Top__d': [200, 300, 400],
#                     'clf__base_estimator': [LogisticRegression(), DecisionTreeClassifier(max_depth=1)],
#                     'clf__n': [25, 50, 100]}]

# manually update the search space to be centered on the local
# minimum found above, under the assumption that loss function
# is smooth over hyperparemters (not assured, or even likely, 
# but best we can do).
# this time run it on twice as much data, and do that twice
# averaging over the results using a bagging classifier
# compute CV metrics, and get a final model

prod2Bow = doc2Bow()
bow2Top = docTopTransformer(this_dict=my_dict)

#baseClf = LogisticRegression()
boostedClf = AdaBoostClassifier()
estimators = [('bow2Top', bow2Top), ('norm', Normalizer()), ('clf', boostedClf)]
semClf = Pipeline(estimators)

# create bag of words mapping outside gridsearchcv loop for speedup
finalClf = Pipeline([('prod2Bow', prod2Bow), ('semClf', semClf)])

# cross validate the procedure
t0 = time.time()
sum_predict = cross_val_predict(finalClf, train_text, train_lbls, cv=5, n_jobs=5)
t1 = time.time()
print(t1-t0)


388.49509286880493


In [13]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, sum_predict)
print('Summary Text Prediction')
print(report)

report = classification_report(train_lbls, np.random.permutation(sum_predict))
print('Null Prediction')
print(report)

Summary Text Prediction
              precision    recall  f1-score   support

       False       0.72      0.83      0.77     43075
        True       0.59      0.43      0.50     24263

    accuracy                           0.69     67338
   macro avg       0.66      0.63      0.64     67338
weighted avg       0.68      0.69      0.67     67338

Null Prediction
              precision    recall  f1-score   support

       False       0.64      0.74      0.68     43075
        True       0.36      0.26      0.30     24263

    accuracy                           0.56     67338
   macro avg       0.50      0.50      0.49     67338
weighted avg       0.54      0.56      0.55     67338

