This script estimates the performance of naive bayes classification in a document-topic model's feature space

In [1]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json
import numpy as np
import csv

import sys
sys.path.append("../libraries/")
from selector import split_data

In [2]:
# takes an array of lists as input, product labels, uniq_labels, and ratings,
# and merges lists with matching labels among labels uniq_labels, averages
# reviews belonging to the same, returns merged lists, and averaged ratings
# uniq_labels should typically be np.unique(product labels), however 
# the option of specifying a subset is useful for parallelization to allow
# different subsets to be processed by different engines
def combine_reviews(review_text, asins, uniq_asins, ratings):
    # cast to array for easier indexing
    review_text = np.array(review_text, dtype=object)
    ratings = np.array(ratings)
    asins = np.array(asins)
    
    #combine all the summaries into a single text and avg the review ratings for each product
    product_text = []
    avg_ratings = []
    for this_asin in uniq_asins:
        asin_idx = np.where(this_asin == asins)
        this_product_text = np.concatenate([text for text in review_text[asin_idx]])
        product_text.append(this_product_text)
        avg_ratings.append(np.mean(ratings[asin_idx]))
    
    return (product_text, avg_ratings)

In [3]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
#json_dat = json_dat[:4000]

In [8]:
# create a list, doc_list, with one review per elem
reviewerID = []
summary = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    rating.append(json_dat[i].get('overall'))
    summary.append([])
    reviewerID.append(json_dat[i].get('reviewerID'))
    prod_id.append(json_dat[i].get('asin'))

In [16]:
# takes ~96 CPU minutes

# this cell runs things in parallel. make sure to start an 
# ipython cluster from the notebook dashboard's IPython Cluster
# tab before running
import ipyparallel as ipp

rc = ipp.Client()
dview = rc[:]
dview.execute('from nltk.tokenize import RegexpTokenizer;' +
              'from nltk.corpus import stopwords; ' + 
              'from nltk.stem.porter import PorterStemmer;' +
              'import numpy as np;')

t0 = time.time()


# combine reviewerID
dview.push(dict(reviewerID=[[rid] for rid in reviewerID], combine_reviews=combine_reviews,
               rating=rating, prod_id=prod_id))
uniq_prod_id = np.unique(prod_id)
dview.scatter('uniq_prod_id', uniq_prod_id) # partition target labels

%px total_text, avg_ratings = combine_reviews(reviewerID, prod_id, uniq_prod_id, rating)
total_text = dview.gather('total_text').get()
avg_ratings = dview.gather('avg_ratings').get()



t1 = time.time()

print(t1-t0)

378.04977083206177


In [174]:
#parallel_avg_ratings = avg_ratings
#parallel_total_text = total_text

In [176]:
# uncomment this box to run serially

#uniq_prod_id = np.unique(prod_id)
#reviewerID, avg_ratings = combine_reviews([[rid] for rid in reviewerID], prod_id, uniq_prod_id, rating)

In [None]:
#print(parallel_avg_ratings == avg_ratings)
#all([all(parallel_total_text[i] == text) for i, text in enumerate(total_text)])

In [None]:
import pandas as pd

t0 = time.time()

prsn_asin = []
prsn_id = []
prsn_rating = []
with open('../data/Sports_and_Outdoors_Ratings_training.csv') as file:
    reader = pd.read_csv(file, delimiter=',')
    prsn_rating = np.array([item[1] for item in reader['overall'].items()])
    prsn_id = np.array([item[1] for item in reader['reviewerID'].items()])
    prsn_asin = np.array([item[1] for item in reader['asin'].items()])

t1 = time.time()
print(t1-t0)

# only retain ratings for products not in training json
# crude way of ensuring were not double dipping when testing the
# reviewer model on training dataset product ratings
idx = [i for i, pid in enumerate(prsn_asin) if pid not in uniq_prod_id]
prsn_rating = prsn_rating[idx]
prsn_id = prsn_id[idx]
prsn_asin = prsn_asin[idx]

len(prsn_rating)

In [None]:
t0 = time.time()

profile = dict(zip(np.unique(prsn_id), [None]*len(np.unique(prsn_id))))
for prsn in profile:
    profile[prsn] = np.mean(prsn_rating[prsn == prsn_id])
    
t1 = time.time()
print(t1-t0)

In [None]:
# takes n x 1 vectors of prsn_ratings and matching prsn_id,
# and an m x 1 (n >= m) vector of uniq_prsn_ids for whom we
# want to get average ratings
def mkProfile(prsn_ratings, prsn_ids, uniq_prsn_id):
    avg_rating = []
    for this_id in uniq_prsn_id:
        avg_rating.append(np.mean(prsn_ratings[this_id == prsn_ids]))
        
    return avg_rating, uniq_prsn_id
    
uniq_prsn_id = np.unique(prsn_id)

rc = ipp.Client()
dview = rc[:]
dview.execute('import numpy as np;')

t0 = time.time()


# combine reviewerID
dview.push(dict(prsn_rating=prsn_rating, prsn_id=prsn_id, mkProfile=mkProfile))
dview.scatter('uniq_prsn_id', uniq_prsn_id) # partition target labels
%px avg_rating, uniq_prsn_id = mkProfile(prsn_rating, prsn_id, uniq_prsn_id)

profile = dict(zip(dview.gather('uniq_prsn_id').get(), dview.gather('avg_rating').get()))

In [199]:
import gc

del summary
del review
del json_dat
del val_dat

gc.collect()

3866623

In [None]:
# compute a predicted rating by averaging over the expected
# ratings of all people who reviewed the product.
pred_rating = []
for text in total_text:
    pred_rating.append(np.mean([profile[rid] for rid in text if rid in profile]))

In [None]:
# vectorize training data
train_lbls = np.array(avg_ratings) >= 4.5
train_text = total_text

pred_rating = np.array(pred_rating)
idx = np.isfinite(pred_rating)

In [None]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls[idx], pred_rating[idx] > 4.5)
print('True recall is sensitivity, false recall is specificity')
print(report)