This script estimates the performance of naive bayes classification in a document-topic model's feature space

In [1]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json
import numpy as np
import csv

import sys
sys.path.append("../libraries/")
from selector import split_data

In [330]:
def combine_reviews(text, asins, ratings):
    # this script uses an algorithm that requires sorted asins, so enforce it
    # introduces overhead if arrays are already sorted (as they are by default)
    # but makes for a more reliable function
    dat = zip(text, asins, ratings)
    dat = sorted(dat, key=lambda id: id[1]) 
    dat = [[i for i,j,k in dat], [j for i,j,k in dat], [k for i,j,k in dat]]
    text = dat[0]
    asins = dat[1]
    ratings = dat[2]
    
    products = [asins[0]]
    combined_text = [text[0]]
    average_rating = []
    total_rating = ratings[0]
    count = 1
    
    #combine all the summaries into a single text and avg the review ratings for each product
    for i in range(1, len(asins)):
        last_element_index = len(products) - 1
        if(asins[i] == products[last_element_index]):
            combined_text[last_element_index] = combined_text[last_element_index] + text[i]
            total_rating += ratings[i]
            count += 1
        else:
            average_rating.append(total_rating/count)
            products.append(asins[i])
            combined_text.append(text[i])
            total_rating = ratings[i]
            count = 1
    average_rating.append(total_rating/count)
    
    return (combined_text, products, average_rating)

# searches for first match to target in dat, beginning
# search at start_offset
# useful for searching sorted lists.
def linearSearch(dat, target, start_offset=0):
    for i in range(start_offset, len(dat)):
        if target == dat[i]:
            return i

# returns elements of list1 not in list2.
#
# list1 - n x 1 array (possibly of tuples)
# list2 - m x 1 array of labels
# col - col of list1 tuples to match to list 2
#
# returns: list1 without elements also in list2
def getUnique(list1, list2, col=0):
    list1 = sorted(list1, key=lambda id: id[col])
    list2 = sorted(np.unique(list2))
    list1_target = np.array([i[col] for i in list1])
    
    idx = 0
    nonunique = []
    for elem in list2:
        new_idx = linearSearch(list1_target, elem, idx)
        
        if new_idx:
            idx = new_idx
            while idx < len(list1_target) and list1_target[idx] == elem:
                nonunique.append(idx)
                idx += 1
    
    nonunique.reverse()
    [list1.pop(i) for i in nonunique]
    
    return list1
    

# takes n x 1 vectors of prsn_ratings and matching prsn_id,
# and an m x 1 (n >= m) vector of uniq_prsn_ids for whom we
# want to get average X. Does not preserve order.
# returns new uniq_lbls corresponding to order of avg_X
# O( n log(n) )
def avgByLbl(X, lbls):
    uniq_lbls = np.unique(lbls)
    
    # sort data for efficient averaging
    dat = sorted(list(zip(X,lbls)), key=lambda id: id[1])
    dat = [[i for i,j in dat], [j for i,j in dat]]
    X = dat[0]
    lbls = dat[1]
    
    uniq_lbls = sorted(uniq_lbls)
    
    t0 = time.time()
    # use an averaging algorithm that assumes sorted entries
    # for both search targets and search list.
    # this algorithm never traverses the same element of the
    # search list twice, but carries the overhead of a pre-
    # sorted target list and search list. Thankfully those
    # can use the O(n log(n)) python sort implementation
    avg_X = np.zeros(len(uniq_lbls))
    idx = 0
    for i,this_id in enumerate(uniq_lbls):
        idx = linearSearch(lbls, this_id, idx)
        n = 0.0
        while idx < len(lbls) and lbls[idx] == this_id:
            avg_X[i] += X[idx]
            n += 1.0
            idx += 1
        avg_X[i] /= n
            
    t1 = time.time()
    print(t1-t0)
    return avg_X, uniq_lbls

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

# this classifier will predict each rating based on whatever the subjects average rating is.
# wrap this in a GroupKFold cross validator to make predictions based on ratings on other products
# and then average over ratings for each product outside the CV loop to get product
# ratings
class subjProfileClf(BaseEstimator):
    # asumes X is tuple (prsn_id, prsn_rating, prod_id)
    def fit(self, X, y=None):
        prsn_id = [i[0] for i in X]
        prsn_rating = [i[1] for i in X]
        
        avg_rating, uniq_prsn_id = avgByLbl(prsn_rating, prsn_id)
        self.profile = dict(zip(uniq_prsn_id, avg_rating))
        
        return self
        
    def predict(self, X):
        prsn_id = [i[0] for i in X]
        prod_id = [i[2] for i in X]
        
        meanValue = np.mean(np.array(list(self.profile.values())))
        
        y = [meanValue]*len(prod_id)
        for i in range(0,len(prod_id)):
            y[i] = np.mean([self.profile[rid] for rid in prsn_id if prsn_id in profile])
        
        return y

In [3]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
#json_dat = json_dat[:4000]

In [8]:
# create a list, doc_list, with one review per elem
reviewerID = []
summary = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    rating.append(json_dat[i].get('overall'))
    summary.append([])
    reviewerID.append(json_dat[i].get('reviewerID'))
    prod_id.append(json_dat[i].get('asin'))

In [195]:
combined_text, products, average_rating = combine_reviews([[rid] for rid in reviewerID], prod_id, rating)

29.605073928833008


In [349]:
import pandas as pd

t0 = time.time()

prsn_asin = []
prsn_id = []
prsn_rating = []
with open('../data/Sports_and_Outdoors_Ratings_training.csv') as file:
    reader = pd.read_csv(file, delimiter=',')
    prsn_rating = np.array([item[1] for item in reader['overall'].items()])
    prsn_id = np.array([item[1] for item in reader['reviewerID'].items()])
    prsn_asin = np.array([item[1] for item in reader['asin'].items()])

t1 = time.time()
print(t1-t0)

20.309006452560425


In [None]:
t0 = time.time()
# only retain ratings for products not in training json
# crude way of ensuring we're not double dipping when testing the
# reviewer model on training dataset product ratings
dat = list(zip(prsn_asin, prsn_rating, prsn_id))
dat = getUnique(dat, products)
dat = [[i for i,j,k in dat], [j for i,j,k in dat], [k for i,j,k in dat]] 
prsn_asin = dat[0]
prsn_rating = dat[1]
prsn_id = dat[2]

t1 = time.time()
print(t1-t0)

len(prsn_rating)

In [320]:
t0 = time.time()

avg_rating, uniq_prsn_id = avgByLbl(prsn_rating, prsn_id)
profile = dict(zip(uniq_prsn_id,avg_rating))

t1 = time.time()
print(t1-t0)

7.928819417953491
32.49006271362305


In [None]:
t1 = time.time()
print(t1-t0)

import gc

del summary
del review
del json_dat
del val_dat

gc.collect()

In [321]:
# compute a predicted rating by averaging over the expected
# ratings of all people who reviewed the product.
pred_rating = []
for text in total_text:
    pred_rating.append(np.mean([profile[rid] for rid in text if rid in profile]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [322]:
# vectorize training data
train_lbls = np.array(avg_ratings) >= 4.5
train_text = total_text

pred_rating = np.array(pred_rating)
idx = np.isfinite(pred_rating)

In [326]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls[idx], pred_rating[idx] > 4.5)
print('True recall is sensitivity, false recall is specificity')
print(report)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.58      0.65      0.61     34937
        True       0.56      0.49      0.52     32301

    accuracy                           0.57     67238
   macro avg       0.57      0.57      0.57     67238
weighted avg       0.57      0.57      0.57     67238



In [327]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls[idx], np.random.permutation(pred_rating[idx] > 4.5))
print('True recall is sensitivity, false recall is specificity')
print(report)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.52      0.59      0.55     34937
        True       0.48      0.42      0.45     32301

    accuracy                           0.50     67238
   macro avg       0.50      0.50      0.50     67238
weighted avg       0.50      0.50      0.50     67238

