In [17]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data

In [18]:
#########################
### Utility Functions ###
#########################

# searches for first match to target in dat, beginning
# search at start_offset
# useful for searching sorted lists.
def linearSearch(dat, target, start_offset=0):
    for i in range(start_offset, len(dat)):
        if target == dat[i]:
            return i
    

# takes n x 1 vectors of prsn_ratings and matching prsn_id,
# and an m x 1 (n >= m) vector of uniq_prsn_ids for whom we
# want to get average X. Does not preserve order.
# returns new uniq_lbls corresponding to order of avg_X
# O( n log(n) )
#
# if laplaceSmoothing is used then we add alpha to numerator
# and alpha*d to the denominator. If empirical distribution
# is known you can specify empirical distribution of category
# using mu. Mu can be a vector of length m if X is n x m
def avgByLbl(X, lbls):    
    # sort data for efficient averaging
    dat = sorted(list(zip(X,lbls)), key=lambda id: id[1])
    dat = [[i for i,j in dat], [j for i,j in dat]]
    X = np.array(dat[0])
    lbls = dat[1]
    
    uniq_lbls = np.unique(lbls)
    uniq_lbls = sorted(uniq_lbls)
    
    # use an averaging algorithm optimized for sorted entries
    # (requires sorted search targets and search list)
    # this algorithm never traverses the same element of the
    # search list twice, but carries the overhead of a pre-
    # sorted target list and search list. Thankfully those
    # can use the O(n log(n)) python sort implementation
    idx = 0
    
    if len(X.shape) > 1:
        avg_X = np.zeros((len(uniq_lbls),len(X[0])))
        for i,this_id in enumerate(uniq_lbls):
            idx = linearSearch(lbls, this_id, idx)
            n = 0.0
            while idx < len(lbls) and lbls[idx] == this_id:
                avg_X[i] = np.sum(np.vstack([avg_X[i],X[idx]]),axis=0)
                n += 1.0
                idx += 1
            avg_X[i] /= n
    else: 
        avg_X = np.zeros(len(uniq_lbls))
        for i,this_id in enumerate(uniq_lbls):
            idx = linearSearch(lbls, this_id, idx)
            n = 0.0
            while idx < len(lbls) and lbls[idx] == this_id:
                avg_X[i] += X[idx]
                n += 1.0
                idx += 1
            avg_X[i] /= n
            
    return avg_X, uniq_lbls

# takes list of text fields (e.g. summary or reviewText fields) and
# tokenizes, removes stop words and stems. Returns result as array of 
# lists, one list per review
def preprocess_data(doc_set):    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        if not i:
            i = ' '
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens.append('null__') # add a bias term, will work as a kind of prior, important for empty reviews
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

# takes an array of lists as input, product labels, uniq_labels, and ratings,
# and merges lists with matching labels among labels uniq_labels, averages
# reviews belonging to the same, returns merged lists, and averaged ratings
# uniq_labels should typically be np.unique(product labels), however 
# the option of specifying a subset is useful for parallelization to allow
# different subsets to be processed by different engines
def combine_reviews(text, asins):
        products = [asins[0]]
        combined_text = [text[0]]

        #combine all the summaries into a single text and avg the review ratings for each product
        for i in range(1, len(asins)):
            last_element_index = len(products) - 1
            if(asins[i] == products[last_element_index]):
                combined_text[last_element_index] = combined_text[last_element_index] + text[i]
                
            else:
                products.append(asins[i])
                combined_text.append(text[i])
        
        return (combined_text, products)

In [19]:
# searches for first match to target in dat, beginning
# search at start_offset
# useful for searching sorted lists.
def linearSearch(dat, target, start_offset=0):
    for i in range(start_offset, len(dat)):
        if target == dat[i]:
            return i

# takes n x 1 vectors of prsn_ratings and matching prsn_id,
# and an m x 1 (n >= m) vector of uniq_prsn_ids for whom we
# want to get average X. Does not preserve order.
# returns new uniq_lbls corresponding to order of avg_X
# O( n log(n) )
#
# if laplaceSmoothing is used then we add alpha to numerator
# and alpha*d to the denominator. If empirical distribution
# is known you can specify empirical distribution of category
# using mu. Mu can be a vector of length m if X is n x m
def avgByLbl(X, lbls):    
    # sort data for efficient averaging
    dat = sorted(list(zip(X,lbls)), key=lambda id: id[1])
    dat = [[i for i,j in dat], [j for i,j in dat]]
    X = np.array(dat[0])
    lbls = dat[1]
    
    uniq_lbls = np.unique(lbls)
    uniq_lbls = sorted(uniq_lbls)
    
    # use an averaging algorithm optimized for sorted entries
    # (requires sorted search targets and search list)
    # this algorithm never traverses the same element of the
    # search list twice, but carries the overhead of a pre-
    # sorted target list and search list. Thankfully those
    # can use the O(n log(n)) python sort implementation
    idx = 0
    avg_X = np.zeros(len(uniq_lbls))
    for i,this_id in enumerate(uniq_lbls):
        idx = linearSearch(lbls, this_id, idx)
        n = 0.0
        while idx < len(lbls) and lbls[idx] == this_id:
            avg_X[i] += X[idx]
            n += 1.0
            idx += 1
        avg_X[i] /= n

    return avg_X, uniq_lbls

def getProdRatings(target_prod_id):
    import pandas as pd

    prsn_asin = []
    prsn_id = []
    prsn_rating = []
    with open('../data/Sports_and_Outdoors_Ratings_training.csv') as file:
        reader = pd.read_csv(file, delimiter=',')
        prsn_rating = np.array([item[1] for item in reader['overall'].items()])
        prsn_id = np.array([item[1] for item in reader['reviewerID'].items()])
        prsn_asin = np.array([item[1] for item in reader['asin'].items()])

    
    prod_rating, prod_asin = avgByLbl(prsn_rating, prsn_asin)
    
    # sort prod_asin and target_prod_id so that they match
    # save inverse sort function to reverse at the end
    idx = np.argsort(target_prod_id)
    inv_sort = np.argsort(idx)
    
    target_prod_id = np.array(target_prod_id)
    target_prod_id = target_prod_id[idx]
    prod_list = sorted(list(zip(prod_rating, prod_asin)), key=lambda id: id[1])
    prod_rating = [i for i,j in prod_list]
    prod_asin = [j for i,j in prod_list]
    
    # now we can assume that prod_ratings will match target_prod_id because both prod_asin and 
    # target_prod_id are sorted
    prod_rating = [prod_rating[i] for i, this_prod in enumerate(prod_asin) if this_prod in target_prod_id] 
    prod_rating = np.array(prod_rating)
    
    # invert prod_rating to match original target_prod_rating order and return
    return prod_rating[inv_sort]


In [20]:
import pickle
with open('logisticRegression.clf', mode='rb') as f:
    clf = pickle.load(f)

In [21]:
# import and prepare test data# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
del json_dat

#sort test data by asin
val_dat = sorted(val_dat, key=lambda k: k['asin'])
    
doc_list = []
asin = []
rating = []
for i in range(0,len(val_dat)):
    doc_list.append(val_dat[i].get('summary'))
    rating.append(val_dat[i].get('overall'))
    asin.append(val_dat[i].get('asin'))

In [22]:

import ipyparallel as ipp

rc = ipp.Client()
dview = rc[:]
dview.execute('from nltk.tokenize import RegexpTokenizer;' +
              'from nltk.corpus import stopwords; ' + 
              'from nltk.stem.porter import PorterStemmer;' +
              'import numpy as np;')


# clean text
dview.push(dict(preprocess_data=preprocess_data))
dview.scatter('doc_list', doc_list) # partitions data

%px cleaned_reviews = preprocess_data(doc_list)
cleaned_reviews = dview.gather('cleaned_reviews').get()

# combine text
total_text, uniq_prod_id = combine_reviews(cleaned_reviews, asin)

In [76]:
# get model predictions for test data
pred_proba = clf.predict_proba(total_text)
clf_pred_lbl = clf.predict(total_text)

pred_lbl = [awesome > not_awesome for not_awesome, awesome in pred_proba]

assert all(np.array(clf_pred_lbl) == pred_lbl)

prod_rating_true = getProdRatings(uniq_prod_id)
obs_lbl = prod_rating_true > 4.5

In [79]:
from sklearn.metrics import classification_report

report = classification_report(obs_lbl, np.random.permutation(clf_pred_lbl))
print('True recall is sensitivity, false recall is specificity')
print(report)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.66      0.51      0.58     10842
        True       0.34      0.49      0.40      5569

    accuracy                           0.51     16411
   macro avg       0.50      0.50      0.49     16411
weighted avg       0.55      0.51      0.52     16411



In [26]:
from sklearn.metrics import classification_report

report = classification_report(obs_lbl, pred_lbl)
print('True recall is sensitivity, false recall is specificity')
print(report)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.84      0.65      0.73     10842
        True       0.53      0.76      0.62      5569

    accuracy                           0.69     16411
   macro avg       0.68      0.70      0.68     16411
weighted avg       0.73      0.69      0.69     16411

Accuracy: 0.686
