In [7]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data

In [8]:
# takes list of text fields (e.g. summary or reviewText fields) and
# tokenizes, removes stop words and stems. Returns result as array of 
# lists, one list per review
def preprocess_data(doc_set):    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        if not i:
            i = ' '
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens.append('null__') # add a bias term, will work as a kind of prior, important for empty reviews
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

# takes an array of lists as input, product labels, uniq_labels, and ratings,
# and merges lists with matching labels among labels uniq_labels, averages
# reviews belonging to the same, returns merged lists, and averaged ratings
# uniq_labels should typically be np.unique(product labels), however 
# the option of specifying a subset is useful for parallelization to allow
# different subsets to be processed by different engines
def combine_reviews(text, asins):
        products = [asins[0]]
        combined_text = [text[0]]

        #combine all the summaries into a single text and avg the review ratings for each product
        for i in range(1, len(asins)):
            last_element_index = len(products) - 1
            if(asins[i] == products[last_element_index]):
                combined_text[last_element_index] = combined_text[last_element_index] + text[i]
                
            else:
                products.append(asins[i])
                combined_text.append(text[i])
        
        return (combined_text, products)

In [9]:
import pickle
with open('logisticRegression.clf', mode='rb') as f:
    clf = pickle.load(f)

In [18]:
import json
# import and prepare test data
with open('../data/Sports_and_Outdoors_Reviews_test.json', 'r') as fp:
    json_dat = [json.loads(x) for x in fp.readlines()]

#json_dat = json_dat
json_dat = sorted(json_dat, key=lambda k: k['asin'])
    
doc_list = []
asin = []
test_reviewer_id = []
test_unixreviewtime = []
for i in range(0,len(json_dat)):
    doc_list.append(json_dat[i].get('summary'))
    asin.append(json_dat[i].get('asin'))
    test_reviewer_id.append(json_dat[i].get('reviewerID'))
    test_unixreviewtime.append(json_dat[i].get('unixReviewTime'))

In [19]:
# takes ~96 CPU minutes

# this cell runs things in parallel. make sure to start an 
# ipython cluster from the notebook dashboard's IPython Cluster
# tab before running
import ipyparallel as ipp

rc = ipp.Client()
dview = rc[:]
dview.execute('from nltk.tokenize import RegexpTokenizer;' +
              'from nltk.corpus import stopwords; ' + 
              'from nltk.stem.porter import PorterStemmer;' +
              'import numpy as np;')


# clean text
dview.push(dict(preprocess_data=preprocess_data))
dview.scatter('doc_list', doc_list) # partitions data

%px cleaned_reviews = preprocess_data(doc_list)
cleaned_reviews = dview.gather('cleaned_reviews').get()

# combine text
total_text, uniq_prod_id = combine_reviews(cleaned_reviews, asin)

In [20]:
# get model predictions for test data
pred_lbls = clf.predict(total_text)

In [30]:
dat = np.column_stack((uniq_prod_id, pred_lbls.astype(int)))

In [31]:
np.savetxt("Sports_and_Outdoors_Ratings_test.csv", dat, delimiter=",", fmt=['%s', '%s'], 
           header='asin,awesomeReview')

In [54]:
print(pred_lbls[11:20])
uniq_asin = np.unique(asin)
print(uniq_asin[11:20])
np.transpose(np.vstack([asin[350:400], doc_list[350:400]]))

[False  True False  True  True  True False False  True]
['002230FD1C13F1EE3B7A487E6C505B67' '00228A1FECFB78004B66CF6559E3E979'
 '0023FE6BED9B69EB14ECC494FA3F88BF' '00244A1D92CA0B9D9C5C846C96C36E5E'
 '002EB10683D96CCC7A27DC7D1190C772' '0030C44CE7D39FB47268DFFB1D0A9514'
 '0032EA58744E6633A5E822DC6BD23E4B' '003E7AFEE9A641B36151F4B1E09F8B5F'
 '00402D5DB3A9832918645CB72A130E8D']


array([['00204B63156848D7B5AE05AC221D3B6F', 'Excelente'],
       ['00204B63156848D7B5AE05AC221D3B6F', 'Very good basic gloves'],
       ['00204B63156848D7B5AE05AC221D3B6F', 'Two Stars'],
       ['00204B63156848D7B5AE05AC221D3B6F', 'excelent'],
       ['00204B63156848D7B5AE05AC221D3B6F', 'Interesting...'],
       ['00204B63156848D7B5AE05AC221D3B6F', 'Great feel'],
       ['002230FD1C13F1EE3B7A487E6C505B67',
        "5 *'s on ease of use and effective pain relief!"],
       ['002230FD1C13F1EE3B7A487E6C505B67', "It's so convenient"],
       ['002230FD1C13F1EE3B7A487E6C505B67', 'Five Stars'],
       ['002230FD1C13F1EE3B7A487E6C505B67', 'Perfect'],
       ['002230FD1C13F1EE3B7A487E6C505B67', 'Five Stars'],
       ['002230FD1C13F1EE3B7A487E6C505B67', 'Three Stars'],
       ['00228A1FECFB78004B66CF6559E3E979',
        'Love this case for my Verizon white iPhone 4'],
       ['00228A1FECFB78004B66CF6559E3E979', 'Perfect Fit'],
       ['00228A1FECFB78004B66CF6559E3E979', 'Perfect'],
       ['002