In [1]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data

In [2]:
with open('../data/Sports_and_Outdoors_Reviews_training.json', 'r') as fp:
    json_dat = [json.loads(x) for x in fp.readlines()]

json_dat = json_dat[0:16000] # use this for prototyping on smaller subsets

In [3]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    doc_list.append(json_dat[i].get('reviewText'))
    rating.append(json_dat[i].get('overall'))
    prod_id.append(json_dat[i].get('asin'))

In [4]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        if not i:
            i = ' '
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens.append('null__') # add a bias term, will work as a kind of prior, important for empty reviews
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [5]:
train_text=preprocess_data(doc_list)

In [6]:
# this is the code we used for Deliverable #3, however there are
# better ways to implement what this does using pipelines and
# sklearn transformers. I've updated the other scripts to 
# use that approach, but am leaving this here as a record

from sklearn.utils.validation import check_is_fitted

from gensim.models import LsiModel as lsi
from gensim.corpora import Dictionary
from gensim import matutils

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

class semantic_SVM(SVC):
    semSpace=[]
    this_dict = []
    
    # reviews should be a list of reviews, where each review is itself a 'cleaned'
    # list of words (lematized, no stop words, etc). train_lbls should be a
    # boolean array
    def fit(self, train_reviews, train_lbls):
        # train a document-topic model        
        self.this_dict = Dictionary(train_reviews)

        # transform corpus (train) into a 2d array word counts (a 'bag of words')
        bow_corpus = [self.this_dict.doc2bow(text) for text in train_reviews]

        # construct a semantic model based on document-topic similarity (15-20 min for 1500k reviews?)
        self.semSpace = lsi(bow_corpus, id2word=self.this_dict, num_topics=300, chunksize=100000, distributed=False)

        # Apply the semantic model to the training set bag of words (fast)
        feat_train = self.semSpace[bow_corpus]

        # convert from TransformedCorpus datatype to numpy doc x topic array (medium speed, needs more benchmarking)
        train_topics_csr = matutils.corpus2csc(feat_train)
        feat_train_np = train_topics_csr.T.toarray()
        
        # fit naive bayes model to training features and apply it to test features
        return super().fit(feat_train_np, train_lbls)
    
    def predict(self, test_reviews):   
        check_is_fitted(self)
        
        # Apply semantic model to test set
        bow_corpus_test = [self.this_dict.doc2bow(text) for text in test_reviews]
        feat_test = self.semSpace[bow_corpus_test]
        test_topics_csr = matutils.corpus2csc(feat_test)
        feat_test_np = test_topics_csr.T.toarray()

        return super().predict(feat_test_np)

In [7]:
train_lbls = np.array(rating) >= 4.5

clf = semantic_SVM()
clf = clf.fit(train_text, train_lbls)

In [8]:
from sklearn.model_selection import cross_val_predict, GroupKFold

gkf = GroupKFold(n_splits=10)
cv = gkf.split(train_text, train_lbls, groups=prod_id)

predict = cross_val_predict(clf, train_text, train_lbls, cv=cv, n_jobs=10)

In [9]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, predict)
print('Summary Text Prediction')
print(report)

Summary Text Prediction
              precision    recall  f1-score   support

       False       0.71      0.38      0.50      5313
        True       0.75      0.92      0.83     10687

    accuracy                           0.74     16000
   macro avg       0.73      0.65      0.66     16000
weighted avg       0.74      0.74      0.72     16000



In [14]:
# import and prepare test data
with open('../data/Sports_and_Outdoors_Reviews_test.json', 'r') as fp:
    json_dat = [json.loads(x) for x in fp.readlines()]

json_dat = json_dat
    
doc_list = []
asin = []
test_reviewer_id = []
test_unixreviewtime = []
for i in range(0,len(json_dat)):
    doc_list.append(json_dat[i].get('reviewText')
    asin.append(json_dat[i].get('asin'))
    test_reviewer_id.append(json_dat[i].get('reviewerID'))
    test_unixreviewtime.append(json_dat[i].get('unixReviewTime'))


test_text=preprocess_data(doc_list)

In [15]:
# get model predictions for test data
pred_lbls = clf.predict(test_text)

In [16]:
dat = np.column_stack((asin, test_reviewer_id, pred_lbls.astype(int), test_unixreviewtime))

In [17]:
np.savetxt("Sports_and_Outdoors_Ratings_test.csv", dat, delimiter=",", fmt=['%s','%s', '%s', '%s'], 
           header='asin,reviewerID,awesomeReview,unixReviewTime')