This script estimates the performance of naive bayes classification in a document-topic model's feature space

In [1]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import semantic_SVM

In [2]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:4000] # use this for prototyping on smaller subsets

In [3]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('reviewText'): #not all reviews have text
        rating.append(json_dat[i].get('overall'))
        doc_list.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [4]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [5]:
clean_text=preprocess_data(doc_list)

In [6]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict

train_lbls = np.array(rating) >= 4.5
train_text = clean_text

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)
cv = gkf.split(train_text, train_lbls, groups=prod_id)

clf = semantic_SVM()
# there will be k estimates here, one for each partitioning of the data
pred_lbls = cross_val_predict(clf, train_text, train_lbls, cv=cv)

In [7]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, pred_lbls)
print('True recall is sensitivity, false recall is specificity')
print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
acc = 1-np.mean(pred_lbls != train_lbls)
print("Accuracy: %0.3f" % acc)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.68      0.24      0.35      1252
        True       0.73      0.95      0.83      2747

    accuracy                           0.73      3999
   macro avg       0.71      0.59      0.59      3999
weighted avg       0.72      0.73      0.68      3999

Accuracy: 0.726


In [8]:
end_time = time.time()
print(end_time-start_time)

71.19845271110535
