This script estimates the performance of naive bayes classification in a document-topic model's feature space

In [1]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer2, doc2Bow

In [2]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [3]:
# create a list, doc_list, with one review per elem
review = []
summary = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('reviewText') and json_dat[i].get('summary'): #not all reviews have text
        rating.append(json_dat[i].get('overall'))
        summary.append(json_dat[i].get('summary'))
        review.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [4]:
def preprocess_data(doc_set, asins, ratings):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        texts.append(preprocess_datum(i, tokenizer, p_stemmer, en_stop))
        
    combined_reviews, products, avg_ratings = combine_reviews(texts, asins, ratings)
    return (combined_reviews, products, avg_ratings)

def preprocess_datum(doc, tk, st, sw):
    # clean and tokenize document string
    raw = doc.lower()
    tokens = tk.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in sw]
    # stem tokens
    return [st.stem(i) for i in stopped_tokens]

def combine_reviews(text, asins, ratings):
    products = [asins[0]]
    combined_text = [text[0]]
    average_rating = []
    total_rating = ratings[0]
    count = 1
    
    #combine all the summaries into a single text and avg the review ratings for each product
    for i in range(1, len(asins)):
        last_element_index = len(products) - 1
        if(asins[i] == products[last_element_index]):
            combined_text[last_element_index] = combined_text[last_element_index] + text[i]
            total_rating += ratings[i]
            count += 1
        else:
            average_rating.append(total_rating/count)
            products.append(asins[i])
            combined_text.append(text[i])
            total_rating = ratings[i]
            count = 1
    average_rating.append(total_rating/count)
    
    return (combined_text, products, average_rating)

In [5]:
total_text, asins, avg_ratings = preprocess_data(summary, prod_id, rating)

In [6]:
print(len(total_text))

67338


In [7]:
import gc

del summary
del review
del json_dat
del val_dat

gc.collect()

40

In [8]:
from gensim.corpora import Dictionary
import gensim.matutils as matutils
from scipy.sparse.csc import csc_matrix

# vectorize training data
train_lbls = np.array(avg_ratings) >= 4.5
train_text = total_text

In [12]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict, KFold, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, FunctionTransformer

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
kf = KFold(n_splits=10)
cv = kf.split(train_text, train_lbls)

time0 = time.time()

# initialize a transformer mapping from bow to latent semantic features
doc2Top = docTopTransformer2()

# initialize a normalization transformer
norm_transformer = Normalizer()

# pick a classifier
baseClf = LinearSVC()

n_estimators=10
clf = BaggingClassifier(base_estimator=baseClf, 
                        bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=1)

# create a pipeline that transforms data to semantic space, 
# and then classifies them by averaging over n_estimators of 
# type baseClf
#
# Note, you could bag over n semantic models too by creating
# a pipeline using bow2Top and baseClf, and then passing that
# in as th base_estimator argument of a BaggingClassifier
# instance. If you think bagging classification of reviews will
# lead to better classification performance for theoretical
# reasons, this would be the way to go, however the purpose
# of bagging here is for to avoid a bypass the slow SVM fitting
# procedure
estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', clf)]
semClf = Pipeline(estimators)

# cross validate over the pipeline using group k-fold CV
pred_lbls = cross_val_predict(semClf, train_text, train_lbls, cv=cv, n_jobs=10)
time1 = time.time()

print(time1-time0)

297.3754200935364


In [13]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, pred_lbls)
print('True recall is sensitivity, false recall is specificity')
print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
acc = 1-np.mean(pred_lbls != train_lbls)
print("Accuracy: %0.3f" % acc)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.77      0.74      0.75     34994
        True       0.73      0.76      0.74     32344

    accuracy                           0.75     67338
   macro avg       0.75      0.75      0.75     67338
weighted avg       0.75      0.75      0.75     67338

Accuracy: 0.749


In [None]:
#clf.n_jobs = 5
clf = BaggingClassifier(base_estimator=baseClf, 
                        bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=5)

estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', clf)]
semClf = Pipeline(estimators)

semClf = semClf.fit(train_text,train_lbls)

In [None]:
# save fitted classifier
#
# classifier will project tokenized test data to the pretrained 
# document-topic embedding, apply two SVM classifiers, which may
# be linear, polynomial or RBF kernel classifiers, with various
# hyperparameters, average the results, and give you the 
# resulting prediction. The selected kernels and hyperparameters
# are chosen to optimize f1 score
import pickle
with open('linearSVM.clf',mode='wb') as f:
    pickle.dump(semClf,f)