This script demonstrates how to use cross validation to estimate the performance of a naive bayes classifier based on a document-topic model of review content. Important points to note:

* cross validation groups by product id to ensure independence assumptions across CV folds are satisfied
* document topic model construction is incorporated into CV model training
* the REAL validation data is never actually used here. After separating training and validation data, I only work with training data. I make a second subdivision for a validation dataset for the sake of this script because this is not the final product and I don't want to touch the real validation data until the final product is ready

In [1]:
import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#from nltk import bigrams

import numpy as np
#import itertools

#import matplotlib.pyplot as plt

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import semantic_NB

In [2]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [3]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('reviewText'): #not all reviews have text
        rating.append(json_dat[i].get('overall'))
        doc_list.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [4]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [5]:
clean_text=preprocess_data(doc_list[0:100000])

In [6]:
# first let's make an 80/20 split, so we can estimate our classifier performance
# in that 80% and then compare it with the 20%. We don't want to use val_dat from
# split_data() here, because we don't want to start peaking at our REAL validation
# data at this point.
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)
groupIterator = gkf.split(clean_text[0:100000], rating[0:100000], groups=prod_id[0:100000])
train, val = next(groupIterator)

We will call on the custom semantic_NB class here. It's in the libraries folder. You should take a moment to look at it before scanning the code further

In [7]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_score

# train are just indices, so let's extract the corresponding data elements
train_text = [clean_text[x] for x in train]
train_lbls = [rating[x] >= 4.5 for x in train]
train_prod_id = [prod_id[x] for x in train]

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)
cv = gkf.split(train_text, train_lbls, groups=train_prod_id)

clf = semantic_NB()
# there will be k estimates here, one for each partitioning of the data
scores = cross_val_score(clf, train_text, train_lbls, cv=cv)

In [8]:
# test classifier accuracy on validation data

val_text = [clean_text[x] for x in val]
val_lbls = [rating[x] >= 4.5 for x in val]

predictor = semantic_NB()
predictor.fit(train_text, train_lbls)
pred_lbl = predictor.predict(val_text)

In [9]:
print("Estimated classifier accuracy based on CV estimator: %0.5f" % np.mean(scores))

acc = 1 - (val_lbls != pred_lbl).sum()/len(val_lbls)
print("Classifier accuracy in validation data: %0.5f" % acc)

Estimated classifier accuracy based on CV estimator: 0.64575
Classifier accuracy in validation data: 0.64535
