This script estimates the performance of naive bayes classification in a document-topic model's feature space

In [2]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer2

In [3]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [4]:
# create a list, doc_list, with one review per elem
review = []
summary = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('reviewText') and json_dat[i].get('summary'): #not all reviews have text
        rating.append(json_dat[i].get('overall'))
        summary.append(json_dat[i].get('summary'))
        review.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [9]:
# initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = set(stopwords.words('english'))
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
def preprocess_data(doc_set):
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for doc in doc_set:
        # clean and tokenize document string
        raw = doc.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        texts.append([p_stemmer.stem(i) for i in stopped_tokens])
    return texts

In [10]:
# this cell runs things in parallel. make sure to start an 
# ipython cluster from the notebook dashboard's IPython Cluster
# tab before running
import ipyparallel as ipp

rc = ipp.Client()
dview = rc[:]
dview.push(dict(tokenizer=tokenizer, p_stemmer=p_stemmer,
                en_stop=set(stopwords.words('english'))))

t0 = time.time()
summary_text = dview.apply_sync(preprocess_data, summary)
t1 = time.time()


# vectorize training data
awesome = np.array(rating) >= 4.5
summary_text = summary_text[0]

print(t1-t0)

206.26088070869446


In [14]:
import gc

del summary
del review
del json_dat
del val_dat

gc.collect()

27368980

In [211]:
from sklearn.base import TransformerMixin, BaseEstimator

from gensim.models import LsiModel as lsi
from gensim.corpora import Dictionary
from gensim import matutils

class center(TransformerMixin, BaseEstimator):
        
    def _breakTuple(self, X):
        X_, X__ = [[x1 for x1,x2 in X], [x2 for x1,x2 in X]]
        return np.array(X_, dtype=object), np.array(X__)
        
    # X must be a tuple with the second element being a group identifier
    def fit(self, X, y=None):      
        X_vec, X_lbl = self._breakTuple(X)
        
        lbl_mean = []
        uPid = np.unique(X_lbl)
        for lbl in uPid:
            mean_X = np.mean(X_vec[np.where(lbl == X_lbl)], axis=0)
            lbl_mean.append(mean_X)
            
        self.lbl_mean = dict(zip(uPid, lbl_mean))
        self.grand_mean = np.mean(lbl_mean, axis=0)
        
        return self
    
    # note this transforms training data, but if we're looking at new data
    # for unseen lbls, then this doesn't do anything to it.
    def transform(self, X, y=None):
        X_vec, X_lbl = self._breakTuple(X)
        
        newX = []
        for i,X_ in enumerate(X_vec):
            if X_lbl[i] in self.lbl_mean.keys():
                newX.append(X_ - self.lbl_mean[X_lbl[i]] + self.grand_mean)
            else:
                newX.append(X_)
                
        return newX
    

# similar to docTopTransformer except it takes a corpus as input and trains dictionaries and computes BOWs internally
class docTopTransformer3(TransformerMixin, BaseEstimator):
    
    def __init__(self, d=300, distributed=False):
        self.this_dict = []
        self.d = d
        self.distributed = distributed
        
    def _breakTuple(self, X):
        X_, X__ = [[x1 for x1,x2 in X], [x2 for x1,x2 in X]]
        return np.array(X_, dtype=object), np.array(X__)
    
    def _getBOW(self,X):
        # transform corpus (train) into a 2d array word counts (a 'bag of words')
        bow = [self.this_dict.doc2bow(text) for text in X]
        
        return bow
    
    # takes corpus as input
    def fit(self, X, y=None):
        
        X, delme = self._breakTuple(X)
        
        # train a document-topic model        
        self.this_dict = Dictionary(X)

        bow = self._getBOW(X)
        
        # construct a semantic model based on document-topic similarity (15-20 min for 1500k reviews?)
        self.semSpace = lsi(bow, id2word=self.this_dict, num_topics=self.d, 
                            chunksize=100000, distributed=self.distributed)
        
        return self
    
    def transform(self, X, y=None):
        X, lbl = self._breakTuple(X)
        
        bow = self._getBOW(X)
        
        # Apply the semantic model to the training set bag of words (fast)
        feat = self.semSpace[bow]

        # convert from TransformedCorpus datatype to numpy doc x topic array (medium speed, needs more benchmarking)
        topics_csr = matutils.corpus2csc(feat)
        X_ = topics_csr.T.toarray()
        
        X_ = list(zip(X_,lbl))
        return X_

In [120]:
from gensim.corpora import Dictionary
import gensim.matutils as matutils
from scipy.sparse.csc import csc_matrix

train_text = summary_text[0:10000]
train_lbls = awesome[0:10000]
this_prod_id = prod_id[0:10000]

In [221]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
gkf = GroupKFold(n_splits=10)
cv = gkf.split(train_text, train_lbls, groups=this_prod_id)

time0 = time.time()

# initialize a transformer mapping from bow to latent semantic features
doc2Top3 = docTopTransformer3()
# pick a classifier
baseClf = LinearSVC()

n_estimators=10
clf = BaggingClassifier(base_estimator=baseClf, 
                        bootstrap=False, max_samples = 4/n_estimators, n_estimators=n_estimators,
                        n_jobs=1)

# create a pipeline that transforms data to semantic space, 
# and then classifies them by averaging over n_estimators of 
# type baseClf
#
# Note, you could bag over n semantic models too by creating
# a pipeline using bow2Top and baseClf, and then passing that
# in as th base_estimator argument of a BaggingClassifier
# instance. If you think bagging classification of reviews will
# lead to better classification performance for theoretical
# reasons, this would be the way to go, however the purpose
# of bagging here is for to avoid a bypass the slow SVM fitting
# procedure
estimators = [('projection', doc2Top3), ('center', center()), ('clf', clf)]
semClf = Pipeline(estimators)

# cross validate over the pipeline using group k-fold CV
pred_lbls = cross_val_predict(semClf, list(zip(train_text, prod_id)), train_lbls, cv=cv, n_jobs=5)
#pred_lbls = cross_val_predict(semClf, train_text, train_lbls, cv=cv, n_jobs=5)
time1 = time.time()

print(time1-time0)

25.662643671035767


In [222]:
# features learned on center product vectors
from sklearn.metrics import classification_report

report = classification_report(train_lbls, pred_lbls)
print('True recall is sensitivity, false recall is specificity')
print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
acc = 1-np.mean(pred_lbls != train_lbls)
print("Accuracy: %0.3f" % acc)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.72      0.54      0.61      3406
        True       0.79      0.89      0.84      6594

    accuracy                           0.77     10000
   macro avg       0.75      0.71      0.72     10000
weighted avg       0.76      0.77      0.76     10000

Accuracy: 0.770


In [220]:
# features learned on raw data
from sklearn.metrics import classification_report

report = classification_report(train_lbls, pred_lbls)
print('True recall is sensitivity, false recall is specificity')
print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
acc = 1-np.mean(pred_lbls != train_lbls)
print("Accuracy: %0.3f" % acc)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.73      0.53      0.62      3406
        True       0.79      0.90      0.84      6594

    accuracy                           0.77     10000
   macro avg       0.76      0.72      0.73     10000
weighted avg       0.77      0.77      0.76     10000

Accuracy: 0.774


In [29]:
#clf.n_jobs = 5
clf = BaggingClassifier(base_estimator=baseClf, 
                        bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=5)

estimators = [('projection', doc2Top), ('clf', clf)]
semClf = Pipeline(estimators)

semClf = semClf.fit(train_text,train_lbls)

In [31]:
# save fitted classifier
#
# classifier will project tokenized test data to the pretrained 
# document-topic embedding, apply two SVM classifiers, which may
# be linear, polynomial or RBF kernel classifiers, with various
# hyperparameters, average the results, and give you the 
# resulting prediction. The selected kernels and hyperparameters
# are chosen to optimize f1 score
import pickle
with open('linearSVM.clf',mode='wb') as f:
    pickle.dump(semClf,f)

Pipeline(steps=[('projection', docTopTransformer2()),
                ('clf',
                 BaggingClassifier(base_estimator=LinearSVC(), bootstrap=False,
                                   max_samples=0.1, n_jobs=5))])