This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [88]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer, doc2Bow

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:100000] # use this for prototyping on smaller subsets
del val_dat

In [90]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('summary'): #not all reviews have summary text
        rating.append(json_dat[i].get('overall'))
        doc_list.append(json_dat[i].get('summary'))
        prod_id.append(json_dat[i].get('asin'))

In [91]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [92]:
from multiprocessing import Pool

pool = Pool(16)
x = pool.map(preprocess_data, [doc_list])
pool.close()
pool.join()

clean_text = x[0]

In [93]:
# vectorize training data
train_lbls = np.array(rating) >= 4.5
train_text = clean_text

In [94]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

# similar to sklearn's but allows us to do some balancing operations that are
# important for decision trees
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

bowGen = doc2Bow()
bowGen = bowGen.fit(train_text)
my_dict = bowGen.this_dict

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
gkf = GroupKFold(n_splits=10)
cv = gkf.split(train_text, train_lbls, groups=prod_id)

time0 = time.time()

params = {
    'baseClf__max_depth': [5, 7, 10, 15, 20],
    'baseClf__min_samples_leaf': [1],
    'baseClf__max_leaf_nodes': [20],
}

# pick a classifier
bow2Top = docTopTransformer(this_dict=my_dict)
baseClf = DecisionTreeClassifier()

estimators = [('projection', bow2Top), ('baseClf', baseClf)]
semClf = Pipeline(estimators)

grid_DT = GridSearchCV(estimator = semClf, param_grid = params, scoring = 'f1_macro', 
                       cv = 5, verbose = 1, n_jobs = 1)

n_estimators=10
clf = BaggingClassifier(base_estimator=grid_DT, 
                        bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=1)


# create a pipeline that transforms data to semantic space, 
# and then classifies them by averaging over n_estimators of 
# type baseClf
#
# Note, you could bag over n semantic models too by creating
# a pipeline using bow2Top and baseClf, and then passing that
# in as th base_estimator argument of a BaggingClassifier
# instance. If you think bagging classification of reviews will
# lead to better classification performance for theoretical
# reasons, this would be the way to go, however the purpose
# of bagging here is for to avoid a bypass the slow SVM fitting
# procedure
undersamp = RandomUnderSampler(replacement=False)
estimators = [('projection', bowGen), ('clf', clf)]
semBagClf = Pipeline(estimators)

# cross validate over the pipeline using group k-fold CV
pred_lbls = cross_val_predict(semBagClf, train_text, train_lbls, cv=cv, n_jobs=10)

time1 = time.time()
print(time1-time0)

2461.181562423706


In [95]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, pred_lbls)
print('True recall is sensitivity, false recall is specificity')
print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
acc = 1-np.mean(pred_lbls != train_lbls)
print("Accuracy: %0.3f" % acc)

True recall is sensitivity, false recall is specificity
              precision    recall  f1-score   support

       False       0.86      0.35      0.50     33325
        True       0.75      0.97      0.85     66649

    accuracy                           0.77     99974
   macro avg       0.81      0.66      0.67     99974
weighted avg       0.79      0.77      0.73     99974

Accuracy: 0.766


In [96]:
clf = BaggingClassifier(base_estimator=grid_DT, 
                        bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=10)


# create a pipeline that transforms data to semantic space, 
# and then classifies them by averaging over n_estimators of 
# type baseClf
#
# Note, you could bag over n semantic models too by creating
# a pipeline using bow2Top and baseClf, and then passing that
# in as th base_estimator argument of a BaggingClassifier
# instance. If you think bagging classification of reviews will
# lead to better classification performance for theoretical
# reasons, this would be the way to go, however the purpose
# of bagging here is for to avoid a bypass the slow SVM fitting
# procedure
undersamp = RandomUnderSampler(replacement=False)
estimators = [('projection', bowGen), ('clf', clf)]
semBagClf = Pipeline(estimators)

semBagClf = semBagClf.fit(train_text, train_lbls)

In [97]:
for this_clf in semBagClf[1]:
    print(this_clf.best_estimator_)

Pipeline(steps=[('projection',
                 docTopTransformer(this_dict=<gensim.corpora.dictionary.Dictionary object at 0x2ab147fbfbb0>)),
                ('baseClf',
                 DecisionTreeClassifier(max_depth=15, max_leaf_nodes=20,
                                        random_state=876777618))])
Pipeline(steps=[('projection',
                 docTopTransformer(this_dict=<gensim.corpora.dictionary.Dictionary object at 0x2ab173f6a8b0>)),
                ('baseClf',
                 DecisionTreeClassifier(max_depth=10, max_leaf_nodes=20,
                                        random_state=1033613485))])
Pipeline(steps=[('projection',
                 docTopTransformer(this_dict=<gensim.corpora.dictionary.Dictionary object at 0x2ab173f6aaf0>)),
                ('baseClf',
                 DecisionTreeClassifier(max_depth=15, max_leaf_nodes=20,
                                        random_state=1940349266))])
Pipeline(steps=[('projection',
                 docTopTransformer

In [98]:
# save fitted classifier
import pickle
with open('ensembleDT.clf',mode='wb') as f:
    pickle.dump(semBagClf,f)