This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [62]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer

In [63]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:50000] # use this for prototyping on smaller subsets

In [64]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('reviewText'): #not all reviews have text
    #if json_dat[i].get('summary'): #not all reviews have summary text
        rating.append(json_dat[i].get('overall'))
        #doc_list.append(json_dat[i].get('summary'))
        doc_list.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [65]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [66]:
clean_text=preprocess_data(doc_list)

In [67]:
from gensim.corpora import Dictionary
import gensim.matutils as matutils

# vectorize training data
train_lbls = np.array(rating) >= 4.5
train_text = clean_text

# train a document-topic model        
this_dict = Dictionary(train_text)

# transform corpus (train) into a 2d array word counts (a 'bag of words')
bow_train = [this_dict.doc2bow(text) for text in train_text]
bow_train = np.transpose(matutils.corpus2dense(bow_train, len(this_dict)))

In [85]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
gkf = GroupKFold(n_splits=5)
cv = gkf.split(bow_train, train_lbls, groups=prod_id)

time0 = time.time()

# initialize a transformer mapping from bow to latent semantic features
bow2Top = docTopTransformer(this_dict=this_dict, d=300, distributed=False)

params = {
    'clf__base_estimator__max_depth': [3, 5, 7, 10, 15],
    'clf__base_estimator__min_samples_leaf': [1],
    'clf__base_estimator__max_leaf_nodes': [20],
    'clf__n_estimators': [10],
    'clf__max_samples': [0.1]
}

# pick a classifier
baseClf = DecisionTreeClassifier()

clf = BaggingClassifier(base_estimator=baseClf, bootstrap=False, n_jobs = 4)

# create a pipeline that transforms data to semantic space, 
# and then classifies them using clf
estimators = [('projection', bow2Top), ('clf', clf)]
semClf = Pipeline(estimators)

grid_DT = GridSearchCV(estimator = semClf, param_grid = params, scoring = ['f1', 'accuracy', 'precision', 'recall'], 
                       refit = 'f1', cv = cv, verbose = 1, n_jobs = 4)

grid_DT.fit(bow_train, train_lbls)

# cross validate over the pipeline using group k-fold CV
#pred_lbls = cross_val_predict(semClf, bow_train, train_lbls, cv=cv)

time1 = time.time()
print(time1-time0)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 26.1min finished


1659.9721484184265


In [93]:
from sklearn.metrics import classification_report

print(grid_DT.cv_results_)
print(grid_DT.best_estimator_)
print("best score:")
print(grid_DT.best_score_)
print("best params:")
print(grid_DT.best_params_)
print("best index:")
print(grid_DT.best_index_)
print("scorer:")
print(grid_DT.scorer_)
print("refit time:")
print(grid_DT.refit_time_)


#report = classification_report(train_lbls, pred_lbls)
#print('True recall is sensitivity, false recall is specificity')
#print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
#acc = 1-np.mean(pred_lbls != train_lbls)
#print("Accuracy: %0.3f" % acc)

{'mean_fit_time': array([242.39180713, 233.35279417, 242.97810702, 229.0049798 ,
       218.64869013]), 'std_fit_time': array([ 6.59821453,  4.88109719,  8.38355339, 11.24679564, 66.04149978]), 'mean_score_time': array([5.90632501, 4.93922744, 4.61299577, 5.00871372, 4.72566414]), 'std_score_time': array([2.13824477, 1.22147921, 1.09948707, 1.51876656, 1.91014334]), 'param_clf__base_estimator__max_depth': masked_array(data=[3, 5, 7, 10, 15],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_clf__base_estimator__max_leaf_nodes': masked_array(data=[20, 20, 20, 20, 20],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_clf__base_estimator__min_samples_leaf': masked_array(data=[1, 1, 1, 1, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_clf__max_samples': masked_array(data=[0.1, 0.1, 0.1, 0.1, 0.1],
 

In [94]:
end_time = time.time()
print(end_time-start_time)

16680.05286502838
