This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [13]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer

In [14]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:20000] # use this for prototyping on smaller subsets

In [15]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    #if json_dat[i].get('reviewText'): #not all reviews have text
    if json_dat[i].get('summary'): #not all reviews have summary text
        rating.append(json_dat[i].get('overall'))
        doc_list.append(json_dat[i].get('summary'))
        #doc_list.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [16]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [17]:
clean_text=preprocess_data(doc_list)

In [18]:
from gensim.corpora import Dictionary
import gensim.matutils as matutils

# vectorize training data
train_lbls = np.array(rating) >= 4.5
train_text = clean_text

# train a document-topic model        
this_dict = Dictionary(train_text)

# transform corpus (train) into a 2d array word counts (a 'bag of words')
bow_train = [this_dict.doc2bow(text) for text in train_text]
bow_train = np.transpose(matutils.corpus2dense(bow_train, len(this_dict)))

In [19]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
gkf = GroupKFold(n_splits=10)
cv = gkf.split(bow_train, train_lbls, groups=prod_id)

time0 = time.time()

# initialize a transformer mapping from bow to latent semantic features
bow2Top = docTopTransformer(this_dict=this_dict, d=300, distributed=False)

params = {
    'clf__base_estimator__max_depth': [5, 7, 10, 15],
    'clf__n_estimators': [10],
    'clf__max_samples': [0.1]
}

# pick a classifier
baseClf = DecisionTreeClassifier()

clf = BaggingClassifier(base_estimator=baseClf, bootstrap=False, n_jobs = 4)

# create a pipeline that transforms data to semantic space, 
# and then classifies them using clf
estimators = [('projection', bow2Top), ('clf', clf)]
semClf = Pipeline(estimators)

grid_DT = GridSearchCV(estimator = semClf, param_grid = params, scoring = ['f1', 'accuracy', 'precision', 'recall'], 
                       refit = 'f1', cv = cv, verbose = 40, n_jobs = 4)

grid_DT.fit(bow_train, train_lbls)

# cross validate over the pipeline using group k-fold CV
#pred_lbls = cross_val_predict(semClf, bow_train, train_lbls, cv=cv)

time1 = time.time()
print(time1-time0)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   15.9s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   16.5s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   29.2s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:   29.7s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:   30.1s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:   30.6s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:   43.8s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   44.3s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:   44.8s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:   45.0s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:   57.5s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:   58.0s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:   58.4s
[Parallel(

159.02924251556396


In [20]:
from sklearn.metrics import classification_report

#print(grid_DT.cv_results_)
print(grid_DT.best_estimator_)
print("\n")
print("best score:")
print(grid_DT.best_score_)
print("\n")
print("best params:")
print(grid_DT.best_params_)
print("\n")
print("best index:")
print(grid_DT.best_index_)
print("\n")
print("scorer:")
print(grid_DT.scorer_)
print("\n")
print("refit time:")
print(grid_DT.refit_time_)


#report = classification_report(train_lbls, pred_lbls)
#print('True recall is sensitivity, false recall is specificity')
#print(report)

# this is not exactly the same as the average of each CV folds accuracy, 
# but it's close and much faster to compute
#acc = 1-np.mean(pred_lbls != train_lbls)
#print("Accuracy: %0.3f" % acc)

Pipeline(memory=None,
         steps=[('projection',
                 docTopTransformer(d=300, distributed=False,
                                   this_dict=<gensim.corpora.dictionary.Dictionary object at 0x0000024F4EFF4EC8>)),
                ('clf',
                 BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                         class_weight=None,
                                                                         criterion='gini',
                                                                         max_depth=5,
                                                                         max_features=None,
                                                                         max_leaf_nodes=None,
                                                                         min_impurity_decrease=0.0,
                                                                         min_impurity_split=None,
     

In [21]:
#Prediction cell
import json
import math

#read in the file
path = '../data/Sports_and_Outdoors_Reviews_test.json'
with open(path, 'r') as fp:
    all_reviews = [json.loads(x) for x in fp.readlines()]

# create a list, doc_list, with one review per elem
test_doc_list = []
test_prod_id = []
test_reviewer_id = []
test_unixreviewtime = []
for i in range(0,len(all_reviews)):
    #if json_dat[i].get('reviewText'): #not all reviews have text
    if all_reviews[i].get('summary'): #not all reviews have summary text
        test_doc_list.append(all_reviews[i].get('summary'))
        #test_doc_list.append(all_reviews[i].get('reviewText'))
        test_prod_id.append(all_reviews[i].get('asin'))
        test_reviewer_id.append(all_reviews[i].get('reviewerID'))
        test_unixreviewtime.append(all_reviews[i].get('unixReviewTime'))

clean_text_test=preprocess_data(test_doc_list)

# Create prediction array
prediction = grid_DT.predict(clean_text_test)

# Create output file
dat = np.column_stack((test_prod_id, test_reviewer_id, prediction.astype(int), test_unixreviewtime))
np.savetxt("Sports_and_Outdoors_Ratings_test.csv", dat, delimiter=",", fmt=['%s','%s'], 
           header='asin,reviewerID,awesomeReview,unixReviewTime')

AttributeError: 'list' object has no attribute 'flat'

In [None]:
end_time = time.time()
print(end_time-start_time)