This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [20]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer, docTopTransformer2, doc2Bow

In [21]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:32000] # use this for prototyping on smaller subsets
del val_dat

In [22]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('summary'): #not all reviews have summary text
        rating.append(json_dat[i].get('overall'))
        doc_list.append(json_dat[i].get('summary'))
        prod_id.append(json_dat[i].get('asin'))

In [23]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [24]:
from multiprocessing import Pool

#pool = Pool(4)
#x = pool.map(preprocess_data(), [doc_list])
#pool.close()
#pool.join()

#clean_text = x[0]
clean_text = preprocess_data(doc_list)

In [25]:
# vectorize training data
train_lbls = np.array(rating) >= 4.5
train_text = clean_text

In [39]:
# estimate classifier accuracy
from sklearn.model_selection import cross_val_predict, GroupKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# similar to sklearn's but allows us to do some balancing operations that are
# important for decision trees
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

bowGen = doc2Bow()
bowGen = bowGen.fit(train_text)
my_dict = bowGen.this_dict

# GroupKFold gives you a KFold partitioner that abides by
# product labels so that products are only ever in a single
# fold
gkf = GroupKFold(n_splits=10)
cv = gkf.split(train_text, train_lbls, groups=prod_id)

time0 = time.time()

params = {
    'baseClf__max_depth': [3, 5, 7, 10, 15],
    'baseClf__min_samples_leaf': [1],
    'baseClf__max_leaf_nodes': [20],
}

# pick a classifier
bow2Top = docTopTransformer(this_dict=my_dict)
baseClf = DecisionTreeClassifier()

estimators = [('projection', bow2Top), ('baseClf', baseClf)]
semClf = Pipeline(estimators)

grid_DT = GridSearchCV(estimator = semClf, param_grid = params, scoring = 'f1_macro', 
                       cv = 5, verbose = 1, n_jobs = 1)

n_estimators=10
clf = BaggingClassifier(base_estimator=grid_DT, 
                        bootstrap=True, max_samples = 3.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=1)


# create a pipeline that transforms data to semantic space, 
# and then classifies them by averaging over n_estimators of 
# type baseClf
#
# Note, you could bag over n semantic models too by creating
# a pipeline using bow2Top and baseClf, and then passing that
# in as th base_estimator argument of a BaggingClassifier
# instance. If you think bagging classification of reviews will
# lead to better classification performance for theoretical
# reasons, this would be the way to go, however the purpose
# of bagging here is for to avoid a bypass the slow SVM fitting
# procedure
undersamp = RandomUnderSampler(replacement=False)
estimators = [('projection', bowGen), ('clf', clf)]
semBagClf = Pipeline(estimators)
semBagClf.fit(train_text, train_lbls)


time1 = time.time()
print(time1-time0)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.1min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.5min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.2min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.2min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.4min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.2min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.1min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.0min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.8min finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.2min finished


2024.9758689403534


In [None]:
#Prediction cell
import json
import math

#read in the file
path = '../data/Sports_and_Outdoors_Reviews_test.json'
with open(path, 'r') as fp:
    all_reviews = [json.loads(x) for x in fp.readlines()]

# create a list, doc_list, with one review per elem
test_doc_list = []
test_prod_id = []
test_reviewer_id = []
test_unixreviewtime = []
for i in range(0,len(all_reviews)):
    if all_reviews[i].get('summary'): #not all reviews have summary text
        test_doc_list.append(all_reviews[i].get('summary'))
        test_prod_id.append(all_reviews[i].get('asin'))
        test_reviewer_id.append(all_reviews[i].get('reviewerID'))
        test_unixreviewtime.append(all_reviews[i].get('unixReviewTime'))

clean_text_test=preprocess_data(test_doc_list)
prediction = semBagClf.predict(clean_text_test)


# Create output file
dat = np.column_stack((test_prod_id, test_reviewer_id, prediction.astype(int), test_unixreviewtime))
np.savetxt("Sports_and_Outdoors_Ratings_test.csv", dat, delimiter=",", fmt=['%s','%s'], 
           header='asin,reviewerID,awesomeReview,unixReviewTime')

In [None]:
end_time = time.time()
print(end_time-start_time)