This script estimates the performance of linear SVM classification in a document-topic model's feature space

In [6]:
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import sys
sys.path.append("../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer2

In [7]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:18000]
del val_dat

In [8]:
# create a list, doc_list, with one review per elem
summary = []
review = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('reviewText') and json_dat[i].get('summary'): #not all reviews have text
        rating.append(json_dat[i].get('overall'))
        summary.append(json_dat[i].get('summary'))
        review.append(json_dat[i].get('reviewText'))
        prod_id.append(json_dat[i].get('asin'))

In [9]:
from spellchecker import SpellChecker

def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # create spellchecker
    sp = SpellChecker()
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)        
        # autocorrect spelling
        #tokens = [sp.correction(this_tok) for this_tok in tokens]
        # remove stop words from tokens
        stopped_tokens = [this_tok for this_tok in tokens if not this_tok in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(this_tok) for this_tok in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [10]:
summary_tok=preprocess_data(summary)
review_tok=preprocess_data(review)
awesome = np.array([x >= 4.5 for x in rating])

In [None]:
# run this cell tweaking the search space a big until you're satisfied you're
# at a local minimum, and not up against the edge of the search space

from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from copy import deepcopy

idx = np.random.permutation([x for x in range(0, len(summary_tok))])
sample_idx = idx[0:np.power(2,12)]

train_lbls = awesome[sample_idx]
this_prod_id = [prod_id[i] for i in sample_idx]
sum_train_text = deepcopy([summary_tok[i] for i in sample_idx])
#rev_train_text = deepcopy([review_tok[i] for i in sample_idx])

gkf = GroupKFold(n_splits=5)
cv = gkf.split(sum_train_text, train_lbls, groups=this_prod_id)

#tuned_parameters = [{'doc2Top__d': [100,200,300,400],
#                     'clf__C': [1, 10, 100, 1000]}]
tuned_parameters = [{'clf__kernel': ['rbf'], 
                     'clf__gamma': [1e-2, 1e-3, 1e-4],
                     'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['linear'], 
                      'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['poly'], 
                      'clf__C': [100, 1000, 10000],
                      'clf__degree': [2]}]

doc2Top = docTopTransformer2()
baseClf = SVC(kernel='rbf')
estimators = [('doc2Top', doc2Top), ('clf', baseClf)]
semClf = Pipeline(estimators)
n_estimators=4
clf = GridSearchCV(semClf, tuned_parameters, cv=cv)
print(clf.best_params_)

In [None]:
# update the search space to be centered on the local minimum found above
# this time run it on twice as much data, and do it twice
# averaging over the results using a bagging classifier
# compute CV metrics, and get a final model

idx = np.random.permutation([x for x in range(0, len(summary_tok))])
sample_idx = idx[0:np.power(2,13)]

train_lbls = awesome[sample_idx]
this_prod_id = [prod_id[i] for i in sample_idx]
sum_train_text = deepcopy([summary_tok[i] for i in sample_idx])
#rev_train_text = deepcopy([review_tok[i] for i in sample_idx])

gkf = GroupKFold(n_splits=5)
cv = gkf.split(sum_train_text, train_lbls, groups=this_prod_id)

#tuned_parameters = [{'doc2Top__d': [100,200,300,400],
#                     'clf__C': [1, 10, 100, 1000]}]
tuned_parameters = [{'clf__kernel': ['rbf'], 
                     'clf__gamma': [1e-2, 1e-3, 1e-4],
                     'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['linear'], 
                      'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['poly'], 
                      'clf__C': [100, 1000, 10000],
                      'clf__degree': [2]}]

doc2Top = docTopTransformer2()
baseClf = SVC(kernel='rbf')
estimators = [('doc2Top', doc2Top), ('clf', baseClf)]
semClf = Pipeline(estimators)
n_estimators=2

clf = GridSearchCV(semClf, tuned_parameters, cv=cv, scoring='f1_macro')
bag = BaggingClassifier(base_estimator=clf, 
                        bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                        n_jobs=1)

t0 = time.time()
sum_predict = cross_val_predict(bag, sum_train_text, train_lbls, groups=this_prod_id, cv=cv, n_jobs=3)
t1 = time.time()
print(t1-t0)
clf = bag.fit(sum_train_text,train_lbls)

In [47]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, sum_predict)
print('Summary Text Prediction')
print(report)

Summary Text Prediction
              precision    recall  f1-score   support

       False       0.78      0.42      0.55      1346
        True       0.77      0.94      0.85      2750

    accuracy                           0.77      4096
   macro avg       0.77      0.68      0.70      4096
weighted avg       0.77      0.77      0.75      4096

{'clf__C': 1000, 'clf__gamma': 0.001}


In [43]:
gkf = GroupKFold(n_splits=5)
cv = gkf.split(sum_train_text, train_lbls, groups=this_prod_id)
sum_predict_default = cross_val_predict(semClf, sum_train_text, train_lbls, groups=this_prod_id, cv=cv, n_jobs=3)

report = classification_report(train_lbls, sum_predict_default)
print('Prediction using hyperparameter defaults')
print(report)

Summary Text Prediction
              precision    recall  f1-score   support

       False       0.70      0.51      0.59      1380
        True       0.78      0.89      0.83      2716

    accuracy                           0.76      4096
   macro avg       0.74      0.70      0.71      4096
weighted avg       0.76      0.76      0.75      4096

{'clf__C': 1, 'doc2Top__d': 400}


In [None]:
# save fitted classifier
#
# classifier will project tokenized test data to the pretrained 
# document-topic embedding, apply two SVM classifiers, which may
# be linear, polynomial or RBF kernel classifiers, with various
# hyperparameters, average the results, and give you the 
# resulting prediction. The selected kernels and hyperparameters
# are chosen to optimize f1 score
import pickle
with open('hypOptSVM.clf',mode='wb') as f:
    pickle.dump(clf,f)