this script uses neural nets in the following places
* for part of speech classification in lemmatization
* word2Vec semantic embeddings

Ideally we can do several things here:
* use existing word embeddings to classify summary text
* use more sentiment analysis to classify review text
* have a classifier that enforces True iff review text is true and summary text is true (true = awesome)
* assign sentiments to nouns, identify relevant/irrelevant nouns, and classify based on their sentiments

sentiment analysis alg idea:
find adj-noun pairs
train classifier on training set based on noun semantics. Identify relevant nouns by a threshold over the SVM coefficients
train classifer on training set based on verb semantics.
predict outcome based on verbs attached to relevant nouns.

In [1]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import numpy as np

import sys
sys.path.append("../../libraries/")
from selector import split_data
from semanticClassifiers import docTopTransformer2,bigramsPhraser

In [2]:
# Read in testing data for 80/20 split
# we won't use val_dat at all
json_dat, val_dat = split_data('../../data/Sports_and_Outdoors_Reviews_training.json', 80)
json_dat = json_dat[0:141792]
del val_dat

In [3]:
# create lists with one review per elem
summary = []
review = []
rating = []
prod_id = []
for i in range(0,len(json_dat)):
    if json_dat[i].get('summary'): #not all reviews have text
        rating.append(json_dat[i].get('overall'))
        summary.append(json_dat[i].get('summary'))
        prod_id.append(json_dat[i].get('asin'))
        
del json_dat

In [4]:
# the get_wordnet_pos() function relies on a neural method, 
# and preprocess_data_lemmatize therefore does too. Implemented
# here for some benchmarking only

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker


def get_wordnet_pos(tag):
    """Map POS tag to first character lemmatize() accepts"""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def preprocess_data_lemmatize(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create spell checker
    sc = SpellChecker()
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # create spell checker
    sp = SpellChecker()
    # Create lemmatizer class
    wordnet_lemmatizer = WordNetLemmatizer()    
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # autocorrect spelling
        tokens = [sp.correction(this_tok) for this_tok in tokens]
        # get part of speech tag
        this_pos = [tag[1] for tag in nltk.pos_tag(tokens)]
        # remove stop words from tokens, also return associatd tags
        stopped_tokens = [(this_tok, this_pos[i]) for i,this_tok in enumerate(tokens)
                          if not this_tok in en_stop]        
        # lemmatize tokens
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(this_tok[0], pos=get_wordnet_pos(this_tok[1]))
                             for this_tok in stopped_tokens]
        # add tokens to list
        texts.append([(this_tok, this_pos[i]) for i,this_tok in enumerate(lemmatized_tokens)])
    return texts

In [None]:
import time
t0 = time.time()
awesome = np.array(rating) >= 4.5
lem_summary_text = preprocess_data_lemmatize(summary[0:18000])
t1 = time.time()
#del summary
#del review

print(t1-t0)
lem_summary_text

In [5]:
from multiprocessing import Pool

pool = Pool(16)
try:
    x = pool.map(preprocess_data_lemmatize, [summary])
finally:
    pool.close()
    pool.join()

lem_summary_text = x[0]
awesome = np.array([x >= 4.5 for x in rating])

In [6]:
i = 1
print(summary[i])
print(lem_summary_text[i])

Much better than plastic caps.
[('much', 'RB'), ('good', 'JJR'), ('plastic', 'IN'), ('cap', 'NN')]


In [8]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.cross_decomposition import PLSRegression

# expects corpus of tuples as input, (word, part-of-speach)
# nltk.pos_tagger has the following pos_tag convention
# N* - noun
# J* - adjective
# V* - verb
# R* - adverb
# use nltk.help.upenn_tagset('CD') to reverse lookup tags, 
# where CD is whatever your tag is
class posDoc(TransformerMixin, BaseEstimator):
    def __init__(self, pos_tag="N"):
        self.pos_tag = pos_tag
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = []
        for doc in X:
            this_doc = [word[0] for word in doc 
                        if word[1][:len(self.pos_tag)] == self.pos_tag]
            this_doc.append('null') # prior value to avoid null set return
            X_.append(this_doc)
        return X_
    
# see here regarding this function:
# https://stackoverflow.com/questions/41881086/valueerror-with-scikit-learn-plsregression-when-used-in-pipeline
class PLSRegressionWrapper(PLSRegression):
    def transform(self, X):
        return super().transform(X)

    def fit_transform(self, X, Y):
        return self.fit(X,Y).transform(X)

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import PCA

from gensim.models import Word2Vec, KeyedVectors
    
class getLatentWord(TransformerMixin, BaseEstimator):
    
    def __init__(self):
        self.model = KeyedVectors.load_word2vec_format('../../../GoogleNews-vectors-negative300.bin', binary=True)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cumWord = []
        for review in X:
            review_word_vec = [self.model[word] for word in review 
                                   if self.model.vocab.__contains__(word)]
            review_word_vec.append(self.model['null']) # works as prior so that we can score empty reviews
            cumWord.append(np.sum(review_word_vec, axis=0))
        return cumWord
    
            
# assumes words are already in latent representation
class getLatentTopics(TransformerMixin, BaseEstimator):
    
    # takes words as input
    def fit(self, X, y=None):
        
        # train a document-topic model        
        self.pca = PCA(n_components = 100)
        self.pca.fit(X)
        
        return self
    
    def transform(self, X, y=None):
        
        return self.pca.transform(X)
    
class docTopTransformer(BaseEstimator):
    

In [16]:
# fit some example models to see what's being misclassified or correctly classified

from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
    
from sklearn.pipeline import Pipeline
from copy import deepcopy

idx = np.random.permutation([x for x in range(0, len(lem_summary_text))])
sample_idx = idx[0:np.power(2,16)]

train_lbls = awesome[sample_idx]
this_prod_id = [prod_id[i] for i in sample_idx]
lem_train_text = deepcopy([lem_summary_text[i] for i in sample_idx])

gkf = GroupKFold(n_splits=5)
cv = gkf.split(lem_train_text, train_lbls, groups=this_prod_id)
n_dim = 2

nounSpace = Pipeline([('getNoun', posDoc(pos_tag="N")), ('doc2Top',docTopTransformer2(d=75)), 
                      ('dRedux', PLSRegressionWrapper(n_components=n_dim))])
verbSpace = Pipeline([('getNoun', posDoc(pos_tag="V")), ('doc2Top',docTopTransformer2(d=75)), 
                      ('dRedux', PLSRegressionWrapper(n_components=n_dim))])
advSpace = Pipeline([('getNoun', posDoc(pos_tag="R")), ('doc2Top',docTopTransformer2(d=75)), 
                      ('dRedux', PLSRegressionWrapper(n_components=n_dim))])
adjSpace = Pipeline([('getNoun', posDoc(pos_tag="J")), ('doc2Top',docTopTransformer2(d=75)), 
                      ('dRedux', PLSRegressionWrapper(n_components=n_dim))])
numSpace = Pipeline([('getNoun', posDoc(pos_tag="CD")), ('doc2Top',docTopTransformer2(d=10)), 
                      ('dRedux', PLSRegressionWrapper(n_components=n_dim))])
featureExtract = FeatureUnion([('nouns', nounSpace), ('verbs', verbSpace), ('adv', advSpace), 
                               ('adj', adjSpace), ('numbers', numSpace)])

clf = SVC(kernel='poly', degree=2)

estimators = [('getFeatures', featureExtract), ('classify', clf)]
semClf = Pipeline(estimators)

t0 = time.time()
predict = cross_val_predict(semClf, lem_train_text, train_lbls, cv=cv, n_jobs=5)
t1 = time.time()

print(t1-t0)

print(1 - np.sum(predict != train_lbls)/len(train_lbls))


147.31337714195251
0.758697509765625


In [17]:
from sklearn.metrics import classification_report

report = classification_report(train_lbls, predict)
print('Summary Text Prediction')
print(report)

Summary Text Prediction
              precision    recall  f1-score   support

       False       0.88      0.31      0.45     21538
        True       0.74      0.98      0.85     43998

    accuracy                           0.76     65536
   macro avg       0.81      0.64      0.65     65536
weighted avg       0.79      0.76      0.72     65536



In [None]:
test_idx = [i for i in range(0,18000) if i not in sample_idx]
#lem_train_text = deepcopy([lem_summary_text[i] for i in sample_idx])
#doc2wordEmb = getLatentWord()
semClf = semClf.fit(lem_train_text, train_lbls)

test_lbls = awesome[test_idx]
test_text = deepcopy([lem_summary_text[i] for i in test_idx])

In [None]:
predict2 = semClf.predict(test_text)
print(1 - np.sum(predict2 != test_lbls)/len(test_lbls))