This script estimates the performance of naive bayes classification in a document-topic model's feature space

In [2]:
%load_ext autoreload
%autoreload 2
import time
start_time = time.time()

import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np

import ipyparallel as ipp

import gc

import pickle

import pandas as pd

from gensim.models import LsiModel as lsi
from gensim.corpora import Dictionary
from gensim import matutils
from gensim.models.phrases import Phrases, Phraser

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import cross_val_predict, KFold, GroupKFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, FunctionTransformer
from sklearn.metrics import classification_report, roc_curve, auc


In [3]:
##########################################
### Transformers for sklearn pipelines ###
##########################################


# this is a class to accommodate semantic space mappings. It takes
# a bow representation as input and returns features in a latent
# semantic space as output
#
# The class is a valid sklearn transformer and can be used as such
# in sklearn pipelines. For details refer to,
# https://scikit-learn.org/stable/modules/compose.html
#
# Also, this is another useful reference,
# https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html
class docTopTransformer(TransformerMixin, BaseEstimator):
    
    def __init__(self, this_dict=None, d=300, distributed=False):
        self.this_dict = this_dict
        self.d = d
        self.distributed = distributed
        
    def fit(self, X, y=None):
        corpus = matutils.Dense2Corpus(np.transpose(X))
        
        # construct a semantic model based on document-topic similarity (15-20 min for 1500k reviews?)
        self.semSpace = lsi(corpus, id2word=self.this_dict, num_topics=self.d, 
                            chunksize=20000, distributed=self.distributed)
        
        return self
    
    def transform(self, X, y=None):
        corpus = matutils.Dense2Corpus(np.transpose(X))
        
        # Apply the semantic model to the training set bag of words (fast)
        feat = self.semSpace[corpus]

        # convert from TransformedCorpus datatype to numpy doc x topic array (medium speed, needs more benchmarking)
        topics_csr = matutils.corpus2csc(feat)
        X_ = topics_csr.T.toarray()
        
        return X_
    

# transforms documents to bag of word representations
class doc2Bow(TransformerMixin, BaseEstimator):
        
    def _getBOW(self,X):
        # transform corpus (train) into a 2d array word counts (a 'bag of words')
        bow = [self.this_dict.doc2bow(text) for text in X]
        
        return bow
    
    # takes corpus as input
    def fit(self, X, y=None):
        
        # train a document-topic model        
        self.this_dict = Dictionary(X)
        
        return self
    
    def transform(self, X, y=None):
        bow = self._getBOW(X)
        
        X_ = np.transpose(matutils.corpus2dense(bow, len(self.this_dict)))
        
        return X_

    
# computes a profile for each subject specifying their likelihood of
# submitting a particular rating. Transforms features (subject identifiers)
# into a feature vector of log likelihoods representing the likelihood
# of a particular rating given the identity of the subject providing the
# rating
class subjLogLikProfiles(TransformerMixin, BaseEstimator):
    
    def _sumByLbl(self, X, lbls, laplaceSmooth=None):
        # precomute any smoothing factors (SF)
        if laplaceSmooth is not None:
            numSF = np.multiply(laplaceSmooth['alpha']*laplaceSmooth['d'], laplaceSmooth['mu'])
            denSF = laplaceSmooth['alpha']*laplaceSmooth['d']
        else:
            numSF = 0
            denSF = 0

        # sort data for efficient averaging
        dat = sorted(list(zip(X,lbls)), key=lambda id: id[1])
        dat = [[i for i,j in dat], [j for i,j in dat]]
        X = np.array(dat[0])
        lbls = dat[1]

        uniq_lbls = np.unique(lbls)
        uniq_lbls = sorted(uniq_lbls)

        # use an averaging algorithm optimized for sorted entries
        # (requires sorted search targets and search list)
        # this algorithm never traverses the same element of the
        # search list twice, but carries the overhead of a pre-
        # sorted target list and search list. Thankfully those
        # can use the O(n log(n)) python sort implementation
        idx = 0
        cum_X = np.zeros((len(uniq_lbls),len(X[0])))
        
        for i,this_id in enumerate(uniq_lbls):
            idx = linearSearch(lbls, this_id, idx)
            n = 0.0
            while idx < len(lbls) and lbls[idx] == this_id:
                cum_X[i] = np.sum(np.vstack([cum_X[i],X[idx]]),axis=0)
                n += 1.0
                idx += 1
            cum_X[i] += numSF
            cum_X[i] *= n/(n + denSF)

        return cum_X, uniq_lbls
    
    # asumes X is tuple (prsn_id, prsn_rating, prod_id)
    def __init__(self, laplaceSmooth=None):
        self.laplaceSmooth = laplaceSmooth
    
    def fit(self, X, y):
        yLbl = np.unique(y)
        
        # mu is 'empirical' estimate it from empirical incidences
        if self.laplaceSmooth:
            if type(self.laplaceSmooth['mu']) is str and self.laplaceSmooth['mu'] == 'empirical':
                mu = np.zeros(self.laplaceSmooth['d'])
                for i in range(0, self.laplaceSmooth['d']):
                    mu[i] = np.mean(y == yLbl[i])
                
                self.laplaceSmooth['mu'] = mu
                
            if type(self.laplaceSmooth['mu']) is str and self.laplaceSmooth['mu'] == 'uniform':
                self.laplaceSmooth['mu'] = [1.0/self.laplaceSmooth['d']]*self.laplaceSmooth['d']
        
        # return frequencies of each review label for this subject
        yExp = [y==yLbl[0]]
        for i in range(1,len(yLbl)):
            yExp = np.vstack([yExp, [y==yLbl[i]]])
        yExp = np.transpose(yExp)
                
        subjectClassIncidence, uniq_prsn_id = self._sumByLbl(yExp, X, self.laplaceSmooth)
        
        classIncidence = np.sum(yExp,axis=0)
        classIncidence = np.tile(classIncidence, len(subjectClassIncidence)
                                ).reshape(len(subjectClassIncidence),len(classIncidence))
        
        logLik = np.subtract(np.log(subjectClassIncidence), np.log(classIncidence))
        
        self.profile = dict(zip(uniq_prsn_id, logLik))
        
        return self
        
    def transform(self, X):
        meanValue = np.mean(np.array(list(self.profile.values())), axis=0)
        
        profile = [meanValue]*len(X)
        for i, rid in enumerate(X):
            if rid in self.profile:
                profile[i] = self.profile[rid]
        
        profile = np.vstack(profile)
        
        return profile
    
# a function designed for loglikelihood normalization across reviews
# converts likelihoods to posterior probabilities using a uniform prior
class logLik2PostProb(TransformerMixin, BaseEstimator):
    
    def fit(self,X,y):
        return self
    
    def transform(self,X,y=None):
        marginal = np.log(np.sum(np.exp(X), axis=1))
        n, m = np.shape(X)
        marginal = np.transpose(np.tile(marginal, m).reshape(m,n))
        
        return np.exp(np.subtract(X, marginal))

In [4]:
# Splits the data into (training, testing). "testing" in this context is a validation
# dataset we use internally as a sanity check
def split_data(path, percent):
    import json
    import math
    
    with open(path, 'r') as fp:
        all_objs = [json.loads(x) for x in fp.readlines()]
        
    index = math.floor((percent/100)*len(all_objs))
    training = []
    test = []
    for x in all_objs[:index]:
        if x['asin'] not in all_objs[index]['asin']:
            training.append(x)
        else:
            test.append(x)
    
    for x in all_objs[index:]:
        test.append(x)
        
    return (training, test)

# takes list of text fields (e.g. summary or reviewText fields) and
# tokenizes, removes stop words and stems. Returns result as array of 
# lists, one list per review
def preprocess_data(doc_set):    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        if not i:
            i = ' '
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens.append('null__') # add a bias term, will work as a kind of prior, important for empty reviews
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

# takes an array of lists as input, product labels, uniq_labels, and ratings,
# and merges lists with matching labels among labels uniq_labels, averages
# reviews belonging to the same, returns merged lists, and averaged ratings
# uniq_labels should typically be np.unique(product labels), however 
# the option of specifying a subset is useful for parallelization to allow
# different subsets to be processed by different engines
def combine_reviews(text, asins, ratings):
        products = [asins[0]]
        combined_text = [text[0]]
        average_rating = []
        total_rating = ratings[0]
        count = 1

        #combine all the summaries into a single text and avg the review ratings for each product
        for i in range(1, len(asins)):
            last_element_index = len(products) - 1
            if(asins[i] == products[last_element_index]):
                combined_text[last_element_index] = combined_text[last_element_index] + text[i]
                total_rating += ratings[i]
                count += 1
            else:
                average_rating.append(total_rating/count)
                products.append(asins[i])
                combined_text.append(text[i])
                total_rating = ratings[i]
                count = 1
        average_rating.append(total_rating/count)
        
        return (combined_text, products, average_rating)
    
# similar to combine_review but removes rating averaging for test data. Test data has no review
# field, so we can't average over one
def combine_test_reviews(text, asins):
        products = [asins[0]]
        combined_text = [text[0]]

        #combine all the summaries into a single text and avg the review ratings for each product
        for i in range(1, len(asins)):
            last_element_index = len(products) - 1
            if(asins[i] == products[last_element_index]):
                combined_text[last_element_index] = combined_text[last_element_index] + text[i]
                
            else:
                products.append(asins[i])
                combined_text.append(text[i])
        
        return (combined_text, products)

In [5]:
# searches for first match to target in dat, beginning
# search at start_offset
# useful for searching sorted lists.
def linearSearch(dat, target, start_offset=0):
    for i in range(start_offset, len(dat)):
        if target == dat[i]:
            return i

# takes n x 1 vectors of prsn_ratings and matching prsn_id,
# and an m x 1 (n >= m) vector of uniq_prsn_ids for whom we
# want to get average X. Does not preserve order.
# returns new uniq_lbls corresponding to order of avg_X
# O( n log(n) )
def avgByLbl(X, lbls):    
    # sort data for efficient averaging
    dat = sorted(list(zip(X,lbls)), key=lambda id: id[1])
    dat = [[i for i,j in dat], [j for i,j in dat]]
    X = np.array(dat[0])
    lbls = dat[1]
    
    uniq_lbls = np.unique(lbls)
    uniq_lbls = sorted(uniq_lbls)
    
    # use an averaging algorithm optimized for sorted entries
    # (requires sorted search targets and search list)
    # this algorithm never traverses the same element of the
    # search list twice, but carries the overhead of a pre-
    # sorted target list and search list. Thankfully those
    # can use the O(n log(n)) python sort implementation
    idx = 0
    avg_X = np.zeros(len(uniq_lbls))
    for i,this_id in enumerate(uniq_lbls):
        idx = linearSearch(lbls, this_id, idx)
        n = 0.0
        while idx < len(lbls) and lbls[idx] == this_id:
            avg_X[i] += X[idx]
            n += 1.0
            idx += 1
        avg_X[i] /= n

    return avg_X, uniq_lbls

# computes expected rating based on probability of each class label
def expRating(prob):
    n,m = np.shape(prob)
    values = [1,2,3,4,5]
    values = np.tile(values,n).reshape(n,m)
    exp_val = np.sum(np.multiply(prob, values), axis=1)
    return exp_val

def getAllRatings():   
    prsn_asin = []
    prsn_id = []
    prsn_rating = []
    with open('../data/Sports_and_Outdoors_Ratings_training.csv') as file:
        reader = pd.read_csv(file, delimiter=',')
        prsn_rating = np.array([item[1] for item in reader['overall'].items()])
        prsn_id = np.array([item[1] for item in reader['reviewerID'].items()])
        prsn_asin = np.array([item[1] for item in reader['asin'].items()])
        
    return prsn_rating, prsn_id, prsn_asin


def getProdRatings(target_prod_id):
    prsn_rating, prsn_id, prsn_asin = getAllRatings()
    
    prod_rating, prod_asin = avgByLbl(prsn_rating, prsn_asin)
    
    # sort prod_asin and target_prod_id so that they match
    # save inverse sort function to reverse at the end
    idx = np.argsort(target_prod_id)
    inv_sort = np.argsort(idx)
    
    target_prod_id = np.array(target_prod_id)
    target_prod_id = target_prod_id[idx]
    prod_list = sorted(list(zip(prod_rating, prod_asin)), key=lambda id: id[1])
    prod_rating = [i for i,j in prod_list]
    prod_asin = [j for i,j in prod_list]
    
    # now we can assume that prod_ratings will match target_prod_id because both prod_asin and 
    # target_prod_id are sorted
    prod_rating = [prod_rating[i] for i, this_prod in enumerate(prod_asin) if this_prod in target_prod_id] 
    prod_rating = np.array(prod_rating)
    
    # invert prod_rating to match original target_prod_rating order and return
    return prod_rating[inv_sort]

In [6]:
def readNRC(filename, stemmed=True):
    """ Reads the NRC lexicon into a dictionary.
    """
    wordToEmotions = dict()
    p_stemmer = PorterStemmer()
    count = 0
    with open(filename, 'r') as fp:
        # Loop through lines
        for line in fp.readlines():
            line = line.strip('\n')
            words = line.split('\t')
            if len(words) != 3:
                continue
            # Stem word
            word = p_stemmer.stem(words[0]) if stemmed else words[0]
            val = int(line[-1:])
            # Store the emotions associated with the word
            if count == 0:
                wordToEmotions[word] = np.array([val])
            else:
                wordToEmotions[word] = np.append(wordToEmotions[word],val)
                
            count = (count + 1)%10
    return wordToEmotions

p_stemmer = PorterStemmer()
def getEmotions(words, lexicon):
    """ Returns a list with percentage of words which conveyed [anger, anticipation, ... , trust]
    """
    emotionCount = np.array([0,0,0,0,0,0,0,0,0,0])
    for word in words:
        # Stem each word
        word = p_stemmer.stem(word)
        # Sum the emotions
        if word in lexicon.keys():
            emotionCount = emotionCount + lexicon[word]
        
    # Avg over all words
    emotionCount = emotionCount / sum(emotionCount) if sum(emotionCount) > 0 else emotionCount
    
    return emotionCount

def getDocEmotions(docList, lexicon):
    """ docList is a list of list of words, lexicon is a dictionary of the NRC Word-Emotion Lexicon
        Returns a list of emotion arrays.
    """
    return [getEmotions(x, lexicon) for x in docList]

In [7]:
def pick_classifier(type, jobs=1):
    if(type == 'LinearSVM'):
        baseClf = LinearSVC()

        params = {
            'C': [1, 10, 100, 1000]
        }

        grid_LSVC = GridSearchCV(estimator = baseClf, param_grid = params, scoring = 'f1_macro', 
                       cv = 10, verbose = 1, n_jobs = 1)

        n_estimators=10
        bagClf = BaggingClassifier(base_estimator=grid_LSVC, 
                                bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                                n_jobs=jobs)
        
        baseClf = SVC() 
        doc2Top = docTopTransformer()
        norm_transformer = Normalizer()

        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', bagClf)]
        clf = Pipeline(estimators)
        
        return clf
    
    elif(type == 'SVC'):
        tuned_parameters = [{'clf__kernel': ['rbf'], 
                     'clf__gamma': [1e-1, 1e-2, 1e-3],
                     'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['linear'], 
                      'clf__C': [100, 1000, 10000]},
                     {'clf__kernel': ['poly'], 
                      'clf__C': [100, 1000, 10000],
                      'clf__degree': [2]}]
        
        baseClf = SVC() 
        doc2Top = docTopTransformer()
        norm_transformer = Normalizer()

        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', baseClf)]
        semClf = Pipeline(estimators)
        
        clf = GridSearchCV(semClf, tuned_parameters, cv=10, n_jobs=jobs, scoring='f1_macro')
        
        return clf
    
    elif(type == 'BaggedDT'):
        baseClf = DecisionTreeClassifier()

        params = {
            'clf__n_estimators': [5, 10, 100]
        }

        n_estimators=10
        bagClf = BaggingClassifier(base_estimator=baseClf, 
                                bootstrap=False, max_samples = 1.0/n_estimators, n_jobs=1)

        doc2Top = docTopTransformer()
        norm_transformer = Normalizer()

        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', bagClf)]
        semClf = Pipeline(estimators)
        
        grid_LR = GridSearchCV(estimator = semClf, param_grid = params, scoring = 'f1_macro', 
                               cv = 10, verbose = 1, n_jobs = jobs)
        return grid_LR
    
    elif(type == 'RandomForest'):
        baseClf = RandomForestClassifier()

        
        doc2Top = docTopTransformer()
        norm_transformer = Normalizer()

        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', baseClf)]
        semClf = Pipeline(estimators)
        
        # Number of trees in random forest
        n_estimators = [10, 25, 50, 100]
        # Maximum number of levels in tree
        max_depth = [5, 7, 10, 14]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]

        #compiling all parameters into param_grid
        param_grid = {
            'clf__n_estimators': n_estimators, 
            'clf__max_depth': max_depth,
            'clf__min_samples_split': min_samples_split,
            'clf__min_samples_leaf': min_samples_leaf
        }

        #calling main classifier function
        nestdClf = GridSearchCV(estimator = semClf, param_grid = param_grid, scoring = 'f1_macro', 
                               cv = 10, verbose = 1, n_jobs = jobs)
        
        return clf
    
    elif(type == 'Boosted'):
        boostedClf = AdaBoostClassifier()
        
        doc2Top = docTopTransformer()
        # initialize a normalization transformer
        norm_transformer = Normalizer()

        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', boostedClf)]
        clf = Pipeline(estimators)
        
        return clf
        
    elif(type == 'LogisticRegression'):
        baseClf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)

        params = {
            'C': [10, 15, 20]
        }

        grid_LR = GridSearchCV(estimator = baseClf, param_grid = params, scoring = 'f1_macro', 
                               cv = 10, verbose = 1, n_jobs = 1)

        n_estimators=10
        bagClf = BaggingClassifier(base_estimator=grid_LR, 
                                bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                                n_jobs=jobs)
        
        doc2Top = docTopTransformer()
        # initialize a normalization transformer
        norm_transformer = Normalizer()
        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', bagClf)]
        clf = Pipeline(estimators)
        
        return clf
    
    else:
        baseClf = GaussianNB()

        n_estimators=10
        bagClf = BaggingClassifier(base_estimator=baseClf, 
                                bootstrap=False, max_samples = 1.0/n_estimators, n_estimators=n_estimators,
                                n_jobs=jobs)
        
        
        doc2Top = docTopTransformer()
        # initialize a normalization transformer
        norm_transformer = Normalizer()

        estimators = [('projection', doc2Top), ('normalization', norm_transformer), ('clf', bagClf)]
        clf = Pipeline(estimators)
        
        
        
        return clf

In [None]:
def classifyByReviewerID():
    prsn_rating, prsn_id, prsn_asin = getAllRatings()
        
    # GroupKFold gives you a KFold partitioner that abides by
    # product labels so that products are only ever in a single
    # fold
    gkf = GroupKFold(n_splits=10)
    cv = gkf.split(X, y, groups=lbl)

    # alpha should be optimized
    getLogLikProfiles = subjLogLikProfiles(laplaceSmooth=dict(alpha=0.2, d=5, mu='empirical'))
    #getProfiles = subjProfile()

    # NB isn't a great classifier but it's very fast
    nb = GaussianNB()

    # get subject class likelihood profiles, convert to log likelihood, convert reviews
    # to products (sum log likelihoods, average ratings and threshold for awesome),
    # then classify ratings based on sum of log likelihoods (i.e. product of likelihoods)
    # GaussianNB should automatically apply empirical class priors
    reviewClf = Pipeline([('getLogLikProfiles', getLogLikProfiles), ('normalize',logLik2PostProb()), ('clf', nb)])
    
    X = prsn_id
    y = prsn_rating
    pred_review_scores = cross_val_predict(reviewClf, X, y, cv=cv, n_jobs=10)
    

    report = classification_report(y, pred_review_scores)
    print('Subject Profile Prediction')
    print(report)
    
    report = classification_report(y, np.random.permutation(pred_review_scores))
    print('Null Prediction')
    print(report)
    
    # plot predicted vs. observed ratings
    from matplotlib import pyplot as plt
    xx = y
    yy = pred_review_scores
    plt.rcParams.update({'font.size':18})
    plt.figure(figsize=[6,6]);
    plt.plot(xx, yy, '.');
    plt.xlabel('Obs');
    plt.ylabel('Pred');
    plt.xlim([0,6]);
    plt.ylim([0,6]);
    plt.plot(np.unique(xx), np.poly1d(np.polyfit(xx, yy, 1))(np.unique(xx)));
    plt.title('Review rating\npredited by reviewer likelihood profile\n');
    
    
    
    
    # above we predicted individual review ratings, here we're going to predict
    # the overall awesome vs. not awesome rating
    gkf = GroupKFold(n_splits=10)
    cv = gkf.split(X, y, groups=lbl)

    getLogLikProfiles = subjLogLikProfiles(laplaceSmooth=dict(alpha=0.2, d=5, mu='empirical'))
    nb = GaussianNB()
    calClf = CalibratedClassifierCV(nb)
    reviewClf = Pipeline([('getLogLikProfiles', getLogLikProfiles), ('normalize',logLik2PostProb()), ('clf', calClf)])

    prob_review_rating = cross_val_predict(reviewClf, X, y, cv=cv, method='predict_proba', n_jobs=10)
    pred_review_rating = expRating(prob_review_rating)
    pred_awesome = pred_review_rating > 4.5
    
    obs_awesome = y > 4.5
        
    report = classification_report(obs_awesome, pred_awesome)
    print('Awesome Review Predicted by Subject Profile')
    print(report)
    report = classification_report(obs_awesome, np.random.permutation(pred_awesome))
    print('Null Prediction')
    print(report)
    
    
    review_fpr, review_tpr, _ = roc_curve(obs_awesome, pred_review_rating)
    review_auc = auc(review_fpr, review_tpr)

    xx = review_fpr
    yy = review_tpr
    plt.figure(figsize=[6,6])
    plt.plot(xx, yy, label='ROC curve (AUC = %0.2f)' % review_auc, color='red')
    plt.plot([0,1],[0,1],color='gray')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('Review predicted as awesome');
    plt.legend(loc='lower right')
    plt.show()
    
    
    # above we predicted reivews, now let's predict products 
    prob_prod_rating, new_lbl = avgByLbl(prob_review_rating, lbl)
    pred_prod_rating = expRating(prob_prod_rating)
    pred_prod_awesome = pred_prod_rating > 4.5

    obs_rating, new_lbl = avgByLbl(y, lbl)
    obs_awesome = obs_rating > 4.5
    
    
    report = classification_report(obs_awesome, pred_prod_awesome)
    print('Awesome Product Prediction by Subject Profile')
    print(report)
    report = classification_report(obs_awesome, np.random.permutation(pred_prod_awesome))
    print('Null Prediction')
    print(report)
    
    prod_fpr, prod_tpr, _ = roc_curve(obs_awesome, pred_prod_rating)
    prod_auc = auc(prod_fpr, prod_tpr)

    xx = prod_fpr
    yy = prod_tpr
    plt.figure(figsize=[6,6])
    plt.plot(xx, yy, label='ROC curve (AUC = %0.2f)' % prod_auc, color='red')
    plt.plot([0,1],[0,1],color='gray')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('Review product as awesome');
    plt.legend(loc='lower right')
    plt.show()

In [12]:
def preprocessJson(json_dat, test_dat=False):
    #sort test data by asin
    json_dat = sorted(json_dat, key=lambda k: k['asin'])
    
    summary = []
    prod_id = []
    rating = np.zeros(len(json_dat))
    for i in range(0,len(json_dat)):
        summary.append(json_dat[i].get('summary'))
        if not test_dat:
            rating[i] = json_dat[i].get('overall')
        prod_id.append(json_dat[i].get('asin'))
    
    # this cell runs things in parallel. make sure to start an 
    # ipython cluster from the notebook dashboard's IPython Cluster
    # tab before running
    rc = ipp.Client()
    dview = rc[:]
    dview.execute('from nltk.tokenize import RegexpTokenizer;' +
                  'from nltk.corpus import stopwords; ' + 
                  'from nltk.stem.porter import PorterStemmer;' +
                  'import numpy as np;')

    # clean text
    dview.push(dict(preprocess_data=preprocess_data))
    dview.scatter('summary', summary) # partitions data

    %px cleaned_reviews = preprocess_data(summary)
    cleaned_reviews = dview.gather('cleaned_reviews').get()

    # combine text
    total_text, uniq_prod_id, avg_ratings = combine_reviews(cleaned_reviews, prod_id, rating)
    
    gc.collect()
    
    # vectorize training data
    train_lbls = np.array(avg_ratings) >= 4.5
    train_text = total_text
    
    return train_text, train_lbls, uniq_prod_id

In [None]:
def main():

    ####### data input and preprocessing ############
    
    # Read in testing data for 80/20 split
    # we won't use val_dat at all
    json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

    train_text, train_lbls, uniq_prod_id = preprocessJson(json_dat)
    
    ######## estimate classifier accuracy #############

    # initialize a transformer mapping from bow to latent semantic features
    # pick a classifier
    clf = pick_classifier('LogisticRegression')

    # create a pipeline that transforms data to a bag of words 
    # representation, and passes that bag off to a classifier 
    # that may do any of a number of things to those features.
    # most perform a dimensionality reduction using a document
    # topic model before applying some cannonical algorithm 
    # in the document topic model space
    estimators = [('doc2Bow', doc2Bow()), ('clf', clf)]
    semClf = Pipeline(estimators)

    # cross validate over the pipeline using 10-fold CV
    pred_lbls = cross_val_predict(semClf, train_text, train_lbls, cv=10, n_jobs=4)
    
    true_lbls = getProdRatings(uniq_prod_id) > 4.5

    # get classifier performance estimates
    report = classification_report(true_lbls, pred_lbls)
    print('Classifier performance')
    print(report)
    
    # get null performance estimates
    report = classification_report(true_lbls, np.random.permutation(pred_lbls))
    print('Null performance')
    print(report)
    
    ############# prepare final model ##############
    
    semClf.fit(train_text,train_lbls)

    ############# make predictions on validation data ################
    
    val_text, val_lbls, val_prod_id = preprocessJson(val_dat)
    
    true_val_lbls = getProdRatings(val_prod_id) > 4.5
        
    val_pred = semClf.predict(val_text)
        
    # get classifier performance estimates
    report = classification_report(true_val_lbls, val_pred)
    print('Classifier performance')
    print(report)
    
    # get null performance estimates
    report = classification_report(true_val_lbls, np.random.permutation(val_pred))
    print('Null performance')
    print(report)
    
    ############# make predictions in test data ######################
    
    
    # import and prepare test data
    with open('../data/Sports_and_Outdoors_Reviews_test.json', 'r') as fp:
        test_dat = [json.loads(x) for x in fp.readlines()]

    test_text, _, test_prod_id = preprocessJson(test_dat)
                        
    # get model predictions for test data
    test_lbls = semClf.predict(test_text)
                        
    #create output csv file from predictions
    dat = np.column_stack((uniq_prod_id, pred_lbls.astype(int)))
    np.savetxt("Sports_and_Outdoors_Ratings_test.csv", dat, delimiter=",", fmt=['%s', '%s'], 
           header='asin,awesomeReview')

In [None]:
main()