This script works at a higher level of abstraction than the word-based semantic space. Instead of using similarity between word coocurrence to compute it's space, the space is defined based on the similarity of "documents" (aka reviews). The advantage of this approach is that each review ends up represented as a vector, eliminating the need to categorize words. The implementation is less transparent though and instead demonstrates the usage of gensim tools.

This script also implements naive bayes. Classification accuracy is around 66% in a hold out sample.

In [12]:
import json

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import bigrams

import numpy as np
import itertools

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA


In [2]:
this_list = []
for line in open('../data/Sports_and_Outdoors_Reviews_training.json', 'r'):
    this_list.append(json.loads(line))

In [3]:
# create a list, doc_list, with one review per elem
doc_list = []
rating = []
for i in range(0,len(this_list)):
    if this_list[i].get('reviewText'): #not all reviews have text
        rating.append(this_list[i].get('overall'))
        doc_list.append(this_list[i].get('reviewText'))

In [4]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [5]:
clean_text=preprocess_data(doc_list)

In [190]:
from gensim.models import LsiModel as lsi
from gensim.corpora import Dictionary

n_doc = 2000

# mapping of words to numbers
this_dict = Dictionary(clean_text[0:n_doc])

# transform corpus (clean_text) into a 2d array word counts
bow_corpus = [this_dict.doc2bow(text) for text in clean_text[0:n_doc]]

# construct a semantic space based on document-topic similarity
semSpace = lsi(bow_corpus, id2word=this_dict, num_topics=300)

# convert documents into vectors in topic feature space
vectors = semSpace[bow_corpus]

In [191]:
# convert from TransformedCorpus datatype to numpy doc x topic array
from gensim import matutils
all_topics_csr = matutils.corpus2csc(vectors)
all_topics_numpy = all_topics_csr.T.toarray()

In [192]:
from sklearn.naive_bayes import GaussianNB

lbl = [i >4.5 for i in rating]

n_train = int(np.floor(n_doc*0.75))
n_test = int(n_doc - n_train)

gnb = GaussianNB()
X_train = all_topics_numpy[0:n_train]
y_train = lbl[0:n_train]
X_test = all_topics_numpy[(n_train+1):(n_train+n_test)]
y_test = lbl[(n_train+1):(n_train+n_test)]
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))


Number of mislabeled points out of a total 1249 points : 435
