In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import ast
import re
import time

import nltk
import gensim

# Read in Tokenized Reviews from file

In [22]:
reviews_df = pd.read_csv("reviews_all_tokenized.csv")

#Reviews are stored as one string. Need to convert to a list of strings.
reviews_df.review_tokenized = reviews_df.review_tokenized.map(ast.literal_eval)

print(reviews_df.shape)
reviews_df.review_tokenized.head()

(59274, 9)


0    [grits, know, best, so, take, it, from, this, ...
1    [brunch, was, enjoyable, from, the, mimosa, st...
2    [if_you're_looking, for, a, delicious, meal, o...
3    [the, best, way, i, can, describe, this_place,...
4    [had, dinner, here, last_night, and, i'm, stil...
Name: review_tokenized, dtype: object

Define remove phrases function in the event that phrase removal is desired.

In [21]:
def phrase_clear(text_block):
    """
    Removes underscores from tokens in tokenized data. Returns re-tokenized text.
    INPUT:
    text_block = tokenized text block containing n-grams to be removed.
    OUTPUT:
    Tokenized text block that has n-grams transformed back into individual words.
    """
    cleared = []
    for idx, word in enumerate(text_block):
        cleared.extend(word.split("_"))
    return cleared

# Generate Word Embeddings

In [44]:
data = reviews_df.review_tokenized
target = reviews_df.star_rating

In [31]:
print("Generate Word Vectors: ", end='')
start = time.time()
model = gensim.models.Word2Vec(data,size=100,window=5,min_count=1,workers=4)
model.train(data,total_examples=model.corpus_count,epochs=10)
end = time.time()
print(round(end-start,2),"seconds")

Generate Word Vectors: 65.74 seconds


In [42]:
#Save embedded word vector space
wv = model.wv
print(len(wv.vocab),"unique words in the dataset.")

63025 unique words in the dataset.


# Predict Ratings

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [34]:
#Creating Mean Word Embeddings using Mean Embedding Vectorizer class
class W2vVectorizer(object):
    
    def __init__(self, model):
        self.w2v = model.wv
        self.dimensions = model.vector_size
    
    # Need to implement a fit method as required for sklearn Pipeline.
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v.get_vector(w) for w in words], axis=0) for words in X])

In [35]:
rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer(wv)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(wv)),
                ('Support Vector Machine', SVC(gamma='auto'))])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(wv)),
              ('Logistic Regression', LogisticRegression(multi_class='auto'))])

models = [('Random Forest', rf),
          ("Support Vector Machine", svc),
          ("Logistic Regression", lr)]

  """


In [46]:
print("Cross Validation Scores: ", end='')
start = time.time()
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]
end = time.time()
print(round(end-start,2),"seconds")

scores

Cross Validation Scores: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   31.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   30.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


600.99 seconds


[('Random Forest', 0.5499374790351649),
 ('Support Vector Machine', 0.5982049244306866),
 ('Logistic Regression', 0.5882006478752827)]

### Predict Ratings using GloVe Word Vectors

In [47]:
#Using GloVe Vectors
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [48]:
#Creating Mean Word Embeddings using Mean Embedding Vectorizer class
class W2vVectorizer_glove(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [49]:
rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer_glove(glove)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer_glove(glove)),
                ('Support Vector Machine', SVC(gamma='auto'))])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer_glove(glove)),
              ('Logistic Regression', LogisticRegression(multi_class='auto'))])

models = [('Random Forest', rf),
          ("Support Vector Machine", svc),
          ("Logistic Regression", lr)]

In [50]:
print("Cross Validation Scores: ", end='')
start = time.time()
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]
end = time.time()
print(round(end-start,2),"seconds")
scores

Cross Validation Scores: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   23.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


416.65 seconds


[('Random Forest', 0.4827917019687721),
 ('Support Vector Machine', 0.46592108185976805),
 ('Logistic Regression', 0.5072376697259633)]

# Word Embeddings - Deep Learning

In [51]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

Using Theano backend.


In [52]:
y = pd.get_dummies(target).values

In [63]:
#tokenizer = text.Tokenizer(num_words=20000)
#tokenizer.fit_on_texts(list(reviews_df.review))
#tokenized_reviews = tokenizer.texts_to_sequences(reviews_df.review)
#X_t = sequence.pad_sequences(tokenized_reviews, maxlen=100)

tokenizer = text.Tokenizer(num_words=20000)
#tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
tokenized_reviews = tokenizer.texts_to_sequences(data)
X_t = sequence.pad_sequences(tokenized_reviews, maxlen=100)

In [64]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(20000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 41 different possible classes, so we use 41 neurons in our output layer
x = Dense(5, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [65]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 25)           15400     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 25)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                1300      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0   

In [67]:
model.fit(X_t, y, epochs=5, batch_size=32, validation_split=0.1)

Train on 53346 samples, validate on 5928 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a59631ba8>