### Import Packages

In [22]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.python.framework.errors_impl import ResourceExhaustedError

In [23]:
from keras.layers import Dense
from keras.models import Model
from keras.models import Sequential
from keras import backend as K

In [24]:
from glob import glob
import json
import numpy as np
import os
import pandas as pd
#import plotly.graph_objs as go
#import plotly.plotly as py
import spacy
from sklearn.cross_decomposition import CCA

### Load Data

In [25]:
DATA_PATH = os.path.join("data", "small_example.csv")
full_dataset = pd.read_csv(DATA_PATH)

In [26]:
full_dataset = full_dataset[np.logical_not(full_dataset["reviewText"].isnull().values)]
full_dataset.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewTime,reviewerID,summary,reviewText,reviewDoc
0,0,1881509818,"[0, 0]",5,26 01 2014,AIXZKN4ACSKI,Woks very good,This came in on time and I am veru happy with ...,This came in on time and I am veru happy with ...
1,1,1881509818,"[1, 1]",5,02 02 2012,A1L5P841VIO02V,Works as well as the factory tool,I had a factory Glock tool that I was using fo...,I had a factory Glock tool that I was using fo...
2,2,1881509818,"[2, 2]",4,28 02 2012,AB2W04NI4OEAD,"It's a punch, that's all.",If you do not have a 3/32 punch or would like ...,If you do not have a 3/32 punch or would like ...
3,3,1881509818,"[0, 0]",4,05 02 2012,A148SVSWKTJKU6,It's a punch with a Glock logo.,This works no better than any 3/32 punch you w...,This works no better than any 3/32 punch you w...
4,4,1881509818,"[0, 0]",4,23 04 2013,AAAWJ6LW9WMOO,"Ok,tool does what a regular punch does.",I purchased this thinking maybe I need a speci...,I purchased this thinking maybe I need a speci...


In [27]:
# Star Rating Statistics
star_stats = full_dataset['overall'].value_counts().to_dict()
star_stats

{5: 32243, 4: 10582, 3: 3878, 2: 1774, 1: 1493}

In [28]:
# Train/Val Split
train_x = []
train_y = []

val_x = []
val_y = []

for key, val in star_stats.items():
    key_reviews = full_dataset[full_dataset["overall"] == key]
    train_idxs = np.random.choice(len(key_reviews), int(len(key_reviews) / 2), replace = False)

    train_reviews = full_dataset.iloc[train_idxs]["reviewText"].tolist()
    val_reviews = full_dataset[~ full_dataset.index.isin(train_idxs)]["reviewText"].tolist()
    
    train_x += train_reviews
    val_x += val_reviews
    
    train_y += list([key] * len(train_reviews))
    val_y += list([key] * len(val_reviews))

### Setup Pre-Trained AutoEncoder

In [30]:
# Download Module
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
embed = hub.Module(module_url)

## Run, Save and Load Data

In [32]:
BATCHES = 750
EMBEDDINGS_PATH = os.path.join("data", "embeddings")

In [31]:
# for memory ease
try:
    for i in range(int(len(train_x) / BATCHES) + 1):
        with tf.Session() as session:
            session.run([tf.global_variables_initializer(), tf.tables_initializer()])
            train_sentence_embeddings = session.run(embed(train_x[i * BATCHES: (i+1) * BATCHES]))
            np.save(os.path.join(EMBEDDINGS_PATH, "train_sent_embeddings_{0}.npy".format(i)), train_sentence_embeddings)
except ResourceExhaustedError:
    pass
        
#for memory ease
try:
    for i in range(int(len(val_x) / BATCHES) + 1):
        with tf.Session() as session:
            session.run([tf.global_variables_initializer(), tf.tables_initializer()])
            val_sentence_embeddings = session.run(embed(val_x[i * BATCHES: (i+1) * BATCHES]))
            np.save(os.path.join(EMBEDDINGS_PATH, "val_sent_embeddings_{0}.npy".format(i)), val_sentence_embeddings)
except ResourceExhaustedError:
    pass

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 11:55:21.990252 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 11:58:28.780424 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:01:32.991943 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:04:40.834889 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:07:59.025969 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:28:37.508083 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:32:18.840153 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:36:07.856875 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:39:53.656727 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:44:44.607013 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:49:25.071828 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:53:34.452924 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 12:57:44.470756 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 13:02:28.160290 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 13:07:30.068809 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 13:12:45.033691 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 13:42:58.463275 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 13:52:13.014434 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 14:02:20.031887 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0312 14:12:38.614241 15852 saver.py:1483] Saver not created because there are no variables in the graph to restore


KeyboardInterrupt: 

In [33]:
# Load Saved Training Data
train_files = glob(os.path.join(EMBEDDINGS_PATH, "train*"))
train_files.sort(key=lambda s : int(s.split(".")[0].split("_")[3]))

train_sent_list = []
for file in train_files:
    train_sent_list.append(np.load(file))
    
train_sentence_embeddings = np.concatenate(train_sent_list, axis = 0)
train_y = train_y[:len(train_sentence_embeddings)]

In [34]:
# Load Saved Validation Data
val_files = glob(os.path.join(EMBEDDINGS_PATH, "val*"))
val_files.sort(key=lambda s : int(s.split(".")[0].split("_")[3]))

val_sent_list = []
for file in val_files:
    val_sent_list.append(np.load(file))
    
val_sentence_embeddings = np.concatenate(val_sent_list, axis = 0)
val_y = val_y[:len(val_sentence_embeddings)]

In [35]:
# CONSTANTS
NUM_TRAIN = train_sentence_embeddings.shape[0]
NUM_VAL = val_sentence_embeddings.shape[0]
EMBEDDING_DIM = train_sentence_embeddings.shape[1] # 512
STAR_CLASSES = 5

### Predict Helpfullness Rating

In [36]:
# Prepare y (0-indexed and one-hot)
train_y_hot = np.zeros((NUM_TRAIN, STAR_CLASSES))
train_y_hot[range(NUM_TRAIN), np.array(train_y) - 1] = 1
val_y_hot = np.zeros((NUM_VAL, STAR_CLASSES))
val_y_hot[range(NUM_VAL), np.array(val_y) - 1] = 1

In [37]:
# Creating Star Rating Classifier
output1_dim = 256
output2_dim = 128
NUM_EPOCHS = 2
BATCH_SIZE = 32

star_classifier = Sequential()
star_classifier.add(Dense(output1_dim, input_dim=EMBEDDING_DIM, kernel_initializer='normal', activation='relu'))
star_classifier.add(Dense(output2_dim, kernel_initializer='normal', activation='relu'))
star_classifier.add(Dense(STAR_CLASSES, kernel_initializer='normal', activation='softmax'))

star_classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(star_classifier.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_8 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 645       
Total params: 164,869
Trainable params: 164,869
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Train Star Rating Classifier
star_classifier.fit(train_sentence_embeddings, train_y_hot, 
                    epochs = NUM_EPOCHS, batch_size = BATCH_SIZE,
                    validation_data=(val_sentence_embeddings, val_y_hot), verbose = 1)

Train on 24984 samples, validate on 15000 samples
Epoch 1/2


In [None]:
# Get Star Hidden Repr.
hidden_repr = Model(inputs = star_classifier.layers[0].input, outputs = star_classifier.layers[1].output)
val_star_embeddings = hidden_repr.predict(val_sentence_embeddings)

### CCA - Original & Star Embeddings

In [None]:
# CONSTANTS
K_comps = 2

In [None]:
# TODO
shuffler = list(range(val_star_embeddings.shape[0]))
np.random.shuffle(shuffler)
shuffled_val_embeddings = val_sentence_embeddings[shuffler[:1000], :]
shuffled_star_embeddings = val_star_embeddings[shuffler[:1000], :]

In [None]:
cca = CCA(n_components=K)
cca.fit(shuffled_val_embeddings, shuffled_star_embeddings)

In [None]:
# TODO
# Get Embeddings
# Train classifier on Star Ratings, Topics available in each sentence
# Show what happens if we cluster using these labels vs. using CCA on the hidden representations

In [None]:
shuffled_val_embeddings2 = val_sentence_embeddings[shuffler[1000:], :]
shuffled_star_embeddings2 = val_star_embeddings[shuffler[1000:], :]
val_sent_low, val_star_low = cca.transform(shuffled_val_embeddings2, shuffled_star_embeddings2)

## Plot CCA Results

In [None]:
# low-dim sentence representations
COLORS = ['red', 'orange', 'green', 'blue', 'black']

data = []
val_y = np.array(val_y)
for i in range(STAR_CLASSES):
    star_x = val_sent_low[[val_y == i + 1], 0]
    star_y = val_sent_low[[val_y == i + 1], 1]
    data.append(go.Scatter(x = star_x,
                           y = star_y,
                           mode = 'markers',
                           marker = dict(color=COLORS[i])
    ))
py.plot(data)