In [1]:
from __future__ import print_function, division

import os
import os.path
import pandas as pd
from io import StringIO
import io
import unicodedata
import re

import tensorflow as tf
import numpy as np
np.set_printoptions(threshold = 10000)
import collections
import random

#from tensorflow.contrib.rnn import LSTMCell as Cell
from gru import GRUCell as Cell #custom implementation with normalization
from tensorflow.python.ops.rnn import dynamic_rnn as rnn
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn

from attention import attention
from ordloss import *
from utils import *
from dataUtilstest import *


from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import accuracy_score
 

In [2]:
# these values need to be updated based on teh prameters for the trained model
#read data; SEQUENCE_LENGTH is maximum length of sentence in words, SEQUENCE_LENGTH_D is maximum length of document in sentences. 

SEQUENCE_LENGTH = 65
SEQUENCE_LENGTH_D = 40

#system parameters-need to be the same as han_train

HIDDEN_SIZE = 50
ATTENTION_SIZE = 10
HIDDEN_SIZE_D = 15
ATTENTION_SIZE_D = 5
KEEP_PROB = 0.7
BATCH_SIZE = 10

#use ordinal regression; logistic regression if False
ordinal = False

MODEL_PATH = "model/model%d" %(HIDDEN_SIZE + HIDDEN_SIZE_D)

In [3]:
import csv 
dictionary = {}
for key,val in csv.reader(open('data/dict_han%d.csv'%(HIDDEN_SIZE + HIDDEN_SIZE_D))):
    dictionary[key] = val

In [4]:
NUM_WORDS = len(dictionary)
vocabulary_size = NUM_WORDS
EMBEDDING_DIM = 200

In [5]:
# the test data set; the fformat is csv, with the text column labelled 'text'
df_test = pd.read_csv('data/test.csv')

In [12]:
# perform basic text clean-up, lower casing, and convert British spelling to US 
text = []
for i in range(len(df_test)):
    t = df_test.iloc[i]['text']
    text.append(clean_(t))
df_test['text'] = text
df_test[['text','stars']].head()

Unnamed: 0,text,stars
0,this review is solely for the vanilla cone . h...,5
1,good arcadia/biltmore area place that 's in a ...,4
2,i've flown in and out of sky harbor many times...,3
3,i love this place ! not only does it make me f...,4
4,opening night of star wars ! the theater handl...,4


In [7]:
X_test_ = read_test_set(df_test, dictionary, SEQUENCE_LEN_D = SEQUENCE_LENGTH_D, SEQUENCE_LEN = SEQUENCE_LENGTH)

len of test set:  7608


In [8]:
tf.reset_default_graph()

In [9]:
#Different placeholders
num_classes = 5
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH])
ind_list_ph = tf.placeholder(tf.int32, [None])
target_ph = tf.placeholder(tf.float32, [None,num_classes])
seq_len_ph = tf.placeholder(tf.int32, [None])
seq_len_ph_d = tf.placeholder(tf.int32, [None])
keep_prob_ph = tf.placeholder(tf.float32)


# Embedding layer
embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

# (Bi-)RNN layer(-s)
with tf.variable_scope('sentence'):
    rnn_outputs, _ = bi_rnn(Cell(HIDDEN_SIZE), Cell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, seq_len_ph, return_alphas=True)
    attention_output = tf.reshape(attention_output,[BATCH_SIZE, SEQUENCE_LENGTH_D, HIDDEN_SIZE*2])
    
with tf.variable_scope('document'):
    rnn_outputs_d, _ = bi_rnn(Cell(HIDDEN_SIZE_D), Cell(HIDDEN_SIZE_D), inputs=attention_output, sequence_length=seq_len_ph_d, dtype=tf.float32)
    attention_output_d, alphas_d = attention(rnn_outputs_d, ATTENTION_SIZE_D, seq_len_ph_d, return_alphas=True)

# Dropout
drop = tf.nn.dropout(attention_output_d, keep_prob_ph)

if ordinal:
    # For ordinal regression, same weights for each class
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value], stddev=0.1))
    W_ = tf.transpose(tf.reshape(tf.tile(W,[num_classes - 1]),[num_classes - 1, drop.get_shape()[1].value]))
    b = tf.Variable(tf.cast(tf.range(num_classes - 1), dtype = tf.float32))
    y_hat_ = tf.nn.xw_plus_b(drop, tf.negative(W_), b)

    # Predicted labels and logits
    y_preds, logits = preds(y_hat_,BATCH_SIZE)
    y_true = tf.argmax(target_ph, axis = 1)

    # Ordinal loss
    loss = ordloss_m(y_hat_, target_ph, BATCH_SIZE)
    c = stats.spearmanr
    str_score = "Spearman rank:"

else:
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value, num_classes], stddev=0.1))  
    b = tf.Variable(tf.constant(0., shape=[num_classes]))
    y_hat_ = tf.nn.xw_plus_b(drop, W, b)
    #y_hat_ = tf.squeeze(y_hat)
    # Cross-entropy loss and optimizer initialization
    y_preds = tf.argmax(y_hat_, axis = 1)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat_, labels=target_ph))
    c = accuracy_score
    str_score = "Accucary:"

optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)
# Batch generators

test_batch_generator_ = test_batch_generator(X_test_, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D)

saver = tf.train.Saver()

# Calculate alpha coefficients for the first test example

with tf.Session() as sess:
    saver.restore(sess, MODEL_PATH)
    
    while True:

        #testing
        num_batches = X_test_.shape[0] // (BATCH_SIZE*SEQUENCE_LENGTH_D)
        a = []
        a_d = []
        true = []
        preds_ = []
        for bx in range(num_batches):
            x_batch = next(test_batch_generator_)
            seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
            seq_len_d = []               
            l = SEQUENCE_LENGTH_D
            for i in range(0,len(x_batch),l):
                for j in range(i,i+l):
                    if list(x_batch[j]).index(0) == 0:
                        seq_len_d.append(j%l)
                        break
                    elif j == i+l-1:
                        seq_len_d.append(l)

            seq_len_d = np.array(seq_len_d)

            pred, alph, alph_d = sess.run([y_preds, alphas, alphas_d],
                         feed_dict={batch_ph: x_batch,
                                    seq_len_ph: seq_len,
                                    seq_len_ph_d: seq_len_d,
                                    keep_prob_ph: 1.0})
            a.append(alph)
            a_d.append(alph_d)
            
            preds_.extend(pred)
   
        preds_ = np.array(preds_)
        preds_ = preds_.flatten()

            
        break
 

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

INFO:tensorflow:Restoring parameters from model/model65


In [10]:
#display word and sentence level attention
from IPython.core.display import display, HTML
#Display sentence level attention 
sc = 0
HTML_str_all_sent = []
for i in df_test['text']:
    i = sent_tokenize(i)
        
    bc = 0 
    HTML_str = ''
    s_len = min(len(i),SEQUENCE_LENGTH_D)

    for j in i[:s_len]:
        j = 'START ' + str(j)
        x = j.split()
        att = a[sc//BATCH_SIZE][((sc%BATCH_SIZE)*SEQUENCE_LENGTH_D)+bc]
        att = att/att.max()
        bc+=1
    
        att_sent = []
        count = 0

        for w in x:
            if w == 'START':
                continue
            HTML_str = HTML_str + '<font style="background: rgba(153, 204, 255, %f)">%s</font>' % (att[count], w+' ')
                
            count+=1
            if count == SEQUENCE_LENGTH:
                break
    sc+=1        
    HTML_str_all_sent.append(HTML_str)
    HTML_str = ''
    
    
#Display sentence level attention 
sc = 0
HTML_str_all_doc = []
for i in df_test['text'][:-10]:
    i = sent_tokenize(i)
        
    HTML_str = ''
    s_len = min(len(i),SEQUENCE_LENGTH_D)
    i = i[:s_len]
    att = a_d[sc//BATCH_SIZE][sc%BATCH_SIZE][:s_len]
    att = att/att.max()
    att_sent = []
    count = 0

    for w in i:
        HTML_str = HTML_str + '<font style="background: rgba(255, 178, 102, %f)">%s</font>' % (att[count], w+' ')
        count+=1
        if count == SEQUENCE_LENGTH_D:
            break
    sc+=1        
    HTML_str_all_doc.append(HTML_str)
    HTML_str = ''
    

In [11]:
i = 0
display(HTML(HTML_str_all_sent[i]))
display(HTML(HTML_str_all_doc[i]))
