# Running trained HAN model for predicting text complexity

This notebook loads the pre-trained hierarchical attention network from Hierarchical Attention Networks for Document Classification (http://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf), and runs it on a test set; there is also the option to visualize attention at the word and senetce level. 
The model has been train to map grades K-12 to 6 levels:

K-1 -> 0

2-3 -> 1

4-5 -> 2

6-8 -> 3

9-10 -> 4

11-12 -> 5.

The pretrained model can be obtained from https://www.dropbox.com/sh/ik2pnukue00g9ay/AADl5gmsqsC-si8_9w72-vXDa?dl=0. A test set is also available at https://sites.google.com/site/nadeemf0755/research/linguistic-complexity. 

In [1]:
from __future__ import print_function, division

import os
import os.path
import pandas as pd
from io import StringIO
import io
import unicodedata
import re

import tensorflow as tf
import numpy as np
np.set_printoptions(threshold = 10000)
import collections
import random

#from tensorflow.contrib.rnn import LSTMCell as Cell
from gru import GRUCell as Cell #custom implementation with normalization
from tensorflow.python.ops.rnn import dynamic_rnn as rnn
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn

from attention import attention
#from bca import *
from ordloss import *
from utils import *
#from dataUtils import *
from dataUtilsbca import *


from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import accuracy_score
 

In [7]:
# load the dictionary from the pre-trained model folder
import csv 
dictionary = {}
for key,val in csv.reader(open('trainedhanmodel/han_dict_trained.csv')):
    dictionary[key] = val

In [9]:
# the test data set; the format is csv, with the text column labelled 'text'
df_test = pd.read_csv('test.csv')

In [10]:
# perform basic text clean-up, lower casing, and convert British spelling to US 
text = []
for i in range(len(df_test)):
    t = df_test.iloc[i]['text']
    text.append(clean_(t))
df_test['text'] = text
df_test.head()

Unnamed: 0,grade,text
0,6,what is movement ?
1,4,what is economics ?
2,4,what is produced ?
3,4,3 . what is evaporation ?
4,4,13 . what is condensation ?


In [11]:
SEQUENCE_LENGTH = 65 # Length of sentence in words
SEQUENCE_LENGTH_D = 40 # length of sentences in document
BATCH_SIZE = 10

In [12]:
X_test_ = read_test_set(df_test, dictionary, SEQUENCE_LEN_D = SEQUENCE_LENGTH_D, SEQUENCE_LEN = SEQUENCE_LENGTH)

len of test set:  5444


In [15]:
vocabulary_size = len(dictionary)
EMBEDDING_DIM = 300

In [16]:
tf.reset_default_graph()

In [17]:
"""
Reload the trained model; get reading level predictions for test set 
"""
tf.reset_default_graph()

NUM_WORDS = vocabulary_size
INDEX_FROM = 3
#EMBEDDING_DIM = embedding_dim
HIDDEN_SIZE = 50
ATTENTION_SIZE = 10
HIDDEN_SIZE_D = 20
ATTENTION_SIZE_D = 5
BATCH_SIZE = 10
DELTA = 0.75
hl = 0.0
hl2 = 0.0

#Different placeholders
num_classes = 6
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH])
ind_list_ph = tf.placeholder(tf.int32, [None])
target_ph = tf.placeholder(tf.float32, [None,num_classes])
seq_len_ph = tf.placeholder(tf.int32, [None])
seq_len_ph_d = tf.placeholder(tf.int32, [None])
keep_prob_ph = tf.placeholder(tf.float32)



# Embedding layer
embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

# (Bi-)RNN layer(-s)
with tf.variable_scope('sentence'):
    rnn_outputs, _ = bi_rnn(Cell(HIDDEN_SIZE), Cell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, seq_len_ph, return_alphas=True)
    attention_output = tf.reshape(attention_output,[BATCH_SIZE, SEQUENCE_LENGTH_D, HIDDEN_SIZE*2])
    
with tf.variable_scope('document'):
    rnn_outputs_d, _ = bi_rnn(Cell(HIDDEN_SIZE_D), Cell(HIDDEN_SIZE_D), inputs=attention_output, sequence_length=seq_len_ph_d, dtype=tf.float32)
    attention_output_d, alphas_d = attention(rnn_outputs_d, ATTENTION_SIZE_D, seq_len_ph_d, return_alphas=True)

drop = tf.nn.dropout(attention_output_d, keep_prob_ph)

# For ordinal regression, same weights for each class
W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value], stddev=0.1))
W_ = tf.transpose(tf.reshape(tf.tile(W,[num_classes - 1]),[num_classes - 1, drop.get_shape()[1].value]))
b = tf.Variable(tf.cast(tf.range(num_classes - 1), dtype = tf.float32))
b2 = tf.Variable(tf.cast(tf.range(num_classes - 1), dtype = tf.float32))
y_hat_ = tf.nn.xw_plus_b(drop, tf.negative(W_), b)
y_hat_2 = tf.nn.xw_plus_b(drop, tf.negative(W_), b2)

# Predicted labels and logits
y_preds, logits = preds(y_hat_,BATCH_SIZE)
y_true = tf.argmax(target_ph, axis = 1)

# hinge/hubber loss
h_loss = tf.losses.huber_loss(labels = y_true, predictions = y_preds) 

# Ordinal loss
ordloss = tf.add(tf.multiply((1.-hl),ordloss_m(y_hat_, target_ph, BATCH_SIZE)), tf.multiply(hl,h_loss))
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(ordloss)

MODEL_PATH = "trainedhanmodel/dualAttModelpad93"

saver = tf.train.Saver()

test_batch_generator_ = test_batch_generator(X_test_, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D)


# Calculate alpha coefficients for the first test example
with tf.Session() as sess:
    saver.restore(sess, MODEL_PATH)
    
    while True:

        #testing
        num_batches = X_test_.shape[0] // (BATCH_SIZE*SEQUENCE_LENGTH_D)
        a = []
        #a_dual = []
        a_d = []
        true = []
        preds_ = []
        log_ = []
        #a_d_dual = []
        #x_batch = X_test_
        #y_batch = y_test_
        for bx in range(num_batches):
            x_batch = next(test_batch_generator_)
            seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
            seq_len_d = []               
            l = SEQUENCE_LENGTH_D
            for i in range(0,len(x_batch),l):
                for j in range(i,i+l):
                    if list(x_batch[j]).index(0) == 0:
                        seq_len_d.append(j%l)
                        break
                    elif j == i+l-1:
                        seq_len_d.append(l)

            seq_len_d = np.array(seq_len_d)

            log, pred, alph, alph_d = sess.run([logits, y_preds, alphas, alphas_d],
                         feed_dict={batch_ph: x_batch,
                                    seq_len_ph: seq_len,
                                    seq_len_ph_d: seq_len_d,
                                    keep_prob_ph: 1.0})
            a.append(alph)
            a_d.append(alph_d)
            log_.append(log)
            
            preds_.extend(pred)
   
        preds_ = np.array(preds_)
        preds_ = preds_.flatten()

            
        break
 

INFO:tensorflow:Restoring parameters from trainedhanmodel/dualAttModelpad93


In [18]:
#display word and sentence level attention
from IPython.core.display import display, HTML
#Display sentence level attention 
sc = 0
HTML_str_all_sent = []
for i in df_test['text']:
    i = sent_tokenize(i)
        
    bc = 0 
    HTML_str = ''
    s_len = min(len(i),SEQUENCE_LENGTH_D)

    for j in i[:s_len]:
        j = 'START ' + str(j)
        x = j.split()
        att = a[sc//BATCH_SIZE][((sc%BATCH_SIZE)*SEQUENCE_LENGTH_D)+bc]
        att = att/att.max()
        bc+=1
    
        att_sent = []
        count = 0

        for w in x:
            if w == 'START':
                continue
            HTML_str = HTML_str + '<font style="background: rgba(153, 204, 255, %f)">%s</font>' % (att[count], w+' ')
                
            count+=1
            if count == SEQUENCE_LENGTH:
                break
    sc+=1        
    HTML_str_all_sent.append(HTML_str)
    HTML_str = ''
    
    #print(i)
    #break
    
#Display sentence level attention 
sc = 0
HTML_str_all_doc = []
for i in df_test['text'][:-10]:
    i = sent_tokenize(i)
        
    HTML_str = ''
    s_len = min(len(i),SEQUENCE_LENGTH_D)
    i = i[:s_len]
    att = a_d[sc//BATCH_SIZE][sc%BATCH_SIZE][:s_len]
    att = att/att.max()
    att_sent = []
    count = 0

    for w in i:
        HTML_str = HTML_str + '<font style="background: rgba(255, 178, 102, %f)">%s</font>' % (att[count], w+' ')
        count+=1
        if count == SEQUENCE_LENGTH_D:
            break
    sc+=1        
    HTML_str_all_doc.append(HTML_str)
    HTML_str = ''
    
    #print(i)
    #break



In [19]:
i = 0
display(HTML(HTML_str_all_sent[i]))
display(HTML(HTML_str_all_doc[i]))
