# Running trained BCA model for predicting text complexity

This notebook loads the pre-trained BCA model, and runs it on a test set; there is also the option to visualize attention at the word and senetce level. 
The model has been train to map grades K-12 to 12 levels:

K-1 -> 0

2-12 -> 1-11

The pretrained model can be obtained from https://1drv.ms/f/s!Ag4UUgKkf0ZPu55vBR6_-6WOuAO-Ug. A test set is also available at https://sites.google.com/site/nadeemf0755/research/linguistic-complexity. 

In [2]:
from __future__ import print_function, division

import os
import os.path
import pandas as pd
from io import StringIO
import io
import unicodedata
import re

import tensorflow as tf
import numpy as np
np.set_printoptions(threshold = 10000)
import collections
import random

from tensorflow.contrib.rnn import LSTMCell as Cell #for GRU: custom implementation with normalization
from tensorflow.python.ops.rnn import dynamic_rnn as rnn
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tensorflow.contrib.rnn import DropoutWrapper

from attention import attention as attention
from bca import *
from ordloss import *
from utils import *
#from dataUtils_snli import *
from dataUtils_gr import *


from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import accuracy_score

In [3]:
# load the dictionary from the pre-trained model folder
import csv 
dictionary = {}
for key,val in csv.reader(open('BCA_grades/dict_bca_gr250.csv')):
    dictionary[key] = val

In [5]:
# the test data set; the fformat is csv, with the text column labelled 'text'
df_test = pd.read_csv('test.csv')

In [7]:
# perform basic text clean-up, lower casing, and convert British spelling to US 
text = []
for i in range(len(df_test)):
    t = df_test.iloc[i]['text']
    text.append(clean_(t))
df_test['text'] = text
df_test.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,2.0,who are you ? the lion yelled at my father .
1,1,2.0,my name is elmer elevator .
2,2,2.0,where do you think you are going ?
3,3,2.0,"i 'm going home , said my father ."
4,4,2.0,that 's what you think ! said the lion . ordin...


In [8]:
SEQUENCE_LENGTH = 40
SEQUENCE_LENGTH_D = 25
BATCH_SIZE = 20

In [9]:
X_test_ = read_test_set(df_test, dictionary, SEQUENCE_LEN_D = SEQUENCE_LENGTH_D, SEQUENCE_LEN = SEQUENCE_LENGTH)

len of test set:  3830
IV in test set:  87624
OOV in test set:  2057


In [10]:
vocabulary_size = len(dictionary)
EMBEDDING_DIM = 300

In [11]:
tf.reset_default_graph()

In [12]:
"""
Reload the trained model; get reading level predictions for test set 
"""

NUM_WORDS = vocabulary_size
INDEX_FROM = 3
#EMBEDDING_DIM = embedding_dim
# system parameters
HIDDEN_SIZE = 150
HIDDEN_SIZE_D = 100
ATTENTION_SIZE = 75
ATTENTION_SIZE_D = 50
#PROJECTION_SIZE = 100
KEEP_PROB = 0.5
NUM_EPOCHS = 5  # max val_acc at __
DELTA = 0.75
ordinal = True


#Different placeholders
num_classes = 12
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH])
ind_list_ph = tf.placeholder(tf.int32, [None])
target_ph = tf.placeholder(tf.float32, [None,num_classes])
seq_len_ph = tf.placeholder(tf.int32, [None])
seq_len_ph_d = tf.placeholder(tf.int32, [None])
keep_prob_ph = tf.placeholder(tf.float32)
doc_size_ph = tf.placeholder(tf.int32,[None])
# Embedding layer
embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)
batch_embedded = tf.nn.dropout(batch_embedded, keep_prob_ph)

W_omega = tf.Variable(tf.random_uniform([HIDDEN_SIZE*2, HIDDEN_SIZE*2], -1.0, 1.0))
# (Bi-)RNN layer(-s)
with tf.variable_scope('sentence'):
    fw_cell = Cell(HIDDEN_SIZE)
    bw_cell = Cell(HIDDEN_SIZE)
    
    fw_cell = DropoutWrapper(fw_cell, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob=keep_prob_ph,
                             variational_recurrent=True, input_size=batch_embedded.get_shape()[-1], 
                             dtype = tf.float32)
    bw_cell = DropoutWrapper(bw_cell, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob= keep_prob_ph,
                             variational_recurrent=True, input_size=batch_embedded.get_shape()[-1], 
                             dtype = tf.float32)
    rnn_output, _ = bi_rnn(fw_cell, bw_cell, inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)

    rnn_outputs_ = cross_attention(rnn_output, SEQUENCE_LENGTH_D, seq_len_ph, BATCH_SIZE, W_omega)
    attention_output_, alphas_ = attention(rnn_outputs_ , ATTENTION_SIZE, seq_len_ph, return_alphas = True)
    attention_output_ = tf.reshape(attention_output_,[BATCH_SIZE, -1, HIDDEN_SIZE*2*3])
    
with tf.variable_scope('document'):
    fw_cell_d = Cell(HIDDEN_SIZE_D)
    bw_cell_d = Cell(HIDDEN_SIZE_D)
    
    fw_cell_d = DropoutWrapper(fw_cell_d, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob=keep_prob_ph,
                             variational_recurrent=True, input_size=attention_output_.get_shape()[-1], 
                             dtype = tf.float32)
    bw_cell_d = DropoutWrapper(bw_cell_d, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob= keep_prob_ph,
                             variational_recurrent=True, input_size=attention_output_.get_shape()[-1], 
                             dtype = tf.float32)
    rnn_outputs_d, _ = bi_rnn(fw_cell_d, bw_cell_d, inputs=attention_output_, 
                              sequence_length=seq_len_ph_d, dtype=tf.float32)
    
    #rnn_outputs_d, _ = bi_rnn(Cell(HIDDEN_SIZE_D), Cell(HIDDEN_SIZE_D), inputs=attention_output, sequence_length=seq_len_ph_d, dtype=tf.float32)
    attention_output_d, alphas_d = attention(rnn_outputs_d, ATTENTION_SIZE_D, seq_len_ph_d, return_alphas=True)

# Dropout
drop = tf.nn.dropout(attention_output_d, keep_prob_ph)


if ordinal:
    # For ordinal regression, same weights for each class
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value], stddev=0.1))
    W_ = tf.transpose(tf.reshape(tf.tile(W,[num_classes - 1]),[num_classes - 1, drop.get_shape()[1].value]))
    b = tf.Variable(tf.cast(tf.range(num_classes - 1), dtype = tf.float32))
    y_hat_ = tf.nn.xw_plus_b(drop, tf.negative(W_), b)

    # Predicted labels and logits
    y_preds, logits = preds(y_hat_,BATCH_SIZE)
    y_true = tf.argmax(target_ph, axis = 1)

    # Ordinal loss
    loss = ordloss_m(y_hat_, target_ph, BATCH_SIZE)
    c = stats.spearmanr
    str_score = "Spearman rank:"

else:
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value, num_classes], stddev=0.1))  
    b = tf.Variable(tf.truncated_normal([num_classes]))
    y_hat_ = tf.nn.xw_plus_b(drop, W, b)
    # Cross-entropy loss and optimizer initialization
    y_preds = tf.argmax(y_hat_, axis = 1)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat_, labels=target_ph))
    c = accuracy_score
    str_score = "accucary:"
    


saver = tf.train.Saver()

In [13]:
MODEL_PATH = "BCA_grades/model250-21720"

test_batch_generator_ = test_batch_generator(X_test_, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D)


# Calculate alpha coefficients for the first test example
with tf.Session() as sess:
    saver.restore(sess, MODEL_PATH)
    
    while True:

        #testing
        num_batches = X_test_.shape[0] // (BATCH_SIZE*SEQUENCE_LENGTH_D)
        a = []
        a_d = []
        true = []
        preds_ = []
        log_ = []
        for bx in range(num_batches+1):
            x_batch = next(test_batch_generator_)
            seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
            seq_len_d = []               
            l = SEQUENCE_LENGTH_D
            for i in range(0,len(x_batch),l):
                for j in range(i,i+l):
                    if list(x_batch[j]).index(0) == 0:
                        seq_len_d.append(j%l)
                        break
                    elif j == i+l-1:
                        seq_len_d.append(l)

            seq_len_d = np.array(seq_len_d)

            log, pred, alph, alph_d = sess.run([logits, y_preds, alphas_, alphas_d],
                         feed_dict={batch_ph: x_batch,
                                    seq_len_ph: seq_len,
                                    seq_len_ph_d: seq_len_d,
                                    keep_prob_ph: 1.0})
            a.append(alph)
            a_d.append(alph_d)
            log_.append(log)
            
            preds_.extend(pred)
   
        preds_ = np.array(preds_)
        preds_ = preds_.flatten()

            
        break
 

INFO:tensorflow:Restoring parameters from BCA_grades/model250-21720


In [14]:
len(preds_)

1560

In [None]:
#display word and sentence level attention
from IPython.core.display import display, HTML
#Display sentence level attention 
sc = 0
HTML_str_all_sent = []
for i in df_test['text']:
    i = sent_tokenize(i)
    i.insert(0,'START_SENT')
    i.append('END_SENT')
    bc = 0 
    HTML_str = ''
    s_len = min(len(i),SEQUENCE_LENGTH_D)

    for j in i[:s_len]:
        if j != 'START_SENT' and j != 'END_SENT':
            j = 'START ' + str(j) + ' END'
        x = j.split()
        att = a[sc//BATCH_SIZE][((sc%BATCH_SIZE)*SEQUENCE_LENGTH_D)+bc]
        att = att/att.max()
        bc+=1
    
        att_sent = []
        count = 0

        for w in x:
            #if w == 'START' or w == 'END':
            #    continue
            HTML_str = HTML_str + '<font style="background: rgba(153, 204, 255, %f)">%s</font>' % (att[count], w+' ')
                
            count+=1
            if count == SEQUENCE_LENGTH:
                break
    sc+=1        
    HTML_str_all_sent.append(HTML_str)
    HTML_str = ''
    

    
#Display sentence level attention 
sc = 0
HTML_str_all_doc = []
for i in df_test['text'][:-10]:
    i = sent_tokenize(i)
    i.insert(0,'START_SENT')
    i.append('END_SENT')
        
    HTML_str = ''
    s_len = min(len(i),SEQUENCE_LENGTH_D)
    i = i[:s_len]
    att = a_d[sc//BATCH_SIZE][sc%BATCH_SIZE][:s_len]
    att = att/att.max()
    att_sent = []
    count = 0

    for w in i:
        HTML_str = HTML_str + '<font style="background: rgba(255, 178, 102, %f)">%s</font>' % (att[count], w+' ')
        count+=1
        if count == SEQUENCE_LENGTH_D:
            break
    sc+=1        
    HTML_str_all_doc.append(HTML_str)
    HTML_str = ''
    
 

In [None]:
i = 1000
display(HTML(HTML_str_all_sent[i]))
display(HTML(HTML_str_all_doc[i]))


In [None]:
preds = preds_[:len(df_test)]
true = np.array(df_test['label'])
stats.spearmanr(preds,true)

In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()
%matplotlib inline
import matplotlib.pyplot as plt
 
objects = [i for i in range(12)]
y_pos = np.arange(len(objects))
performance = [sum([p==i for p in preds]) for i in y_pos]
t = [sum([p==i+1 for p in true]) for i in y_pos]


plt.bar(y_pos-0.15, performance, width = 0.3, align='center', alpha=0.8, label = 'Predicted')
plt.bar(y_pos+0.15, t, width = 0.3,align='center', alpha=0.8, label = 'True')

plt.xticks(y_pos, [o+1 for o in objects])
plt.ylabel('Number of items')
plt.xlabel('Grades')
plt.title('Reading level for CCS paragraphs')
plt.legend()
plt.show()