## Notebook for running a DM-BCA model

This notebook can be used to run a BCA model pretrained on the discourse marker prediction task, and trained on TOEFL LDC data (bca_dm_toefl_model). Please use the script TOEFL_dataParse.py to preprocess the TOEFL data. 

In [None]:
from __future__ import print_function, division

import os
import os.path
import pandas as pd
from io import StringIO
import io
import unicodedata
import re
import random

import tensorflow as tf
import numpy as np
np.set_printoptions(threshold = 10000)
import collections
import random

from tensorflow.contrib.rnn import LSTMCell as Cell #for GRU: custom implementation with normalization
from tensorflow.python.ops.rnn import dynamic_rnn as rnn
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tensorflow.contrib.rnn import DropoutWrapper

from attention import attention as attention
from bca_ import *
from ordloss import *
from utils import *
from datautilsbca import *


from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import accuracy_score

In [None]:
#read data; SEQUENCE_LENGTH is maximum length of sentence in words, SEQUENCE_LENGTH_D is maximum length of document in sentences. 
SEQUENCE_LENGTH = 40
SEQUENCE_LENGTH_D = 25
max_vocab = 75000
train_split = 0.95
BATCH_SIZE = 20

# system parameters
HIDDEN_SIZE = 150
HIDDEN_SIZE_D = 150
ATTENTION_SIZE = 75
ATTENTION_SIZE_D = 50
LAYER_1 = 500
LAYER_2 = 250
LAYER_3 = 100
KEEP_PROB = 0.7
#NUM_EPOCHS = 1  # max val_acc at __
DELTA = 0.75

In [None]:
fpath = 'data/TOEFL'

#add dict name
dict_name = 'bca_dm_toefl_model/dict.csv'
# load the dictionary from the pre-trained model folder
import csv 
dictionary = {}
for key,val in csv.reader(open(dict_name)):
    dictionary[key] = val

In [None]:
# the test data set; the fformat is csv, with the text column labelled 'text'
df_test = pd.read_csv(os.path.join(fpath,'test.csv'))

In [None]:
def read_data(raw_text):
    content = raw_text
    #print(content)
    content = content.split() #splits the text by spaces (default split character)
    content = np.array(content)
    content = np.reshape(content, [-1, ])
    return content

In [None]:
def read_test_set(df_val, dictionary, SEQUENCE_LEN_D = 40, SEQUENCE_LEN = 65, BATCH_SIZE = 10):
    
    X_val = []
    
    for i in df_val['text1']:
        i = sent_tokenize(i)
        #print(i)
        X_val.append([dictionary['START_SENT']])        
        for j in i[:SEQUENCE_LEN_D-2]:
            #print(j)
            #print(str(j).lower())
            x = read_data(str(j).lower())
            #print(x)
            data = []
            data.append(dictionary['START'])
            for word in x:
                if word in dictionary:
                    index = dictionary[word]
                    #count_iv_test += 1

                else:
                    index = dictionary['UNK']
                    #count_oov_test += 1

                data.append(index)
            data.append(dictionary['END'])
            X_val.append(data)
        X_val.append([dictionary['END_SENT']])
        for k in range(max(SEQUENCE_LEN_D - (len(i)+2), 0)):
            X_val.append([0])

    print('len of test set: ', len(X_val)//BATCH_SIZE)

    rank_val = df_val['label']
    target_val = np.array(rank_val)
    onehot_encoder = OneHotEncoder(sparse=False)
    
    integer_encoded = target_val.reshape(len(target_val), 1)
    y_test = onehot_encoder.fit_transform(integer_encoded)

    return X_val, y_test

In [None]:
X_test, y_test = read_test_set(df_test, dictionary, SEQUENCE_LEN_D = SEQUENCE_LENGTH_D, SEQUENCE_LEN = SEQUENCE_LENGTH)

In [None]:
doc_vocab_size = len(dictionary)
NUM_WORDS = doc_vocab_size
EMBEDDING_DIM = 300

In [None]:
y_test_len = len(y_test)

#use ordinal regression; logistic regression if False
ordinal = True

In [None]:
def zero_pad_(X, seq_len):
    return np.array([x[:seq_len - 1] + [0] * max(seq_len - len(x), 1) for x in X])

In [None]:
# Sequences preprocessing
vocabulary_size = doc_vocab_size 
X_test = zero_pad_(X_test, SEQUENCE_LENGTH)

#batch size padding 
X_test = zero_pad_test(X_test, BATCH_SIZE*SEQUENCE_LENGTH_D)
y_test = zero_pad_test(y_test, BATCH_SIZE)

In [None]:
tf.reset_default_graph()

In [None]:
#Different placeholders
num_classes = y_test.shape[1]
num_classes_s = 8
num_classes_s1 = 4
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH])
ind_list_ph = tf.placeholder(tf.int32, [None])
target_ph = tf.placeholder(tf.float32, [None,num_classes])
target_ph_s = tf.placeholder(tf.float32, [None,num_classes_s])
target_ph_s1 = tf.placeholder(tf.float32, [None,num_classes_s1])

seq_len_ph = tf.placeholder(tf.int32, [None])
seq_len_ph_d = tf.placeholder(tf.int32, [None])
keep_prob_ph = tf.placeholder(tf.float32)
doc_size_ph = tf.placeholder(tf.int32,[None])

In [None]:
# Embedding layer
embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)
batch_embedded = tf.nn.dropout(batch_embedded, keep_prob_ph)

W_omega = tf.Variable(tf.random_uniform([HIDDEN_SIZE*2, HIDDEN_SIZE*2], -1.0, 1.0))
# (Bi-)RNN layer(-s)
with tf.variable_scope('sentence'):
    fw_cell = Cell(HIDDEN_SIZE)
    bw_cell = Cell(HIDDEN_SIZE)
    
    fw_cell = DropoutWrapper(fw_cell, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob=keep_prob_ph,
                             variational_recurrent=True, input_size=batch_embedded.get_shape()[-1], 
                             dtype = tf.float32)
    bw_cell = DropoutWrapper(bw_cell, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob= keep_prob_ph,
                             variational_recurrent=True, input_size=batch_embedded.get_shape()[-1], 
                             dtype = tf.float32)
    rnn_output, _ = bi_rnn(fw_cell, bw_cell, inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)

    rnn_outputs = cross_attention(rnn_output, 2 , seq_len_ph, BATCH_SIZE, W_omega, time_major=False, return_alphas=False)
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, seq_len_ph, return_alphas=True)
    rnn_outputs_ = cross_attention(rnn_output, SEQUENCE_LENGTH_D, seq_len_ph, BATCH_SIZE, W_omega)
    attention_output_, alphas_ = attention(rnn_outputs_ , ATTENTION_SIZE, seq_len_ph, return_alphas = True)
    attention_output_ = tf.reshape(attention_output_,[BATCH_SIZE, -1, HIDDEN_SIZE*2*3])
    
with tf.variable_scope('document'):
    fw_cell_d = Cell(HIDDEN_SIZE_D)
    bw_cell_d = Cell(HIDDEN_SIZE_D)
    
    fw_cell_d = DropoutWrapper(fw_cell_d, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob=keep_prob_ph,
                             variational_recurrent=True, input_size=attention_output_.get_shape()[-1], 
                             dtype = tf.float32)
    bw_cell_d = DropoutWrapper(bw_cell_d, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob= keep_prob_ph,
                             variational_recurrent=True, input_size=attention_output_.get_shape()[-1], 
                             dtype = tf.float32)
    rnn_outputs_d, _ = bi_rnn(fw_cell_d, bw_cell_d, inputs=attention_output_, 
                              sequence_length=seq_len_ph_d, dtype=tf.float32)
    
    #rnn_outputs_d, _ = bi_rnn(Cell(HIDDEN_SIZE_D), Cell(HIDDEN_SIZE_D), inputs=attention_output, sequence_length=seq_len_ph_d, dtype=tf.float32)
    attention_output_d, alphas_d = attention(rnn_outputs_d, ATTENTION_SIZE_D, seq_len_ph_d, return_alphas=True)

# Dropout
drop = tf.nn.dropout(attention_output_d, keep_prob_ph)



#first classifier for first task using the representation from attention_outputs
#adding more layers... 
attention_output_sentorder = tf.reshape(attention_output, [BATCH_SIZE, -1])
W_s1_ = tf.Variable(tf.truncated_normal([HIDDEN_SIZE*2*2*3, LAYER_1], stddev=0.1))  
b_s1_ = tf.Variable(tf.truncated_normal([LAYER_1]))
y_hat_s1_ = tf.nn.xw_plus_b(attention_output_sentorder, W_s1_, b_s1_)
W_s2 = tf.Variable(tf.truncated_normal([LAYER_1, LAYER_2], stddev=0.1))  
b_s2 = tf.Variable(tf.truncated_normal([LAYER_2]))
y_hat_s2 = tf.nn.xw_plus_b(y_hat_s1_, W_s2, b_s2)

W_s = tf.Variable(tf.truncated_normal([LAYER_2, num_classes_s], stddev=0.1))  
b_s = tf.Variable(tf.truncated_normal([num_classes_s]))
y_hat_s = tf.nn.xw_plus_b(y_hat_s2, W_s, b_s)
y_preds_s = tf.argmax(y_hat_s, axis = 1)
loss_s = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat_s, labels=target_ph_s))

#second classifier for second task using the representation from attention_outputs
W_s1__ = tf.Variable(tf.truncated_normal([HIDDEN_SIZE*2*2*3, LAYER_1], stddev=0.1))  
b_s1__ = tf.Variable(tf.truncated_normal([LAYER_1]))
y_hat_s1__ = tf.nn.xw_plus_b(attention_output_sentorder, W_s1__, b_s1__)
W_s2_ = tf.Variable(tf.truncated_normal([LAYER_1, LAYER_2], stddev=0.1))  
b_s2_ = tf.Variable(tf.truncated_normal([LAYER_2]))
y_hat_s2_ = tf.nn.xw_plus_b(y_hat_s1__, W_s2_, b_s2_)

W_s1 = tf.Variable(tf.truncated_normal([LAYER_2, num_classes_s1], stddev=0.1))  
b_s1 = tf.Variable(tf.truncated_normal([num_classes_s1]))
y_hat_s1 = tf.nn.xw_plus_b(y_hat_s2_, W_s1, b_s1)
y_preds_s1 = tf.argmax(y_hat_s1, axis = 1)
loss_s1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat_s1, labels=target_ph_s1))


if ordinal:
    # For ordinal regression, same weights for each class
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value], stddev=0.1))
    W_ = tf.transpose(tf.reshape(tf.tile(W,[num_classes - 1]),[num_classes - 1, drop.get_shape()[1].value]))
    b = tf.Variable(tf.cast(tf.range(num_classes - 1), dtype = tf.float32))
    y_hat_ = tf.nn.xw_plus_b(drop, tf.negative(W_), b)

    # Predicted labels and logits
    y_preds, logits = preds(y_hat_,BATCH_SIZE)
    y_true = tf.argmax(target_ph, axis = 1)

    # Ordinal loss
    loss = ordloss_m(y_hat_, target_ph, BATCH_SIZE)
    c = stats.spearmanr
    str_score = "Spearman rank:"

else:
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value, num_classes], stddev=0.1))  
    b = tf.Variable(tf.truncated_normal([num_classes]))
    y_hat_ = tf.nn.xw_plus_b(drop, W, b)
    # Cross-entropy loss and optimizer initialization
    y_preds = tf.argmax(y_hat_, axis = 1)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat_, labels=target_ph))
    c = accuracy_score
    str_score = "accucary:"
    
# Calculate and clip gradients
max_gradient_norm = 5
lr = 1e-4
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
optimizer_ = tf.train.AdamOptimizer(learning_rate=lr)
optimizer = optimizer_.apply_gradients(
    zip(clipped_gradients, params))

#second optimizer for sentence order
gradients_s = tf.gradients(loss_s, params)
clipped_gradients_s, _ = tf.clip_by_global_norm(gradients_s, max_gradient_norm)
optimizer_s = optimizer_.apply_gradients(
    zip(clipped_gradients_s, params))

#third optimizer for sentence order
gradients_s1 = tf.gradients(loss_s1, params)
clipped_gradients_s1, _ = tf.clip_by_global_norm(gradients_s1, max_gradient_norm)
optimizer_s1 = optimizer_.apply_gradients(
    zip(clipped_gradients_s1, params))


In [None]:
saver = tf.train.Saver()

In [None]:
#testing
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D, shuffle = False)

In [None]:
MODEL_PATH = "bca_dm_toefl_model/model300-22275" 

In [None]:
sess = tf.Session()
saver.restore(sess, MODEL_PATH)

In [None]:
#testing on the test set
num_batches = X_test.shape[0] // (BATCH_SIZE*SEQUENCE_LENGTH_D)
true = []
ypreds = []
a = []
a_d = []
true = []
preds_ = []
log_ = []
doc_size_np = np.array([0]*SEQUENCE_LENGTH_D)

for bx in range(num_batches):
    x_batch, y_batch = next(test_batch_generator)
    seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
    seq_len_d = []               
    l = SEQUENCE_LENGTH_D
    for i in range(0,len(x_batch),l):
        for j in range(i,i+l):
            if list(x_batch[j]).index(0) == 0:
                seq_len_d.append(j%l)
                break
            elif j == i+l-1:
                seq_len_d.append(l)

    seq_len_d = np.array(seq_len_d)

    y_preds_, loss_t, alph, alph_d, log = sess.run([y_preds,loss,alphas_,alphas_d, logits],
                  feed_dict={batch_ph: x_batch,
                        target_ph: y_batch,
                        seq_len_ph: seq_len,
                        seq_len_ph_d: seq_len_d,
                        doc_size_ph: doc_size_np,
                        keep_prob_ph: 1.0})
    ypreds.extend(y_preds_)
    t = np.argmax(y_batch, axis = 1)
    true.extend(t)
    a.append(alph)
    a_d.append(alph_d)
    log_.append(log)

true = true[:y_test_len]
ypreds = ypreds[:y_test_len]

spr = c(true, ypreds)

if ordinal:
    spr = spr[0]
print('Test set '+ str_score + str(spr))

rank = stats.spearmanr
print('sp rho')
print(rank(true, ypreds))

from sklearn.metrics import cohen_kappa_score as kappa
print('qwk')
print(kappa(true, ypreds, weights="quadratic"))

from scipy.stats import pearsonr
print('pearson')
print(pearsonr(true,ypreds))

print('kappa')
print(kappa(true, ypreds, weights=None))