**Duplicate Question Pair Detection using Deep Learning.**

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from multiprocessing import cpu_count
from itertools import chain
from collections import Counter
from tensorflow.contrib.metrics import streaming_accuracy as accuracy
from tqdm import trange, tnrange
from sklearn.metrics import accuracy_score

In [74]:
def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length
def rmse_loss(outputs, targets):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.sub(targets, outputs))))
def activate(outputs, weight_shape, bias_shape, activation=None):
    weights = tf.get_variable(
        "weights", shape=weight_shape, initializer=tf.random_normal_initializer(mean=0.0, stddev=1.0))
    biases = tf.get_variable("biases", shape=bias_shape,
                             initializer=tf.constant_initializer(0.0))
    affine = tf.matmul(outputs, weights) + biases
    if not activation:
        return affine
    else:
        return activation(affine)

In [82]:
# config
vec = np.load('data/embed.npz')['embed']
batch_size = 64
max_length = 272
dim = 300
reg_lambda = 1e-6
is_train=True
resume_from_checkpoint=True
save = True
lr = 1e-3
print_step = 100

In [83]:
tf.reset_default_graph()
with tf.Graph().as_default() as graph:
    embeds = tf.constant(vec, dtype=tf.float32)
    input1 = tf.placeholder(shape=[batch_size, max_length], dtype=tf.int64)
    input2 = tf.placeholder(shape=[batch_size, max_length], dtype=tf.int64)
    target = tf.placeholder(shape=[batch_size], dtype=tf.float32)
    input1_embed = tf.nn.embedding_lookup(embeds, input1)
    input2_embed = tf.nn.embedding_lookup(embeds, input2)    
    
    def sentence_op(inputs):
        with tf.variable_scope('lstm1'):
            cell1 = tf.contrib.rnn.LSTMCell(dim)
            o, _ = tf.nn.dynamic_rnn(cell=cell1, inputs=input1_embed,
                        sequence_length=length(inputs), dtype=tf.float32)
        with tf.variable_scope('lstm2'):
            cell2 = tf.contrib.rnn.LSTMCell(dim)
            o, s = tf.nn.dynamic_rnn(cell=cell2, inputs=o,
                        sequence_length=length(inputs), dtype=tf.float32)     
        return s[0]
    
    with tf.variable_scope('similarity') as scope:
        s1 = sentence_op(input1_embed)
        scope.reuse_variables()
        s2 = sentence_op(input2_embed)
        d = tf.concat([tf.abs(tf.subtract(s1, s2)), tf.multiply(s1, s2)], 1)
#         dist = tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(s1, s2)), axis=1)), axis=1)
#         angle = tf.expand_dims(tf.reduce_mean(tf.multiply(s1, s2), axis=1), axis=1)
#         d = tf.concat([dist, angle], axis=1)
        
    with tf.variable_scope('dense1'):
        h1 = tf.squeeze(activate(d, [dim * 2, 200], [200], activation=tf.nn.tanh))
    with tf.variable_scope('dense2'):
        h2 = tf.squeeze(activate(h1, [200, 50], [50], activation=tf.nn.tanh))
    with tf.variable_scope('dense3'):
        preds = tf.squeeze(activate(h2, [50, 1], [1], activation=None))
        preds_sig = tf.nn.sigmoid(preds)
    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=preds, labels=target)) +\
        tf.reduce_sum([reg_lambda * tf.nn.l2_loss(x) for x in tf.trainable_variables()])
    optimize_op = tf.train.AdamOptimizer(lr).minimize(loss)
    
    saver = tf.train.Saver()
    with tf.Session() as sess:
        if resume_from_checkpoint:
            saver = tf.train.import_meta_graph('saves/model.ckpt.meta')
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))
        else:
            sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        if not is_train:
            p, t = [], []
            for i in tnrange(0, 10000, batch_size):
                batch_data = np.load('data_test/'+str(i)+'.npz')
                a, b, c = batch_data['ques1'], batch_data['ques2'], batch_data['label']
                if a.shape[0] != batch_size:
                    continue
                p.extend(sess.run(preds_sig,  {input1:a, input2:b, target:c}))
                t.extend(c)
            p = (np.array(p) >= 0.5).astype(np.int32)
            t = np.array(t).astype(np.int32)
            print(accuracy_score(p, t))
        else: 
            p, t = [], []
            avg_loss = 0
            tr = tnrange(6000)
            for i in tr:
                batch_data = np.load('data/'+str(i)+'.npz')
                a, b, c = batch_data['ques1'], batch_data['ques2'], batch_data['label']
                _, l, x = sess.run([optimize_op, loss, preds_sig], 
                                   {input1: a, input2:b, target: c})
                avg_loss += l
                p.extend(x)
                print((np.array(p) >= 0.5).astype(np.int32))
#                 print(c)
                t.extend(c)
                if i % print_step == 0 and i != 0:
                    if save:
                        saver.save(sess, 'saves/model.ckpt')
                    p = (np.array(p) >= 0.5).astype(np.int32)
                    t = np.array(t).astype(np.int32)
                    tr.set_description('Loss = {0}, Training batch accuracy = {1}'.
                                       format(avg_loss/print_step, accuracy_score(p, t)))
                    p, t = [], []
                    avg_loss = 0

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

KeyboardInterrupt: 

In [None]:
1513