**Duplicate Question Pair Detection using Deep Learning.**

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from multiprocessing import cpu_count
from itertools import chain
from collections import Counter
from tensorflow.contrib.metrics import streaming_accuracy as accuracy
from tqdm import trange
from sklearn.metrics import accuracy_score

In [3]:
def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length
def rmse_loss(outputs, targets):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.sub(targets, outputs))))
def activate(outputs, weight_shape, bias_shape, activation=tf.nn.softmax):
    weights = tf.get_variable(
        "weights", shape=weight_shape, initializer=tf.random_normal_initializer())
    biases = tf.get_variable("biases", shape=bias_shape,
                             initializer=tf.constant_initializer(0.0))
    result = activation(tf.matmul(outputs, weights) + biases)
    return result

In [6]:
vec = np.load('data/embed.npz')['embed']
batch_size = 10
max_length = 237
dim = 200
reg_lambda = 0.0001
is_train=False
resume_from_checkpoint=True

In [7]:
tf.reset_default_graph()
with tf.Graph().as_default() as graph:
    embeds = tf.constant(vec, dtype=tf.float32)
    input1 = tf.placeholder(shape=[batch_size, max_length], dtype=tf.int64)
    input2 = tf.placeholder(shape=[batch_size, max_length], dtype=tf.int64)
    target = tf.placeholder(shape=[batch_size], dtype=tf.float32)
    input1_embed = tf.nn.embedding_lookup(embeds, input1)
    input2_embed = tf.nn.embedding_lookup(embeds, input2)
    
    
    cell1 = tf.nn.rnn_cell.GRUCell(dim)
    cell2 = tf.nn.rnn_cell.GRUCell(dim)
    
    with tf.variable_scope('gru1') as scope1:
        o11, _ = tf.nn.dynamic_rnn(cell=cell1, inputs=input1_embed,
                        sequence_length=length(input1_embed), dtype=tf.float32)
        scope1.reuse_variables()
        o21, _ = tf.nn.dynamic_rnn(cell=cell1, inputs=input2_embed,
                        sequence_length=length(input2_embed), dtype=tf.float32)
        
    with tf.variable_scope('gru2') as scope2:
        o12, s12 = tf.nn.dynamic_rnn(cell=cell2, inputs=o11, sequence_length=length(input1_embed), dtype=tf.float32)
        scope2.reuse_variables()
        o22, s22 = tf.nn.dynamic_rnn(cell=cell2, inputs=o21, sequence_length=length(input2_embed), dtype=tf.float32)
        
        d = tf.concat(1, [tf.abs(tf.sub(s12, s22)), tf.mul(s12, s22)])
    preds = tf.squeeze(activate(d, [dim * 2, 1], [1], activation=tf.nn.sigmoid))
    loss = - tf.reduce_mean (target * tf.log(preds) + (1 - target) * tf.log(1 - preds)) +\
        tf.reduce_sum([reg_lambda * tf.nn.l2_loss(x) for x in tf.trainable_variables()])
    
#     acc, update_op = accuracy(preds >= 0.5, target)
    optimize_op = tf.train.AdamOptimizer(1e-3).minimize(loss)
    
    saver = tf.train.Saver()
    with tf.Session() as sess:
        if resume_from_checkpoint:
            saver = tf.train.import_meta_graph('saves/model.ckpt.meta')
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))
        else:
            sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        if not is_train:
            p, t = [], []
            for i in trange(1500, 2000):
                batch_data = np.load('data/'+str(i)+'.npz')
                a, b, c = batch_data['ques1'], batch_data['ques2'], batch_data['label']
                p.extend(sess.run(preds,  {input1:a, input2:b, target:c}))
                t.extend(c)
            p = (np.array(p) >= 0.5).astype(np.int32)
            t = np.array(t).astype(np.int32)
            print(accuracy_score(p, t))
        else: 
            p, t = [], []
            for i in trange(1500):
                batch_data = np.load('data/'+str(i)+'.npz')
                a, b, c = batch_data['ques1'], batch_data['ques2'], batch_data['label']
                _, x = sess.run([optimize_op, preds], {input1: a, input2:b, target: c})
                p.extend(x)
                t.extend(c)
                if i % 100 == 0:
                    saver.save(sess, 'saves/model.ckpt')
                    p = (np.array(p) >= 0.5).astype(np.int32)
                    t = np.array(t).astype(np.int32)
                    print(accuracy_score(p, t))
                    p, t = [], []

100%|██████████| 500/500 [03:52<00:00,  2.28it/s]

0.7442



