In [None]:
from utils import *
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import itertools


In [None]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'
glove_file = 'wordvectors/glove.6B.300d.txt'
unk_char = '<UNK>'
pad_char = '<PAD>'


In [None]:
def get_train_data(train_file, word_to_index, max_len=None):
    sentence_pair, is_duplicated, sequence_length = read_train_data(train_file)
    sent_pair = filter_unknown(sentence_pair, word_to_index, unk_char)
    if max_len:
        sent_pair = pad(sent_pair, pad_char, max_len)
    
    return word2index(sent_pair, word_to_index), np.array(is_duplicated), np.array(sequence_length)

def get_test_data(test_file, word_to_index):
    sentence_pair, sequence_length = read_test_data(test_file)
    sent_pair = filter_unknown(sentence_pair, word_to_index, unk_char)
    
    return word2index(sent_pair, word_to_index), np.array(sequence_length)


def get_embedding_matrix(glove_file, unk_char, pad_char):
    word_to_index, index_to_word, emb_matrix = read_glove_vecs(glove_file)

    word_to_index[unk_char] = len(word_to_index)
    word_to_index[pad_char] = len(word_to_index)

    index_to_word[len(word_to_index) - 2] = unk_char
    index_to_word[len(word_to_index) - 1] = pad_char

    emb_dim = emb_matrix.shape[1]
    emb_matrix = np.append(emb_matrix, [[0] * emb_dim] * 2, axis=0)

    return emb_matrix, word_to_index, index_to_word

In [None]:
emb_matrix, word_to_index, index_to_word = get_embedding_matrix(glove_file, unk_char, pad_char)
print('emb_matrix shape: {}'.format(emb_matrix.shape))

In [None]:
# max_len = 20
# x_train, y_train, train_sequlence_length = get_train_data(train_file, word_to_index, max_len=max_len)
# np.save('data/x_train_pad.npy', x_train)
# np.save('data/y_train.npy', y_train)
# np.save('data/train_seq_len.npy', train_sequlence_length)
# print(len(x_train))

In [None]:
# x_test, test_sequence_length = get_test_data(test_file, word_to_index)
# np.save('data/test.npy', x_test)
# np.save('data/test_seq_len.npy', test_sequence_length)
# print(len(x_test))

In [None]:
import tensorflow as tf
from tensorflow.keras.backend import binary_crossentropy
from tensorflow.contrib.layers import xavier_initializer
from tensorflow.nn import bidirectional_dynamic_rnn, embedding_lookup, dropout
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell
from tqdm import tqdm_notebook

class QuestionPairDuplicated:
    def __init__(self, emb_matrix, learning_rate, emb_trainable=False):
        tf.reset_default_graph()
        self.learning_rate = learning_rate
                
        # input
        self.input_sentA = tf.placeholder(tf.int32, shape=[None, None])  # (batch_size, time_step)
        self.input_sentB = tf.placeholder(tf.int32, shape=[None, None])  # (batch_size, time_step) 
        self.input_seq_lenA = tf.placeholder(tf.int32, shape=[None, ])  # (batch_size, )
        self.input_seq_lenB = tf.placeholder(tf.int32, shape=[None, ])  # (batch_size, )
        
        # output
        self.target = tf.placeholder(dtype=tf.float32, shape=[None, ])  # (batch_size, )

        # embedding matrix
        self.embedding_matrix = tf.get_variable(shape=emb_matrix.shape, 
                                    initializer=tf.constant_initializer(emb_matrix, dtype=tf.float32),
                                    dtype=tf.float32,
                                    trainable=emb_trainable,
                                    name='embeddings_matrix')
        
        
    def embedding_layer(self, sequence):
        return embedding_lookup(self.embedding_matrix, sequence)  # (batch_size, time_step, emb_dim)
    
    
    def bilstm(self, sequence, sequence_length, lstm_unit, reuse=None):
        with tf.variable_scope('BiLSTM', reuse=reuse, dtype=tf.float32):
            cell_fw = LSTMCell(num_units=lstm_unit, reuse=tf.get_variable_scope().reuse)
            cell_bw = LSTMCell(num_units=lstm_unit, reuse=tf.get_variable_scope().reuse)
            
        ((output_fw, output_bw), _) = bidirectional_dynamic_rnn(cell_fw, cell_bw, sequence, dtype=tf.float32, sequence_length=sequence_length)
        
        return tf.concat([output_fw, output_bw], axis=2)  # (batch_size, num_step, lstm_unit * 2)
    
    
    def lstm(self, sequence, sequence_length, lstm_unit, n_layers=1, reuse=None):
        with tf.variable_scope('LSTM', reuse=reuse, dtype=tf.float32):
            cell = tf.contrib.rnn.LSTMCell(num_units=lstm_unit, activation='tanh', reuse=tf.get_variable_scope().reuse)
#             cell = MultiRNNCell([cell] * n_layers)

        _, state = tf.nn.dynamic_rnn(cell, sequence, dtype=tf.float32, sequence_length=sequence_length)
        return state[1]  # (batch_size, lstm_unit)
    
    
    def manhattan_distance(self, vecA, vecB):
        # exp(-||h1 - h2||)
        diff = tf.reduce_sum(tf.abs(tf.subtract(vecA, vecB)), axis=1)  # (batch_size, )
        return tf.exp(-diff)
    
    
    def loss_function(self, output):
        ## MSE
        diff = tf.subtract(self.target, output) # (batch_size, )
        return tf.reduce_mean(tf.square(diff)) # (1, )
    
    
    def build(self, lstm_unit=256, hidden_unit=16, output_unit=1, encoder='lstm'):
        word_embA = self.embedding_layer(self.input_sentA)  # (batch_size, num_step, emb_dim)
        word_embB = self.embedding_layer(self.input_sentB)  # (batch_size, num_step, emb_dim)
        
        if encoder == 'lstm':
            repA = self.lstm(word_embA, self.input_seq_lenA, lstm_unit, reuse=None)  # (batch_size, lstm_unit)
            repB = self.lstm(word_embB, self.input_seq_lenB, lstm_unit, reuse=True)  # (batch_size, lstm_unit)
            input_dim = lstm_unit * 2
        elif encoder == 'bilstm':
            repA = self.bilstm(word_embA, self.input_seq_lenA, lstm_unit, None)  # (batch_size, num_step, lstm_unit * 2)
            repB = self.bilstm(word_embA, self.input_seq_lenB, lstm_unit, True)  # (batch_size, num_step, lstm_unit * 2)
            repA = tf.reduce_sum(repA, axis=1)  # (batch_size, lstm_unit * 2)
            repB = tf.reduce_sum(repB, axis=1)  # (batch_size, lstm_unit * 2)
            input_dim = lstm_unit * 4
        
        self.output = self.manhattan_distance(repA, repB)  # (batch_size, )
        self.loss = self.loss_function(self.output)  # (1, )
        
#         rep = tf.concat([repA, repB], axis=1)  # lstm: (batch_size, lstm_unit * 2), bilstm: (batch_size, lstm_unit * 4)
#         rep = dropout(rep, keep_prob=0.8)
        
#         hidden = self.dense(rep, hidden_unit, 'hidden')  # (batch_size, hidden_unit)
#         hidden = dropout(hidden, keep_prob=0.8)
        
#         self.output = self.dense(hidden, output_unit, 'output')  # (batch_size, output_unit)
#         self.output = tf.reshape(self.output, (-1, ))

#         self.loss = self.loss_function(self.output)  # ()
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

        
    def fit(self, train_data, val_data=None, epoch_size=1, batch_size=128, word_to_index=None, model_name='model'):
        def learn(X, Y, sequence_length, epoch, mode):
            tn = tqdm_notebook(total=len(X))
            for sentA, sentB, seq_lenA, seq_lenB, target in next_batch_with_pad(X, Y, sequence_length, word_to_index, batch_size):
#             for sentA, sentB, seq_lenA, seq_lenB, target in next_batch(X, Y, sequence_length, batch_size):

                feed_dict = {
                    self.input_sentA: sentA,
                    self.input_sentB: sentB, 
                    self.input_seq_lenA: seq_lenA,
                    self.input_seq_lenB: seq_lenB,
                    self.target: target
                }
                if mode == 'train':
                    fetches = [self.loss, self.output, self.optimizer]
                    loss, output, _ = self.sess.run(fetches, feed_dict)
                    tn.set_description('Epoch: {}/{}'.format(epoch, epoch_size))
                elif mode == 'validate':
                    fetches = [self.loss, self.output]
                    loss, output = self.sess.run(fetches, feed_dict)
                
                tn.set_postfix(loss=loss, accuracy=accuracy(output, target), mode=mode)
                tn.update(n=batch_size)
                
        saver = tf.train.Saver()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
                
        x_train, y_train, train_sequence_length = train_data[0], train_data[1], train_data[2]
        if val_data:
            x_val, y_val, val_sequence_length = val_data[0], val_data[1], val_data[2]
        
        print('Train on {} samples, validate on {} samples'.format(len(x_train), len(x_val) if val_data else 0))
        for epoch in range(1, epoch_size + 1):       
            x_train, y_train, train_sequence_length = shuffle_data(x_train, y_train, train_sequence_length)
            # train
            learn(x_train, y_train, train_sequence_length, epoch, 'train')

            # validate
            if val_data:
                learn(x_val, y_val, val_sequence_length, epoch, 'validate')    


        save_path = saver.save(self.sess, 'models/{}.ckpt'.format(model_name))
        print('Model was saved in {}'.format(save_path))
            
    
    def restore(self, model_path):
        saver = tf.train.Saver()
        self.sess = tf.Session()
        saver.restore(self.sess, model_path)
            
    
    def predict(self, X, sequence_length, word_to_index):
        
        y_empty = np.empty(0)
        batch_size, i = 100, 0
        tn = tqdm_notebook(total=len(X))
        prediction = np.empty((len(X), ))
        for sentA, sentB, seq_lenA, seq_lenB, _ in next_batch_with_pad(X, y_empty, sequence_length, word_to_index, batch_size):
            fetches = [self.output]
            feed_dict = {
                self.input_sentA: sentA,
                self.input_sentB: sentB, 
                self.input_seq_lenA: seq_lenA,
                self.input_seq_lenB: seq_lenB,
            }
            output = self.sess.run(fetches, feed_dict)[0]
            prediction[i * batch_size: i * batch_size + len(output)] = output
            
            tn.set_postfix(mode='predict')
            tn.update(n=batch_size)
            
            i += 1
        
        
        return prediction
        
        


In [None]:
X, Y, sequence_length = np.load('data/x_train.npy'), np.load('data/y_train.npy'), np.load('data/train_seq_len.npy')
x_train, y_train, train_sequlence_length, x_val, y_val, val_sequlence_length = split_train_val_data(X, Y, sequence_length, 0.0)

In [None]:
learning_rate = 0.001
model = QuestionPairDuplicated(emb_matrix, learning_rate, emb_trainable=True)
model.build(lstm_unit=256, encoder='lstm')

In [None]:
epoch_size = 15
batch_size = 128
train_data = [x_train, y_train, train_sequlence_length]
val_data = [x_val, y_val, val_sequlence_length]
model.fit(train_data, None, epoch_size, batch_size, word_to_index, 'model-manhattan')

In [None]:
model_path = 'models/model-manhattan.ckpt'
model.restore(model_path)

In [None]:
x_test, test_sequence_length = np.load('data/x_test.npy'), np.load('data/test_seq_len.npy')

In [None]:
prediction = model.predict(x_test, test_sequence_length, word_to_index)

In [None]:
prediction2csv(prediction, 'data/submit.csv')