In [1]:
# import dependencies
from __future__ import division 
import numpy as np
import pandas as pd
import gc
import pickle
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Merge, TimeDistributed, Lambda, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, MaxPooling1D
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

from keras.layers import Input, Bidirectional, LSTM, dot, Flatten, Dense, Reshape, add, Dropout, BatchNormalization, concatenate
from keras.models import Model

import matplotlib
%matplotlib inline

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
#use pickle to import our pre=processed data 
import pickle
with open('question_pair.pickle', 'rb') as f:
    df = pickle.load(f)

In [3]:
tk = text.Tokenizer(num_words=200000)
# we use keras Tokenizer to tokenizer the data. 
# we will only consider top 200000 words that occur in the dataset

max_len = 30
# the maximum length of each sequence

tk.fit_on_texts(list(df.question1.values.astype(str)) + list(df.question2.values.astype(str)))

#we now convert the text to numerical data
x1 = tk.texts_to_sequences(df.question1.values.astype(str))
x2 = tk.texts_to_sequences(df.question2.values.astype(str))

# we pad the sequences so that all questions are of the same length(30)
x1 = sequence.pad_sequences(x1, maxlen=max_len)
x2 = sequence.pad_sequences(x2, maxlen=max_len)

#the word_index contains the words in our dataset mapped to numbers
word_index = tk.word_index

In [4]:
# y is our target variable
y = df.is_duplicate.values

In [5]:
#we split our data to train and test sets

x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split(x1,x2,y, test_size = 0.1,random_state=2017)

In [6]:
#we import our pretrained weights for the words in our dataset. 
#This file was created during preprocessing
embedding_matrix = np.loadtxt('embeddings.txt')

In [8]:
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 500
n_epoch = 100

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

ques1 = Input(shape=(30,))
ques2 = Input(shape=(30,))
embedding_layer = Embedding(len(word_index) + 1, 
                 300, 
                 weights=[embedding_matrix], 
                 input_length=30, 
                 trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(ques1)
encoded_right = embedding_layer(ques2)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([ques1, ques2], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

checkpoint = ModelCheckpoint('benchmark.h5', monitor='val_acc', save_best_only=True, verbose=2)

estop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=20, verbose=0, mode='auto')

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
malstm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
input_4 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 30, 300)       26510400    input_3[0][0]                    
                                                                   input_4[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       embedding_2[0][0]       

In [None]:
malstm_trained = malstm.fit([x1_train, x2_train], y_train, batch_size=batch_size, 
                            epochs=n_epoch,verbose=1, validation_split=0.1, 
                            shuffle=True, callbacks=[checkpoint,estop])

Train on 327474 samples, validate on 36387 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100

In [9]:
# we can load our saved weights using load_weights function
malstm.load_weights('benchmark.h5')

# find the predictions using predict(). This will output only the probability
preds  = malstm.predict([x1_train,x2_train])


# we need to convert our predicted probabilies to binary classes(0,1), since F1 score doesnot support probabilities
predicted_classes = np.zeros((preds.shape[0],1))
for i in range(len(preds)):
    if preds[i] >= 0.500:
        predicted_classes[i]=1
        
#using sklearn f1_score function to find the score
print('F1 Score on train_set: '+ str(f1_score(y_train,predicted_classes)))

F1 Score on train_set: 0.767615470929


In [11]:
# we can load our saved weights using load_weights function
malstm.load_weights('benchmark.h5')

# find the predictions using predict(). This will output only the probability
preds  = malstm.predict([x1_test,x2_test])

# we need to convert our predicted probabilies to binary classes(0,1), since F1 score doesnot support probabilities
predicted_classes = np.zeros((preds.shape[0],1))
for i in range(len(preds)):
    if preds[i] >= 0.500:
        predicted_classes[i]=1
        
#using sklearn f1_score function to find the score
print('F1 Score on test_set: '+ str(f1_score(y_test,predicted_classes)))

F1 Score on test_set: 0.749760700123
