# reference: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
from __future__ import print_function

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

Using TensorFlow backend.


In [37]:
data=pd.read_csv('ttrain.csv')
test_set=pd.read_csv('testt.csv')

In [38]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [39]:
#for python 2, this encoding works with tokenizer, no need to specify unicode or 'utf-8'
data['question1'] = data['question1'].apply(lambda x: (str(x)))
data['question2'] = data['question2'].apply(lambda x: (str(x)))


test_set['question1'] = test_set['question1'].apply(lambda x: (str(x)))
test_set['question2'] = test_set['question2'].apply(lambda x: (str(x)))

In [40]:
target=data['is_duplicate']

In [41]:
question1 = list(data['question1'])
question2 = list(data['question2'])

test1=list(test_set['question1'])
test2=list(test_set['question2'])

In [42]:
print(len(question1))
print(len(test1))

343635
60643


# tokenize the corpus, then transform sentence into sequences of integer corresponding to tokenizer word index

In [None]:
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(question1+question2+test1+test2)

In [None]:
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index #unique words in corpus (training and test sets)

print("Words in index: %d" % len(word_index))

In [None]:
test1_word_sequences = tokenizer.texts_to_sequences(test1)
test2_word_sequences = tokenizer.texts_to_sequences(test2)


In [None]:
q1_data = pad_sequences(question1_word_sequences, maxlen=25)
q2_data = pad_sequences(question2_word_sequences, maxlen=25)
test1_data=pad_sequences(test1_word_sequences, maxlen=25)
test2_data=pad_sequences(test2_word_sequences, maxlen=25)

labels = np.array(target, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

#  word embedding dictionary

In [None]:
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

# save as numpy array

In [None]:
np.save(open('q1_train.npy', 'wb'), q1_data)
np.save(open('q2_train.npy', 'wb'), q2_data)
np.save(open('test1.npy', 'wb'), test1_data)
np.save(open('test2.npy', 'wb'), test2_data)

np.save(open('label_train.npy', 'wb'), labels)
np.save(open('word_embedding_matrix.npy', 'wb'), embedding_matrix)

In [None]:
import numpy as np
import pandas as pd
import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GlobalAveragePooling1D,Lambda,Bidirectional
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, RMSprop
from keras import backend as B

In [None]:
q1_data = np.load(open('q1_train.npy', 'rb'))
q2_data = np.load(open('q2_train.npy', 'rb'))

labels = np.load(open('label_train.npy', 'rb'))
embedding_matrix = np.load(open('word_embedding_matrix.npy', 'rb'))


In [None]:

from sklearn.cross_validation import train_test_split

X = np.stack((q1_data, q2_data), axis=1)
target = labels

X_train, X_val, y_train, y_val = train_test_split(X, target, test_size=0.25, random_state=126, stratify=target)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_val = X_val[:,0]
Q2_val = X_val[:,1]

In [None]:
def vec_distance(vects):
    x, y = vects
    return B.sum(B.square(x - y), axis=1, keepdims=True)
#don't use squar root of the sum, it doens't give a good range to feed to the dense layer.

In [None]:
def vec_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


In [None]:
from keras.layers.embeddings import Embedding

nb_words=137077+1
max_sentence_len=25
embedding_layer = Embedding(nb_words,300,
        weights=[embedding_matrix],
        input_length=max_sentence_len,trainable=False)
#dont train this layer!

In [None]:
lstm_layer =LSTM(128)

sequence_1_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

distance=Lambda(vec_distance, output_shape=vec_output_shape)([x1, y1])
dense1=Dense(16, activation='sigmoid')(distance)
dense1 = Dropout(0.3)(dense1)

bn2 = BatchNormalization()(dense1)
prediction=Dense(1, activation='sigmoid')(bn2)

model = Model(input=[sequence_1_input, sequence_2_input], output=prediction)

In [None]:
model.summary()

In [None]:
#according to Keras, RMSprop (adaptive LR) is good for recurrent neural net. 
# Adam is another method that computes adaptive learning rates for each parameter. 
#In addition to storing an exponentially decaying average of past squared gradients vtvt like Adadelta and RMSprop, Adam also keeps an exponentially decaying average of past gradients mtmt,

###RMSprop as well divides the learning rate by an exponentially decaying average of squared gradients. 
##Adam is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, 
#and is well suited for problems that are large in terms of data and/or parameter
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])

In [None]:

early_stopping =EarlyStopping(monitor='val_loss', patience=3)


In [None]:
#optional: try calculating class weights
#source: stack exchange, J.Guillaumin

import math

# labels_dict : {ind_label: count_label}
# mu : parameter to tune 

def create_class_weight(labels_dict,mu=2):
    total = np.sum(labels_dict.values())
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(mu*total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight


In [None]:
####optional, assign weights to the labels due to imbalanced labels (0,1)
unique, counts = np.unique(target, return_counts=True)
labels_dict=dict(zip(unique, counts))

target_weight=create_class_weight(labels_dict)
target_weight

In [None]:
hist=model.fit([Q1_train, Q2_train], y_train, validation_data=([Q1_val, Q2_val], y_val), verbose=1, 
          nb_epoch=1, batch_size=256, shuffle=True,class_weight=None, callbacks=[early_stopping])
#takes long time to initiate
#using dense() layer and sigmoid activation

In [None]:
from keras.models import model_from_json

In [None]:
# export model to JSON
model_json = model.to_json()
with open("brnn_model_distance_128_d16_d05.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("brnn_model_distance_128_d16_d05.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('brnn_model_distance_128_d16_d05.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("brnn_model_distance_128_d16_d05.h5")
print("Loaded model from disk")
 

In [None]:
test1_data = np.load(open('test1.npy', 'rb'))
test2_data = np.load(open('test2.npy', 'rb'))

In [None]:
pred=model.predict([test1_data, test2_data],verbose=1)


In [None]:
submission=pd.read_csv('sample.csv')
print pred.clip(1e-5, 0.99999)


In [None]:
submission['is_duplicate']=pred.clip(1e-5, 0.99999)
submission.to_csv('lstm_submission(13).csv', index=False)