In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential, model_from_json
from keras.layers import Embedding, Dense, Dropout, Merge, BatchNormalization
from keras.layers import TimeDistributed, Lambda, LSTM, Convolution1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

In [None]:
train = pd.read_csv("train_Quora.csv")
train.fillna("", inplace=True)
question1, question2 = train["question1"], train["question1"]

In [None]:
questions = question1 + question2
token = Tokenizer(num_words=200000)
token.fit_on_texts(questions)
question1_word_sequences = token.texts_to_sequences(question1)
question2_word_sequences = token.texts_to_sequences(question2)
word_index = token.word_index

In [None]:
words_param = min(200000, len(word_index))
q1_data = pad_sequences(question1_word_sequences, maxlen=20)
q2_data = pad_sequences(question2_word_sequences, maxlen=20)
labels = np.array(train["is_duplicate"], dtype=int)

In [None]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
Q1_train = X[:,0]
Q2_train = X[:,1]

In [None]:
model_td1 = Sequential()
model_td1.add(Embedding(words_param + 1, 250, input_length=20, trainable=False))
model_td1.add(TimeDistributed(Dense(250, activation="relu")))
model_td1.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(250, )))


model_td2 = Sequential()
model_td2.add(Embedding(words_param + 1, 250,input_length=20, trainable=False))
model_td2.add(TimeDistributed(Dense(250, activation="relu")))
model_td2.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(250, )))


model = Sequential()
model.add(Merge([model_td1, model_td2], mode="concat"))
model.add(BatchNormalization())
for i in range(2):
    model.add(Dropout(0.9))
    model.add(Dense(5000, activation="relu"))
    model.add(BatchNormalization())

model.add(Dense(1, activation="sigmoid"))

In [None]:
del train, question1, question2, question1_word_sequences, question2_word_sequences, q1_data, q2_data, labels, X

In [None]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [None]:
model.fit([Q1_train, Q2_train], y, batch_size=100, epochs=500, validation_split=0.1, verbose=2)

In [None]:
test = pd.read_csv("test_Quora.csv")
test.fillna("", inplace=True)
question1, question2 = test["question1"], test["question1"]

In [None]:
question1_word_sequences = token.texts_to_sequences(question1)
question2_word_sequences = token.texts_to_sequences(question2)

In [None]:
q1_data = pad_sequences(question1_word_sequences, maxlen=20)
q2_data = pad_sequences(question2_word_sequences, maxlen=20)

In [None]:
X = np.stack((q1_data, q2_data), axis=1)
Q1_test = X[:,0]
Q2_test = X[:,1]

In [None]:
submission = pd.read_csv("sample_submission_Quora.csv")
submission["is_duplicate"] = pd.DataFrame(model.predict([Q1_test, Q2_test], batch_size=100, verbose=1)[:, 0])
submission.to_csv("mysub.csv", index=False)