In [26]:
import pandas as pd
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,LSTM,concatenate,Input,SimpleRNN, Embedding, Conv1D, MaxPooling1D, Flatten
from keras.models import Model, load_model

from sklearn.model_selection import train_test_split

import sys
sys.path.insert(0, '../common/')
import csv_utils

import os

In [27]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

In [28]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [35]:
MAX_LENGTH = 150
def prep_data(train, test):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train[0] + train[1])

    q1_train = tokenizer.texts_to_sequences(train[0])
    q2_train = tokenizer.texts_to_sequences(train[1])
    q1_test = tokenizer.texts_to_sequences(test[0])
    q2_test = tokenizer.texts_to_sequences(test[1])

    q1_train = pad_sequences(q1_train, maxlen=MAX_LENGTH)
    q2_train = pad_sequences(q2_train, maxlen=MAX_LENGTH)
    q1_test = pad_sequences(q1_test, maxlen=MAX_LENGTH)
    q2_test = pad_sequences(q2_test, maxlen=MAX_LENGTH)

    train = q1_train, q2_train, train[2]
    if len(test) == 3:
        test = q1_test, q2_test, test[2]
    else:
        test = q1_test, q2_test
    
    return train, test, len(tokenizer.word_counts)+1

In [36]:
data = train_data[['question1', 'question2', 'is_duplicate']].values
train, test = train_test_split(data, test_size=0.33, random_state=42)
train = train.T
test = test.T

In [37]:
train, test, vocab = prep_data(train, test)

In [47]:
input_q1 = Input(shape=(MAX_LENGTH,), dtype='float32')
input_q2 = Input(shape=(MAX_LENGTH,), dtype='float32')

embd_q1 = Embedding(vocab, 200, input_length=MAX_LENGTH)(input_q1)
embd_q2 = Embedding(vocab, 200, input_length=MAX_LENGTH)(input_q2)

conv_q1 = Conv1D(128, 5, activation='relu')(embd_q1)
conv_q2 = Conv1D(128, 5, activation='relu')(embd_q2)
conv_q1 = MaxPooling1D(2)(conv_q1)
conv_q2 = MaxPooling1D(2)(conv_q2)
for i in range(2):
    conv_q1 = Conv1D(128, 5, activation='relu')(conv_q1)
    conv_q2 = Conv1D(128, 5, activation='relu')(conv_q2)
    conv_q1 = MaxPooling1D(2)(conv_q1)
    conv_q2 = MaxPooling1D(2)(conv_q2)

flatten_q1 = Flatten()(conv_q1)
flatten_q2 = Flatten()(conv_q2)

concat = concatenate([flatten_q1,flatten_q2],axis=-1)
for i in range(2):
    concat = Dense(64, activation='relu')(concat)

output=Dense(1, activation='sigmoid')(concat)

model = Model(inputs=[input_q1, input_q2], outputs=output)
model.compile(optimizer='rmsprop',
          loss='binary_crossentropy', # binary_crossentropy
          metrics=['binary_accuracy'])

In [None]:
model.fit([train[0], train[1]],train[2],batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 4/5
 25056/216517 [==>...........................] - ETA: 1:35:41 - loss: 0.5641 - binary_accuracy: 0.7607

In [51]:
#model.save('embedding_NN_model.h5')
#model = load_model('embedding_NN_model.h5')

In [13]:
model.metrics_names

['loss', 'binary_accuracy']

In [50]:
model.evaluate(x=[test[0], test[1]], y=test[2], batch_size=32)



[0.6867834903568718, 0.7317311028434725]

In [15]:
predicted = model.predict([test[0], test[1]], batch_size=32)

In [16]:
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

## Playing with the real test now

In [17]:
path = os.path.join('..','data','test_data.csv')
test_data = pd.read_csv(path)

In [19]:
train = train_data[['question1', 'question2', 'is_duplicate']].values
test = test_data[['question1', 'question2']].values
train = train.T
test = test.T

In [20]:
train, test = prep_data(train, test)

In [21]:
model.fit([train[0], train[1]],train[2],batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f09820aee10>

In [22]:
predicted = model.predict([test[0], test[1]], batch_size=32)

In [23]:
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

In [24]:
csv_utils.create_csvs(predicted, test_data.test_id.values)

saved in:  /home/zenbook/Work/github/quora_npl/models/../data/submissions/submission_0843PM-November-23-2018.csv
