In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,LSTM,subtract,Input,SimpleRNN, Embedding, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.models import Model, load_model
from keras import regularizers

from sklearn.model_selection import train_test_split

import sys
sys.path.insert(0, '../common/')
import csv_utils

import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

In [3]:
#train_data = train_data.loc[:10000,:]

In [4]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
MAX_LENGTH = 100 # Quora only allows 150 characters
'''
input:
    - train: raw text of training
    - test: raw text of testing
ouput:
    - train: processed training
    - test: processed testing
    - vocab: Number of vocaboluary
'''
def prep_data(train, test):
    tokenizer = Tokenizer()
    # Training the tokenizer with the words from all questions from training
    tokenizer.fit_on_texts(train[0] + train[1])

    # Convert each word to a integer according to the tokenizer
    q1_train = tokenizer.texts_to_sequences(train[0])
    q2_train = tokenizer.texts_to_sequences(train[1])
    q1_test = tokenizer.texts_to_sequences(test[0])
    q2_test = tokenizer.texts_to_sequences(test[1])

    # Add a left pad to make all the question have the same length
    q1_train = pad_sequences(q1_train, maxlen=MAX_LENGTH)
    q2_train = pad_sequences(q2_train, maxlen=MAX_LENGTH)
    q1_test = pad_sequences(q1_test, maxlen=MAX_LENGTH)
    q2_test = pad_sequences(q2_test, maxlen=MAX_LENGTH)

    train = q1_train, q2_train, train[2]
    # In case that the testing comes with labels, we need to return the labels as well
    if len(test) == 3:
        test = q1_test, q2_test, test[2]
    else:
        test = q1_test, q2_test
    
    return train, test, len(tokenizer.word_index) + 1

In [82]:
data = train_data[['question1', 'question2', 'is_duplicate']].values
train, test = train_test_split(data, test_size=0.33, random_state=42)
train = train.T
test = test.T

In [83]:
train, test, vocab = prep_data(train, test)

In [6]:
EMBEDDING_DIM = 100
DROPOUT_RATE = 0.5
LAMBDA_REGULARIZER = 0.03

def get_model(vocab):
    # Network design
    #
    # Q1 -> Embedding -> Conv1 -> MaxPooling -> Flatten \
    #                                                    -> Concatenate -> FC layers -> output
    # Q2 -> Embedding -> Conv1 -> MaxPooling -> Flatten /
    #
    input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')

    embd_q1 = Embedding(vocab, EMBEDDING_DIM, input_length=MAX_LENGTH)(input_q1)
    embd_q2 = Embedding(vocab, EMBEDDING_DIM, input_length=MAX_LENGTH)(input_q2)
    
    drop_q1 = Dropout(DROPOUT_RATE)(embd_q1)
    drop_q2 = Dropout(DROPOUT_RATE)(embd_q2)
    
    conv_q1 = Conv1D(256, 3, activation='relu', padding='same')(drop_q1)
    conv_q2 = Conv1D(256, 3, activation='relu', padding='same')(drop_q2)
    
    for i in range(3):
        conv_q1 = Conv1D(256, 3, activation='relu', padding='same')(conv_q1)
        conv_q2 = Conv1D(256, 3, activation='relu', padding='same')(conv_q2)
        conv_q1 = MaxPooling1D(2)(conv_q1)
        conv_q2 = MaxPooling1D(2)(conv_q2)

    flatten_q1 = Flatten()(conv_q1)
    flatten_q2 = Flatten()(conv_q2)
    
    concat = concatenate([flatten_q1,flatten_q2])
    concat = Dropout(DROPOUT_RATE)(concat)
    
    for i in range(3):
        concat = Dense(units=16, activation='relu')(concat)

    output = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(LAMBDA_REGULARIZER))(concat)

    model = Model(inputs=[input_q1, input_q2], outputs=output)
    model.compile(optimizer='adadelta',
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

In [None]:
model = get_model(vocab)
model.fit([train[0], train[1]],
          train[2],
          validation_data = ([test[0], test[1]],test[2]),
          batch_size=256,
          epochs=20)

In [128]:
#model.save('embedding_NN_model.h5')
#model = load_model('embedding_NN_model.h5')

In [None]:
model.evaluate(x=[test[0], test[1]], y=test[2], batch_size=256) # 

In [15]:
predicted = model.predict([test[0], test[1]], batch_size=256)

In [16]:
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

## Playing with the real test now

In [7]:
path = os.path.join('..','data','test_data.csv')
test_data = pd.read_csv(path)

In [8]:
train = train_data[['question1', 'question2', 'is_duplicate']].values
test = test_data[['question1', 'question2']].values
train = train.T
test = test.T

In [9]:
train, test, vocab = prep_data(train, test)

In [10]:
model = load_model('embedding_NN_model_v2.h5') # Model trained in Colab Google
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 100)     8682800     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 100, 100)     8682800     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_4 

In [11]:
predicted = model.predict([test[0], test[1]], batch_size=256)

In [12]:
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

In [13]:
csv_utils.create_csvs(predicted, test_data.test_id.values)

saved in:  /home/zenbook/Work/github/quora_npl/models/../data/submissions/submission_0230PM-November-25-2018.csv


### Score in Kaggle: 0.78318