In [1]:
import pandas as pd
import numpy as np
import io

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,LSTM,concatenate,Input,SimpleRNN, Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, GlobalMaxPooling1D, BatchNormalization, PReLU
from keras.models import Model, load_model
from keras import regularizers
from keras.utils import plot_model

from sklearn.model_selection import train_test_split
#import pydot

import sys
sys.path.insert(0, '../common/')
import csv_utils

import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

In [3]:
#train_data = train_data.loc[:10000,:]

In [4]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
data = train_data[['question1', 'question2', 'is_duplicate']].values
train, test = train_test_split(data, test_size=0.33, random_state=42)
train = train.T
test = test.T

In [6]:
MAX_LENGTH = 100 # Quora only allows 150 characters

'''
input:
    - train: raw text of training
    - test: raw text of testing
ouput:
    - train: processed training
    - test: processed testing
    - vocab: Number of vocaboluary
'''
def prep_data(train, test):
    tokenizer = Tokenizer()
    # Training the tokenizer with the words from all questions from training
    tokenizer.fit_on_texts(np.concatenate((train[0], train[1]), axis=0))

    # Convert each word to a integer according to the tokenizer
    q1_train = tokenizer.texts_to_sequences(train[0])
    q2_train = tokenizer.texts_to_sequences(train[1])
    q1_test = tokenizer.texts_to_sequences(test[0])
    q2_test = tokenizer.texts_to_sequences(test[1])

    # Add a left pad to make all the question have the same length
    q1_train = pad_sequences(q1_train, maxlen=MAX_LENGTH)
    q2_train = pad_sequences(q2_train, maxlen=MAX_LENGTH)
    q1_test = pad_sequences(q1_test, maxlen=MAX_LENGTH)
    q2_test = pad_sequences(q2_test, maxlen=MAX_LENGTH)
    
    train = q1_train, q2_train, train[2]
    # In case that the testing comes with labels, we need to return the labels as well
    if len(test) == 3:
        test = q1_test, q2_test, test[2]
    else:
        test = q1_test, q2_test
    
    return train, test, tokenizer

In [7]:
def prep_embd(fname, tokenizer):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    
    vocab = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab, d))
    
    for line in fin:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in tokenizer.word_index:
            i = tokenizer.word_index[word]
            vector = np.asarray(tokens[1:], dtype='float32')
            embedding_matrix[i] = vector
            
    return vocab, d, embedding_matrix

In [13]:
train, test, tokenizer = prep_data(train, test)

In [17]:
path = os.path.join('..','data','wiki-news-300d-1M.vec')
vocab, FASTTEXT_EMBEDDING_DIM, FASTTEXT_EMBEDDING_MATRIX = prep_embd(path, tokenizer)

In [8]:
DROPOUT_RATE = 0.2
#LAMBDA_REGULARIZER = 0.03
#EMBEDDING_DIM = 100
#FILTERS = 256
#KERNEL_SIZE = 5

def get_model_non_trainable_embeddings(vocab, input_q):
    embd = Embedding(input_dim = vocab,
                     output_dim = FASTTEXT_EMBEDDING_DIM, 
                     weights = [FASTTEXT_EMBEDDING_MATRIX],
                     trainable = False,
                     input_length=MAX_LENGTH)(input_q)
    flatten = Flatten()(embd)
    return flatten

def get_model(vocab):
    input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    model_q1 = get_model_non_trainable_embeddings(vocab, input_q1)
    model_q2 = get_model_non_trainable_embeddings(vocab, input_q2)

    
    concat = concatenate([model_q1,
                         model_q2])
    concat = Dropout(DROPOUT_RATE)(concat)
    
    for i in range(3):
        concat = Dense(units = 32)(concat)
        concat = PReLU()(concat)
        concat = Dropout(DROPOUT_RATE)(concat)

    output = Dense(1, activation='hard_sigmoid')(concat)

    model = Model(inputs=[input_q1, input_q2], outputs=output)
    model.compile(optimizer='adadelta',
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

In [None]:
model = get_model(vocab)
model.fit([train[0], train[1]],
          train[2],
          validation_data = ([test[0], test[1]],test[2]),
          batch_size=1024,
          epochs=100)

In [128]:
#model.save('embedding_NN_model.h5')
#model = load_model('embedding_NN_model.h5')

In [None]:
model.evaluate(x=[test[0], test[1]], y=test[2], batch_size=256) # 

In [15]:
predicted = model.predict([test[0], test[1]], batch_size=256)

In [16]:
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

## Playing with the real test now

In [9]:
path = os.path.join('..','data','test_data.csv')
test_data = pd.read_csv(path)

In [10]:
train = train_data[['question1', 'question2', 'is_duplicate']].values
test = test_data[['question1', 'question2']].values
train = train.T
test = test.T

In [11]:
train, test, tokenizer = prep_data(train, test)

In [12]:
model = load_model('embedding_NN_model_v3.h5') # Model trained in Colab Google
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 100, 300)     28678800    input_5[0][0]                    
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 100, 300)     28678800    input_6[0][0]                    
__________________________________________________________________________________________________
flatten_7 

In [13]:
predicted = model.predict([test[0], test[1]], batch_size=256)

In [14]:
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

In [15]:
csv_utils.create_csvs(predicted, test_data.test_id.values)

saved in:  /home/zenbook/Work/github/quora_npl/models/../data/submissions/submission_0934PM-November-27-2018.csv


### Score in Kaggle: 0.68705