In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Embedding
from keras.layers import Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D, BatchNormalization
from keras.layers import Concatenate, Subtract, Add, Multiply
from keras.layers import Input, Dropout, PReLU, SpatialDropout1D
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
from keras import optimizers

import os
import io

Using TensorFlow backend.


In [2]:
# Loading drive content
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
path = os.path.join('drive', 'My Drive', 'quora', 'train_data_v3_processed (1).csv')
train_data = pd.read_csv(path, dtype={'question1': 'str', 'question2': 'str'})

path = os.path.join('drive', 'My Drive', 'quora', 'test_data_v3_processed.csv')
test_data = pd.read_csv(path, dtype={'question1': 'str', 'question2': 'str'})

In [0]:
#Temporal
test_data.drop(['num_stop_words_q1', 'num_stop_words_q2', 'numerics_q1', 'numerics_q2', 'uppercase_q1', 'uppercase_q2'], axis=1, inplace=True)

In [5]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate,question1_lemma,question1_tag,question2_lemma,question2_tag,fuzz_ratio,fuzz_partial_ratio,...,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,avg_word_q1,avg_word_q2
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what be the step by step guide to invest in sh...,WP VBZ DT NN IN NN NN TO VB IN NN NN IN NN .,what be the step by step guide to invest in sh...,WP VBZ DT NN IN NN NN TO VB IN NN NN .,93,98,...,66,57,9,20,20,14,12,12,3.785714,3.833333
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what be the story of kohinoor ( koh - i - noor...,WP VBZ DT NN IN NNP -LRB- NNP HYPH NNP HYPH NN...,what would happen if the indian government ste...,WP MD VB IN DT JJ NN VBD DT NNP -LRB- NNP HYPH...,68,78,...,51,88,-37,21,29,8,13,11,5.5,5.846154
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can -PRON- increase the speed of -PRON- in...,WRB MD PRP VB DT NN IN PRP$ NN NN IN VBG DT NNP .,how can internet speed be increase by hack thr...,WRB MD VB NN VB VBN IN VBG IN NN .,35,44,...,73,59,14,25,24,14,10,6,4.285714,5.0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,why be -PRON- mentally very lonely ? how can -...,WRB VBP PRP RB RB JJ . WRB MD PRP VB PRP .,find the remainder when [ math]23^{24}[/math ]...,VB DT NN WRB -LRB- NN -RRB- VBZ VBN IN CD .,17,22,...,50,65,-15,19,26,11,9,2,3.636364,6.333333
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"which one dissolve in water quikly sugar , sal...","WDT CD NN IN NN RB NN , NN , NN CC NN FW NN .",which fish would survive in salt water ?,WDT NN MD VB IN NN NN .,37,55,...,76,39,37,25,18,13,7,5,4.923077,4.714286


In [0]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip -d .

In [0]:
RAW_Q1 = 0
RAW_Q2 = 1
LEMMA_Q1 = 2
LEMMA_Q2 = 3
TAGS_Q1 = 4
TAGS_Q2 = 5
FEATURES = 6
IS_DUPLICATE = 7

In [0]:
def process_questions(q1_train, q2_train, q1_val, q2_val, q1_test, q2_test):
    tokenizer = Tokenizer(lower=False)
    # Training the tokenizer with the words from all questions from training
    tokenizer.fit_on_texts(np.concatenate((q1_train, q2_train), axis=0))

    # Convert each word to a integer according to the tokenizer
    q1_train = tokenizer.texts_to_sequences(q1_train)
    q2_train = tokenizer.texts_to_sequences(q2_train)
    q1_val = tokenizer.texts_to_sequences(q1_val)
    q2_val = tokenizer.texts_to_sequences(q2_val)
    q1_test = tokenizer.texts_to_sequences(q1_test)
    q2_test = tokenizer.texts_to_sequences(q2_test)

    # Add a left pad to make all the question have the same length
    q1_train = pad_sequences(q1_train, maxlen=MAX_LENGTH)
    q2_train = pad_sequences(q2_train, maxlen=MAX_LENGTH)
    q1_val = pad_sequences(q1_val, maxlen=MAX_LENGTH)
    q2_val = pad_sequences(q2_val, maxlen=MAX_LENGTH)
    q1_test = pad_sequences(q1_test, maxlen=MAX_LENGTH)
    q2_test = pad_sequences(q2_test, maxlen=MAX_LENGTH)
    
    return q1_train, q2_train, q1_val, q2_val, q1_test, q2_test, tokenizer

In [0]:
MAX_LENGTH = 35
'''
input:
    - train: raw text of training
    - test: raw text of testing
ouput:
    - train: processed training
    - test: processed testing
    - vocab: Number of vocaboluary
'''
def prep_data(train, val, test):
    
    raw_q1_train, raw_q2_train,\
    raw_q1_val, raw_q2_val,\
    raw_q1_test, raw_q2_test,\
    raw_tokenizer = process_questions(train[RAW_Q1],
                                      train[RAW_Q2],
                                      val[RAW_Q1],
                                      val[RAW_Q2],
                                      test[RAW_Q1],
                                      test[RAW_Q2])
    
    lemma_q1_train, lemma_q2_train,\
    lemma_q1_val, lemma_q2_val,\
    lemma_q1_test, lemma_q2_test,\
    lemma_tokenizer = process_questions(train[LEMMA_Q1],
                                        train[LEMMA_Q2],
                                        val[LEMMA_Q1],
                                        val[LEMMA_Q2],
                                        test[LEMMA_Q1],
                                        test[LEMMA_Q2])
    
    tags_q1_train, tags_q2_train,\
    tags_q1_val, tags_q2_val,\
    tags_q1_test, tags_q2_test,\
    tags_tokenizer = process_questions(train[TAGS_Q1],
                                        train[TAGS_Q2],
                                        val[TAGS_Q1],
                                        val[TAGS_Q2],
                                        test[TAGS_Q1],
                                        test[TAGS_Q2])

    train = raw_q1_train, raw_q2_train, lemma_q1_train, lemma_q2_train, tags_q1_train, tags_q2_train, np.array(train[FEATURES].tolist()), train[IS_DUPLICATE]
    val = raw_q1_val, raw_q2_val, lemma_q1_val, lemma_q2_val, tags_q1_val, tags_q2_val, np.array(val[FEATURES].tolist()), val[IS_DUPLICATE]
    test = raw_q1_test, raw_q2_test, lemma_q1_test, lemma_q2_test, tags_q1_test, tags_q2_test, np.array(test[FEATURES].tolist())
    
    return train, val, test, raw_tokenizer, len(lemma_tokenizer.word_index) + 1, len(tags_tokenizer.word_index) + 1

In [0]:
def prep_embd(fname, tokenizer):
    f = open(fname,'r')
    d = 300
    
    vocab = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab, d))
    
    for line in f:
        tokens = line.split(' ')
        word = tokens[0]
        if word in tokenizer.word_index:
            i = tokenizer.word_index[word]
            vector = np.asarray(tokens[1:], dtype='float32')
            embedding_matrix[i] = vector
            
    return vocab, d, embedding_matrix

In [0]:
def get_features(df):
    df = df.drop(['question1', 'question2', 
                  'question1_lemma', 'question2_lemma', 
                  'question1_tag', 'question2_tag'], axis=1)
    if 'id' in df.columns:
        df = df.drop(['id', 'is_duplicate'], axis=1)
    elif 'test_id' in df.columns:
        df = df.drop('test_id', axis=1)
    return df.values

In [0]:
train_features = get_features(train_data)
train_data['features'] = train_features.tolist()
data = train_data[['question1', 'question2', 
                   'question1_lemma', 'question2_lemma', 
                   'question1_tag', 'question2_tag', 
                   'features', 'is_duplicate']].values

train, val = train_test_split(data, test_size=0.2, random_state=19)
train = train.T
val = val.T

test_features = get_features(test_data)
test_data['features'] = test_features.tolist()
test = test_data[['question1', 'question2', 
                  'question1_lemma', 'question2_lemma', 
                  'question1_tag', 'question2_tag', 
                  'features']].values
test = test.T

In [0]:
train, val, test, raw_tokenizer, LEMMA_VOCAB, TAGS_VOCAB = prep_data(train, val, test)

In [0]:
path = os.path.join('glove.840B.300d.txt')
RAW_VOCAB, GLOVE_EMBEDDING_DIM, GLOVE_EMBEDDING_MATRIX = prep_embd(path, raw_tokenizer)

In [0]:
EMBEDDING_DIM = 200
DROPOUT_RATE = 0.2
FILTERS = 16
KERNEL_SIZE = 5

def get_model_non_trainable_embeddings(input_q):
    embd = Embedding(input_dim = RAW_VOCAB,
                     output_dim = GLOVE_EMBEDDING_DIM, 
                     weights = [GLOVE_EMBEDDING_MATRIX],
                     trainable = False,
                     input_length=MAX_LENGTH)(input_q)
    lstm = LSTM(EMBEDDING_DIM, 
                recurrent_dropout = DROPOUT_RATE)(embd)
    flatten = Flatten()(embd)
    return flatten

def get_model_embeddings(input_q, vocab):
    embd = Embedding(input_dim = vocab,
                     output_dim = EMBEDDING_DIM, 
                     input_length=MAX_LENGTH)(input_q)
    
    dropout = Dropout(DROPOUT_RATE)(embd)
    flatten = Flatten()(dropout)
    return flatten
  
def get_model_features(features_input):
    model = BatchNormalization()(features_input)
    for i in range(4):
        model = Dense(units = 200, activation='relu')(model)
        model = Dropout(DROPOUT_RATE)(model)
    return model

def get_model():
    # Define inputs
    raw_input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    raw_input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    lemma_input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    lemma_input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    tags_input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    tags_input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    features_input = Input(shape=(train_features.shape[1],), dtype='float32')
   
    # Load models
    model_raw_q1 = get_model_non_trainable_embeddings(raw_input_q1)
    model_raw_q2 = get_model_non_trainable_embeddings(raw_input_q2)
    
    model_lemma_q1 = get_model_embeddings(lemma_input_q1, LEMMA_VOCAB)
    model_lemma_q2 = get_model_embeddings(lemma_input_q2, LEMMA_VOCAB)
    
    model_tags_q1 = get_model_embeddings(tags_input_q1, TAGS_VOCAB)
    model_tags_q2 = get_model_embeddings(tags_input_q2, TAGS_VOCAB) 
    
    model_features = get_model_features(features_input)
    
    # Merge models
    raw_subtract = Subtract()([model_raw_q2, model_raw_q1])
    lemma_subtract = Subtract()([model_lemma_q2, model_lemma_q1])
    tags_subtract = Subtract()([model_tags_q2, model_tags_q1])
    
    concat = Concatenate()([raw_subtract, lemma_subtract, tags_subtract, model_features])
    concat = BatchNormalization()(concat)
    concat = Dropout(DROPOUT_RATE)(concat)
    
    for i in range(3):
        concat = Dense(units = 200, activation='relu')(concat)
        concat = Dropout(DROPOUT_RATE)(concat)

    output = Dense(1, activation='sigmoid')(concat)
    
    model = Model(inputs=[raw_input_q1, raw_input_q2, 
                          lemma_input_q1, lemma_input_q2, 
                          tags_input_q1, tags_input_q2,
                          features_input], 
                  outputs=output)
    
    model.compile(optimizer=optimizers.Nadam(lr=0.0005),
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

In [0]:
model = get_model()

In [18]:
BATCH_SIZE = 512

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint('embedding_NN_model_v7.h5', save_best_only=True, save_weights_only=False, monitor='val_binary_accuracy', mode='max')

model.fit(list(train[:-1]), train[IS_DUPLICATE],
          validation_data = (list(val[:-1]),val[IS_DUPLICATE]),
          batch_size=BATCH_SIZE,
          epochs=15,
          callbacks=[early_stopping, model_checkpoint])

Train on 258311 samples, validate on 64578 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15


<keras.callbacks.History at 0x7f6ffa16a5c0>

In [0]:
model = load_model('embedding_NN_model_v7.h5')

In [21]:
model.evaluate(x=list(val[:-1]),y=val[IS_DUPLICATE], batch_size=BATCH_SIZE)



[0.456110086419516, 0.8198767381877528]

In [0]:
predicted = model.predict(list(test), batch_size=BATCH_SIZE)
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

In [0]:
import csv
import datetime
from os.path import join,abspath,curdir
import numpy as np
import pandas as pd

'''
Use this with 
import sys
sys.path.insert(0, './common/')
import csv_utils
csv_utils.create_csvs(predicted, test_ids)

Given the predicted outputs for each model:
predicted = [[0,1,0,0,1,0],[0,1,0,1,1,0],[0,1,0,0,1,1]]
test_ids = [12,32,43,44,11]
Create the csvs to submit to kaggle
'''

def create_csvs(predicted, test_ids):
    EXPECTED_ROWS = 81126 
    tests_ids_len = len(test_ids)
    assert(tests_ids_len == EXPECTED_ROWS)
    assert(len(predicted)==tests_ids_len)
    
    CURRENT_PATH = abspath(curdir)
    
    merged = {'test_id': test_ids}
    merged['is_duplicate'] = predicted

    FILENAME = 'submission_' + datetime.datetime.now().strftime("%I%M%p-%B-%d-%Y") + '.csv'
    df = pd.DataFrame.from_dict(merged)

    df.set_index('test_id', inplace=True)

    FULL_PATH = join(CURRENT_PATH, FILENAME)

    df.to_csv(path_or_buf=FULL_PATH, sep=',')

    print('saved in: ', FULL_PATH)


In [25]:
create_csvs(predicted, test_data.test_id.values)

saved in:  /content/submission_0135PM-December-09-2018.csv
