In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Embedding
from keras.layers import Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D, BatchNormalization
from keras.layers import Concatenate, Subtract, Multiply
from keras.layers import Input, Dropout, PReLU, SpatialDropout1D
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
from keras import optimizers

import os
import io

Using TensorFlow backend.


In [2]:
# Loading drive content
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
path = os.path.join('drive', 'My Drive', 'quora', 'train_data_v3_processed.csv')
train_data = pd.read_csv(path, dtype={'question1': 'str', 'question2': 'str'})

path = os.path.join('drive', 'My Drive', 'quora', 'test_data_v3_processed.csv')
test_data = pd.read_csv(path, dtype={'question1': 'str', 'question2': 'str'})

In [4]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip -d .

--2018-12-08 21:11:00--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2018-12-08 21:11:01--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2018-12-08 21:14:32 (9.88 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]

Archive:  glove.840B.300d.zip
  inflating: ./glove.840B.300d.txt   


In [0]:
RAW_Q1 = 0
RAW_Q2 = 1
LEMMA_Q1 = 2
LEMMA_Q2 = 3
TAGS_Q1 = 4
TAGS_Q2 = 5
IS_DUPLICATE = 6

In [0]:
def process_questions(q1_train, q2_train, q1_val, q2_val, q1_test, q2_test):
    tokenizer = Tokenizer(lower=False)
    # Training the tokenizer with the words from all questions from training
    tokenizer.fit_on_texts(np.concatenate((q1_train, q2_train), axis=0))

    # Convert each word to a integer according to the tokenizer
    q1_train = tokenizer.texts_to_sequences(q1_train)
    q2_train = tokenizer.texts_to_sequences(q2_train)
    q1_val = tokenizer.texts_to_sequences(q1_val)
    q2_val = tokenizer.texts_to_sequences(q2_val)
    q1_test = tokenizer.texts_to_sequences(q1_test)
    q2_test = tokenizer.texts_to_sequences(q2_test)

    # Add a left pad to make all the question have the same length
    q1_train = pad_sequences(q1_train, maxlen=MAX_LENGTH)
    q2_train = pad_sequences(q2_train, maxlen=MAX_LENGTH)
    q1_val = pad_sequences(q1_val, maxlen=MAX_LENGTH)
    q2_val = pad_sequences(q2_val, maxlen=MAX_LENGTH)
    q1_test = pad_sequences(q1_test, maxlen=MAX_LENGTH)
    q2_test = pad_sequences(q2_test, maxlen=MAX_LENGTH)
    
    return q1_train, q2_train, q1_val, q2_val, q1_test, q2_test, tokenizer

In [0]:
MAX_LENGTH = 35
'''
input:
    - train: raw text of training
    - test: raw text of testing
ouput:
    - train: processed training
    - test: processed testing
    - vocab: Number of vocaboluary
'''
def prep_data(train, val, test):
    
    raw_q1_train, raw_q2_train,\
    raw_q1_val, raw_q2_val,\
    raw_q1_test, raw_q2_test,\
    raw_tokenizer = process_questions(train[RAW_Q1],
                                      train[RAW_Q2],
                                      val[RAW_Q1],
                                      val[RAW_Q2],
                                      test[RAW_Q1],
                                      test[RAW_Q2])
    
    lemma_q1_train, lemma_q2_train,\
    lemma_q1_val, lemma_q2_val,\
    lemma_q1_test, lemma_q2_test,\
    lemma_tokenizer = process_questions(train[LEMMA_Q1],
                                        train[LEMMA_Q2],
                                        val[LEMMA_Q1],
                                        val[LEMMA_Q2],
                                        test[LEMMA_Q1],
                                        test[LEMMA_Q2])
    
    tags_q1_train, tags_q2_train,\
    tags_q1_val, tags_q2_val,\
    tags_q1_test, tags_q2_test,\
    tags_tokenizer = process_questions(train[TAGS_Q1],
                                        train[TAGS_Q2],
                                        val[TAGS_Q1],
                                        val[TAGS_Q2],
                                        test[TAGS_Q1],
                                        test[TAGS_Q2])

    train = raw_q1_train, raw_q2_train, lemma_q1_train, lemma_q2_train, tags_q1_train, tags_q2_train, train[IS_DUPLICATE]
    val = raw_q1_val, raw_q2_val, lemma_q1_val, lemma_q2_val, tags_q1_val, tags_q2_val, val[IS_DUPLICATE]
    test = raw_q1_test, raw_q2_test, lemma_q1_test, lemma_q2_test, tags_q1_test, tags_q2_test
    
    return train, val, test, raw_tokenizer, len(lemma_tokenizer.word_index) + 1, len(tags_tokenizer.word_index) + 1

In [0]:
def prep_embd(fname, tokenizer):
    f = open(fname,'r')
    d = 300
    
    vocab = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab, d))
    
    for line in f:
        tokens = line.split(' ')
        word = tokens[0]
        if word in tokenizer.word_index:
            i = tokenizer.word_index[word]
            vector = np.asarray(tokens[1:], dtype='float32')
            embedding_matrix[i] = vector
            
    return vocab, d, embedding_matrix

In [0]:
def get_features(df):
    df = df.drop(['question1', 'question2', 
                  'question1_lemma', 'question2_lemma', 
                  'question1_tag', 'question2_tag', 
                  'is_duplicate'], axis=1)
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
    elif 'test_id' in df.columns:
        df = df.drop('test_id', axis=1)
    return df.values

In [0]:
data = train_data[['question1', 'question2', 
                   'question1_lemma', 'question2_lemma', 
                   'question1_tag', 'question2_tag', 
                   'is_duplicate']].values

train, val = train_test_split(data, test_size=0.2, random_state=19)
train = train.T
val = val.T

test = test_data[['question1', 'question2', 
                  'question1_lemma', 'question2_lemma', 
                  'question1_tag', 'question2_tag']].values
test = test.T

In [0]:
train, val, test, raw_tokenizer, LEMMA_VOCAB, TAGS_VOCAB = prep_data(train, val, test)

In [0]:
path = os.path.join('glove.840B.300d.txt')
RAW_VOCAB, GLOVE_EMBEDDING_DIM, GLOVE_EMBEDDING_MATRIX = prep_embd(path, raw_tokenizer)

In [0]:
EMBEDDING_DIM = 200
DROPOUT_RATE = 0.25
FILTERS = 16
KERNEL_SIZE = 5

def get_model_non_trainable_embeddings(input_q):
    embd = Embedding(input_dim = RAW_VOCAB,
                     output_dim = GLOVE_EMBEDDING_DIM, 
                     weights = [GLOVE_EMBEDDING_MATRIX],
                     trainable = False,
                     input_length=MAX_LENGTH)(input_q)
    lstm = LSTM(96, 
                recurrent_dropout = DROPOUT_RATE)(embd)
    return lstm

def get_model_embeddings(input_q, vocab):
    embd = Embedding(input_dim = vocab,
                     output_dim = EMBEDDING_DIM, 
                     input_length=MAX_LENGTH)(input_q)
    lstm = LSTM(96, 
                recurrent_dropout = DROPOUT_RATE)(embd)
    
    return lstm
  
def get_model_features(features_input):
    model = BatchNormalization()(features_input)
    for i in range(2):
        model = Dense(units = 200, activation='relu')(model)
        model = Dropout(DROPOUT_RATE)(model)
    return model

def get_model():
    # Define inputs
    raw_input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    raw_input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    lemma_input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    lemma_input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    tags_input_q1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    tags_input_q2 = Input(shape=(MAX_LENGTH,), dtype='int32')
    
    #features_input = Input(shape=(train_features.shape[1],), dtype='float32')
   
    # Load models
    model_raw_q1 = get_model_non_trainable_embeddings(raw_input_q1)
    model_raw_q2 = get_model_non_trainable_embeddings(raw_input_q2)
    
    model_lemma_q1 = get_model_embeddings(lemma_input_q1, LEMMA_VOCAB)
    model_lemma_q2 = get_model_embeddings(lemma_input_q2, LEMMA_VOCAB)
    
    model_tags_q1 = get_model_embeddings(tags_input_q1, TAGS_VOCAB)
    model_tags_q2 = get_model_embeddings(tags_input_q2, TAGS_VOCAB) 
    
    #model_features = get_model_features(features_input)
    
    # Merge models
    raw_subtract = Subtract()([model_raw_q2, model_raw_q1])
    lemma_subtract = Subtract()([model_lemma_q2, model_lemma_q1])
    tags_subtract = Subtract()([model_tags_q2, model_tags_q1])
    
    # mult = Multiply()([raw_subtract, lemma_subtract, tags_subtract])
    
    concat = Concatenate()([raw_subtract, lemma_subtract, tags_subtract])
    concat = BatchNormalization()(concat)
    concat = Dropout(DROPOUT_RATE)(concat)
    
    for i in range(2):
        concat = Dense(units = 150, activation='relu')(concat)
        concat = Dropout(DROPOUT_RATE)(concat)

    output = Dense(1, activation='sigmoid')(concat)
    
    model = Model(inputs=[raw_input_q1, raw_input_q2, 
                          lemma_input_q1, lemma_input_q2, 
                          tags_input_q1, tags_input_q2], 
                  outputs=output)
    
    model.compile(optimizer='nadam',
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

In [0]:
model = get_model()

In [14]:
BATCH_SIZE = 1024

early_stopping = EarlyStopping(monitor='val_binary_accuracy', patience=5)
model_checkpoint = ModelCheckpoint('embedding_NN_model_v7.h5', save_best_only=True, save_weights_only=False, monitor='val_loss', mode='min')

model.fit([train[RAW_Q1], train[RAW_Q2], train[LEMMA_Q1], train[LEMMA_Q2], train[TAGS_Q1], train[TAGS_Q2]], train[IS_DUPLICATE],
          validation_data = ([val[RAW_Q1], val[RAW_Q2], val[LEMMA_Q1], val[LEMMA_Q2], val[TAGS_Q1], val[TAGS_Q2]],val[IS_DUPLICATE]),
          batch_size=BATCH_SIZE,
          epochs=15,
          callbacks=[early_stopping, model_checkpoint])

Train on 258311 samples, validate on 64578 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15


<keras.callbacks.History at 0x7f060800fd30>

In [0]:
model = load_model('embedding_NN_model_v7.h5')

In [23]:
model.evaluate(x=list(val[:-1]),y=val[IS_DUPLICATE], batch_size=BATCH_SIZE)



[0.4599608550606606, 0.7913221220941504]

In [0]:
predicted = model.predict(list(test), batch_size=BATCH_SIZE)
predicted = predicted.ravel()
predicted = list(map(lambda x: 1 if x > 0.5 else 0, predicted))

In [0]:
import csv
import datetime
from os.path import join,abspath,curdir
import numpy as np
import pandas as pd

'''
Use this with 
import sys
sys.path.insert(0, './common/')
import csv_utils
csv_utils.create_csvs(predicted, test_ids)

Given the predicted outputs for each model:
predicted = [[0,1,0,0,1,0],[0,1,0,1,1,0],[0,1,0,0,1,1]]
test_ids = [12,32,43,44,11]
Create the csvs to submit to kaggle
'''

def create_csvs(predicted, test_ids):
    EXPECTED_ROWS = 81126 
    tests_ids_len = len(test_ids)
    assert(tests_ids_len == EXPECTED_ROWS)
    assert(len(predicted)==tests_ids_len)
    
    CURRENT_PATH = abspath(curdir)
    
    merged = {'test_id': test_ids}
    merged['is_duplicate'] = predicted

    FILENAME = 'submission_' + datetime.datetime.now().strftime("%I%M%p-%B-%d-%Y") + '.csv'
    df = pd.DataFrame.from_dict(merged)

    df.set_index('test_id', inplace=True)

    FULL_PATH = join(CURRENT_PATH, FILENAME)

    df.to_csv(path_or_buf=FULL_PATH, sep=',')

    print('saved in: ', FULL_PATH)


In [25]:
create_csvs(predicted, test_data.test_id.values)

saved in:  /content/submission_1114PM-December-08-2018.csv
