In [1]:
import pickle
import random

#import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#import h5py
#import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from tqdm import tqdm
#% matplotlib inline

random.seed(42)

In [2]:
import keras
import keras.backend as K
from keras.layers import InputSpec, Input, Dense, Flatten, Permute#, Layer
from keras.layers import Lambda, Masking, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.advanced_activations import ELU
from keras.layers.merge import Add, Dot, add, concatenate, dot
from keras.layers.noise import GaussianNoise
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.engine.topology import Layer
from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2
from keras.optimizers import Adam

Using TensorFlow backend.


# Keras Model 1

In [3]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def create_base_network(input_dim, hidden_dim, dropout=0.0):
    '''Base network to be shared (eq. to feature extraction).'''
    seq = Sequential()
    seq.add(Dense(hidden_dim, input_shape=(input_dim,), activation='relu'))
    seq.add(Dropout(dropout))
    seq.add(Dense(hidden_dim, activation='relu'))
    seq.add(Dropout(dropout))
    seq.add(Dense(hidden_dim, activation='relu'))
    return seq

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

In [4]:
def create_network(input_dim, hidden_dim, settings):    
    # network definition
    base_network = create_base_network(input_dim, hidden_dim, settings['dropout'])
    
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    model = Model(inputs=[input_a, input_b], outputs=distance)
    return model

# Keras Model 2

In [3]:
def create_base_network(vectors, shape, settings):
    max_length, nr_hidden, nr_class = shape
    
    # Inputs
    #ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
    #ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
    
    seq = Sequential()
    
    seq.add(Embedding(vectors.shape[0],
                      vectors.shape[1],
                      input_length=max_length,
                      weights=[vectors],
                      name='embed',
                      trainable=False))
    
    seq.add(LSTM(nr_hidden, dropout=settings['dropout']))
    
    seq.add(Dense(nr_hidden, activation='relu', dropout=settings['dropout']))
    
    return(seq)

In [1]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [None]:
def create_network(input_dim, shape, settings):
    # network definition
    base_network = create_base_network(input_dim, shape, settings['dropout'])
    
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    model = Model(inputs=[input_a, input_b], outputs=distance)
    return model

# Load data

In [5]:
train_df = pd.read_csv('../data/input/train.csv', index_col='id')

train_df['question1'].fillna('', inplace=True)
train_df['question2'].fillna('', inplace=True)

# filter out question shorter than 10 characters
train_df['q1_len'] = train_df['question1'].str.len()
train_df['q2_len'] = train_df['question2'].str.len()

train_df = train_df.loc[lambda df: (df['q1_len'] > 10) & (df['q2_len'] > 10)]

# subset data
train_df = train_df.loc[0:199999]

train_df['is_duplicate'].value_counts()

0    125461
1     74472
Name: is_duplicate, dtype: int64

In [None]:
nlp = spacy.load('en')

print('Splitting data set')
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(train_df['question1'].values, 
                                                                      train_df['question2'].values,
                                                                      train_df['is_duplicate'].values, 
                                                                      test_size=0.2, random_state=42, 
                                                                      stratify=train_df['is_duplicate'].values)

print('Get doc vectors')
print('Train 1')
train_X1 = get_word_ids(list(nlp.pipe(X1_train.tolist(), n_threads=20, batch_size=10000)),
                        max_length=30,
                        rnn_encode=False,
                        tree_truncate=False)
print('Train 2')
train_X2 = get_word_ids(list(nlp.pipe(X2_train.tolist(), n_threads=20, batch_size=10000)),
                        max_length=30,
                        rnn_encode=False,
                        tree_truncate=False)
print('Val 1')
val_X1 = get_word_ids(list(nlp.pipe(X1_val.tolist(), n_threads=20, batch_size=10000)),
                        max_length=30,
                        rnn_encode=False,
                        tree_truncate=False)
print('Val 2')
val_X2 = get_word_ids(list(nlp.pipe(X2_val.tolist(), n_threads=20, batch_size=10000)),
                        max_length=30,
                        rnn_encode=False,
                        tree_truncate=False)

print('Building model')
shape = (30, 200, 1)
settings = {
    'lr': 0.0001,
    'dropout': 0.2
}

model = create_network(300, shape, settings)
model.compile(
        optimizer=Adam(lr=settings['lr']),
        loss='binary_crossentropy',
        #loss='categorical_crossentropy',
        metrics=['accuracy'])

# Train Model

In [6]:
nlp = spacy.load('en')

print('Splitting data set')
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(train_df['question1'].values, 
                                                                      train_df['question2'].values,
                                                                      train_df['is_duplicate'].values, 
                                                                      test_size=0.2, random_state=42, 
                                                                      stratify=train_df['is_duplicate'].values)

print('Get doc vectors')
print('Train 1')
train_X1 = np.array([doc.vector for doc in nlp.pipe(X1_train.tolist(), n_threads=20, batch_size=10000)])
print('Train 2')
train_X2 = np.array([doc.vector for doc in nlp.pipe(X2_train.tolist(), n_threads=20, batch_size=10000)])
print('Val 1')
val_X1 = np.array([doc.vector for doc in nlp.pipe(X1_val.tolist(), n_threads=20, batch_size=10000)])
print('Val 2')
val_X2 = np.array([doc.vector for doc in nlp.pipe(X2_val.tolist(), n_threads=20, batch_size=10000)])

Splitting data set
Get doc vectors
Train 1
Train 2
Val 1
Val 2


In [7]:
print('Building model')
shape = (30, 200, 1)
settings = {
    'lr': 0.0001,
    'dropout': 0.2
}

model = create_network(300, 128, settings)
model.compile(
        optimizer=Adam(lr=settings['lr']),
        loss='binary_crossentropy',
        #loss='categorical_crossentropy',
        metrics=['accuracy'])

Building model


In [8]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 300)           0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 300)           0                                            
____________________________________________________________________________________________________
sequential_1 (Sequential)        (None, 128)           71552       input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lambda_1 (Lambda)                (None, 1)             0           sequential_1[1][0]      

In [9]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=5, verbose=1),
    keras.callbacks.CSVLogger(filename='..\logs\checkpoints\model_log.csv'),
    keras.callbacks.ModelCheckpoint('..\models\checkpoints\weights.{epoch:03d}-{val_loss:0.4f}.hdf5',
                                    save_best_only=True,
                                    save_weights_only=True,
                                    period=10),
    keras.callbacks.TensorBoard(log_dir='..\logs\\tensorboard',
                                histogram_freq=1)
]

# Train model
model.fit([train_X1, train_X2],
          y_train,
          validation_data=([val_X1, val_X2], y_val),
          epochs=100,
          batch_size=256, 
          verbose=2,
          callbacks=callbacks)

Train on 159946 samples, validate on 39987 samples
INFO:tensorflow:Summary name dense_1/kernel:0 is illegal; using dense_1/kernel_0 instead.
INFO:tensorflow:Summary name dense_1/bias:0 is illegal; using dense_1/bias_0 instead.
INFO:tensorflow:Summary name dense_2/kernel:0 is illegal; using dense_2/kernel_0 instead.
INFO:tensorflow:Summary name dense_2/bias:0 is illegal; using dense_2/bias_0 instead.
INFO:tensorflow:Summary name dense_3/kernel:0 is illegal; using dense_3/kernel_0 instead.
INFO:tensorflow:Summary name dense_3/bias:0 is illegal; using dense_3/bias_0 instead.
Epoch 1/100
5s - loss: 0.7447 - acc: 0.5389 - val_loss: 0.9449 - val_acc: 0.6275
Epoch 2/100
5s - loss: 0.6548 - acc: 0.6292 - val_loss: 0.9143 - val_acc: 0.6274
Epoch 3/100
5s - loss: 0.6355 - acc: 0.6463 - val_loss: 0.8985 - val_acc: 0.6274
Epoch 4/100
5s - loss: 0.6238 - acc: 0.6578 - val_loss: 0.9086 - val_acc: 0.6276
Epoch 5/100
5s - loss: 0.6160 - acc: 0.6670 - val_loss: 0.9261 - val_acc: 0.6279
Epoch 6/100
5s -

<keras.callbacks.History at 0xb97ff84ef0>

## Save model

In [10]:
model.save('../models/model_siamese_network.h5')

In [6]:
model = keras.models.load_model('../models/model_siamese_network.h5')

# Prediciton

In [7]:
pred = model.predict([val_X1, val_X2])

In [12]:
np.sum(pred > 1)
len(pred)

39987