In [1]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.cross_validation import train_test_split
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
DROPOUT = 0.1
BATCH_SIZE = 32

In [3]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

In [12]:
X = np.stack((q1_data, q2_data), axis=1)
print(X[1])
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]
print(Q1_train[1])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     2     3     1   559    10 14300 13598     5 21311
   4565]
 [    0     0     0     0     0     0     0     0     0     0     2    43
    182    25     1    82   237 11296     1 14300 13598     5 21311  4565
    202]]
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     2    11     1    18   150    20  1215 10449    29  3822
  1417]


In [5]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)

In [6]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [7]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 25, 300)      28679100    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 300)      28679100    input_2[0][0]                    
__________________________________________________________________________________________________
time_distr

In [8]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=16,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE
                    )
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-06-11 11:47:31.207049
Train on 327474 samples, validate on 36387 samples
Epoch 1/16
 - 337s - loss: 0.5377 - acc: 0.7283 - val_loss: 0.4852 - val_acc: 0.7587
Epoch 2/16
 - 332s - loss: 0.4826 - acc: 0.7643 - val_loss: 0.4744 - val_acc: 0.7641
Epoch 3/16
 - 333s - loss: 0.4547 - acc: 0.7809 - val_loss: 0.4423 - val_acc: 0.7818
Epoch 4/16
 - 332s - loss: 0.4351 - acc: 0.7929 - val_loss: 0.4406 - val_acc: 0.7860
Epoch 5/16
 - 344s - loss: 0.4179 - acc: 0.8039 - val_loss: 0.4339 - val_acc: 0.7901
Epoch 6/16
 - 340s - loss: 0.4049 - acc: 0.8115 - val_loss: 0.4299 - val_acc: 0.7952
Epoch 7/16
 - 379s - loss: 0.3910 - acc: 0.8198 - val_loss: 0.4198 - val_acc: 0.8017
Epoch 8/16
 - 362s - loss: 0.3774 - acc: 0.8273 - val_loss: 0.4142 - val_acc: 0.8053
Epoch 9/16
 - 349s - loss: 0.3672 - acc: 0.8338 - val_loss: 0.4164 - val_acc: 0.8027
Epoch 10/16
 - 350s - loss: 0.3580 - acc: 0.8392 - val_loss: 0.4147 - val_acc: 0.8032
Epoch 11/16
 - 351s - loss: 0.3458 - acc: 0.8459 -

In [9]:
import h5py


In [10]:
model.save('my_model.h5')