In [85]:
import numpy as np
import datetime, time, json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [110]:
quora_corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy")
labels = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_labels.npy")

w2v_google_50d = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_google_50d.npy")
w2v_pivots100_50d = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_quora_50d_5m.npy")

#w2v_retrain = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_retrain5.npy")

w2v_concat = concat_vec = np.concatenate([w2v_google_50d, w2v_quora_50d_5m], axis=1)
w2v_avg = (w2v_google_50d + w2v_quora_50d)/2  

In [111]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
w2v_concat = pca.fit_transform(w2v_concat)
w2v_concat.shape

(30300, 50)

In [113]:
word_embedding = w2v_concat[:] 

In [39]:
# separate question1 and question2
question1 = []
question2 = []
for n in range(int(len(quora_corpus)/2)):
    question1.append(quora_corpus[2*n])
    question2.append(quora_corpus[2*n+1])
    
q1_data = pad_sequences(question1, maxlen=25)
q2_data = pad_sequences(question2, maxlen=25)

In [6]:
l = 0
l_avg = []
for i in range(len(quora_corpus)):
    if len(quora_corpus[i]) > l:
        l = len(quora_corpus[i])
    l_avg.append(len(quora_corpus[i]))
print('max length:{0}, average length:{1}'.format(l,np.mean(np.array(l_avg))))

max length:190, average length:8.62675648734686


In [40]:
# hyperparameter setup
max_sentence_len = 25
embed_dim = 50
dropout_rate = 0.1
vocab_size = len(word_embedding)

In [41]:
# split cross validation set and test set
questions = np.stack((q1_data, q2_data), axis=1)
X_train, X_test, y_train, y_test = train_test_split(questions, labels, test_size=0.1, random_state=2018)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

### define the model

In [114]:
question1 = Input(shape=(max_sentence_len,))
question2 = Input(shape=(max_sentence_len,))



q1 = Embedding(  input_dim=vocab_size, 
                 output_dim=embed_dim, 
                 weights=[word_embedding], 
                 input_length=max_sentence_len, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(embed_dim, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embed_dim, ))(q1)

q2 = Embedding(  input_dim=vocab_size, 
                 output_dim=embed_dim, 
                 weights=[word_embedding], 
                 input_length=max_sentence_len, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(embed_dim, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embed_dim, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


### train the model

In [115]:
n_epoch = 50
val_split = 0.1
batch_size = 32
MODEL_WEIGHTS_FILE = '/Users/zhang/MscProject_tweak2vec/Max_BOE_weights/concat_50d_5m_weights.h5'

In [116]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=n_epoch,
                    validation_split=val_split,
                    verbose=2,
                    batch_size=batch_size,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-07-06 00:19:31.724102
Train on 327472 samples, validate on 36386 samples
Epoch 1/50
 - 79s - loss: 0.5457 - acc: 0.7199 - val_loss: 0.4969 - val_acc: 0.7518
Epoch 2/50
 - 75s - loss: 0.5050 - acc: 0.7481 - val_loss: 0.4789 - val_acc: 0.7665
Epoch 3/50
 - 75s - loss: 0.4909 - acc: 0.7562 - val_loss: 0.4714 - val_acc: 0.7678
Epoch 4/50
 - 75s - loss: 0.4811 - acc: 0.7622 - val_loss: 0.4639 - val_acc: 0.7751
Epoch 5/50
 - 75s - loss: 0.4738 - acc: 0.7666 - val_loss: 0.4589 - val_acc: 0.7750
Epoch 6/50
 - 74s - loss: 0.4684 - acc: 0.7700 - val_loss: 0.4509 - val_acc: 0.7813
Epoch 7/50
 - 75s - loss: 0.4635 - acc: 0.7738 - val_loss: 0.4559 - val_acc: 0.7775
Epoch 8/50
 - 75s - loss: 0.4592 - acc: 0.7771 - val_loss: 0.4496 - val_acc: 0.7842
Epoch 9/50
 - 75s - loss: 0.4554 - acc: 0.7788 - val_loss: 0.4461 - val_acc: 0.7840
Epoch 10/50
 - 75s - loss: 0.4529 - acc: 0.7800 - val_loss: 0.4430 - val_acc: 0.7826
Epoch 11/50
 - 75s - loss: 0.4496 - acc: 0.7818 - val_loss: 

### Evaluate the model with best validation accuracy on the test partition

In [22]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.4726, accuracy = 0.8038


In [27]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 25, 50)       1515000     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 25, 50)       1515000     input_6[0][0]                    
__________________________________________________________________________________________________
time_distr