In [24]:
import numpy as np
import datetime, time, json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences

In [25]:
quora_corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy")
labels = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_labels.npy")

w2v_google_50d = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_google_50d.npy")
w2v_quora_50d = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_quora5_50d.npy")
w2v_pivots100_50d = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_pivots100_50d.npy")

# w2v_retrain = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/w2v_retrain5.npy")


#w2v_concat = concat_vec = np.concatenate([w2v_google, w2v_quora], axis=1)
#w2v_avg = (w2v_google + w2v_quora)/2  

In [16]:
word_embedding = w2v_pivots100_50d[:] 

In [26]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
w2v_concat = pca.fit_transform(w2v_concat)
w2v_concat.shape

In [5]:
# separate question1 and question2
question1 = []
question2 = []
for n in range(int(len(quora_corpus)/2)):
    question1.append(quora_corpus[2*n])
    question2.append(quora_corpus[2*n+1])
    
q1_data = pad_sequences(question1, maxlen=25)
q2_data = pad_sequences(question2, maxlen=25)

In [6]:
l = 0
l_avg = []
for i in range(len(quora_corpus)):
    if len(quora_corpus[i]) > l:
        l = len(quora_corpus[i])
    l_avg.append(len(quora_corpus[i]))
print('max length:{0}, average length:{1}'.format(l,np.mean(np.array(l_avg))))

max length:190, average length:8.62675648734686


In [7]:
# hyperparameter setup
max_sentence_len = 25
embed_dim = 50
dropout_rate = 0.1
vocab_size = len(word_embedding)

In [8]:
# split cross validation set and test set
questions = np.stack((q1_data, q2_data), axis=1)
X_train, X_test, y_train, y_test = train_test_split(questions, labels, test_size=0.1, random_state=2018)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

### define the model

In [17]:
question1 = Input(shape=(max_sentence_len,))
question2 = Input(shape=(max_sentence_len,))

q1 = Embedding(  input_dim=vocab_size, 
                 output_dim=embed_dim, 
                 weights=[word_embedding], 
                 input_length=max_sentence_len, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(embed_dim, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embed_dim, ))(q1)

q2 = Embedding(  input_dim=vocab_size, 
                 output_dim=embed_dim, 
                 weights=[word_embedding], 
                 input_length=max_sentence_len, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(embed_dim, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embed_dim, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 25, 50)       1515000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 50)       1515000     input_2[0][0]                    
__________________________________________________________________________________________________
time_distr

### train the model

In [18]:
n_epoch = 50
val_split = 0.1
batch_size = 32
MODEL_WEIGHTS_FILE = 'SN_weights/pivots100_50d_weights.h5'

In [19]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=n_epoch,
                    validation_split=val_split,
                    verbose=2,
                    batch_size=batch_size,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-07-01 19:03:53.089696
Train on 327472 samples, validate on 36386 samples
Epoch 1/50
 - 56s - loss: 0.5395 - acc: 0.7268 - val_loss: 0.4892 - val_acc: 0.7552
Epoch 2/50
 - 59s - loss: 0.4947 - acc: 0.7538 - val_loss: 0.4647 - val_acc: 0.7724
Epoch 3/50
 - 64s - loss: 0.4782 - acc: 0.7632 - val_loss: 0.4580 - val_acc: 0.7762
Epoch 4/50
 - 61s - loss: 0.4682 - acc: 0.7704 - val_loss: 0.4500 - val_acc: 0.7834
Epoch 5/50
 - 61s - loss: 0.4621 - acc: 0.7733 - val_loss: 0.4457 - val_acc: 0.7822
Epoch 6/50
 - 64s - loss: 0.4556 - acc: 0.7770 - val_loss: 0.4450 - val_acc: 0.7855
Epoch 7/50
 - 64s - loss: 0.4504 - acc: 0.7807 - val_loss: 0.4427 - val_acc: 0.7855
Epoch 8/50
 - 65s - loss: 0.4472 - acc: 0.7839 - val_loss: 0.4348 - val_acc: 0.7903
Epoch 9/50
 - 61s - loss: 0.4434 - acc: 0.7853 - val_loss: 0.4382 - val_acc: 0.7891
Epoch 10/50
 - 63s - loss: 0.4409 - acc: 0.7867 - val_loss: 0.4333 - val_acc: 0.7905
Epoch 11/50
 - 66s - loss: 0.4380 - acc: 0.7888 - val_loss: 

###  Plot training and validation accuracy

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['acc'],
                    'validation': history.history['val_acc']})
ax = acc.iloc[:,:].plot(x='epoch', figsize={5,8}, grid=True)
ax.set_ylabel("accuracy")
ax.set_ylim([0.0,1.0]);

### Evaluate the model with best validation accuracy on the test partition

In [22]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.4726, accuracy = 0.8038
