In [53]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Siamese network
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from time import time
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import itertools
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Input, Embedding, Lambda,LSTM, GRU, Conv1D, Conv2D, GlobalMaxPool1D, Dense, Dropout

In [0]:
class ManDist(Layer):
    """
    Keras Custom Layer that calculates Manhattan Distance.
    """

    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://stackoverflow.com/questions/53496095/keras-reports-typeerror-unsupported-operand-types-for-nonetype-and-int

# With Embeddings

In [56]:
import pandas as pd
train = pd.read_csv("/content/drive/My Drive/Siamese/train.csv")
train = train.dropna()
train = train.drop(['id','qid1','qid2'],axis=1)
train.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [57]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['is_duplicate'],axis=1),train['is_duplicate'],test_size=0.2)
X_test.head()

Unnamed: 0,question1,question2
321132,What is the best thing in your life?,What's the best thing to do in life?
319109,What are some tips on making it through the jo...,What are some tips on making it through the jo...
311738,What do you think about the rapist Donald Trump?,What do you think about Donald Trump in Septem...
139448,How do we change the system that gave us such ...,What will happen if I don't have 75% attendanc...
10475,Did Trump injure a teacher with punch to face ...,Did Trump punch his elementary teacher in the ...


In [58]:
max_length = max(train.question1.map(lambda x: len(x.split())).max(),train.question2.map(lambda x: len(x.split())).max())
max_length

237

In [0]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(train.question1)
t.fit_on_texts(train.question2)
vocab_size = len(t.word_index) + 1

In [60]:
vocab_size

95593

In [61]:
encoded_docs_train_left = t.texts_to_sequences(X_train.question1)
print(encoded_docs_train_left[0])
encoded_docs_train_right = t.texts_to_sequences(X_train.question2)
print(encoded_docs_train_right[0])

[36, 93, 458, 178, 128, 288, 8, 35]
[3, 19, 380, 258, 1308, 8, 484, 128, 33, 1133, 36, 19, 178, 60, 8, 17, 658, 8, 35]


In [62]:
encoded_docs_test_left = t.texts_to_sequences(X_test.question1)
print(encoded_docs_test_left[0])
encoded_docs_test_right = t.texts_to_sequences(X_test.question2)
print(encoded_docs_test_right[0])

[2, 3, 1, 18, 158, 8, 34, 61]
[75, 1, 18, 158, 7, 9, 8, 61]


In [63]:
padded_docs_train_left = pad_sequences(encoded_docs_train_left, maxlen=max_length, padding='post')
print(padded_docs_train_left.shape)
padded_docs_train_right = pad_sequences(encoded_docs_train_right, maxlen=max_length, padding='post')
print(padded_docs_train_right.shape)

(323429, 237)
(323429, 237)


In [64]:
padded_docs_test_left = pad_sequences(encoded_docs_test_left, maxlen=max_length, padding='post')
print(padded_docs_test_left.shape)
padded_docs_test_right = pad_sequences(encoded_docs_test_right, maxlen=max_length, padding='post')
print(padded_docs_test_right.shape)

(80858, 237)
(80858, 237)


In [65]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/content/drive/My Drive/Siamese/glove.6B.50d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [0]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [72]:
# Model variables
gpus = 2
batch_size = 1024 * gpus
n_epoch = 20
n_hidden = 50

# Define the shared model
x = Sequential()
x.add(Embedding(vocab_size,50, weights=[embedding_matrix], input_length=4, trainable=False))
x.add(LSTM(n_hidden))
x.add(Dropout(0.7))
shared_model = x

W0709 03:25:45.235687 140373054867328 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [0]:
# The visible layer
left_input = Input(shape=(max_length,), dtype='int32')
right_input = Input(shape=(max_length,), dtype='int32')

In [74]:
# Pack it all up into a Manhattan Distance model
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

W0709 03:25:45.453423 140373054867328 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0709 03:25:45.648065 140373054867328 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [75]:
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model.summary()
shared_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 237)]        0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 237)]        0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 50)           4799850     input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
man_dist_3 (ManDist)            (None, 1)            0           sequential_3[1][0]         

In [76]:
# Start trainings
training_start_time = time()
malstm_trained = model.fit([padded_docs_train_left,padded_docs_train_right], y_train,
                           batch_size=batch_size, epochs=n_epoch, verbose=2)
training_end_time = time()
print("Training time finished.\n%d epochs in %12.2f" % (n_epoch,training_end_time - training_start_time))

Epoch 1/20
323429/323429 - 128s - loss: 0.2470 - acc: 0.6091
Epoch 2/20
323429/323429 - 126s - loss: 0.2369 - acc: 0.6226
Epoch 3/20
323429/323429 - 125s - loss: 0.2370 - acc: 0.6224
Epoch 4/20
323429/323429 - 126s - loss: 0.2373 - acc: 0.6226
Epoch 5/20
323429/323429 - 125s - loss: 0.2371 - acc: 0.6229
Epoch 6/20
323429/323429 - 125s - loss: 0.2370 - acc: 0.6228
Epoch 7/20
323429/323429 - 126s - loss: 0.2369 - acc: 0.6228
Epoch 8/20
323429/323429 - 126s - loss: 0.2370 - acc: 0.6225
Epoch 9/20
323429/323429 - 125s - loss: 0.2369 - acc: 0.6227
Epoch 10/20
323429/323429 - 125s - loss: 0.2371 - acc: 0.6225
Epoch 11/20
323429/323429 - 126s - loss: 0.2371 - acc: 0.6224
Epoch 12/20
323429/323429 - 123s - loss: 0.2371 - acc: 0.6223
Epoch 13/20
323429/323429 - 127s - loss: 0.2370 - acc: 0.6229
Epoch 14/20
323429/323429 - 125s - loss: 0.2369 - acc: 0.6229
Epoch 15/20
323429/323429 - 126s - loss: 0.2371 - acc: 0.6227
Epoch 16/20
323429/323429 - 125s - loss: 0.2369 - acc: 0.6226
Epoch 17/20
32342

In [78]:
prediction = model.predict([padded_docs_test_left,padded_docs_test_right],verbose=1)
print(prediction[0:5])

[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]


In [0]:
y_test

In [80]:
loss, accuracy = model.evaluate([padded_docs_test_left,padded_docs_test_right], y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 36.802790


In [81]:
from collections import Counter 
Counter(y_train)

Counter({0: 203921, 1: 119508})

In [0]:
for i in prediction:
  if(i==0):
    print(0)