In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Importing relevant modules

In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [21]:
df = pd.read_csv('/content/drive/MyDrive/questions.csv')

In [22]:
labels = df["is_duplicate"]
labels

0         0
1         0
2         0
3         0
4         0
         ..
404346    0
404347    1
404348    0
404349    0
404350    0
Name: is_duplicate, Length: 404351, dtype: int64

## Splitting data into train, validation and test

In [23]:
from sklearn.model_selection import train_test_split
X=df.to_numpy()
y=np.asarray(labels)
X_train,X_test,y_train,y_test=train_test_split(df,labels,test_size=0.1,random_state=45)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.2, random_state=45)

In [24]:
print(X_train.shape)
print(X_validate.shape)
print(X_test.shape)
print(y_train.shape)
print(y_validate.shape)
print(y_test.shape)

(291132, 6)
(72783, 6)
(40436, 6)
(291132,)
(72783,)
(40436,)


## Setting up Keras tokenizer

In [25]:
MAX_NB_WORDS = 200
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train['question1'].values.astype(str))+list(X_train['question2'].values.astype(str)))

## Creating feature vectors

In [26]:
X_train_q1 = tokenizer.texts_to_sequences(X_train['question1'].values.astype(str))
X_train_q1 = pad_sequences(X_train_q1, maxlen = 30, padding='post')
X_train_q2 = tokenizer.texts_to_sequences(X_train['question2'].values.astype(str))
X_train_q2 = pad_sequences(X_train_q2, maxlen = 30, padding='post')

In [27]:
X_validate_q1 = tokenizer.texts_to_sequences(X_validate['question1'].values.astype(str))
X_validate_q1 = pad_sequences(X_validate_q1, maxlen = 30, padding='post')
X_validate_q2 = tokenizer.texts_to_sequences(X_validate['question2'].values.astype(str))
X_validate_q2 = pad_sequences(X_validate_q2, maxlen = 30, padding='post')

In [28]:
X_test_q1 = tokenizer.texts_to_sequences(X_test['question1'].values.astype(str))
X_test_q1 = pad_sequences(X_test_q1,maxlen = 30, padding='post')

X_test_q2 = tokenizer.texts_to_sequences(X_test['question2'].values.astype(str))
X_test_q2 = pad_sequences(X_test_q2, maxlen = 30, padding='post')



## Creating dictionary of words

In [29]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'what': 2,
 'is': 3,
 'how': 4,
 'i': 5,
 'a': 6,
 'to': 7,
 'in': 8,
 'do': 9,
 'of': 10,
 'are': 11,
 'and': 12,
 'can': 13,
 'for': 14,
 'you': 15,
 'why': 16,
 'best': 17,
 'my': 18,
 'it': 19,
 'on': 20,
 'does': 21,
 'or': 22,
 'which': 23,
 'if': 24,
 'be': 25,
 'some': 26,
 'have': 27,
 'that': 28,
 'with': 29,
 'get': 30,
 'should': 31,
 'an': 32,
 'from': 33,
 'your': 34,
 'india': 35,
 'will': 36,
 'when': 37,
 'people': 38,
 'like': 39,
 'who': 40,
 'at': 41,
 'good': 42,
 'would': 43,
 'there': 44,
 'as': 45,
 'about': 46,
 'not': 47,
 'between': 48,
 'one': 49,
 'most': 50,
 'we': 51,
 'make': 52,
 'did': 53,
 'quora': 54,
 'way': 55,
 'where': 56,
 'by': 57,
 'any': 58,
 'was': 59,
 'life': 60,
 'me': 61,
 'so': 62,
 'after': 63,
 'time': 64,
 'they': 65,
 'this': 66,
 'money': 67,
 'know': 68,
 'difference': 69,
 'has': 70,
 'learn': 71,
 'am': 72,
 'new': 73,
 'much': 74,
 "what's": 75,
 'use': 76,
 'their': 77,
 'think': 78,
 'many': 79,
 'work': 80,
 'all

## Creating embedding matrix

In [30]:
embedding_index = {}

with open('/content/drive/My Drive/glove.6B.200d.txt') as f :
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()



In [31]:
embedding_matrix = np.random.random((len(word_index)+1, 200))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [32]:
embedding_matrix.shape

(82195, 200)

## Creating model architectures

In [33]:
# Model for Q1
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
model_q1 = tf.keras.Sequential()
model_q1.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q1.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q1.add(Dropout(0.2))
model_q1.add(LSTM(128, return_sequences = True))
model_q1.add(LSTM(128))
model_q1.add(Dense(60, activation = 'tanh'))
model_q1.add(Dense(2, activation = 'sigmoid'))

In [34]:
# Model for Q2
model_q2 = tf.keras.Sequential()
model_q2.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q2.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q2.add(Dropout(0.2))
model_q2.add(LSTM(128, return_sequences = True))
model_q2.add(LSTM(128))
model_q2.add(Dense(60, activation = 'tanh'))
model_q2.add(Dense(2, activation = 'sigmoid'))

In [35]:
# Merging the output of the two models,i.e, model_q1 and model_q2
mergedOut = Multiply()([model_q1.output, model_q2.output])

mergedOut = Flatten()(mergedOut)
mergedOut = Dense(100, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.2)(mergedOut)
mergedOut = Dense(50, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.2)(mergedOut)
mergedOut = Dense(2, activation = 'sigmoid')(mergedOut)

## Fitting with various parameters

#### Adam with 2 epochs

In [37]:
new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history = new_model.fit([X_train_q1,X_train_q2],y_train, batch_size = 2000, epochs = 2, validation_data=([X_validate_q1, X_validate_q2], y_validate))



Epoch 1/2
Epoch 2/2


#### Adam with 5 epochs

In [None]:
new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history = new_model.fit([X_train_q1,X_train_q2],y_train, batch_size = 2000, epochs = 5, validation_data=([X_validate_q1, X_validate_q2], y_validate))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### Adam with 20 epochs

In [None]:


new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history = new_model.fit([X_train_q1,X_train_q2],y_train, batch_size = 2000, epochs = 20)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Testing accuracy with 20 epochs in Adam

In [None]:
test_loss, test_acc = new_model.evaluate([X_test_q1, X_test_q2], y_test)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 0.5502656102180481
Test Accuracy: 0.7284317016601562


#### SGD with 2 epochs

In [38]:
new_model2 = Model([model_q1.input, model_q2.input], mergedOut)
new_model2.compile(optimizer = 'SGD', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history2 = new_model2.fit([X_train_q1,X_train_q2],y_train, batch_size = 2000, epochs = 2, validation_data=([X_validate_q1, X_validate_q2], y_validate))

Epoch 1/2
Epoch 2/2


### SGD with 10 epochs

In [39]:
new_model3 = Model([model_q1.input, model_q2.input], mergedOut)
new_model3.compile(optimizer = 'SGD', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history3 = new_model2.fit([X_train_q1,X_train_q2],y_train, batch_size = 2000, epochs = 10, validation_data=([X_validate_q1, X_validate_q2], y_validate))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Testing accuracy with 10 epochs in SGD

In [48]:
test_loss, test_acc = new_model2.evaluate([X_test_q1, X_test_q2], y_test)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 0.5923463702201843
Test Accuracy: 0.685799777507782
