In [29]:
import pandas as pd
import numpy as np
import re
import gensim.downloader
import itertools
import datetime
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Input, LSTM, GRU, Lambda, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adadelta
import tensorflow.keras.backend as K
from tensorflow import keras
from time import time
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [30]:
df = pd.read_csv(
    '../data/quora_duplicate_questions.tsv',
    sep='\t',
    encoding='ISO-8859-1',
    on_bad_lines='skip',
    dtype={'qid1': str, 'qid2': str},
    low_memory=False
)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0


In [31]:
df.shape

(291177, 6)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291177 entries, 0 to 291176
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            291177 non-null  object 
 1   qid1          291174 non-null  object 
 2   qid2          291158 non-null  object 
 3   question1     291144 non-null  object 
 4   question2     291123 non-null  object 
 5   is_duplicate  291111 non-null  float64
dtypes: float64(1), object(5)
memory usage: 13.3+ MB


In [33]:
df.duplicated().sum()

5

In [34]:
df.isna().sum()

id               0
qid1             3
qid2            19
question1       33
question2       54
is_duplicate    66
dtype: int64

In [35]:
df = df.drop_duplicates()
df = df.dropna(subset=['question1', 'question2', 'is_duplicate'])

In [36]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

In [37]:
df['question1'] = df['question1'].apply(clean)
df['question2'] = df['question2'].apply(clean)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.0
1,1,3,4,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0.0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0.0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0.0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0.0


In [14]:
questions = pd.concat([df['question1'], df['question2']])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index

In [38]:
maxlen = 50
q1_seq = pad_sequences(tokenizer.texts_to_sequences(df['question1']), maxlen=maxlen)
q2_seq = pad_sequences(tokenizer.texts_to_sequences(df['question2']), maxlen=maxlen)
labels = df['is_duplicate'].values

In [16]:
print("Loading Word2Vec (Google News 300d)...")
vectors = gensim.downloader.load('word2vec-google-news-300')
embedding_dim = 300

Loading Word2Vec (Google News 300d)...


In [18]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in vectors:
        embedding_matrix[i] = vectors[word]

In [19]:
q1_train, q1_val, q2_train, q2_val, y_train, y_val = train_test_split(
    q1_seq, q2_seq, labels, test_size=0.1, random_state=42
)

In [20]:
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))

In [39]:
def build_siamese_model(rnn_layer):
    input_a = Input(shape=(maxlen,))
    input_b = Input(shape=(maxlen,))
    embedding_layer = Embedding(
        input_dim=len(word_index) + 1,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    )
    encoded_a = embedding_layer(input_a)
    encoded_b = embedding_layer(input_b)
    shared_rnn = rnn_layer(100)
    out_a = shared_rnn(encoded_a)
    out_b = shared_rnn(encoded_b)
    malstm_dist = Lambda(lambda x: exponent_neg_manhattan_distance(x[0], x[1]))([out_a, out_b])
    dense = Dense(64, activation='relu')(malstm_dist)
    drop = Dropout(0.2)(dense)
    output = Dense(1, activation='sigmoid')(drop)
    model = Model([input_a, input_b], output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [24]:
siamese_lstm = build_siamese_model(LSTM)
siamese_gru = build_siamese_model(GRU)

In [40]:
callbacks_lstm = [
    ModelCheckpoint('siamese_lstm_best.keras', save_best_only=True, monitor='val_loss', mode='min'),
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]
callbacks_gru = [
    ModelCheckpoint('siamese_gru_best.keras', save_best_only=True, monitor='val_loss', mode='min'),
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]

In [27]:
print("Training Siamese LSTM...")
lstm_start = time()
history_lstm = siamese_lstm.fit(
    [q1_train, q2_train], y_train,
    batch_size=128,
    epochs=25,
    validation_data=([q1_val, q2_val], y_val),
    callbacks=callbacks_lstm
)
print(f"Siamese LSTM training finished in {datetime.timedelta(seconds=time()-lstm_start)}")

Training Siamese LSTM...
Epoch 1/25
[1m2815/4094[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m2:07[0m 100ms/step - accuracy: 0.7044 - loss: 0.5779

KeyboardInterrupt: 

In [None]:
print("Training Siamese GRU...")
gru_start = time()
history_gru = siamese_gru.fit(
    [q1_train, q2_train], y_train,
    batch_size=128,
    epochs=25,
    validation_data=([q1_val, q2_val], y_val),
    callbacks=callbacks_gru,
)
print(f"Siamese GRU training finished in {int(time()-gru_start)} seconds")

In [None]:
print("Evaluating LSTM...")
lstm_preds = siamese_lstm.predict([q1_val, q2_val])
lstm_preds_bin = (lstm_preds > 0.5).astype(int)
lstm_acc = (lstm_preds_bin.flatten() == y_val).mean()
lstm_prec = precision_score(y_val, lstm_preds_bin)
print(f"Siamese LSTM: Accuracy = {lstm_acc:.4f}, Precision = {lstm_prec:.4f}")

In [None]:
print("Evaluating GRU...")
gru_preds = siamese_gru.predict([q1_val, q2_val])
gru_preds_bin = (gru_preds > 0.5).astype(int)
gru_acc = (gru_preds_bin.flatten() == y_val).mean()
gru_prec = precision_score(y_val, gru_preds_bin)
print(f"Siamese GRU:  Accuracy = {gru_acc:.4f}, Precision = {gru_prec:.4f}")

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history_lstm.history['val_accuracy'], label='LSTM val_acc')
plt.plot(history_gru.history['val_accuracy'], label='GRU val_acc')
plt.xlabel('Epoch')
plt.ylabel('Validation Accuracy')
plt.legend()
plt.title('Validation Accuracy Comparison')

plt.subplot(1,2,2)
plt.plot(history_lstm.history['val_loss'], label='LSTM val_loss')
plt.plot(history_gru.history['val_loss'], label='GRU val_loss')
plt.xlabel('Epoch')
plt.ylabel('Validation Loss')
plt.legend()
plt.title('Validation Loss Comparison')

plt.tight_layout()
plt.show()