# 1. Imports


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model

# 2. Data Loading
## toy dataset


In [5]:
Q1_train = ["How are you?", "What is AI?", "What is your name?"]
Q2_train = ["How do you do?", "Define artificial intelligence", "Tell me your name"]
y_train = [1, 1, 1]  # duplicates

Q1_test = ["How old are you?", "What is ML?"]
Q2_test = ["What’s your age?", "What is machine learning?"]
y_test = [1, 1]

# 3. Tokenization + Padding

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(Q1_train + Q2_train + Q1_test + Q2_test)
max_len = 10

def tokenize_pad(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_len, padding='post')

X1_train = tokenize_pad(Q1_train)
X2_train = tokenize_pad(Q2_train)
X1_test = tokenize_pad(Q1_test)
X2_test = tokenize_pad(Q2_test)

# 4. Siamese Model

In [7]:
embedding_dim = 50
lstm_dim = 64

# Inputs
input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

embedding = Embedding(input_dim=len(tokenizer.word_index)+1,
                      output_dim=embedding_dim, input_length=max_len)

# Shared LSTM encoder
shared_lstm = LSTM(lstm_dim)

v1 = shared_lstm(embedding(input_q1))
v2 = shared_lstm(embedding(input_q2))

# Outputs
output = tf.keras.layers.Dot(axes=1, normalize=True)([v1, v2])  # cosine similarity
model = Model(inputs=[input_q1, input_q2], outputs=output)

model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 10, 50)       1050        ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 64)           29440       ['embedding[0][0]',          

# 5. Training

In [8]:
history = model.fit([X1_train, X2_train], np.array(y_train, dtype=np.float32),
                    epochs=10, batch_size=2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 6. Evaluation

In [9]:
loss, acc = model.evaluate([X1_test, X2_test], np.array(y_test, dtype=np.float32))
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 1.0000


# 7. Prediction helper

In [10]:
def predict(question1, question2, threshold=0.7, verbose=True):
    X1 = tokenize_pad([question1])
    X2 = tokenize_pad([question2])
    d = model.predict([X1, X2])[0][0]
    res = d > threshold
    
    if verbose:
        print("Q1:", question1)
        print("Q2:", question2)
        print("Similarity:", d)
        print("Duplicate?", res)
    return res

# 8. Test

In [11]:
predict("What is AI?", "Define artificial intelligence")
predict("How old are you?", "What’s your age?")
predict("What is AI?", "What is your name?")

Q1: What is AI?
Q2: Define artificial intelligence
Similarity: 0.9996928
Duplicate? True
Q1: How old are you?
Q2: What’s your age?
Similarity: 0.9989886
Duplicate? True
Q1: What is AI?
Q2: What is your name?
Similarity: 0.99902123
Duplicate? True


True