## Data preprocessing

In [23]:
import pandas as pd 

df = pd.read_csv('sample_data.csv')

df.head()

Unnamed: 0,index,sentences1,sentences2,is_similar
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


### Normalizing data

In [24]:
import re

# Function to clean sentences: remove punctuation and convert to lowercase
def clean_sentence(sentence):
    sentence = re.sub(r'[^\w\s]', '', sentence)  # Remove punctuation
    sentence = sentence.lower()  # Convert to lowercase
    return sentence

# Apply the function to both columns and create new columns
df['clean_sentence1'] = df['sentences1'].apply(clean_sentence)
df['clean_sentence2'] = df['sentences2'].apply(clean_sentence)

df.head()

Unnamed: 0,index,sentences1,sentences2,is_similar,clean_sentence1,clean_sentence2
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water


## Neural Network Arquitecture

In [32]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Lambda, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

# Combine all sentences to fit the tokenizer
all_sentences = df['clean_sentence1'].tolist() + df['clean_sentence2'].tolist()

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)

# Convert sentences to sequences
sequences1 = tokenizer.texts_to_sequences(df['clean_sentence1'])
sequences2 = tokenizer.texts_to_sequences(df['clean_sentence2'])

# Pad the sequences
max_length = max(len(seq) for seq in sequences1 + sequences2)
padded_sequences1 = pad_sequences(sequences1, maxlen=max_length, padding='post')
padded_sequences2 = pad_sequences(sequences2, maxlen=max_length, padding='post')

# Convert labels to numpy array
labels = np.array(df['is_similar'])

# Split the data into training, validation, and testing sets (80-10-10 split)
X_train1, X_temp1, y_train, y_temp = train_test_split(padded_sequences1, labels, test_size=0.2, random_state=42)
X_train2, X_temp2 = train_test_split(padded_sequences2, test_size=0.2, random_state=42)

X_val1, X_test1, y_val, y_test = train_test_split(X_temp1, y_temp, test_size=0.5, random_state=42)
X_val2, X_test2 = train_test_split(X_temp2, test_size=0.5, random_state=42)

# Define the LSTM model
def create_base_network(input_shape):
    input = Input(shape=input_shape)
    x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input)
    x = LSTM(64, return_sequences=True)(x)
    x = LSTM(64)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    return Model(input, x)

def create_bidirectional_base_network(input_shape):
    input = Input(shape=input_shape)
    x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input)
    x = Bidirectional(LSTM(units=64, return_sequences=False))(x)
    return Model(input, x)

# Create the Siamese network
input_shape = (max_length, )
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

# Compute the Euclidean distance between the two vectors
def euclidean_distance(vectors):
    x, y = vectors
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

# Compute the cosine distance between the two vectors
def cosine_distance(vectors):
    x, y = vectors
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return 1 - K.sum(x * y, axis=-1, keepdims=True)

def l1_norm(vectors):
    x, y = vectors
    return 1 - K.abs(x - y)

distance = Lambda(euclidean_distance)([processed_a, processed_b])
output = Dense(1, activation='sigmoid')(distance)

model = Model([input_a, input_b], output)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(
    [X_train1, X_train2], 
    y_train, 
    batch_size=50, 
    epochs=15, 
    validation_data=([X_val1, X_val2], y_val)
)

# Evaluate the model
loss, accuracy = model.evaluate([X_test1, X_test2], y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')


Epoch 1/15
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 236ms/step - accuracy: 0.3703 - loss: 0.6970 - val_accuracy: 0.6400 - val_loss: 0.6923
Epoch 2/15
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step - accuracy: 0.4528 - loss: 0.6933 - val_accuracy: 0.6600 - val_loss: 0.6911
Epoch 3/15
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 115ms/step - accuracy: 0.5860 - loss: 0.6920 - val_accuracy: 0.6600 - val_loss: 0.6901
Epoch 4/15
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - accuracy: 0.6205 - loss: 0.6907 - val_accuracy: 0.6600 - val_loss: 0.6890
Epoch 5/15
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 122ms/step - accuracy: 0.5870 - loss: 0.6908 - val_accuracy: 0.6600 - val_loss: 0.6881
Epoch 6/15
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step - accuracy: 0.6166 - loss: 0.6893 - val_accuracy: 0.6600 - val_loss: 0.6871
Epoch 7/15
[1m8/8[0m [32m━━━━━━━━━━━━