<a href="https://colab.research.google.com/github/FarrahTharwat/Sentence-Similarity/blob/main/Sentence_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Libraries**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

**Sentence**

In [None]:
# Sample sentences and their similarity scores
sentences = [
    ("The sun is shining", "The weather is beautiful today", 0.8),
    ("The sun is shining", "It is raining cats and dogs", 0.2),
    ("The car is red", "The apple is red", 0.7),
    ("This is a sentence", "This is a completely different sentence", 0.3),
    ("He enjoys playing football", "Football is his favorite sport", 0.9),
    ("The book was laid on the table", "The table held several books", 0.7),
    ("She loves to read horror novels", "Reading horror stories is her hobby", 0.9),
    ("Birds fly south in the winter", "In winter, birds migrate south", 0.8),
    ("He is a software engineer", "He writes code", 0.6),
    ("Climate change is a global issue", "Global warming affects the earth", 0.8),
    ("Water boils at 100 degrees Celsius", "Boiling point of water is 100°C", 0.9),
    ("She moved to New York last year", "Last year, she relocated to New York", 0.9),
    ("The museum is closed on Mondays", "On Mondays, the museum isn't open", 0.9),
    ("I love eating strawberries", "Strawberries are my favorite fruit", 0.8),
    ("They won their first soccer match", "Their team lost the soccer game", 0.2),
    ("He has two siblings", "He is an only child", 0.1),
    ("The film started at nine o'clock", "The movie began at 9 PM", 0.9),
    ("Our cat is very old", "Our pet is quite young", 0.2),
    ("She's studying to become a lawyer", "She is attending law school", 0.9),
    ("He's allergic to peanuts", "Peanut allergies affect him", 0.9),
    ("They're looking forward to the trip", "The upcoming trip excites them", 0.8),
    ("I need to charge my phone", "My phone battery is dead", 0.6),
    ("Can you call me later?", "Please phone me afterwards", 0.8),
    ("The coffee is too hot to drink", "The drink is cold", 0.2),
    ("Rainforests are located near the equator", "Equatorial regions have rainforests", 0.9),
    ("Mathematics is challenging for many students", "Many students struggle with math", 0.8),
    ("The Earth orbits the Sun", "The Sun is orbited by the Earth", 0.9),
    ("The chef cooked a delicious meal", "A tasty dinner was prepared by the chef", 0.9),
    ("He is reading a novel", "He is watching a movie", 0.2),
    ("My favorite season is spring", "I love the springtime", 0.9),
    ("The dog barked loudly", "A loud noise was made by the dog", 0.8),
    ("The concert starts at eight o'clock", "The show begins at 8 PM", 0.9),
    ("She cut her hair short", "She has long hair", 0.1),
    ("They painted the room blue", "The room was painted red", 0.2),
    ("He runs every morning", "Running is his morning routine", 0.8),
    ("Lightning usually precedes thunder", "Thunder follows lightning", 0.9),
    ("The boy broke the window", "The window was broken by the girl", 0.3),
    ("It rained the whole day", "The rain lasted all day", 0.9),
    ("The cake recipe calls for eggs", "Eggs are needed for the cake", 0.9),
    ("She took a flight to Rome", "She drove to Rome", 0.2),
    ("Whales are mammals", "Whales are not fish", 0.8),
    ("The meal was very satisfying", "Dinner was quite disappointing", 0.2),
    ("I turned off the light", "The light was switched off", 0.9),
    ("He forgot his wallet at home", "He left his wallet at home", 0.9),
    ("The planet Mars is red", "Mars is known as the Red Planet", 0.9),
    ("The building was very tall", "It was a short building", 0.2),
    ("The painting is a masterpiece", "The artwork is mediocre", 0.3),
    ("She adopted a puppy", "She adopted a kitten", 0.3),
    ("The laptop is new", "The computer is old", 0.2),
    ("I learned a lot from the lecture", "The lecture was informative", 0.8),
    ("The phone is ringing", "Someone is calling", 0.7),
    ("The exam was very difficult", "The test was easy", 0.2),
    ("He likes to travel", "Traveling is his hobby", 0.9),
    ("The tree was very old", "The ancient tree was tall", 0.7)
]


**Data preprocessing**

In [None]:
def preprocess_sentence(sentence):
    return sentence.lower().strip()

def get_sequences(tokenizer, sentences, max_length):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding='post')

In [None]:
# Prepare text data for Siamese Network
text_data1 = [preprocess_sentence(pair[0]) for pair in sentences]
text_data2 = [preprocess_sentence(pair[1]) for pair in sentences]
all_text_data = text_data1 + text_data2

In [None]:
# Tokenization and encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text_data)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Set a consistent sequence length
max_length = max(max(len(item.split()) for item in all_text_data), 10)

In [None]:
sequences1 = get_sequences(tokenizer, text_data1, max_length)
sequences2 = get_sequences(tokenizer, text_data2, max_length)

In [None]:
labels = np.array([float(score) for _, _, score in sentences])

**MODEL**

In [None]:
def siamese_model(vocab_size, embedding_dim, lstm_units, max_length):
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length)
    lstm_layer = LSTM(lstm_units)

    input_1 = Input(shape=(max_length,))
    input_2 = Input(shape=(max_length,))

    encoded_1 = lstm_layer(embedding_layer(input_1))
    encoded_2 = lstm_layer(embedding_layer(input_2))

    distance = Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))([encoded_1, encoded_2])
    outputs = Dense(1, activation='sigmoid')(distance)

    model = Model(inputs=[input_1, input_2], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

**Hyperparameters**

In [None]:
# Model hyperparameters
embedding_dim = 128
lstm_units = 64

Train Model

In [None]:
# Create and train the model
model = siamese_model(vocab_size, embedding_dim, lstm_units, max_length)
model.fit([sequences1, sequences2], labels, epochs=137, batch_size=64)

Epoch 1/137
Epoch 2/137
Epoch 3/137
Epoch 4/137
Epoch 5/137
Epoch 6/137
Epoch 7/137
Epoch 8/137
Epoch 9/137
Epoch 10/137
Epoch 11/137
Epoch 12/137
Epoch 13/137
Epoch 14/137
Epoch 15/137
Epoch 16/137
Epoch 17/137
Epoch 18/137
Epoch 19/137
Epoch 20/137
Epoch 21/137
Epoch 22/137
Epoch 23/137
Epoch 24/137
Epoch 25/137
Epoch 26/137
Epoch 27/137
Epoch 28/137
Epoch 29/137
Epoch 30/137
Epoch 31/137
Epoch 32/137
Epoch 33/137
Epoch 34/137
Epoch 35/137
Epoch 36/137
Epoch 37/137
Epoch 38/137
Epoch 39/137
Epoch 40/137
Epoch 41/137
Epoch 42/137
Epoch 43/137
Epoch 44/137
Epoch 45/137
Epoch 46/137
Epoch 47/137
Epoch 48/137
Epoch 49/137
Epoch 50/137
Epoch 51/137
Epoch 52/137
Epoch 53/137
Epoch 54/137
Epoch 55/137
Epoch 56/137
Epoch 57/137
Epoch 58/137
Epoch 59/137
Epoch 60/137
Epoch 61/137
Epoch 62/137
Epoch 63/137
Epoch 64/137
Epoch 65/137
Epoch 66/137
Epoch 67/137
Epoch 68/137
Epoch 69/137
Epoch 70/137
Epoch 71/137
Epoch 72/137
Epoch 73/137
Epoch 74/137
Epoch 75/137
Epoch 76/137
Epoch 77/137
Epoch 78

<keras.src.callbacks.History at 0x7fb717719480>

Predict

In [None]:
# Predicting similarity for new sentences
def predict_similarity(model, sentence1, sentence2, tokenizer, max_length):
    preprocessed1 = preprocess_sentence(sentence1)
    preprocessed2 = preprocess_sentence(sentence2)
    sequence1 = get_sequences(tokenizer, [preprocessed1], max_length)
    sequence2 = get_sequences(tokenizer, [preprocessed2], max_length)
    if sentence1 == sentence2:
      return 1
    return model.predict([sequence1, sequence2])[0][0]

In [None]:
# Example usage
new_sentence1 = "its raining"
new_sentence2 = "the weather is cloudy"
predicted_similarity = predict_similarity(model, new_sentence1, new_sentence2, tokenizer, max_length)
print(f"Predicted similarity score: {predicted_similarity:.4f}")

Predicted similarity score: 0.5845
