In [106]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Subtract, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
import tensorflow as tf

In [107]:
# Sample data (replace with real data)
manual_abstracts = ["In the modern technology industry, the utilization of machine learning and natural language processing is increasingly dominant, with applications like facial recognition leveraging machine learning algorithms to understand and interpret users' natural language."]
keyword_queries = ["machine learning OR natural language processing"]

In [108]:
# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(manual_abstracts + keyword_queries)

In [109]:
max_length = 100  # Define a maximum length for padding sequences

In [110]:
def preprocess_text(text):
    tokens = tokenizer.texts_to_sequences([text])
    tokens_padded = pad_sequences(tokens, maxlen=max_length, padding='post')
    return tokens_padded

In [111]:
manual_abstracts_seq = np.array([preprocess_text(abstract)[0] for abstract in manual_abstracts])
keyword_queries_seq = np.array([preprocess_text(query)[0] for query in keyword_queries])

In [112]:
# Create the Siamese network
embedding_dim = 50
vocab_size = len(tokenizer.word_index) + 1

In [113]:
# Define the embedding layer
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=True)



In [114]:
# Define the LSTM encoder
lstm_layer = LSTM(50)

In [115]:
# Define the input layers
input_abstract = Input(shape=(max_length,))
input_query = Input(shape=(max_length,))

In [116]:
# Encode the inputs
encoded_abstract = lstm_layer(embedding_layer(input_abstract))
encoded_query = lstm_layer(embedding_layer(input_query))

In [118]:
# Compute the similarity as the absolute difference between the encodings
similarity = Lambda(lambda x: K.abs(x[0] - x[1]))([encoded_abstract, encoded_query])
similarity = Dense(1, activation='sigmoid')(similarity)

In [119]:
# Define the model
model = Model(inputs=[input_abstract, input_query], outputs=similarity)

In [120]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [121]:
# Dummy labels for training (replace with real labels)
labels = np.array([1])

In [122]:
# Train the model (using dummy data for demonstration)
model.fit([manual_abstracts_seq, keyword_queries_seq], labels, epochs=10)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 0.6931
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 1.0000 - loss: 0.6926
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 1.0000 - loss: 0.6921
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 1.0000 - loss: 0.6916
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 1.0000 - loss: 0.6911
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 1.0000 - loss: 0.6907
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 1.0000 - loss: 0.6902
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 1.0000 - loss: 0.6897
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x78ad89252110>

In [123]:
# Save the model
model.save('similarity_model.h5')
print("Model saved as similarity_model.h5")



Model saved as similarity_model.h5
