In [53]:
# Required libraries 
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

import warnings
warnings.filterwarnings('ignore')



In [54]:
#data importing
df = pd.read_csv("input_data.csv")
df.head()

Unnamed: 0,text1,text2,similarity_score
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,0.185374
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,0.066526
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...,0.086851
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...,0.123677
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...,0.102747


In [55]:
# shape of dataframe
df.shape

(3000, 3)

In [56]:
# Encoding text to tensors

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


def get_embeddings(sentences):
    return embed(sentences)

embeddings1 = get_embeddings(df['text1'].tolist())
embeddings2 = get_embeddings(df['text2'].tolist())


In [57]:
# size of tensors
embeddings1.shape

TensorShape([3000, 512])

In [58]:
# Model architecture

input_1 = tf.keras.Input(shape=(512,), dtype='float32')
input_2 = tf.keras.Input(shape=(512,), dtype='float32')

diff = tf.keras.layers.Subtract()([input_1, input_2])
abs_diff = tf.keras.layers.Lambda(lambda x: tf.abs(x))(diff)

concatenated = tf.keras.layers.Concatenate()([diff, abs_diff])


dense1 = tf.keras.layers.Dense(256, activation='relu')(concatenated)
dense2 = tf.keras.layers.Dense(128, activation='relu')(dense1)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2)

model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

model.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, 512)]                0         []                            
                                                                                                  
 input_14 (InputLayer)       [(None, 512)]                0         []                            
                                                                                                  
 subtract_6 (Subtract)       (None, 512)                  0         ['input_13[0][0]',            
                                                                     'input_14[0][0]']            
                                                                                                  
 lambda_6 (Lambda)           (None, 512)                  0         ['subtract_6[0][0]']    

In [59]:
# model training
X_train = [embeddings1, embeddings2]
y_train = df['similarity_score'].values

model.fit(X_train, y_train, epochs=20, validation_split=0.2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x226b68d3730>

In [60]:
# Saving the model for future use
model.save('similarity_model')

INFO:tensorflow:Assets written to: similarity_model\assets


INFO:tensorflow:Assets written to: similarity_model\assets


In [61]:
# Loading the model
loaded_model = tf.keras.models.load_model('similarity_model')

In [62]:
def predict_similarity(sentence1, sentence2,model):

    embedding1 = embed([sentence1])[0].numpy()
    embedding2 = embed([sentence2])[0].numpy()


    embedding1 = np.reshape(embedding1, (1, -1))
    embedding2 = np.reshape(embedding2, (1, -1))


    similarity = model.predict([embedding1, embedding2])

    return similarity[0][0]


sentence1 = "The weather is nice today"
sentence2 = "Today, the weather is good"


similarity_score = predict_similarity(sentence1, sentence2,model)
print(f"Similarity score: {similarity_score}")
print(f"Similarity score: {round(float(similarity_score), 1)}")



Similarity score: 0.25830793380737305
Similarity score: 0.3
