In [None]:
import warnings
import os
import numpy as np
import requests
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from nltk.translate.bleu_score import sentence_bleu

In [None]:
warnings.filterwarnings("ignore")

In [None]:
query = "What are the approaches to Task Decomposition?"

transformed_queries = [
    "How can Task Decomposition be approached?",
    "What are the different methods for Task Decomposition?",
    "What are the various approaches to decomposing tasks?"
]

In [None]:
def calculate_similarity_scores(query, transformed_queries):
    query_similarity_scores = []
    word_overlap_scores = []
    bleu_scores = []

    for transformed_query in transformed_queries:
        # Similarity score
        similarity_score = len(set(query.lower().split()).intersection(set(transformed_query.lower().split()))) / len(set(query.lower().split()).union(set(transformed_query.lower().split())))
        query_similarity_scores.append(similarity_score)

        # Word overlap
        # Calculate the percentage of overlapping words between the original query and its transformed version.
        query_words = set(query.lower().split())
        transformed_query_words = set(transformed_query.lower().split())
        overlap_score = len(query_words.intersection(transformed_query_words)) / len(query_words.union(transformed_query_words))
        word_overlap_scores.append(overlap_score)

        # BLEU score
        bleu_score = sentence_bleu([query.lower().split()], transformed_query.lower().split())
        bleu_scores.append(bleu_score)

    return query_similarity_scores, word_overlap_scores, bleu_scores

In [None]:
query_similarity_scores, word_overlap_scores, bleu_scores = calculate_similarity_scores(query, transformed_queries)

print("Query Similarity Scores:", query_similarity_scores)
print("Word Overlap Scores:", word_overlap_scores)
print("BLEU Scores:", bleu_scores)

Query Similarity Scores: [0.08333333333333333, 0.5, 0.5]
Word Overlap Scores: [0.08333333333333333, 0.5, 0.5]
BLEU Scores: [9.853445011990208e-232, 5.614021910443866e-78, 5.614021910443866e-78]


### Semantic Similarity:
Utilising pre-trained word embeddings(GloVe) to compute the semantic similarity between the original query and its transformed version to capture the similarity in meaning between the queries.

In [None]:
def load_glove_model(glove_file):
    print("Loading GloVe Model")
    with open(glove_file, 'r', encoding='utf-8') as f:
        word_to_vec = {}
        for line in f:
            values = line.split()
            word = values[0]
            vec = np.array(values[1:], dtype='float32')
            word_to_vec[word] = vec
    print("Done.", len(word_to_vec), " words loaded!")
    return word_to_vec

def compute_semantic_similarity(query, transformed_query, word_to_vec):
    query_embedding = np.mean([word_to_vec[word] for word in query.lower().split() if word in word_to_vec], axis=0)
    transformed_query_embedding = np.mean([word_to_vec[word] for word in transformed_query.lower().split() if word in word_to_vec], axis=0)

    if np.all(np.isnan(query_embedding)) or np.all(np.isnan(transformed_query_embedding)):
        return 0.0

    similarity_score = np.dot(query_embedding, transformed_query_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(transformed_query_embedding))
    return similarity_score

In [None]:
glove_txt_path = "glove.6B.50d.txt"
word_to_vec = load_glove_model(glove_txt_path)

semantic_similarity_scores = []
for transformed_query in transformed_queries:
    semantic_similarity_score = compute_semantic_similarity(query, transformed_query, word_to_vec)
    semantic_similarity_scores.append(semantic_similarity_score)

for i, transformed_query in enumerate(transformed_queries):
    print("Semantic Similarity Score for transformed query", i+1, ":", semantic_similarity_scores[i])

Loading GloVe Model
Done. 400000  words loaded!
Semantic Similarity Score for transformed query 1 : 0.9197151
Semantic Similarity Score for transformed query 2 : 0.97984254
Semantic Similarity Score for transformed query 3 : 0.9665459


### Semantic Similarity utilizing custom embeddings:
Utilising custom word embeddings to compute the semantic similarity between the original query and its transformed version to capture the similarity in meaning between the queries.

// #TODO: integrate this with previously utilised embeddings.

In [None]:
from langchain import LangChain
from sklearn.metrics.pairwise import cosine_similarity

custom_embeddings_file = "custom/embeddings.txt"
# word_to_vec_custom = langchain.encode(texts)

semantic_similarity_scores_custom = []
for transformed_query in transformed_queries:
    semantic_similarity_score = compute_semantic_similarity(query, transformed_query, word_to_vec)
    semantic_similarity_scores_custom.append(semantic_similarity_score)

for i, transformed_query in enumerate(transformed_queries):
    print("Semantic Similarity Score for transformed query utilizing custom embeddings", i+1, ":", semantic_similarity_scores_custom[i])