In [2]:
from sentence_transformers import CrossEncoder, SentenceTransformer
import numpy as np

In [3]:
# Implement Cosine Similarity function
def cosine_similarity(a: list, b: list) -> float:
    """
    Calculates the cosine similarity between two vectors.

    Args:
        a (list): The first vector.
        b (list): The second vector.

    Returns:
        float: The cosine similarity between the two vectors.
    """
    # Convert lists to numpy arrays for efficient computation
    a = np.array(a)
    b = np.array(b)
    
    # Compute the dot product of the vectors
    dot_product = np.dot(a, b)
    
    # Compute the magnitude (L2 norm) of each vector
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    # Prevent division by zero by checking norms
    if norm_a == 0 or norm_b == 0:
        return 0.0
    
    # Compute cosine similarity
    similarity = dot_product / (norm_a * norm_b)
    
    return similarity


In [15]:
# test of function
vector1 = [4, 5, 5]
vector2 = [4, 4, 5]

similarity = cosine_similarity(vector1, vector2)
print("Cosine Similarity:", similarity)


Cosine Similarity: 0.9945358423571875


In [7]:
query = "What is the French most visited monument ?"

answers = [
    "The Effeil Tower is the most visited monument in France, and is located in Paris", # True
    "France is an amazing country, where you should live in", # False
    "How know's the French most visited monument ? " # False
    ]

In [16]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
answer_embedding = embedding_model.encode(answers)
query_embedding = embedding_model.encode(query)

In [9]:
scores = {cosine_similarity(query_embedding, embedding): answers[i] for i, embedding in enumerate(answer_embedding)}
scores

{np.float32(0.62850946): 'The Effeil Tower is the most visited monument in France, and is located in Paris',
 np.float32(0.46081764): 'France is an amazing country, where you should live in',
 np.float32(0.9634501): "How know's the French most visited monument ? "}

In [10]:
cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
scores = {
    cross_encoder_model.predict([(query, answers[i])])[0]: answers[i] 
    for i in range(len(answers))
}
scores

{np.float32(9.420457): 'The Effeil Tower is the most visited monument in France, and is located in Paris',
 np.float32(-7.764552): 'France is an amazing country, where you should live in',
 np.float32(5.4101825): "How know's the French most visited monument ? "}