In [7]:
from hnswlib import Index
import numpy as np
from sentence_transformers import SentenceTransformer
import time
import pickle
import hnswlib
import torch

In [2]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

## Cosie search

In [9]:

def search_similar_sentences(query, index, doc_list, k=5):
    query_embedding = model.encode([query])

    labels, distances = index.knn_query(query_embedding, k=k)
    similar_sentences_with_scores = [(doc_list[label], 1 - distance) for label, distance in zip(labels[0], distances[0])]

    return similar_sentences_with_scores


with open("embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data["sentences"]
    stored_embeddings = stored_data["embeddings"]

# Ensure stored_embeddings is on CPU and converted to NumPy
if isinstance(stored_embeddings, torch.Tensor):
    stored_embeddings = stored_embeddings.cpu().numpy()  # Convert to CPU and NumPy array

dimension = stored_embeddings.shape[1]
p = hnswlib.Index(space='cosine', dim=dimension)
p.init_index(max_elements=10000, ef_construction=200, M=16)
p.add_items(stored_embeddings)  # Now stored_embeddings is a NumPy array on CPU
p.set_ef(50)  # Setting ef, which controls the recall

new_sentence = "My sister's leg broke"
top_similar_sentences = search_similar_sentences(new_sentence, p, stored_sentences, k=3)

print("Top 3 similar sentences are:")
for i, sentence in enumerate(top_similar_sentences):
    print(f"{i+1}. {sentence}")


Top 3 similar sentences are:
1. ('  We describe a new algorithm, the $(k,\\ell)$-pebble game with colors, and use\nit obtain a characterization of the family of $(k,\\ell)$-sparse graphs and\nalgorithmic solutions to a family of problems concerning tree decompositions of\ngraphs. Special instances of sparse graphs appear in rigidity theory and have\nreceived increased attention in recent years. In particular, our colored\npebbles generalize and strengthen the previous results of Lee and Streinu and\ngive a new proof of the Tutte-Nash-Williams characterization of arboricity. We\nalso present a new decomposition that certifies sparsity based on the\n$(k,\\ell)$-pebble game with colors. Our work also exposes connections between\npebble game algorithms and previous sparse graph algorithms by Gabow, Gabow and\nWestermann and Hendrickson.\n', 0.028408825397491455)
2. ("  The evolution of Earth-Moon system is described by the dark matter field\nfluid model proposed in the Meeting of Division 

## IP

In [10]:

def search_similar_sentences(query, index, doc_list, k=5):
    query_embedding = model.encode([query])

    labels, distances = index.knn_query(query_embedding, k=k)
    similar_sentences_with_scores = [(doc_list[label], 1 - distance) for label, distance in zip(labels[0], distances[0])]

    return similar_sentences_with_scores


with open("embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data["sentences"]
    stored_embeddings = stored_data["embeddings"]

# Ensure stored_embeddings is on CPU and converted to NumPy
if isinstance(stored_embeddings, torch.Tensor):
    stored_embeddings = stored_embeddings.cpu().numpy()  # Convert to CPU and NumPy array

dimension = stored_embeddings.shape[1]
p = hnswlib.Index(space='ip', dim=dimension)
p.init_index(max_elements=10000, ef_construction=200, M=16)
p.add_items(stored_embeddings)  # Now stored_embeddings is a NumPy array on CPU
p.set_ef(50)  # Setting ef, which controls the recall

new_sentence = "My sister's leg broke"
top_similar_sentences = search_similar_sentences(new_sentence, p, stored_sentences, k=3)

print("Top 3 similar sentences are:")
for i, sentence in enumerate(top_similar_sentences):
    print(f"{i+1}. {sentence}")


Top 3 similar sentences are:
1. ('  We describe a new algorithm, the $(k,\\ell)$-pebble game with colors, and use\nit obtain a characterization of the family of $(k,\\ell)$-sparse graphs and\nalgorithmic solutions to a family of problems concerning tree decompositions of\ngraphs. Special instances of sparse graphs appear in rigidity theory and have\nreceived increased attention in recent years. In particular, our colored\npebbles generalize and strengthen the previous results of Lee and Streinu and\ngive a new proof of the Tutte-Nash-Williams characterization of arboricity. We\nalso present a new decomposition that certifies sparsity based on the\n$(k,\\ell)$-pebble game with colors. Our work also exposes connections between\npebble game algorithms and previous sparse graph algorithms by Gabow, Gabow and\nWestermann and Hendrickson.\n', 0.053950369358062744)
2. ("  The evolution of Earth-Moon system is described by the dark matter field\nfluid model proposed in the Meeting of Division 

## L2

In [12]:

def search_similar_sentences(query, index, doc_list, k=5):
    query_embedding = model.encode([query])

    labels, distances = index.knn_query(query_embedding, k=k)
    similar_sentences_with_scores = [(doc_list[label], 1 - distance) for label, distance in zip(labels[0], distances[0])]

    return similar_sentences_with_scores


with open("embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data["sentences"]
    stored_embeddings = stored_data["embeddings"]

# Ensure stored_embeddings is on CPU and converted to NumPy
if isinstance(stored_embeddings, torch.Tensor):
    stored_embeddings = stored_embeddings.cpu().numpy()  # Convert to CPU and NumPy array

dimension = stored_embeddings.shape[1]
p = hnswlib.Index(space='l2', dim=dimension)
p.init_index(max_elements=10000, ef_construction=200, M=16)
p.add_items(stored_embeddings)  # Now stored_embeddings is a NumPy array on CPU
p.set_ef(50)  # Setting ef, which controls the recall

new_sentence = "My sister's leg broke"
top_similar_sentences = search_similar_sentences(new_sentence, p, stored_sentences, k=3)

print("Top 3 similar sentences are:")
for i, sentence in enumerate(top_similar_sentences):
    print(f"{i+1}. {sentence}")


Top 3 similar sentences are:
1. ("  The evolution of Earth-Moon system is described by the dark matter field\nfluid model proposed in the Meeting of Division of Particle and Field 2004,\nAmerican Physical Society. The current behavior of the Earth-Moon system agrees\nwith this model very well and the general pattern of the evolution of the\nMoon-Earth system described by this model agrees with geological and fossil\nevidence. The closest distance of the Moon to Earth was about 259000 km at 4.5\nbillion years ago, which is far beyond the Roche's limit. The result suggests\nthat the tidal friction may not be the primary cause for the evolution of the\nEarth-Moon system. The average dark matter field fluid constant derived from\nEarth-Moon system data is 4.39 x 10^(-22) s^(-1)m^(-1). This model predicts\nthat the Mars's rotation is also slowing with the angular acceleration rate\nabout -4.38 x 10^(-22) rad s^(-2).\n", -3.1173319816589355)
2. ('  We describe a new algorithm, the $(k,\\ell)