In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install faiss-cpu
!pip install -U sentence-transformers

zsh:1: no matches found: transformers[sentencepiece]


In [2]:
from datasets import load_dataset

ds = load_dataset("sentence-transformers/squad")

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

In [15]:
questions = ds["train"]["question"]
answers = ds["train"]["answer"]


In [None]:

# # Generate embeddings for questions and answers
# question_embeddings = model.encode(questions, convert_to_tensor=True)
# answer_embeddings = model.encode(answers, convert_to_tensor=True)


In [None]:
# import pickle

# # Save the embeddings to a pickle file
# with open('embeddings.pkl', 'wb') as f:
#     pickle.dump({'question_embeddings': question_embeddings, 'answer_embeddings': answer_embeddings}, f)

In [4]:
import pickle

with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

question_embeddings = embeddings['question_embeddings']
answer_embeddings = embeddings['answer_embeddings']

In [15]:
import torch


def search_query_top_k(query, corpus_embedding, corpus, top_k=5):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = model.similarity(query_embedding, corpus_embedding)[0]
    scores, indices = torch.topk(cosine_scores, k=top_k)

    print("\nQuery:", query)
    print("Top 5 most similar sentences in corpus:")

    for score, idx in zip(scores, indices):
        print(corpus[idx], f"(Score: {score:.4f})")

In [None]:
search_query_top_k("What year was it restituted to France?", question_embeddings, questions, top_k=5)


Query: What year was it restituted to France?
Top 5 most similar sentences in corpus:
What year was it restituted to France? (Score: 32.5987)
Who withdrew recognition from the Grand Orient de France? (Score: 24.3168)
Who did France restore into power? (Score: 24.2572)
What month did France sue for peace? (Score: 24.1915)
Which country acquired New France from France? (Score: 24.1437)


In [18]:
search_query_top_k(
    "What year was it restituted to France?", answer_embeddings, answers, top_k=5
)


Query: What year was it restituted to France?
Top 5 most similar sentences in corpus:
With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official language of the region. (Score: 22.7375)
With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official language of the region. (Score: 22.7375)
With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official language of the region. (Score: 22.7375)
With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieti

In [19]:
search_query_top_k(
    "France?", answer_embeddings, answers, top_k=5
)


Query: France?
Top 5 most similar sentences in corpus:
The Region of Île de France, including Paris and its surrounding communities, is governed by the Regional Council, which has its headquarters in the 7th arrondissement of Paris. It is composed of 209 members representing the different communes within the region. On December 15, 2015, a list of candidates of the Union of the Right, a coalition of centrist and right-wing parties, led by Valérie Pécresse, narrowly won the regional election, defeating a coalition of Socialists and ecologists. The Socialists had governed the region for seventeen years. In 2016, the new regional council will have 121 members from the Union of the Right, 66 from the Union of the Left and 22 from the extreme right National Front. (Score: 20.2113)
The Region of Île de France, including Paris and its surrounding communities, is governed by the Regional Council, which has its headquarters in the 7th arrondissement of Paris. It is composed of 209 members repr

faiss

In [7]:
from datasets import Dataset
import numpy as np

# Create a dataset from the answers
answers_dataset = Dataset.from_dict({"answers": ds["train"]["answer"]})

# Convert embeddings to a list of lists
answer_embeddings_list = [embedding.tolist() for embedding in answer_embeddings]

# Add embeddings to the dataset
answers_dataset = answers_dataset.add_column("embeddings", answer_embeddings_list)

# Add FAISS index to the dataset
answers_dataset.add_faiss_index(column="embeddings")


  0%|          | 0/88 [00:00<?, ?it/s]

Dataset({
    features: ['answers', 'embeddings'],
    num_rows: 87599
})

In [8]:
questions_dataset = Dataset.from_dict({"questions": ds["train"]["question"]})   

question_embeddings_list = [embedding.tolist() for embedding in question_embeddings]

questions_dataset = questions_dataset.add_column("embeddings", question_embeddings_list)

questions_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/88 [00:00<?, ?it/s]

Dataset({
    features: ['questions', 'embeddings'],
    num_rows: 87599
})

In [9]:
import pickle

# Save the datasets to a pickle file
with open('datasets.pkl', 'wb') as f:
    pickle.dump({'answers_dataset': answers_dataset, 'questions_dataset': questions_dataset}, f)

In [10]:
import pandas as pd

In [None]:
def search_query_top_k_faiss(query, dataset, column, top_k=5):

    query_embedding = model.encode(query, convert_to_tensor=False)
    scores, samples = dataset.get_nearest_examples(
        "embeddings", query_embedding, k=top_k
    )
    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=True)

    for _, row in samples_df.iterrows():
        print(row[column], f"(Score: {row['scores']:.4f})")
        print("=" * 50)
        print()

In [22]:

search_query_top_k_faiss("What year was it restituted to France?", answers_dataset, "answers", top_k=5)

With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official language of the region. (Score: 31.9532)

With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official language of the region. (Score: 31.9532)

With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official language of the region. (Score: 31.9532)

With the Treaty of the Pyrenees (1659), Spain ceded the northern part of Catalonia to France, and soon thereafter the local Catalan varieties came under the influence of French, which in 1700 became the sole official langua

In [23]:
search_query_top_k_faiss(
    "What year was it restituted to France?", questions_dataset, "questions", top_k=5
)

Which territory did France control? (Score: 13.1974)

In what year was the Fall of France? (Score: 13.1916)

Who did France restore into power? (Score: 12.8509)

Which country acquired New France from France? (Score: 11.5127)

What year was it restituted to France? (Score: 0.0000)

