# 1 - Preprocessing Contexts

In [None]:
!pip install beir datasets



We download the SQuAD v2 Dataset

In [None]:
from datasets import load_dataset
datasets = load_dataset("squad_v2")

Reusing dataset squad_v2 (/root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

We download the DBPedia Dataset

In [None]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

dataset = "dbpedia-entity"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/4635922 [00:00<?, ?it/s]

Then we combine their contexts

In [None]:
import pandas as pd
import numpy as np

In [None]:
contexts = pd.Series(datasets["validation"]["context"]).unique()

In [None]:
to_add = []
for doc in list(corpus.values())[:14000]: # The first 14000 completes the SQuAD v2 validation set for ~10000 samples
  if len(doc["text"].split()) > 50:
    to_add.append(doc["text"])
contexts = np.concatenate((contexts, np.array(to_add)))
contexts = np.unique(contexts)

In [None]:
print("Nombre de textes:")
print(len(contexts))

Nombre de textes:
10604


# 2 - Indexing contexts

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('msmarco-distilbert-base-tas-b')

In [None]:
embeddings = model.encode(contexts)

# 3 - Exact Nearest Neighbors

In [None]:
# Define datasets for evaluation
x_test = datasets["validation"]["question"]
y_test = datasets["validation"]["context"]

In [None]:
x_test_embeddings = model.encode(x_test)

In [None]:
# Nearest Neighbors for Dot product
def get_nn(v, k):
  distances = v @ embeddings.T
  return np.argsort(distances)[::-1][:k]

In [None]:
# Evaluation
k = 5
results = []
for i, (x, y) in enumerate(zip(x_test_embeddings, y_test)):
  topk = get_nn(x, k)
  if y in contexts[topk]:
    for j, y_pred in enumerate(contexts[topk]):
      if y_pred == y:
        results.append(1 / (j + 1))
        break
  else:
    results.append(0)
print("MRR:")
print(np.array(results).mean())

MRR:
0.6990749319183582


# 3 - Approximate Nearest Neighbors

In [None]:
!pip install annoy



In [None]:
from annoy import AnnoyIndex

In [None]:
# Index Nearest Neighbors
nn = AnnoyIndex(768, "dot")
for i, embedding in enumerate(embeddings):
  nn.add_item(i, embedding)
nn.build(2000)

True

In [None]:
# Evaluation
k = 5
results = []
for i, (x, y) in enumerate(zip(x_test_embeddings, y_test)):
  topk = nn.get_nns_by_vector(x, k)
  if y in contexts[topk]:
    for j, y_pred in enumerate(contexts[topk]):
      if y_pred == y:
        results.append(1 / (j + 1))
        break
  else:
    results.append(0)
print("MRR:")
print(np.array(results).mean())

MRR:
0.6674106516185182
