# NLP(DEEP) - Lab07
## Auhors
- Eliot LECLAIR
- Alex POIRON
- Tom THIL
- Aurélien VISENTIN

In [None]:
%pip install beir sentence-transformers hnswlib

## Imports

In [None]:
#Imports for Data
from beir import util as util_beir
from beir.datasets.data_loader import GenericDataLoader

#Keep same values
from random import seed, sample

#Imports for the pre-trained model
from sentence_transformers import SentenceTransformer, util

#Lib for the ANN
import hnswlib

## Load Data and explore structure

In [None]:
dataset = "dbpedia-entity"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util_beir.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

In [None]:
print("Length of the corpus :", len(corpus))

Length of the corpus : 4635922


In [None]:
list(corpus.items())[:3]

[('<dbpedia:Animalia_(book)>',
  {'text': "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold.   A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket.",
   'title': 'Animalia (book)'}),
 ('<dbpedia:Academy_Award_for_Best_Production_Design>',
  {'text': "The Academy Awards are the oldest awards ceremony for achievements in motion pictures. The Academy Award for Best Production Design recognizes achievement in art direction on a film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards.  This change resulted from the Art Director's branch of the Academy being renamed the Designer's branch.",
   'title': 'Academy Award for Best Production Design'}),
 ('<dbpedia:An_American_in_Paris>',
  {'tex

In [None]:
list(queries.items())[:3]

[('INEX_LD-2009022', 'Szechwan dish food cuisine'),
 ('INEX_LD-2009039', 'roman architecture'),
 ('INEX_LD-2009053', 'finland car industry manufacturer saab sisu')]

In [None]:
list(qrels.items())[0]

('INEX_LD-2009022',
 {'<dbpedia:Afghan_cuisine>': 0,
  '<dbpedia:Akan_cuisine>': 0,
  '<dbpedia:Ambuyat>': 0,
  '<dbpedia:American_Chinese_cuisine>': 1,
  '<dbpedia:Ants_climbing_a_tree>': 2,
  '<dbpedia:Baingan_bharta>': 1,
  '<dbpedia:Bamischijf>': 0,
  '<dbpedia:Black_cardamom>': 0,
  '<dbpedia:Brazilian_cuisine>': 0,
  '<dbpedia:British_cuisine>': 0,
  '<dbpedia:Caribbean_cuisine>': 0,
  '<dbpedia:Cellophane_noodles>': 1,
  '<dbpedia:Ceviche>': 0,
  '<dbpedia:Chana_masala>': 0,
  '<dbpedia:Chen_Kenichi>': 1,
  '<dbpedia:Chen_Kenmin>': 1,
  '<dbpedia:Chicago-style_pizza>': 0,
  '<dbpedia:Chicken_(food)>': 0,
  '<dbpedia:Chifle>': 0,
  '<dbpedia:Chili_oil>': 2,
  '<dbpedia:Chinatown,_Los_Angeles>': 0,
  '<dbpedia:Chinatown>': 1,
  '<dbpedia:Chinese_cuisine>': 2,
  '<dbpedia:Churumuri_(food)>': 0,
  '<dbpedia:Cookbook>': 0,
  '<dbpedia:Cooking>': 0,
  '<dbpedia:Couscous>': 0,
  '<dbpedia:Cuban_cuisine>': 0,
  '<dbpedia:Cuisine>': 0,
  '<dbpedia:Cuisine_of_Jharkhand>': 0,
  '<dbpedia:C

## Reduce the Dataset
We will take only documents that are at relevant (>= 1 as value in qrels) and add 100K documents randomly that are not relevant (0 as value in qrels).

In [None]:
def select_relevants(qrels: dict, VALUE_RELEVANT=1) -> set:
  """
    Returns a set of documents's ids that correspond to a relevant value
    Args:
          - qrels (dict): the qrels
          - VALUE_RELEVANT (int): global variable in qrels
    Returns:
          - docs_ids (list): list of only documents's keys correponding to the relevant value given
  """
  docs_ids = set()
  
  for pairs in qrels.values():
      #Add in the set each key when the value is relevant
      docs_ids.update(set([k for k,v in pairs.items() if v >= VALUE_RELEVANT]))
  
  return docs_ids

In [None]:
relevants = select_relevants(qrels)

In [None]:
len(relevants)

14877

In [None]:
seed(23)

def select_irrelevants(relevants: set, corpus: dict, K=100000) -> set:
  """
    Returns a set of documents's ids that correspond to an irrelevant value. This set is
    a random sample of K values.
    Args:
          - relevants (set): Set of relevants keys in the original corpus
          - corpus (dict): the original corpus
    Returns:
          - random_sample (set): the random sample of irrelevants docs
  """
  #To gain speed, we use set properties
  all_irrelevants = set(corpus.keys()) - relevants

  #We choose randomly 100K values
  random_sample = set(sample(list(all_irrelevants), K))
  
  return  random_sample


In [None]:
irrelevants = select_irrelevants(relevants, corpus)

In [None]:

def get_reduced_corpus(corpus: dict, keys_to_keep: set) -> dict:
  """
    Get the reduced corpus composed of all relevants documents and 100K irrelevants documents randomly chose.
    Args:
        - corpus (dict): original corpus
        - keys_to_keep (list): list of docs's ids that are relevants
    Returns:
        - reduced_corpus (dict): new corpus reduced.
  """
  reduced_corpus = corpus
  #Keys to delete
  to_delete = set(corpus.keys()) - keys_to_keep

  for k in to_delete:
    del reduced_corpus[k]
  
  return reduced_corpus

In [None]:
keys_to_keep = relevants.union(irrelevants)
reduced_corpus = get_reduced_corpus(corpus, keys_to_keep)

In [None]:
lg = len(reduced_corpus)
print(lg)

114877


Now we have a reduced corpus, we can work with it.

## Creation of the model and embeddings

In [None]:
# We recup only the text in our reduced corpus to fill it in the model
corpus_text = []
for _, val in reduced_corpus.items():
  corpus_text.append(val['text'])

queries_text = list(queries.values())

In [None]:
model = SentenceTransformer('msmarco-distilbert-base-v4')

queries_embedding = model.encode(queries_text)
corpus_embedding = model.encode(corpus_text)

## MAP @ 100

In [None]:
#We use semantic search to obtain hits from the embeddings
hits = util.semantic_search(queries_embedding, corpus_embedding, top_k=100)

First, we create a function that define if a document in the hits variable is relevant or not.

In [None]:
IRRELEVANT_VALUE = 0

def is_relevant(index_q: int, hit: dict, qrels: dict, reduced_corpus: dict) -> int:
  """
      Boolean function that returns if a document is relevant or not.
  Args:
        - index_q (int): Index in the list of hits. This index allow us to get the corresponding 
          tag of the query at index_q in the queries dict.
        - hit (dict): Contains {corpus_id, score}. We use it to obtain the corpus_id variable.
        - qrels(dict): queries dict
        - reduced_corpus(dict): the reduced corpus
  Returns: 
        True or False
  """
  
  query_tag = list(queries.keys())[index_q]

  corpus_id = list(hit.values())[0]
  corpus_tag = list(reduced_corpus.keys())[corpus_id]

  # We need to do it in this way due to the fact that the program crash if we try to get the corpus_id with
  # qrels[query_tag][corpus_tag] and corpus_tag isn't in qrels[query_tag]

  # We need to check it first and in this condition to check if the value == 0
  if corpus_tag not in qrels[query_tag] or qrels[query_tag][corpus_tag] == IRRELEVANT_VALUE:
    return 0
  else:
    return 1

In [None]:
def mean_average_precision(hits: list, qrels: dict, reduced_corpus: dict)-> int:
  """
    Get the MAP@100 for the list of hits.
    Args:
          - hits (list): hits obtain from the semantic search
          - qrels (dict): the qrels dict
          - reduced_corpus (dict): the reduced corpus
    Returns:
          - MAP (float) : the score of MAP (between 0 and 1).
  """
  MAP = 0

  for index_q in range(len(hits)):
    AP = relevants_docs = nb_docs = 0
    
    for hit in hits[index_q]:
      #Get relevance of the document
      relevant = is_relevant(index_q, hit, qrels, reduced_corpus)

      #Update indexes and get the current precision
      nb_docs += 1
      relevants_docs += relevant
      prec = relevants_docs / nb_docs

      # If the document is relevant, we need to add this precision to the average precision
      if relevant == 1:
        AP += prec

    #Little check to avoid the division by 0
    if relevants_docs > 0:
      AP /= relevants_docs

    MAP += AP
   
  #Total number of queries  
  Q = len(hits)
  #Formula of the MAP
  return MAP / Q

In [None]:
MAP = mean_average_precision(hits, qrels, reduced_corpus)
print("MAP =", MAP)

MAP = 0.6384498705369226


## Approximate nearest neighbours

In [None]:
#Get the embeddings size
embedding_size = corpus_embedding.shape[1]

#Create the index object from the Hnswlib librairy
index = hnswlib.Index(space='cosine', dim=embedding_size)
#Init it with 2 hyperparameters
index.init_index(max_elements=len(corpus_embedding), ef_construction=500, M=64)
#Add the corpus embeddings
index.add_items(corpus_embedding, list(range(len(corpus_embedding))))

In [None]:
#We recup the hits
corpus_ids, scores = index.knn_query(queries_embedding, k=100)

In [None]:
# Transform the new hits in the same format as the older ones.
hits_ann = []
for i in range(len(corpus_ids)):
  hit = [{'corpus_id': id, 'score': 1-score} for id, score in zip(corpus_ids[i], scores[i])]
  hits_ann.append(hit)

In [None]:
#We get the MAP@100 from the hits obtained by the ANN way
MAP_ANN = mean_average_precision(hits_ann, qrels, reduced_corpus)
print("MAP with ANN = " + str(MAP_ANN) + "%")

MAP with ANN = 0.6351560243791695%
