In [1]:
import os
from tqdm import tqdm
import numpy as np
from load_dataset import load_corpus, load_queries
from crossencoder_bm25 import CustomCrossEncoder
from sentence_transformers.cross_encoder import CrossEncoder
import pickle


  from .autonotebook import tqdm as notebook_tqdm


On lit les passages retrieve par BM25 (top 1000) :

In [6]:
data_folder = "./data/msmarco-passage/"
corpus = load_corpus(os.path.join(data_folder, "collection.tsv"))
queries = load_queries(os.path.join(data_folder, "queries.dev.small.tsv"))

import gzip


def load_retrieval(retrieval_filepath, corpus, queries):
    retrieval_samples = {}
    with gzip.open(retrieval_filepath, "rt") as fIn:
        for line in tqdm(fIn, unit_scale=True):
            qid, corpus_id, rank, bm25_score = line.strip().split("\t")

            query = queries[qid]
            passage = corpus[corpus_id]

            if qid in retrieval_samples:
                retrieval_samples[qid].append([query, passage])
            else:
                retrieval_samples[qid] = [[query, passage]]

    return retrieval_samples


retrieval_samples = load_retrieval(
    os.path.join(data_folder, "msmarco.bm25.dev.small.tsv.gz"), corpus, queries
)


8.84Mit [00:14, 600kit/s]
6.98kit [00:00, 1.23Mit/s]
6.97Mit [00:17, 390kit/s] 


Initialisation du Cross-Encoder (CEBM25CAT) :

In [5]:
model_name = "output/training_ms-marco_cross-encoder-microsoft-MiniLM-L12-H384-uncased-2023-05-09_18-21-03-latest"
model = CrossEncoder(model_name)

Re-Ranking :

In [6]:
similarity_scores = {}
for qid in tqdm(queries):
    similarity_scores[qid] = [model.predict(retrieval_samples[qid])]
    

  0%|          | 0/6980 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
 58%|█████▊    | 4034/6980 [1:01:48<46:18,  1.06it/s]

On save parce que c'est long :

In [2]:
if not os.path.exists("reranking_baseline.pkl"):
    with open('reranking_baseline.pkl', 'wb') as f:
        pickle.dump(similarity_scores, f)
else:
    with open('reranking_baseline.pkl', 'rb') as f:
        similarity_scores = pickle.load(f)

Maintenant on prépare les données pour pouvoir calculer les métriques...

In [3]:
dev_filepath = "runs/run.msmarco-passage.bm25tuned.txt"
dev = {}

with open(dev_filepath, "r") as fIn:
    for line in tqdm(fIn, unit_scale=True):
        qid, corpus_id, rank = line.strip().split("\t")
        
        if qid in dev:
            dev[qid].append([qid, corpus_id])
        else:
            dev[qid] = [[qid, corpus_id]]



6.97Mit [00:12, 558kit/s] 


Il ne faut pas oublier de sort :

In [7]:
results = [np.concatenate((np.array(dev[qid]), similarity_scores[qid][0].reshape(-1, 1)), axis=1) for qid in queries]


In [8]:
results[0]

array([['1048585', '7187158', '0.99066484'],
       ['1048585', '7187157', '0.9708628'],
       ['1048585', '7187163', '0.94394267'],
       ...,
       ['1048585', '5771111', '0.00012813006'],
       ['1048585', '6073381', '0.000110037196'],
       ['1048585', '6339403', '0.000116001946']], dtype='<U32')

In [9]:
results_sorted = [results[i][results[i][:, 2].argsort()[::-1]] for i in range(len(results))]

In [10]:
results_sorted[0]

array([['1048585', '7187155', '0.99207896'],
       ['1048585', '7187158', '0.99066484'],
       ['1048585', '7187160', '0.9873302'],
       ...,
       ['1048585', '7012534', '0.000107882675'],
       ['1048585', '8593160', '0.00010783897'],
       ['1048585', '6516178', '0.00010779846']], dtype='<U32')

Et on peut enfin sauvegarder les résultats pour Pyserini :

In [11]:
import csv

with open("run.msmarco-passage.reranking.baseline.txt", "w") as fOut:
    writer = csv.writer(fOut, delimiter="\t", lineterminator="\n")
    for result in results_sorted:
        for rank, line in enumerate(result):
            qid = line[0]
            corpus_id = line[1]
            writer.writerow([qid, corpus_id, rank + 1])