Training is done in the `distilbert+bm25_msmarco.ipynb` notebook

# Test

In [5]:
max_seq_length = 512
model_name = "distilbert-base-uncased" 
dataset = "msmarco_tiny"

dataset_path = "../datasets/msmarco_tiny/"
corpus_file = "tiny_collection.json"
queries_file = "topics.dl20.txt"
qrels_test_file = "qrels.dl20-passage.txt"
training_set = "msmarco_triples.train.tiny.tsv"

In [2]:

from sentence_transformers import losses, models, SentenceTransformer
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.train import TrainRetriever
import pathlib, os, tqdm
import logging

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

  from tqdm.autonotebook import tqdm, trange


In [3]:
data_path = f"../datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="train")

2024-06-02 13:18:34 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 16771.97it/s]

2024-06-02 13:18:35 - Loaded 5183 TRAIN Documents.
2024-06-02 13:18:35 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 ver




In [6]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "ab",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "JS2mz8c3RdOol8iLxCkFHA",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [7]:
#### Lexical Retrieval using Bm25 (Elasticsearch) ####

## elasticsearch settings
hostname = "localhost" #localhost
index_name = dataset # scifact
initialize = True # True - Delete existing index and re-index all documents from scratch 

number_of_shards = 1
model = BM25(index_name=index_name, hostname=hostname, initialize=initialize, number_of_shards=number_of_shards)

2024-06-02 13:19:57 - Activating Elasticsearch....
2024-06-02 13:19:57 - Elastic Search Credentials: {'hostname': 'localhost', 'index_name': 'scifact', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 1, 'language': 'english'}
2024-06-02 13:19:57 - Deleting previous Elasticsearch-Index named - scifact
2024-06-02 13:19:59 - Creating fresh Elasticsearch-Index named - scifact


In [42]:
bm25 = EvaluateRetrieval(model)

#### Index passages into the index (seperately)
bm25.retriever.index(corpus)

triplets = []
qids = list(qrels) 
hard_negatives_max = 10

#### Retrieve BM25 hard negatives => Given a positive document, find most similar lexical documents
for idx in tqdm.tqdm(range(len(qids)), desc="Retrieve Hard Negatives using BM25"):
    query_id, query_text = qids[idx], queries[qids[idx]]
    pos_docs = [doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0]
    pos_doc_texts = [corpus[doc_id]["title"] + " " + corpus[doc_id]["text"] for doc_id in pos_docs]
    hits = bm25.retriever.es.lexical_multisearch(texts=pos_doc_texts, top_hits=hard_negatives_max+1)
    for (pos_text, hit) in zip(pos_doc_texts, hits):
        for (neg_id, _) in hit.get("hits"):
            if neg_id not in pos_docs:
                neg_text = corpus[neg_id]["title"] + " " + corpus[neg_id]["text"]
                triplets.append([query_text, pos_text, neg_text])


  0%|          | 0/5183 [00:00<?, ?docs/s]             
Retrieve Hard Negatives using BM25: 100%|██████████| 809/809 [00:08<00:00, 93.34it/s] 


In [43]:
#### Provide any sentence-transformers or HF model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#### Provide a high batch-size to train better with triplets!
retriever = TrainRetriever(model=model, batch_size=12)



2024-05-31 01:32:47 - Use pytorch device_name: cuda


In [10]:
#### Provide model save path
model_save_path = os.path.join(os.getcwd(), "../output", "{}-v2-{}-bm25-hard-negs".format(model_name, dataset))
os.makedirs(model_save_path, exist_ok=True)

In [45]:
len(triplets), type(triplets)

(9121, list)

In [51]:
#### Validation set can't be provider to retriever.fit()

# train_cutoff = 2800
# #### Prepare triplets-train samples
# valid_samples = retriever.load_train_triplets(triplets=triplets[:train_cutoff])
# valid_dataloader = retriever.prepare_train_triplets(valid_samples)

#### Prepare triplets-validation samples
train_samples = retriever.load_train_triplets(triplets=triplets)
train_dataloader = retriever.prepare_train_triplets(train_samples)

#### Training SBERT with cosine-product
train_loss = losses.MultipleNegativesRankingLoss(model=retriever.model)

#### Prepare dev evaluator
# ir_evaluator = retriever.load_ir_evaluator(dev_corpus, dev_queries, dev_qrels)

#### If no dev set is present from above use dummy evaluator
ir_evaluator = retriever.load_dummy_evaluator()



Adding Input Examples: 100%|██████████| 761/761 [00:00<00:00, 94990.34it/s]

2024-05-31 01:37:15 - Loaded 9121 training pairs.





In [52]:
# #### Configure Train params
# num_epochs = 10
# evaluation_steps = 5000
# warmup_steps = int(len(train_samples) * num_epochs / retriever.batch_size * 0.1)

# retriever.fit(train_objectives=[(train_dataloader, train_loss)], 
#                 evaluator=ir_evaluator, 
#                 epochs=num_epochs,
#                 output_path=model_save_path,
#                 warmup_steps=warmup_steps,
#                 evaluation_steps=evaluation_steps,
#                 use_amp=True)


2024-05-31 01:37:16 - Starting to Train...




Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# Evaluate-distilBert

In [8]:
# Loading test set
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

2024-06-02 13:20:32 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 11253.96it/s]

2024-06-02 13:20:33 - Loaded 5183 TEST Documents.
2024-06-02 13:20:33 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers




In [11]:
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

## Load retriever from saved model

model = DRES(models.SentenceBERT(model_save_path), batch_size=128)
retriever = EvaluateRetrieval(model, score_function="cos_sim")

#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)

2024-06-02 13:20:54 - Use pytorch device_name: cuda
2024-06-02 13:20:54 - Load pretrained SentenceTransformer: /mnt/c/D_drive/UCSD/Quarters/Q3/DSC253-Adv_txt_mining/Project/slm4search/src/../output/distilbert-base-uncased-v2-scifact-bm25-hard-negs
2024-06-02 13:21:11 - Encoding Queries...


Batches: 100%|██████████| 3/3 [00:17<00:00,  5.69s/it]


2024-06-02 13:21:28 - Sorting Corpus by document length (Longest first)...
2024-06-02 13:21:28 - Scoring Function: Cosine Similarity (cos_sim)
2024-06-02 13:21:28 - Encoding Batch 1/1...


Batches:  32%|███▏      | 13/41 [00:12<00:26,  1.07it/s]


KeyboardInterrupt: 

In [55]:
import json

output_path = "../output/"
with open(f"{output_path}{dataset}_distilBert_results.json", 'w') as f:
    json.dump(results, f)

In [12]:
import json

output_path = "../output/"
with open(f"{output_path}{dataset}_distilBert_results.json", 'r') as f:
    results = json.load(f)

In [13]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
logging.info("Retriever evaluation for k in: {}".format(retriever.k_values))
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
ndcg, _map, recall, precision

2024-06-02 13:21:46 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
2024-06-02 13:21:46 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 13:21:46 - 

2024-06-02 13:21:46 - NDCG@1: 0.6767
2024-06-02 13:21:46 - NDCG@3: 0.7256
2024-06-02 13:21:46 - NDCG@5: 0.7348
2024-06-02 13:21:46 - NDCG@10: 0.7430
2024-06-02 13:21:46 - NDCG@100: 0.7634
2024-06-02 13:21:46 - NDCG@1000: 0.7725
2024-06-02 13:21:46 - 

2024-06-02 13:21:46 - MAP@1: 0.6409
2024-06-02 13:21:46 - MAP@3: 0.7052
2024-06-02 13:21:46 - MAP@5: 0.7131
2024-06-02 13:21:46 - MAP@10: 0.7168
2024-06-02 13:21:46 - MAP@100: 0.7207
2024-06-02 13:21:46 - MAP@1000: 0.7210
2024-06-02 13:21:46 - 

2024-06-02 13:21:46 - Recall@1: 0.6409
2024-06-02 13:21:46 - Recall@3: 0.7594
2024-06-02 13:21:46 - Recall@5: 0.7855
2024-06-02 13:21:46 - Recall@10: 0.8089
2024-06-02 13:21:46 - Recall@100: 0.9061
2024-06-02 13:21:46 - Recall@1000: 0.9782

({'NDCG@1': 0.67667,
  'NDCG@3': 0.72557,
  'NDCG@5': 0.73475,
  'NDCG@10': 0.74305,
  'NDCG@100': 0.76343,
  'NDCG@1000': 0.77253},
 {'MAP@1': 0.64094,
  'MAP@3': 0.70522,
  'MAP@5': 0.71306,
  'MAP@10': 0.71684,
  'MAP@100': 0.72071,
  'MAP@1000': 0.72102},
 {'Recall@1': 0.64094,
  'Recall@3': 0.75939,
  'Recall@5': 0.7855,
  'Recall@10': 0.80894,
  'Recall@100': 0.90611,
  'Recall@1000': 0.97822},
 {'P@1': 0.67667,
  'P@3': 0.27667,
  'P@5': 0.17267,
  'P@10': 0.08933,
  'P@100': 0.01013,
  'P@1000': 0.0011})

# Evaluate-bm25

In [14]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

model_bm25 = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever_bm25 = EvaluateRetrieval(model_bm25)

#### Retrieve dense results (format of results is identical to qrels)
results_bm25 = retriever_bm25.retrieve(corpus, queries)

2024-06-02 13:21:48 - Activating Elasticsearch....
2024-06-02 13:21:48 - Elastic Search Credentials: {'hostname': 'localhost', 'index_name': 'scifact', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2024-06-02 13:21:48 - Deleting previous Elasticsearch-Index named - scifact
2024-06-02 13:21:51 - Creating fresh Elasticsearch-Index named - scifact


  0%|          | 0/5183 [00:00<?, ?docs/s]             
que: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]


In [15]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg, _map, recall, precision = retriever_bm25.evaluate(qrels, results_bm25, retriever_bm25.k_values)
ndcg, _map, recall, precision

2024-06-02 13:22:04 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 13:22:04 - 

2024-06-02 13:22:04 - NDCG@1: 0.5767
2024-06-02 13:22:04 - NDCG@3: 0.6345
2024-06-02 13:22:04 - NDCG@5: 0.6632
2024-06-02 13:22:04 - NDCG@10: 0.6886
2024-06-02 13:22:04 - NDCG@100: 0.7119
2024-06-02 13:22:04 - NDCG@1000: 0.7192
2024-06-02 13:22:04 - 

2024-06-02 13:22:04 - MAP@1: 0.5559
2024-06-02 13:22:04 - MAP@3: 0.6127
2024-06-02 13:22:04 - MAP@5: 0.6297
2024-06-02 13:22:04 - MAP@10: 0.6422
2024-06-02 13:22:04 - MAP@100: 0.6476
2024-06-02 13:22:04 - MAP@1000: 0.6479
2024-06-02 13:22:04 - 

2024-06-02 13:22:04 - Recall@1: 0.5559
2024-06-02 13:22:04 - Recall@3: 0.6759
2024-06-02 13:22:04 - Recall@5: 0.7446
2024-06-02 13:22:04 - Recall@10: 0.8164
2024-06-02 13:22:04 - Recall@100: 0.9192
2024-06-02 13:22:04 - Recall@1000: 0.9767
2024-06-02 13:22:04 - 

2024-06-02 13:22:04 - P@1: 0.5767
2024-06-02 13:22:04

({'NDCG@1': 0.57667,
  'NDCG@3': 0.63447,
  'NDCG@5': 0.66322,
  'NDCG@10': 0.68862,
  'NDCG@100': 0.71187,
  'NDCG@1000': 0.71921},
 {'MAP@1': 0.55594,
  'MAP@3': 0.61266,
  'MAP@5': 0.62966,
  'MAP@10': 0.64216,
  'MAP@100': 0.64763,
  'MAP@1000': 0.64792},
 {'Recall@1': 0.55594,
  'Recall@3': 0.67594,
  'Recall@5': 0.74456,
  'Recall@10': 0.81644,
  'Recall@100': 0.91922,
  'Recall@1000': 0.97667},
 {'P@1': 0.57667,
  'P@3': 0.24,
  'P@5': 0.16133,
  'P@10': 0.09033,
  'P@100': 0.0104,
  'P@1000': 0.00111})

# Ensemble

In [16]:
def get_maxmin(results):
    max_score = -1
    min_score = 999999
    for q_id, q in results.items():
        for doc_id, score in q.items():
            max_score = max(score, max_score)
            min_score = min(score, min_score)

    return min_score, max_score

# Get range to normalize both
min_distilbert_score, max_distilbert_score = get_maxmin(results)
min_bm25_score, max_bm25_score = get_maxmin(results_bm25)

min_distilbert_score, max_distilbert_score, min_bm25_score, max_bm25_score

(0.00429678987711668, 0.9599123001098633, 0.5297587, 120.60852)

In [17]:
# Normalize
def normalize_results(results, min_score, max_score):
    for q_id, q in results.items():
        for doc_id, score in q.items():
            results[q_id][doc_id] = (score-min_score)/(max_score-min_score)

    return results

results = normalize_results(results, min_distilbert_score, max_distilbert_score)
results_bm25 = normalize_results(results_bm25, min_bm25_score, max_bm25_score)
# results

In [18]:
# results_bm25

In [19]:
def ensemble_score(x,y):
    mu = 0.5
    return mu*x + (1-mu)*y

combined_result = {}

for q_id_1, q_1 in results.items():
        combined_result[q_id_1] = {}
        for doc_id_1, score_1 in q_1.items():
            
            score_2 = 0
            if results_bm25[q_id_1].get(doc_id_1,None)!=None:
                score_2 = results_bm25[q_id_1][doc_id_1]
                del results_bm25[q_id_1][doc_id_1] # So that same query-doc pair is not added to combined result twice
            
            combined_score = ensemble_score(score_1, score_2)
            combined_result[q_id_1][doc_id_1] = combined_score


# Now add remaining bm25 results in combined dict
for q_id_2, q_2 in results_bm25.items():
    for doc_id_2, score_2 in q_2.items():
         score_1 = 0
         combined_score = ensemble_score(score_1, score_2)
         combined_result[q_id_1][doc_id_1] = combined_score

In [21]:
ndcg, _map, recall, precision = retriever_bm25.evaluate(qrels, combined_result, retriever_bm25.k_values)

2024-06-02 13:22:31 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 13:22:31 - 

2024-06-02 13:22:31 - NDCG@1: 0.7400
2024-06-02 13:22:31 - NDCG@3: 0.7794
2024-06-02 13:22:31 - NDCG@5: 0.7998
2024-06-02 13:22:31 - NDCG@10: 0.8109
2024-06-02 13:22:31 - NDCG@100: 0.8260
2024-06-02 13:22:31 - NDCG@1000: 0.8296
2024-06-02 13:22:31 - 

2024-06-02 13:22:31 - MAP@1: 0.7026
2024-06-02 13:22:31 - MAP@3: 0.7611
2024-06-02 13:22:31 - MAP@5: 0.7761
2024-06-02 13:22:31 - MAP@10: 0.7814
2024-06-02 13:22:31 - MAP@100: 0.7852
2024-06-02 13:22:31 - MAP@1000: 0.7854
2024-06-02 13:22:31 - 

2024-06-02 13:22:31 - Recall@1: 0.7026
2024-06-02 13:22:31 - Recall@3: 0.8061
2024-06-02 13:22:31 - Recall@5: 0.8573
2024-06-02 13:22:31 - Recall@10: 0.8885
2024-06-02 13:22:31 - Recall@100: 0.9519
2024-06-02 13:22:31 - Recall@1000: 0.9782
2024-06-02 13:22:31 - 

2024-06-02 13:22:31 - P@1: 0.7400
2024-06-02 13:22:31