In [1]:
import transformers
from datasets import load_dataset
import pandas as pd
import numpy as np
import torch
import time
from tqdm import tqdm_notebook as tqdm

In [2]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [3]:
import pathlib, os
from beir import util

dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))

Dataset downloaded here: /home/toghrul/ada/ml/final/datasets/nfcorpus


In [4]:
!ls datasets/scifact/

corpus.csv	      embeddings.csv  qrels.csv      retrieval_results.csv
corpus.jsonl	      hybrid	      queries.csv
cosine		      keyword_rerank  queries.jsonl
cross_encoder_rerank  qrels	      results.csv


In [5]:
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

2024-05-23 20:45:29 - Loading Corpus...


  0%|          | 0/3633 [00:00<?, ?it/s]

2024-05-23 20:45:30 - Loaded 3633 TEST Documents.
2024-05-23 20:45:30 - Doc Example: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants die

In [6]:
corpus_idx = list(corpus.keys())
corpus_vals = list(corpus.values())

corpus_df = pd.DataFrame(corpus_vals, index=corpus_idx)

In [7]:
corpus_df

Unnamed: 0,text,title
MED-10,"Recent studies have suggested that statins, an...",Statin Use and Breast Cancer Survival: A Natio...
MED-14,BACKGROUND: Preclinical studies have shown tha...,Statin use after diagnosis of breast cancer an...
MED-118,The aims of this study were to determine the c...,Alkylphenols in human milk and their relations...
MED-301,Epilepsy or seizure disorder is one of the mos...,Methylmercury: A Potential Environmental Risk ...
MED-306,Hit Reaction Time latencies (HRT) in the Conti...,Sensitivity of Continuous Performance Test (CP...
...,...,...
MED-917,Scottish-grown red raspberries are a rich sour...,Effect of freezing and storage on the phenolic...
MED-941,BACKGROUND: Common warts (verruca vulgaris) ar...,Topical vitamin A treatment of recalcitrant co...
MED-942,Apple cider vinegar products are advertised in...,Esophageal injury by apple cider vinegar table...
MED-952,The use of cannabis is embedded within many so...,Cannabis and the lung.


In [8]:
queries_df = pd.Series(queries)

In [9]:
from typing import List
import logging
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
# from rag import insert_document_and_embeddings, find_similar_embeddings, preprocess
from datetime import datetime
import re
from nltk import tokenize
import unicodedata
import string
import logging
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

MAX_WORD_COUNT = 256
MAX_TOKEN_COUNT = 512


model_name = "mixedbread-ai/mxbai-embed-large-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
EMBED_MODEL = SentenceTransformer(model_name, truncate_dim=MAX_TOKEN_COUNT, device="cuda")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBED_MODEL.to(device)


def encoding(text: str) -> str:
    """
    Remove unicoded data
    """
    text = unicodedata.normalize("NFKD", text)

    return text


def remove_URL(text: str) -> str:
    """
    Remove URLs
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)


def remove_non_ascii(text: str) -> str:
    """
    Remove non-ASCII characters
    """
    return re.sub(r"[^\x00-\x7f]", r"", text)


def remove_html(text: str) -> str:
    """
    Remove the html
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)


def remove_punct(text: str) -> str:
    """
    Remove the punctuation
    """
    #     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans("", "", string.punctuation))


def preprocess(text: str) -> str:
    """
    Preprocess the text
    """

    text = encoding(text)
    text = remove_URL(text)
    text = remove_non_ascii(text)
    text = remove_html(text)
    # text = remove_punct(text)
    return text


def ingest_input(user_input):
    user_input = preprocess(user_input)
    # logging.info(f"Preprocessed user input")

    # Generate sentence tokens
    sentence_tokens = tokenize.sent_tokenize(user_input)
    model_input = []
    temp_input: str = ""
    if len(user_input.split(" ")) > MAX_WORD_COUNT:
        logging.info(
            f"Input contains more than {MAX_WORD_COUNT} words. Splitting the input into chunks"
        )

        # Split the input into chunks based on the sentence tokens
        for i, sent in enumerate(sentence_tokens):
            num_words_sent = len(sent.split(" "))

            # Check if the new chunk would exceed the maximum word count
            if len(temp_input.split(" ")) + num_words_sent > MAX_WORD_COUNT:

                # Append the chunk to the model input
                model_input.append(temp_input.strip())
                logging.info(
                    f"Number of words in the chunk: {len(temp_input.split(' '))}"
                )
                temp_input = sent
            else:
                temp_input += " " + sent

        # Append the last chunk to the model input
        logging.info(
            f"Number of words in the last chunk: {len(temp_input.split(' '))}"
        )
        model_input.append(temp_input)
    else:
        model_input = [user_input]

    return model_input


def read_pdf_doc(filepath):
    doc = fitz.open(filepath)
    text = ""
    for page_index, page in enumerate(doc):
        logging.info(f"page {page_index+1} out of {len(doc)}")
        tp = page.get_textpage()
        words = tp.extractWORDS()

        page_text = " ".join([word[4] for word in words])
        text += page_text + " "
    doc.close()
    return text


def generate_embeddings(text, device):
    # inputs = tokenizer(
    #     text,
    #     return_tensors="pt",
    #     truncation=True,
    #     max_length=MAX_TOKEN_COUNT,
    #     padding="max_length",
    # )
    with torch.no_grad():
        outputs = EMBED_MODEL.encode(text, device=device)

    # outputs = outputs.cpu().numpy()
    # Scale the embeddings to be between 0 and 1
    outputs = (outputs - outputs.min()) / (outputs.max() - outputs.min())
    return outputs


2024-05-23 20:45:31 - Load pretrained SentenceTransformer: mixedbread-ai/mxbai-embed-large-v1


# Embedding Generation

In [11]:
text_chunks_list = []
embeddings_list = []
doc_ids = []

batch_size = 128
batch_no = 0
text_batch = []
ids_batch = []

for idx, doc in tqdm(corpus_df.iterrows(), total=len(corpus_df), desc="Processing documents"):
    text_chunks = ingest_input(doc['text'])

    start = time.time()
    text_batch.extend(text_chunks)
    ids_batch.extend([idx] * len(text_chunks))

    while len(text_batch) >= batch_size:
        sub_text_batch = text_batch[:batch_size]
        sub_ids_batch = ids_batch[:batch_size]

        embeddings = generate_embeddings(sub_text_batch, device)
        embeddings_list.append(embeddings)
        text_chunks_list.extend(sub_text_batch)
        doc_ids.extend(sub_ids_batch)

        logging.info(f">>> Generated embeddings for batch {batch_no}")
        logging.info(f"Shape of embeddings: {embeddings.shape}")
        logging.info(f"Time taken for the batch: {time.time() - start}")

        text_batch = text_batch[batch_size:]
        ids_batch = ids_batch[batch_size:]
        batch_no += 1

# Handle any remaining batches
if len(text_batch) > 0:
    torch.cuda.empty_cache()  # Clear CUDA cache before final batch
    logging.info(f"Processing remaining batch of size {len(text_batch)}")
    sub_batch_size = 128  # Same sub-batch size as the main batch
    for i in range(0, len(text_batch), sub_batch_size):
        sub_text_batch = text_batch[i:i + sub_batch_size]
        sub_ids_batch = ids_batch[i:i + sub_batch_size]

        embeddings = generate_embeddings(sub_text_batch, device)
        embeddings_list.append(embeddings)
        text_chunks_list.extend(sub_text_batch)
        doc_ids.extend(sub_ids_batch)

        logging.info(f">>> Generated embeddings for sub-batch {batch_no}-{i // sub_batch_size}")
        logging.info(f"Shape of embeddings: {embeddings.shape}")
        logging.info(f"Time taken for the sub-batch: {time.time() - start}")

        torch.cuda.empty_cache()  # Clear CUDA cache after each sub-batch

embeddings_list = np.concatenate(embeddings_list, axis=0)
doc_ids = np.array(doc_ids)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, doc in tqdm(corpus_df.iterrows(), total=len(corpus_df), desc="Processing documents"):


Processing documents:   0%|          | 0/3633 [00:00<?, ?it/s]

2024-05-23 18:33:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:32 - Number of words in the chunk: 233
2024-05-23 18:33:32 - Number of words in the last chunk: 30
2024-05-23 18:33:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:32 - Number of words in the chunk: 245
2024-05-23 18:33:32 - Number of words in the last chunk: 38
2024-05-23 18:33:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:32 - Number of words in the chunk: 242
2024-05-23 18:33:32 - Number of words in the last chunk: 33
2024-05-23 18:33:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:32 - Number of words in the chunk: 240
2024-05-23 18:33:32 - Number of words in the last chunk: 46
2024-05-23 18:33:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:32 - Number of words in the chunk: 245
2024-05-23 18:33:32 - Number o

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:33:36 - >>> Generated embeddings for batch 0
2024-05-23 18:33:36 - Shape of embeddings: (128, 512)
2024-05-23 18:33:36 - Time taken for the batch: 4.781296491622925
2024-05-23 18:33:36 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:36 - Number of words in the chunk: 253
2024-05-23 18:33:36 - Number of words in the last chunk: 116
2024-05-23 18:33:36 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:36 - Number of words in the chunk: 256
2024-05-23 18:33:36 - Number of words in the last chunk: 22
2024-05-23 18:33:36 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:36 - Number of words in the chunk: 251
2024-05-23 18:33:36 - Number of words in the last chunk: 186
2024-05-23 18:33:36 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:36 - Number of words in the chunk: 256
2024-05-23 18:33:36 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:33:41 - >>> Generated embeddings for batch 1
2024-05-23 18:33:41 - Shape of embeddings: (128, 512)
2024-05-23 18:33:41 - Time taken for the batch: 4.103841781616211
2024-05-23 18:33:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:41 - Number of words in the chunk: 245
2024-05-23 18:33:41 - Number of words in the last chunk: 20
2024-05-23 18:33:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:41 - Number of words in the chunk: 244
2024-05-23 18:33:41 - Number of words in the last chunk: 52
2024-05-23 18:33:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:41 - Number of words in the chunk: 246
2024-05-23 18:33:41 - Number of words in the last chunk: 54
2024-05-23 18:33:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:41 - Number of words in the chunk: 234
2024-05-23 18:33:41 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:33:45 - >>> Generated embeddings for batch 2
2024-05-23 18:33:45 - Shape of embeddings: (128, 512)
2024-05-23 18:33:45 - Time taken for the batch: 4.1856303215026855
2024-05-23 18:33:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:45 - Number of words in the chunk: 246
2024-05-23 18:33:45 - Number of words in the last chunk: 38
2024-05-23 18:33:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:45 - Number of words in the chunk: 237
2024-05-23 18:33:45 - Number of words in the last chunk: 25
2024-05-23 18:33:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:45 - Number of words in the chunk: 255
2024-05-23 18:33:45 - Number of words in the last chunk: 7
2024-05-23 18:33:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:45 - Number of words in the chunk: 225
2024-05-23 18:33:45 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:33:49 - >>> Generated embeddings for batch 3
2024-05-23 18:33:49 - Shape of embeddings: (128, 512)
2024-05-23 18:33:49 - Time taken for the batch: 3.669057846069336
2024-05-23 18:33:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:49 - Number of words in the chunk: 244
2024-05-23 18:33:49 - Number of words in the last chunk: 90
2024-05-23 18:33:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:49 - Number of words in the chunk: 244
2024-05-23 18:33:49 - Number of words in the last chunk: 23
2024-05-23 18:33:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:49 - Number of words in the chunk: 255
2024-05-23 18:33:49 - Number of words in the last chunk: 13
2024-05-23 18:33:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:49 - Number of words in the chunk: 231
2024-05-23 18:33:49 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:33:53 - >>> Generated embeddings for batch 4
2024-05-23 18:33:53 - Shape of embeddings: (128, 512)
2024-05-23 18:33:53 - Time taken for the batch: 4.145297527313232
2024-05-23 18:33:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:53 - Number of words in the chunk: 253
2024-05-23 18:33:53 - Number of words in the last chunk: 12
2024-05-23 18:33:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:53 - Number of words in the chunk: 245
2024-05-23 18:33:53 - Number of words in the last chunk: 34
2024-05-23 18:33:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:53 - Number of words in the chunk: 244
2024-05-23 18:33:53 - Number of words in the last chunk: 46
2024-05-23 18:33:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:53 - Number of words in the chunk: 245
2024-05-23 18:33:53 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:33:57 - >>> Generated embeddings for batch 5
2024-05-23 18:33:57 - Shape of embeddings: (128, 512)
2024-05-23 18:33:57 - Time taken for the batch: 4.307407379150391
2024-05-23 18:33:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:57 - Number of words in the chunk: 226
2024-05-23 18:33:57 - Number of words in the last chunk: 41
2024-05-23 18:33:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:57 - Number of words in the chunk: 214
2024-05-23 18:33:57 - Number of words in the last chunk: 66
2024-05-23 18:33:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:57 - Number of words in the chunk: 248
2024-05-23 18:33:57 - Number of words in the last chunk: 20
2024-05-23 18:33:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:33:57 - Number of words in the chunk: 210
2024-05-23 18:33:57 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:02 - >>> Generated embeddings for batch 6
2024-05-23 18:34:02 - Shape of embeddings: (128, 512)
2024-05-23 18:34:02 - Time taken for the batch: 4.458921670913696
2024-05-23 18:34:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:02 - Number of words in the chunk: 240
2024-05-23 18:34:02 - Number of words in the last chunk: 85
2024-05-23 18:34:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:02 - Number of words in the chunk: 252
2024-05-23 18:34:02 - Number of words in the last chunk: 26
2024-05-23 18:34:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:02 - Number of words in the chunk: 252
2024-05-23 18:34:02 - Number of words in the last chunk: 8
2024-05-23 18:34:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:02 - Number of words in the chunk: 232
2024-05-23 18:34:02 - Number of words in the last chunk: 2

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:06 - >>> Generated embeddings for batch 7
2024-05-23 18:34:06 - Shape of embeddings: (128, 512)
2024-05-23 18:34:06 - Time taken for the batch: 4.7276999950408936
2024-05-23 18:34:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:06 - Number of words in the chunk: 250
2024-05-23 18:34:06 - Number of words in the last chunk: 36
2024-05-23 18:34:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:06 - Number of words in the chunk: 242
2024-05-23 18:34:06 - Number of words in the last chunk: 36
2024-05-23 18:34:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:06 - Number of words in the chunk: 244
2024-05-23 18:34:06 - Number of words in the last chunk: 54
2024-05-23 18:34:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:06 - Number of words in the chunk: 223
2024-05-23 18:34:06 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:11 - >>> Generated embeddings for batch 8
2024-05-23 18:34:11 - Shape of embeddings: (128, 512)
2024-05-23 18:34:11 - Time taken for the batch: 4.184152364730835
2024-05-23 18:34:11 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:11 - Number of words in the chunk: 236
2024-05-23 18:34:11 - Number of words in the last chunk: 74
2024-05-23 18:34:11 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:11 - Number of words in the chunk: 254
2024-05-23 18:34:11 - Number of words in the last chunk: 50
2024-05-23 18:34:11 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:11 - Number of words in the chunk: 253
2024-05-23 18:34:11 - Number of words in the last chunk: 13
2024-05-23 18:34:11 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:11 - Number of words in the chunk: 254
2024-05-23 18:34:11 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:15 - >>> Generated embeddings for batch 9
2024-05-23 18:34:15 - Shape of embeddings: (128, 512)
2024-05-23 18:34:15 - Time taken for the batch: 4.688470363616943
2024-05-23 18:34:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:15 - Number of words in the chunk: 253
2024-05-23 18:34:15 - Number of words in the last chunk: 46
2024-05-23 18:34:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:15 - Number of words in the chunk: 201
2024-05-23 18:34:15 - Number of words in the last chunk: 147
2024-05-23 18:34:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:15 - Number of words in the chunk: 242
2024-05-23 18:34:15 - Number of words in the last chunk: 18
2024-05-23 18:34:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:15 - Number of words in the chunk: 236
2024-05-23 18:34:15 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:19 - >>> Generated embeddings for batch 10
2024-05-23 18:34:19 - Shape of embeddings: (128, 512)
2024-05-23 18:34:19 - Time taken for the batch: 4.2091779708862305
2024-05-23 18:34:19 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:19 - Number of words in the chunk: 232
2024-05-23 18:34:19 - Number of words in the last chunk: 37
2024-05-23 18:34:19 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:19 - Number of words in the chunk: 253
2024-05-23 18:34:19 - Number of words in the chunk: 246
2024-05-23 18:34:19 - Number of words in the chunk: 253
2024-05-23 18:34:19 - Number of words in the last chunk: 174
2024-05-23 18:34:20 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:20 - Number of words in the chunk: 226
2024-05-23 18:34:20 - Number of words in the last chunk: 85
2024-05-23 18:34:20 - Input contains more than 256 words. Splitting the input into chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:24 - >>> Generated embeddings for batch 11
2024-05-23 18:34:24 - Shape of embeddings: (128, 512)
2024-05-23 18:34:24 - Time taken for the batch: 4.117032527923584
2024-05-23 18:34:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:24 - Number of words in the chunk: 255
2024-05-23 18:34:24 - Number of words in the last chunk: 8
2024-05-23 18:34:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:24 - Number of words in the chunk: 250
2024-05-23 18:34:24 - Number of words in the last chunk: 99
2024-05-23 18:34:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:24 - Number of words in the chunk: 241
2024-05-23 18:34:24 - Number of words in the last chunk: 79
2024-05-23 18:34:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:24 - Number of words in the chunk: 248
2024-05-23 18:34:24 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:28 - >>> Generated embeddings for batch 12
2024-05-23 18:34:28 - Shape of embeddings: (128, 512)
2024-05-23 18:34:28 - Time taken for the batch: 4.168704032897949
2024-05-23 18:34:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:28 - Number of words in the chunk: 250
2024-05-23 18:34:28 - Number of words in the last chunk: 16
2024-05-23 18:34:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:28 - Number of words in the chunk: 248
2024-05-23 18:34:28 - Number of words in the last chunk: 145
2024-05-23 18:34:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:28 - Number of words in the chunk: 229
2024-05-23 18:34:28 - Number of words in the last chunk: 42
2024-05-23 18:34:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:28 - Number of words in the chunk: 241
2024-05-23 18:34:28 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:32 - >>> Generated embeddings for batch 13
2024-05-23 18:34:32 - Shape of embeddings: (128, 512)
2024-05-23 18:34:32 - Time taken for the batch: 4.245055198669434
2024-05-23 18:34:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:32 - Number of words in the chunk: 253
2024-05-23 18:34:32 - Number of words in the last chunk: 12
2024-05-23 18:34:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:32 - Number of words in the chunk: 253
2024-05-23 18:34:32 - Number of words in the last chunk: 15
2024-05-23 18:34:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:32 - Number of words in the chunk: 249
2024-05-23 18:34:32 - Number of words in the last chunk: 37
2024-05-23 18:34:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:32 - Number of words in the chunk: 228
2024-05-23 18:34:32 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:37 - >>> Generated embeddings for batch 14
2024-05-23 18:34:37 - Shape of embeddings: (128, 512)
2024-05-23 18:34:37 - Time taken for the batch: 4.345192909240723
2024-05-23 18:34:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:37 - Number of words in the chunk: 212
2024-05-23 18:34:37 - Number of words in the last chunk: 79
2024-05-23 18:34:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:37 - Number of words in the chunk: 255
2024-05-23 18:34:37 - Number of words in the last chunk: 102
2024-05-23 18:34:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:37 - Number of words in the chunk: 234
2024-05-23 18:34:37 - Number of words in the last chunk: 30
2024-05-23 18:34:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:37 - Number of words in the chunk: 240
2024-05-23 18:34:37 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:41 - >>> Generated embeddings for batch 15
2024-05-23 18:34:41 - Shape of embeddings: (128, 512)
2024-05-23 18:34:41 - Time taken for the batch: 4.177549123764038
2024-05-23 18:34:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:41 - Number of words in the chunk: 241
2024-05-23 18:34:41 - Number of words in the last chunk: 31
2024-05-23 18:34:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:41 - Number of words in the chunk: 231
2024-05-23 18:34:41 - Number of words in the last chunk: 34
2024-05-23 18:34:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:41 - Number of words in the chunk: 253
2024-05-23 18:34:41 - Number of words in the last chunk: 16
2024-05-23 18:34:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:41 - Number of words in the chunk: 252
2024-05-23 18:34:41 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:45 - >>> Generated embeddings for batch 16
2024-05-23 18:34:45 - Shape of embeddings: (128, 512)
2024-05-23 18:34:45 - Time taken for the batch: 4.459033727645874
2024-05-23 18:34:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:45 - Number of words in the chunk: 230
2024-05-23 18:34:45 - Number of words in the last chunk: 56
2024-05-23 18:34:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:45 - Number of words in the chunk: 227
2024-05-23 18:34:45 - Number of words in the last chunk: 100
2024-05-23 18:34:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:45 - Number of words in the chunk: 235
2024-05-23 18:34:45 - Number of words in the last chunk: 34
2024-05-23 18:34:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:45 - Number of words in the chunk: 234
2024-05-23 18:34:45 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:49 - >>> Generated embeddings for batch 17
2024-05-23 18:34:49 - Shape of embeddings: (128, 512)
2024-05-23 18:34:49 - Time taken for the batch: 3.8520395755767822
2024-05-23 18:34:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:49 - Number of words in the chunk: 217
2024-05-23 18:34:49 - Number of words in the last chunk: 65
2024-05-23 18:34:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:49 - Number of words in the chunk: 250
2024-05-23 18:34:49 - Number of words in the last chunk: 22
2024-05-23 18:34:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:49 - Number of words in the chunk: 230
2024-05-23 18:34:49 - Number of words in the last chunk: 97
2024-05-23 18:34:49 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:49 - Number of words in the chunk: 228
2024-05-23 18:34:49 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:53 - >>> Generated embeddings for batch 18
2024-05-23 18:34:53 - Shape of embeddings: (128, 512)
2024-05-23 18:34:53 - Time taken for the batch: 4.234833717346191
2024-05-23 18:34:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:53 - Number of words in the chunk: 243
2024-05-23 18:34:53 - Number of words in the last chunk: 124
2024-05-23 18:34:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:53 - Number of words in the chunk: 249
2024-05-23 18:34:53 - Number of words in the last chunk: 30
2024-05-23 18:34:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:53 - Number of words in the chunk: 246
2024-05-23 18:34:53 - Number of words in the last chunk: 116
2024-05-23 18:34:53 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:53 - Number of words in the chunk: 225
2024-05-23 18:34:53 - Number of words in the last chun

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:34:57 - >>> Generated embeddings for batch 19
2024-05-23 18:34:57 - Shape of embeddings: (128, 512)
2024-05-23 18:34:57 - Time taken for the batch: 3.839906692504883
2024-05-23 18:34:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:57 - Number of words in the chunk: 251
2024-05-23 18:34:57 - Number of words in the last chunk: 57
2024-05-23 18:34:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:57 - Number of words in the chunk: 249
2024-05-23 18:34:57 - Number of words in the last chunk: 27
2024-05-23 18:34:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:57 - Number of words in the chunk: 255
2024-05-23 18:34:57 - Number of words in the last chunk: 16
2024-05-23 18:34:57 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:34:57 - Number of words in the chunk: 248
2024-05-23 18:34:57 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:02 - >>> Generated embeddings for batch 20
2024-05-23 18:35:02 - Shape of embeddings: (128, 512)
2024-05-23 18:35:02 - Time taken for the batch: 4.496845006942749
2024-05-23 18:35:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:02 - Number of words in the chunk: 252
2024-05-23 18:35:02 - Number of words in the last chunk: 38
2024-05-23 18:35:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:02 - Number of words in the chunk: 226
2024-05-23 18:35:02 - Number of words in the last chunk: 102
2024-05-23 18:35:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:02 - Number of words in the chunk: 246
2024-05-23 18:35:02 - Number of words in the last chunk: 47
2024-05-23 18:35:02 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:02 - Number of words in the chunk: 244
2024-05-23 18:35:02 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:06 - >>> Generated embeddings for batch 21
2024-05-23 18:35:06 - Shape of embeddings: (128, 512)
2024-05-23 18:35:06 - Time taken for the batch: 4.127668142318726
2024-05-23 18:35:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:06 - Number of words in the chunk: 256
2024-05-23 18:35:06 - Number of words in the last chunk: 49
2024-05-23 18:35:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:06 - Number of words in the chunk: 241
2024-05-23 18:35:06 - Number of words in the last chunk: 87
2024-05-23 18:35:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:06 - Number of words in the chunk: 223
2024-05-23 18:35:06 - Number of words in the last chunk: 80
2024-05-23 18:35:06 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:06 - Number of words in the chunk: 240
2024-05-23 18:35:06 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:10 - >>> Generated embeddings for batch 22
2024-05-23 18:35:10 - Shape of embeddings: (128, 512)
2024-05-23 18:35:10 - Time taken for the batch: 4.139651775360107
2024-05-23 18:35:10 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:10 - Number of words in the chunk: 243
2024-05-23 18:35:10 - Number of words in the last chunk: 18
2024-05-23 18:35:10 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:10 - Number of words in the chunk: 249
2024-05-23 18:35:10 - Number of words in the last chunk: 20
2024-05-23 18:35:10 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:10 - Number of words in the chunk: 253
2024-05-23 18:35:10 - Number of words in the last chunk: 15
2024-05-23 18:35:10 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:10 - Number of words in the chunk: 235
2024-05-23 18:35:10 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:15 - >>> Generated embeddings for batch 23
2024-05-23 18:35:15 - Shape of embeddings: (128, 512)
2024-05-23 18:35:15 - Time taken for the batch: 4.461026668548584
2024-05-23 18:35:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:15 - Number of words in the chunk: 254
2024-05-23 18:35:15 - Number of words in the last chunk: 27
2024-05-23 18:35:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:15 - Number of words in the chunk: 240
2024-05-23 18:35:15 - Number of words in the last chunk: 18
2024-05-23 18:35:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:15 - Number of words in the chunk: 248
2024-05-23 18:35:15 - Number of words in the last chunk: 146
2024-05-23 18:35:15 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:15 - Number of words in the chunk: 239
2024-05-23 18:35:15 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:19 - >>> Generated embeddings for batch 24
2024-05-23 18:35:19 - Shape of embeddings: (128, 512)
2024-05-23 18:35:19 - Time taken for the batch: 4.57558012008667
2024-05-23 18:35:19 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:19 - Number of words in the chunk: 240
2024-05-23 18:35:19 - Number of words in the last chunk: 31
2024-05-23 18:35:19 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:19 - Number of words in the chunk: 246
2024-05-23 18:35:19 - Number of words in the last chunk: 74
2024-05-23 18:35:19 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:19 - Number of words in the chunk: 188
2024-05-23 18:35:19 - Number of words in the last chunk: 131
2024-05-23 18:35:19 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:19 - Number of words in the chunk: 248
2024-05-23 18:35:19 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:24 - >>> Generated embeddings for batch 25
2024-05-23 18:35:24 - Shape of embeddings: (128, 512)
2024-05-23 18:35:24 - Time taken for the batch: 4.162595510482788
2024-05-23 18:35:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:24 - Number of words in the chunk: 238
2024-05-23 18:35:24 - Number of words in the last chunk: 27
2024-05-23 18:35:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:24 - Number of words in the chunk: 202
2024-05-23 18:35:24 - Number of words in the last chunk: 170
2024-05-23 18:35:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:24 - Number of words in the chunk: 221
2024-05-23 18:35:24 - Number of words in the last chunk: 111
2024-05-23 18:35:24 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:24 - Number of words in the chunk: 252
2024-05-23 18:35:24 - Number of words in the last chun

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:28 - >>> Generated embeddings for batch 26
2024-05-23 18:35:28 - Shape of embeddings: (128, 512)
2024-05-23 18:35:28 - Time taken for the batch: 4.223910570144653
2024-05-23 18:35:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:28 - Number of words in the chunk: 253
2024-05-23 18:35:28 - Number of words in the last chunk: 23
2024-05-23 18:35:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:28 - Number of words in the chunk: 241
2024-05-23 18:35:28 - Number of words in the last chunk: 61
2024-05-23 18:35:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:28 - Number of words in the chunk: 230
2024-05-23 18:35:28 - Number of words in the last chunk: 41
2024-05-23 18:35:28 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:28 - Number of words in the chunk: 225
2024-05-23 18:35:28 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:32 - >>> Generated embeddings for batch 27
2024-05-23 18:35:32 - Shape of embeddings: (128, 512)
2024-05-23 18:35:32 - Time taken for the batch: 4.315373182296753
2024-05-23 18:35:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:32 - Number of words in the chunk: 230
2024-05-23 18:35:32 - Number of words in the last chunk: 38
2024-05-23 18:35:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:32 - Number of words in the chunk: 256
2024-05-23 18:35:32 - Number of words in the last chunk: 26
2024-05-23 18:35:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:32 - Number of words in the chunk: 238
2024-05-23 18:35:32 - Number of words in the last chunk: 102
2024-05-23 18:35:32 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:32 - Number of words in the chunk: 208
2024-05-23 18:35:32 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:37 - >>> Generated embeddings for batch 28
2024-05-23 18:35:37 - Shape of embeddings: (128, 512)
2024-05-23 18:35:37 - Time taken for the batch: 4.34032416343689
2024-05-23 18:35:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:37 - Number of words in the chunk: 232
2024-05-23 18:35:37 - Number of words in the last chunk: 44
2024-05-23 18:35:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:37 - Number of words in the chunk: 242
2024-05-23 18:35:37 - Number of words in the last chunk: 26
2024-05-23 18:35:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:37 - Number of words in the chunk: 250
2024-05-23 18:35:37 - Number of words in the last chunk: 83
2024-05-23 18:35:37 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:37 - Number of words in the chunk: 255
2024-05-23 18:35:37 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:41 - >>> Generated embeddings for batch 29
2024-05-23 18:35:41 - Shape of embeddings: (128, 512)
2024-05-23 18:35:41 - Time taken for the batch: 4.476076602935791
2024-05-23 18:35:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:41 - Number of words in the chunk: 252
2024-05-23 18:35:41 - Number of words in the last chunk: 14
2024-05-23 18:35:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:41 - Number of words in the chunk: 244
2024-05-23 18:35:41 - Number of words in the last chunk: 14
2024-05-23 18:35:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:41 - Number of words in the chunk: 250
2024-05-23 18:35:41 - Number of words in the last chunk: 18
2024-05-23 18:35:41 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:41 - Number of words in the chunk: 230
2024-05-23 18:35:41 - Number of words in the chunk: 248


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:45 - >>> Generated embeddings for batch 30
2024-05-23 18:35:45 - Shape of embeddings: (128, 512)
2024-05-23 18:35:45 - Time taken for the batch: 4.311082363128662
2024-05-23 18:35:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:45 - Number of words in the chunk: 216
2024-05-23 18:35:45 - Number of words in the last chunk: 46
2024-05-23 18:35:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:45 - Number of words in the chunk: 188
2024-05-23 18:35:45 - Number of words in the last chunk: 163
2024-05-23 18:35:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:45 - Number of words in the chunk: 228
2024-05-23 18:35:45 - Number of words in the last chunk: 79
2024-05-23 18:35:45 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:45 - Number of words in the chunk: 221
2024-05-23 18:35:45 - Number of words in the last chunk

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:50 - >>> Generated embeddings for batch 31
2024-05-23 18:35:50 - Shape of embeddings: (128, 512)
2024-05-23 18:35:50 - Time taken for the batch: 4.588411331176758
2024-05-23 18:35:50 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:50 - Number of words in the chunk: 230
2024-05-23 18:35:50 - Number of words in the last chunk: 40
2024-05-23 18:35:50 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:50 - Number of words in the chunk: 238
2024-05-23 18:35:50 - Number of words in the last chunk: 93
2024-05-23 18:35:50 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:50 - Number of words in the chunk: 245
2024-05-23 18:35:50 - Number of words in the last chunk: 66
2024-05-23 18:35:50 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:50 - Number of words in the chunk: 229
2024-05-23 18:35:50 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:55 - >>> Generated embeddings for batch 32
2024-05-23 18:35:55 - Shape of embeddings: (128, 512)
2024-05-23 18:35:55 - Time taken for the batch: 4.873944997787476
2024-05-23 18:35:55 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:55 - Number of words in the chunk: 242
2024-05-23 18:35:55 - Number of words in the last chunk: 21
2024-05-23 18:35:55 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:55 - Number of words in the chunk: 235
2024-05-23 18:35:55 - Number of words in the last chunk: 44
2024-05-23 18:35:55 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:55 - Number of words in the chunk: 246
2024-05-23 18:35:55 - Number of words in the last chunk: 31
2024-05-23 18:35:55 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:55 - Number of words in the chunk: 252
2024-05-23 18:35:55 - Number of words in the last chunk:

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:35:59 - >>> Generated embeddings for batch 33
2024-05-23 18:35:59 - Shape of embeddings: (128, 512)
2024-05-23 18:35:59 - Time taken for the batch: 4.47705864906311
2024-05-23 18:35:59 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:59 - Number of words in the chunk: 246
2024-05-23 18:35:59 - Number of words in the last chunk: 19
2024-05-23 18:35:59 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:59 - Number of words in the chunk: 210
2024-05-23 18:35:59 - Number of words in the last chunk: 51
2024-05-23 18:35:59 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:59 - Number of words in the chunk: 238
2024-05-23 18:35:59 - Number of words in the last chunk: 47
2024-05-23 18:35:59 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:35:59 - Number of words in the chunk: 237
2024-05-23 18:35:59 - Number of words in the last chunk: 

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:36:04 - >>> Generated embeddings for batch 34
2024-05-23 18:36:04 - Shape of embeddings: (128, 512)
2024-05-23 18:36:04 - Time taken for the batch: 4.36688756942749
2024-05-23 18:36:04 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:36:04 - Number of words in the chunk: 234
2024-05-23 18:36:04 - Number of words in the chunk: 241
2024-05-23 18:36:04 - Number of words in the last chunk: 61
2024-05-23 18:36:04 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:36:04 - Number of words in the chunk: 244
2024-05-23 18:36:04 - Number of words in the last chunk: 64
2024-05-23 18:36:04 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:36:04 - Number of words in the chunk: 255
2024-05-23 18:36:04 - Number of words in the last chunk: 140
2024-05-23 18:36:04 - Input contains more than 256 words. Splitting the input into chunks
2024-05-23 18:36:04 - Number of words in the chunk: 230


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 18:36:08 - >>> Generated embeddings for sub-batch 35-0
2024-05-23 18:36:08 - Shape of embeddings: (117, 512)
2024-05-23 18:36:08 - Time taken for the sub-batch: 3.7609269618988037


In [12]:
torch.cuda.empty_cache()

In [13]:
# Convert the embeddings and doc_ids to a DataFrame
embeddings_df = pd.DataFrame({
    'text': text_chunks_list,
    'embedding': embeddings_list.tolist(),
    'doc_id': doc_ids
})

In [14]:
embeddings_df.to_csv(os.path.join(data_path, "embeddings.csv"), index=False)

# Retrieval

In [94]:
embeddings_df = pd.read_csv(os.path.join(data_path, "embeddings.csv"))

In [95]:
embeddings_df['embedding'][0]

'[0.4984809  0.5466461  0.49245688 0.57978016 0.52682066 0.4503061\n 0.5709185  0.60514444 0.55339444 0.71970266 0.6294538  0.33047262\n 0.5528569  0.47545147 0.5932326  0.6088847  0.49650493 0.37622535\n 0.4121623  0.6784761  0.48273173 0.6050354  0.3916255  0.56188047\n 0.40015402 0.5913703  0.600369   0.4440864  0.74994075 0.7629428\n 0.4217622  0.44717902 0.6703402  0.26622513 0.49527502 0.48728397\n 0.56444687 0.34291488 0.43436667 0.5312977  0.57477367 0.5058694\n 0.73954165 0.42584348 0.49210918 0.44601497 0.4197642  0.3051098\n 0.623024   0.49866796 0.5673945  0.52327645 0.5038981  0.3188306\n 0.48575872 0.5769357  0.6573103  0.6469505  0.37061575 0.62314355\n 0.48759896 0.403409   0.61190706 0.2401353  0.8512474  0.6503232\n 0.58221394 0.43596855 0.41126993 0.54587555 0.4703908  0.3592393\n 0.37011188 0.4410236  0.52131045 0.41811013 0.6694076  0.54730594\n 0.37192568 0.44322154 0.58059716 0.48982763 0.6321952  0.63248056\n 0.36404496 0.56164885 0.62324667 0.4684238  0.6235964

In [96]:
embeddings_df["embedding"] = embeddings_df["embedding"].apply(lambda x: x[1:-1].replace("'", "").replace("\n", "").split())
embeddings_df["embedding"] = embeddings_df["embedding"].apply(lambda x: np.array(x).astype(float))

In [97]:
embeddings_df["embedding"][0]

array([0.4984809 , 0.5466461 , 0.49245688, 0.57978016, 0.52682066,
       0.4503061 , 0.5709185 , 0.60514444, 0.55339444, 0.71970266,
       0.6294538 , 0.33047262, 0.5528569 , 0.47545147, 0.5932326 ,
       0.6088847 , 0.49650493, 0.37622535, 0.4121623 , 0.6784761 ,
       0.48273173, 0.6050354 , 0.3916255 , 0.56188047, 0.40015402,
       0.5913703 , 0.600369  , 0.4440864 , 0.74994075, 0.7629428 ,
       0.4217622 , 0.44717902, 0.6703402 , 0.26622513, 0.49527502,
       0.48728397, 0.56444687, 0.34291488, 0.43436667, 0.5312977 ,
       0.57477367, 0.5058694 , 0.73954165, 0.42584348, 0.49210918,
       0.44601497, 0.4197642 , 0.3051098 , 0.623024  , 0.49866796,
       0.5673945 , 0.52327645, 0.5038981 , 0.3188306 , 0.48575872,
       0.5769357 , 0.6573103 , 0.6469505 , 0.37061575, 0.62314355,
       0.48759896, 0.403409  , 0.61190706, 0.2401353 , 0.8512474 ,
       0.6503232 , 0.58221394, 0.43596855, 0.41126993, 0.54587555,
       0.4703908 , 0.3592393 , 0.37011188, 0.4410236 , 0.52131

In [98]:
embeddings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4597 entries, 0 to 4596
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   text                4597 non-null   object
 1   embedding           4597 non-null   object
 2   doc_id              4597 non-null   object
 3   sentence_embedding  4597 non-null   object
dtypes: object(4)
memory usage: 143.8+ KB


In [99]:
from sentence_transformers import SentenceTransformer, util


def sentence_similarity(text1, text2):
    embedding_1 = model.encode(text1, convert_to_tensor=True)
    embedding_2 = model.encode(text2, convert_to_tensor=True)
    o = util.pytorch_cos_sim(embedding_1, embedding_2)
    return o.item()


model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

2024-05-23 22:00:36 - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2024-05-23 22:00:38 - Use pytorch device_name: cuda


## Sentence Embedding Generation

In [102]:
text_batch = []
sentence_embeddings_list = []
batch_size = 128
batch_no = 0

for idx, row in tqdm(embeddings_df.iterrows(), total=len(embeddings_df), desc="Processing queries"):
    text_batch.append(row['text'])

    while len(text_batch) >= batch_size:
        sub_text_batch = text_batch[:batch_size]

        start = time.time()
        embeddings = model.encode(sub_text_batch)
        sentence_embeddings_list.extend(embeddings.tolist())

        logging.info(f">>> Generated embeddings for batch {batch_no}")
        logging.info(f"Shape of embeddings: {embeddings.shape}")
        logging.info(f"Time taken for the batch: {time.time() - start}")

        text_batch = text_batch[batch_size:]
        batch_no += 1

# Handle any remaining batches
if len(text_batch) > 0:
    torch.cuda.empty_cache()  # Clear CUDA cache before final batch
    logging.info(f"Processing remaining batch of size {len(text_batch)}")
    sub_batch_size = 128  # Same sub-batch size as the main batch
    for i in range(0, len(text_batch), sub_batch_size):
        sub_text_batch = text_batch[i:i + sub_batch_size]
        start = time.time()
        embeddings = model.encode(sub_text_batch)
        sentence_embeddings_list.extend(embeddings.tolist())

        logging.info(f">>> Generated embeddings for sub-batch {batch_no}-{i // sub_batch_size}")
        logging.info(f"Shape of embeddings: {embeddings.shape}")
        logging.info(f"Time taken for the sub-batch: {time.time() - start}")

        torch.cuda.empty_cache()  # Clear CUDA cache after each sub-batch

embeddings_df["sentence_embedding"] = sentence_embeddings_list

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, row in tqdm(embeddings_df.iterrows(), total=len(embeddings_df), desc="Processing queries"):


Processing queries:   0%|          | 0/4597 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:15 - >>> Generated embeddings for batch 0
2024-05-23 22:01:15 - Shape of embeddings: (128, 384)
2024-05-23 22:01:15 - Time taken for the batch: 1.3640639781951904


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:15 - >>> Generated embeddings for batch 1
2024-05-23 22:01:15 - Shape of embeddings: (128, 384)
2024-05-23 22:01:15 - Time taken for the batch: 0.20400714874267578


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:15 - >>> Generated embeddings for batch 2
2024-05-23 22:01:15 - Shape of embeddings: (128, 384)
2024-05-23 22:01:15 - Time taken for the batch: 0.20573806762695312


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:15 - >>> Generated embeddings for batch 3
2024-05-23 22:01:15 - Shape of embeddings: (128, 384)
2024-05-23 22:01:15 - Time taken for the batch: 0.1975998878479004


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:16 - >>> Generated embeddings for batch 4
2024-05-23 22:01:16 - Shape of embeddings: (128, 384)
2024-05-23 22:01:16 - Time taken for the batch: 0.20504188537597656


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:16 - >>> Generated embeddings for batch 5
2024-05-23 22:01:16 - Shape of embeddings: (128, 384)
2024-05-23 22:01:16 - Time taken for the batch: 0.21490120887756348


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:16 - >>> Generated embeddings for batch 6
2024-05-23 22:01:16 - Shape of embeddings: (128, 384)
2024-05-23 22:01:16 - Time taken for the batch: 0.20191216468811035


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:16 - >>> Generated embeddings for batch 7
2024-05-23 22:01:16 - Shape of embeddings: (128, 384)
2024-05-23 22:01:16 - Time taken for the batch: 0.22829818725585938


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:17 - >>> Generated embeddings for batch 8
2024-05-23 22:01:17 - Shape of embeddings: (128, 384)
2024-05-23 22:01:17 - Time taken for the batch: 0.20894932746887207


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:17 - >>> Generated embeddings for batch 9
2024-05-23 22:01:17 - Shape of embeddings: (128, 384)
2024-05-23 22:01:17 - Time taken for the batch: 0.22536659240722656


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:17 - >>> Generated embeddings for batch 10
2024-05-23 22:01:17 - Shape of embeddings: (128, 384)
2024-05-23 22:01:17 - Time taken for the batch: 0.21143317222595215


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:17 - >>> Generated embeddings for batch 11
2024-05-23 22:01:17 - Shape of embeddings: (128, 384)
2024-05-23 22:01:17 - Time taken for the batch: 0.21217083930969238


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:17 - >>> Generated embeddings for batch 12
2024-05-23 22:01:17 - Shape of embeddings: (128, 384)
2024-05-23 22:01:17 - Time taken for the batch: 0.2180471420288086


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:18 - >>> Generated embeddings for batch 13
2024-05-23 22:01:18 - Shape of embeddings: (128, 384)
2024-05-23 22:01:18 - Time taken for the batch: 0.20787858963012695


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:18 - >>> Generated embeddings for batch 14
2024-05-23 22:01:18 - Shape of embeddings: (128, 384)
2024-05-23 22:01:18 - Time taken for the batch: 0.21001791954040527


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:18 - >>> Generated embeddings for batch 15
2024-05-23 22:01:18 - Shape of embeddings: (128, 384)
2024-05-23 22:01:18 - Time taken for the batch: 0.2259664535522461


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:18 - >>> Generated embeddings for batch 16
2024-05-23 22:01:18 - Shape of embeddings: (128, 384)
2024-05-23 22:01:18 - Time taken for the batch: 0.21765756607055664


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:18 - >>> Generated embeddings for batch 17
2024-05-23 22:01:18 - Shape of embeddings: (128, 384)
2024-05-23 22:01:18 - Time taken for the batch: 0.20393896102905273


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:19 - >>> Generated embeddings for batch 18
2024-05-23 22:01:19 - Shape of embeddings: (128, 384)
2024-05-23 22:01:19 - Time taken for the batch: 0.21553468704223633


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:19 - >>> Generated embeddings for batch 19
2024-05-23 22:01:19 - Shape of embeddings: (128, 384)
2024-05-23 22:01:19 - Time taken for the batch: 0.2169950008392334


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:19 - >>> Generated embeddings for batch 20
2024-05-23 22:01:19 - Shape of embeddings: (128, 384)
2024-05-23 22:01:19 - Time taken for the batch: 0.22944974899291992


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:19 - >>> Generated embeddings for batch 21
2024-05-23 22:01:19 - Shape of embeddings: (128, 384)
2024-05-23 22:01:19 - Time taken for the batch: 0.2830185890197754


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:20 - >>> Generated embeddings for batch 22
2024-05-23 22:01:20 - Shape of embeddings: (128, 384)
2024-05-23 22:01:20 - Time taken for the batch: 0.2088320255279541


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:20 - >>> Generated embeddings for batch 23
2024-05-23 22:01:20 - Shape of embeddings: (128, 384)
2024-05-23 22:01:20 - Time taken for the batch: 0.21971893310546875


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:20 - >>> Generated embeddings for batch 24
2024-05-23 22:01:20 - Shape of embeddings: (128, 384)
2024-05-23 22:01:20 - Time taken for the batch: 0.21257710456848145


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:20 - >>> Generated embeddings for batch 25
2024-05-23 22:01:20 - Shape of embeddings: (128, 384)
2024-05-23 22:01:20 - Time taken for the batch: 0.21938347816467285


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:21 - >>> Generated embeddings for batch 26
2024-05-23 22:01:21 - Shape of embeddings: (128, 384)
2024-05-23 22:01:21 - Time taken for the batch: 0.2280430793762207


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:21 - >>> Generated embeddings for batch 27
2024-05-23 22:01:21 - Shape of embeddings: (128, 384)
2024-05-23 22:01:21 - Time taken for the batch: 0.20818686485290527


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:21 - >>> Generated embeddings for batch 28
2024-05-23 22:01:21 - Shape of embeddings: (128, 384)
2024-05-23 22:01:21 - Time taken for the batch: 0.21424460411071777


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:21 - >>> Generated embeddings for batch 29
2024-05-23 22:01:21 - Shape of embeddings: (128, 384)
2024-05-23 22:01:21 - Time taken for the batch: 0.23562121391296387


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:21 - >>> Generated embeddings for batch 30
2024-05-23 22:01:21 - Shape of embeddings: (128, 384)
2024-05-23 22:01:21 - Time taken for the batch: 0.2260875701904297


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:22 - >>> Generated embeddings for batch 31
2024-05-23 22:01:22 - Shape of embeddings: (128, 384)
2024-05-23 22:01:22 - Time taken for the batch: 0.22823524475097656


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:22 - >>> Generated embeddings for batch 32
2024-05-23 22:01:22 - Shape of embeddings: (128, 384)
2024-05-23 22:01:22 - Time taken for the batch: 0.21967077255249023


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:22 - >>> Generated embeddings for batch 33
2024-05-23 22:01:22 - Shape of embeddings: (128, 384)
2024-05-23 22:01:22 - Time taken for the batch: 0.21236348152160645


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:22 - >>> Generated embeddings for batch 34
2024-05-23 22:01:22 - Shape of embeddings: (128, 384)
2024-05-23 22:01:22 - Time taken for the batch: 0.2189185619354248
2024-05-23 22:01:22 - Processing remaining batch of size 117


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-23 22:01:23 - >>> Generated embeddings for sub-batch 35-0
2024-05-23 22:01:23 - Shape of embeddings: (117, 384)
2024-05-23 22:01:23 - Time taken for the sub-batch: 0.22496652603149414


In [28]:
embeddings_df.dropna(inplace=True)
embeddings_df.info()

embeddings_df.to_csv(os.path.join(data_path, "embeddings.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4597 entries, 0 to 4596
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   text                4597 non-null   object
 1   embedding           4597 non-null   object
 2   doc_id              4597 non-null   object
 3   sentence_embedding  4597 non-null   object
dtypes: object(4)
memory usage: 143.8+ KB


## Query Embedding Generation

In [103]:
queries_df = pd.DataFrame({
    'query_id': list(queries.keys()),
    'query': list(queries.values())
})
queries_df

Unnamed: 0,query_id,query
0,PLAIN-2,Do Cholesterol Statin Drugs Cause Breast Cancer?
1,PLAIN-12,Exploiting Autophagy to Live Longer
2,PLAIN-23,How to Reduce Exposure to Alkylphenols Through...
3,PLAIN-33,What’s Driving America’s Obesity Problem?
4,PLAIN-44,Who Should be Careful About Curcumin?
...,...,...
318,PLAIN-3432,Healthy Chocolate Milkshakes
319,PLAIN-3442,The Healthiest Vegetables
320,PLAIN-3452,Bowel Movement Frequency
321,PLAIN-3462,Olive Oil and Artery Function


In [104]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [105]:
import torch
import logging
from tqdm import tqdm

retrieved_docs_list = []
retrieved_text_list = []
query_ids = []
batch_size = 128
k = 10
query_batch = []
query_embeddings_list = []
query_ids = []
batch_no = 0

for idx, query in tqdm(queries_df.iterrows(), total=len(queries_df), desc="Processing queries"):
    query_batch.append(ingest_input(query['query'])[0])

    while len(query_batch) >= batch_size:
        sub_query_batch = query_batch[:batch_size]

        start = time.time()
        query_embeddings = generate_embeddings(sub_query_batch, device)
        query_embeddings_list.extend(query_embeddings.tolist())
        # query_ids.extend([query['query_id']] * batch_size)

        logging.info(f">>> Generated embeddings for batch {batch_no}")
        logging.info(f"Shape of embeddings: {query_embeddings.shape}")
        logging.info(f"Time taken for the batch: {time.time() - start}")

        query_batch = query_batch[batch_size:]
        batch_no += 1

# Handle any remaining queries
if len(query_batch) > 0:
    torch.cuda.empty_cache()  # Clear CUDA cache before final batch
    logging.info(f"Processing remaining batch of size {len(query_batch)}")
    sub_batch_size = 128  # Same sub-batch size as the main batch
    for i in range(0, len(query_batch), sub_batch_size):
        sub_query_batch = query_batch[i:i + sub_batch_size]
        start = time.time()

        query_embeddings = generate_embeddings(sub_query_batch, device)
        query_embeddings_list.extend(query_embeddings.tolist())
        # query_ids.extend([query['query_id']] * len(sub_query_batch))

        logging.info(f">>> Generated embeddings for sub-batch {batch_no}-{i // sub_batch_size}")
        logging.info(f"Shape of embeddings: {query_embeddings.shape}")
        logging.info(f"Time taken for the sub-batch: {time.time() - start}")

        torch.cuda.empty_cache()  # Clear CUDA cache after each sub-batch

# The final embeddings list can now be used as required
# Example usage:
# queries_df["query_embedding"] = query_embeddings_list


Processing queries:   0%|          | 0/323 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Processing queries:  40%|███▉      | 128/323 [00:00<00:00, 572.18it/s]

2024-05-23 22:02:54 - >>> Generated embeddings for batch 0
2024-05-23 22:02:54 - Shape of embeddings: (128, 512)
2024-05-23 22:02:54 - Time taken for the batch: 0.20508790016174316


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Processing queries: 100%|██████████| 323/323 [00:00<00:00, 869.69it/s]

2024-05-23 22:02:54 - >>> Generated embeddings for batch 1
2024-05-23 22:02:54 - Shape of embeddings: (128, 512)
2024-05-23 22:02:54 - Time taken for the batch: 0.13589215278625488
2024-05-23 22:02:54 - Processing remaining batch of size 67





Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-05-23 22:02:54 - >>> Generated embeddings for sub-batch 2-0
2024-05-23 22:02:54 - Shape of embeddings: (67, 512)
2024-05-23 22:02:54 - Time taken for the sub-batch: 0.10659027099609375


In [106]:
queries_df = pd.DataFrame({
    'query_id': queries_df['query_id'],
    'query': queries_df['query'],
    'query_embedding': query_embeddings_list
})
queries_df.shape

(323, 3)

In [107]:
queries_df.to_csv(os.path.join(data_path, "queries.csv"), index=False)
queries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   query_id         323 non-null    object
 1   query            323 non-null    object
 2   query_embedding  323 non-null    object
dtypes: object(3)
memory usage: 7.7+ KB


In [108]:
embeddings_df.dropna(inplace=True)
embeddings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4597 entries, 0 to 4596
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   text                4597 non-null   object
 1   embedding           4597 non-null   object
 2   doc_id              4597 non-null   object
 3   sentence_embedding  4597 non-null   object
dtypes: object(4)
memory usage: 143.8+ KB


## Document Keyword Corpus Generation

In [17]:
queries_df = pd.read_csv(os.path.join(data_path, "queries.csv"))
queries_df

Unnamed: 0,query_id,query,query_embedding
0,PLAIN-2,Do Cholesterol Statin Drugs Cause Breast Cancer?,"[0.3830738365650177, 0.5533143877983093, 0.667..."
1,PLAIN-12,Exploiting Autophagy to Live Longer,"[0.6694016456604004, 0.4906133711338043, 0.433..."
2,PLAIN-23,How to Reduce Exposure to Alkylphenols Through...,"[0.288175493478775, 0.6211127638816833, 0.5944..."
3,PLAIN-33,What’s Driving America’s Obesity Problem?,"[0.44784390926361084, 0.6733686327934265, 0.45..."
4,PLAIN-44,Who Should be Careful About Curcumin?,"[0.22403667867183685, 0.4641282260417938, 0.48..."
...,...,...,...
318,PLAIN-3432,Healthy Chocolate Milkshakes,"[0.45116451382637024, 0.604766845703125, 0.598..."
319,PLAIN-3442,The Healthiest Vegetables,"[0.3944050073623657, 0.5450353622436523, 0.499..."
320,PLAIN-3452,Bowel Movement Frequency,"[0.565150797367096, 0.6456841230392456, 0.4579..."
321,PLAIN-3462,Olive Oil and Artery Function,"[0.5049484372138977, 0.6557204127311707, 0.536..."


In [109]:
from rank_bm25 import BM25Okapi

TEXT_CORPUS = embeddings_df['text'].values.tolist()
TOKENIZED_CORPUS = [tokenize.word_tokenize(doc) for doc in TEXT_CORPUS]
BM25_TC = BM25Okapi(TOKENIZED_CORPUS)

In [110]:
embeddings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4597 entries, 0 to 4596
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   text                4597 non-null   object
 1   embedding           4597 non-null   object
 2   doc_id              4597 non-null   object
 3   sentence_embedding  4597 non-null   object
dtypes: object(4)
memory usage: 143.8+ KB


In [111]:
embeddings_df["sentence_embedding"] = embeddings_df["sentence_embedding"].apply(lambda x: np.array(x).astype(np.float32))
embeddings_df["embedding"] = embeddings_df["embedding"].apply(lambda x: np.array(x).astype(np.float32))

In [112]:
from sentence_transformers.util import cos_sim
import torch
# Convert embeddings to NumPy arrays of floats and stack them
DOC_EMBEDDINGS = torch.tensor(
    np.vstack(embeddings_df["embedding"]).astype(np.float32)
).to(device)

def find_similar_embeddings(
    df: pd.DataFrame,
    query: str, 
    query_embedding: List[float], 
    top_k: int = 10, 
    alpha: float = 0.7,
    similarity_threshold: float = 0.5,
    method: str = "cosine"
):
    query_embedding = torch.tensor(np.array(query_embedding).reshape(1, -1).astype(np.float32)).to(device)

    # Calculate cosine similarities using GPU
    similarities = cos_sim(query_embedding, DOC_EMBEDDINGS).flatten().cpu().numpy()

    if method == "hybrid":
        query_tokens = tokenize.word_tokenize(query)
        doc_scores = BM25_TC.get_scores(query_tokens)
        similarities = alpha * similarities + (1 - alpha) * doc_scores

    # Add similarities to DataFrame
    df["similarity"] = similarities

    # Filter based on similarity threshold and sort
    filtered_df = df[df["similarity"] > similarity_threshold]
    if filtered_df.empty:
        return [], [], []
    
    results_df = filtered_df.sort_values(by="similarity", ascending=False).head(top_k)

    if method == "keyword_rerank":
        tokenized_corpus = [tokenize.word_tokenize(doc) for doc in results_df["text"].tolist()]
        bm25 = BM25Okapi(tokenized_corpus)
        query_tokens = tokenize.word_tokenize(query)
        
        doc_scores = bm25.get_scores(query_tokens)
        doc_scores_idx = np.argsort(doc_scores)[::-1]
        
        results_df = results_df.iloc[doc_scores_idx].reset_index(drop=True)
        
    elif method == "cross_encoder_rerank":
        doc_sentence_embeddings = torch.tensor(np.vstack(results_df["sentence_embedding"].apply(np.array).values)).to(device)
        
        with torch.no_grad():
            query_sentence_embedding = model.encode(query, device=device).reshape(1, -1)
        # Move embeddings to GPU
        query_sentence_embedding = torch.tensor(query_sentence_embedding).to(device)
        doc_sentence_embeddings = doc_sentence_embeddings.to(device)
        
        cross_encoder_scores = cos_sim(query_sentence_embedding, doc_sentence_embeddings).flatten().cpu().numpy()
        doc_scores_idx = np.argsort(cross_encoder_scores)[::-1]
        
        results_df = results_df.iloc[doc_scores_idx].reset_index(drop=True)

    return (
        results_df["doc_id"].values.tolist(),
        results_df["similarity"].values.tolist(),
        results_df["text"].values.tolist(),
    )

## Retrieval of Relevant Documents

In [113]:
k = 50

# methods_list = ["hybrid", "cosine"]
methods_list = ["hybrid", "cosine", "cross_encoder_rerank", "keyword_rerank"]


for method in methods_list:
    logging.info(f">>> Retrieving documents using {method} similarity <<<")
    retrieved_docs_list = []
    retrieved_text_list = []
    retrieved_sim_list = []
    query_ids = []
    for idx, row in tqdm(
        queries_df.iterrows(), total=len(queries_df), desc="Finding similar embeddings"
    ):
        doc_ids, sim_scores, similar_texts = find_similar_embeddings(
            embeddings_df,
            row["query"],
            row["query_embedding"],
            top_k=k,
            alpha=0.5,
            similarity_threshold=0.5,
            method=method,
        )
        retrieved_docs_list.append(doc_ids)
        retrieved_sim_list.append(sim_scores)
        retrieved_text_list.append(similar_texts)

    retrieval_df = pd.DataFrame(
        {
            "query_id": queries_df["query_id"],
            "query": queries_df["query"],
            f"retrieved_docs_top{k}": retrieved_docs_list,
            f"retrieved_texts_top{k}": retrieved_text_list,
            f"retrieved_sim_top{k}": retrieved_sim_list,
        }
    )

    save_path = os.path.join(data_path, method)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    retrieval_df.to_csv(os.path.join(save_path, "retrieval_results.csv"), index=False)

2024-05-23 22:03:20 - >>> Retrieving documents using hybrid similarity <<<


Finding similar embeddings: 100%|██████████| 323/323 [00:01<00:00, 186.49it/s]


2024-05-23 22:03:22 - >>> Retrieving documents using cosine similarity <<<


Finding similar embeddings: 100%|██████████| 323/323 [00:00<00:00, 453.08it/s]


2024-05-23 22:03:23 - >>> Retrieving documents using cross_encoder_rerank similarity <<<


Finding similar embeddings:   0%|          | 0/323 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:   1%|          | 3/323 [00:00<00:11, 28.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:   2%|▏         | 6/323 [00:00<00:10, 29.37it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:   3%|▎         | 10/323 [00:00<00:10, 30.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:   5%|▍         | 15/323 [00:00<00:08, 35.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:   7%|▋         | 21/323 [00:00<00:07, 41.51it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:   8%|▊         | 26/323 [00:00<00:06, 43.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  10%|▉         | 31/323 [00:00<00:06, 43.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  11%|█         | 36/323 [00:00<00:06, 43.42it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  13%|█▎        | 41/323 [00:01<00:06, 43.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  14%|█▍        | 46/323 [00:01<00:06, 44.29it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  16%|█▌        | 51/323 [00:01<00:06, 41.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  17%|█▋        | 56/323 [00:01<00:06, 40.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  19%|█▉        | 61/323 [00:01<00:06, 41.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  20%|██        | 66/323 [00:01<00:06, 41.86it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  22%|██▏       | 71/323 [00:01<00:06, 40.42it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  24%|██▎       | 76/323 [00:01<00:05, 41.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  25%|██▌       | 81/323 [00:01<00:05, 40.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  27%|██▋       | 86/323 [00:02<00:05, 41.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  28%|██▊       | 91/323 [00:02<00:05, 41.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  30%|██▉       | 96/323 [00:02<00:05, 41.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  31%|███▏      | 101/323 [00:02<00:05, 41.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  33%|███▎      | 106/323 [00:02<00:05, 42.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  34%|███▍      | 111/323 [00:02<00:04, 43.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  36%|███▌      | 116/323 [00:02<00:04, 42.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  37%|███▋      | 121/323 [00:02<00:04, 42.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  39%|███▉      | 126/323 [00:03<00:04, 42.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  41%|████      | 131/323 [00:03<00:04, 44.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  42%|████▏     | 136/323 [00:03<00:04, 43.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  44%|████▎     | 141/323 [00:03<00:04, 43.70it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  45%|████▌     | 146/323 [00:03<00:04, 43.40it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  47%|████▋     | 151/323 [00:03<00:03, 44.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  48%|████▊     | 156/323 [00:03<00:04, 40.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  50%|████▉     | 161/323 [00:03<00:03, 40.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  51%|█████▏    | 166/323 [00:03<00:03, 40.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  53%|█████▎    | 171/323 [00:04<00:03, 40.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  54%|█████▍    | 176/323 [00:04<00:03, 41.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  56%|█████▌    | 181/323 [00:04<00:03, 41.47it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  58%|█████▊    | 186/323 [00:04<00:03, 40.36it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  59%|█████▉    | 191/323 [00:04<00:03, 41.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  61%|██████    | 196/323 [00:04<00:03, 41.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  62%|██████▏   | 201/323 [00:04<00:03, 40.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  64%|██████▍   | 206/323 [00:04<00:02, 41.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  65%|██████▌   | 211/323 [00:05<00:02, 42.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  67%|██████▋   | 216/323 [00:05<00:02, 42.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  68%|██████▊   | 221/323 [00:05<00:02, 41.82it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  70%|██████▉   | 226/323 [00:05<00:02, 42.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  72%|███████▏  | 231/323 [00:05<00:02, 40.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  73%|███████▎  | 236/323 [00:05<00:02, 39.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  75%|███████▍  | 241/323 [00:05<00:02, 39.90it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  76%|███████▌  | 246/323 [00:05<00:01, 40.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  78%|███████▊  | 251/323 [00:06<00:01, 40.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  79%|███████▉  | 256/323 [00:06<00:01, 40.48it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  81%|████████  | 261/323 [00:06<00:01, 41.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  82%|████████▏ | 266/323 [00:06<00:01, 42.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  84%|████████▍ | 271/323 [00:06<00:01, 43.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  85%|████████▌ | 276/323 [00:06<00:01, 42.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  87%|████████▋ | 281/323 [00:06<00:01, 39.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  89%|████████▊ | 286/323 [00:06<00:00, 41.17it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  90%|█████████ | 291/323 [00:07<00:00, 41.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  92%|█████████▏| 296/323 [00:07<00:00, 42.36it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  93%|█████████▎| 301/323 [00:07<00:00, 41.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  95%|█████████▍| 306/323 [00:07<00:00, 42.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  96%|█████████▋| 311/323 [00:07<00:00, 43.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  98%|█████████▊| 316/323 [00:07<00:00, 44.20it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings:  99%|█████████▉| 321/323 [00:07<00:00, 42.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Finding similar embeddings: 100%|██████████| 323/323 [00:07<00:00, 41.60it/s]


2024-05-23 22:03:31 - >>> Retrieving documents using keyword_rerank similarity <<<


Finding similar embeddings: 100%|██████████| 323/323 [00:13<00:00, 23.42it/s]


# Evaluation

In [25]:
list(qrels.values())

[{'MED-2427': 2,
  'MED-10': 2,
  'MED-2429': 2,
  'MED-2430': 2,
  'MED-2431': 2,
  'MED-14': 2,
  'MED-2432': 2,
  'MED-2428': 1,
  'MED-2440': 1,
  'MED-2434': 1,
  'MED-2435': 1,
  'MED-2436': 1,
  'MED-2437': 1,
  'MED-2438': 1,
  'MED-2439': 1,
  'MED-3597': 1,
  'MED-3598': 1,
  'MED-3599': 1,
  'MED-4556': 1,
  'MED-4559': 1,
  'MED-4560': 1,
  'MED-4828': 1,
  'MED-4829': 1,
  'MED-4830': 1},
 {'MED-2513': 2,
  'MED-5237': 2,
  'MED-2517': 2,
  'MED-2518': 2,
  'MED-2519': 2,
  'MED-2520': 2,
  'MED-2521': 2,
  'MED-2514': 1,
  'MED-2943': 1,
  'MED-5322': 1,
  'MED-5323': 1,
  'MED-5324': 1,
  'MED-5325': 1,
  'MED-5326': 1,
  'MED-5327': 1,
  'MED-5328': 1,
  'MED-5329': 1,
  'MED-5330': 1,
  'MED-5331': 1,
  'MED-5332': 1,
  'MED-5333': 1,
  'MED-5334': 1,
  'MED-5335': 1,
  'MED-5363': 1,
  'MED-5337': 1,
  'MED-5338': 1,
  'MED-5339': 1,
  'MED-5340': 1,
  'MED-5341': 1,
  'MED-5342': 1},
 {'MED-2644': 2,
  'MED-2646': 2,
  'MED-2651': 2,
  'MED-118': 2,
  'MED-2652': 2,


In [114]:
qrels_df = pd.DataFrame({
    'query_id': list(qrels.keys()),
    'relevant_docs': [list(rel_docs.keys()) for rel_docs in list(qrels.values())]
})

In [115]:
retrieval_df = pd.read_csv(os.path.join(data_path, 'cosine', "retrieval_results.csv"))
retrieval_df.head()

Unnamed: 0,query_id,query,retrieved_docs_top50,retrieved_texts_top50,retrieved_sim_top50
0,PLAIN-2,Do Cholesterol Statin Drugs Cause Breast Cancer?,"['MED-2429', 'MED-14', 'MED-2431', 'MED-10', '...","[""Emerging evidence suggests that statins' may...","[0.9891527891159058, 0.9874821901321411, 0.986..."
1,PLAIN-12,Exploiting Autophagy to Live Longer,"['MED-2514', 'MED-1437', 'MED-1432', 'MED-1436...","[""Healthy life span is rapidly increasing and ...","[0.9835842847824097, 0.9829482436180115, 0.981..."
2,PLAIN-23,How to Reduce Exposure to Alkylphenols Through...,"['MED-4726', 'MED-999', 'MED-1004', 'MED-1961'...","[""The aim of these studies was to evaluate the...","[0.9849041700363159, 0.9845057129859924, 0.983..."
3,PLAIN-33,What’s Driving America’s Obesity Problem?,"['MED-2721', 'MED-2721', 'MED-2722', 'MED-2723...",['BACKGROUND: The major drivers of the obesity...,"[0.9824672937393188, 0.9813079833984375, 0.981..."
4,PLAIN-44,Who Should be Careful About Curcumin?,"['MED-2824', 'MED-1110', 'MED-1811', 'MED-2794...","['Cancer is primarily a disease of old age, an...","[0.979678750038147, 0.9796613454818726, 0.9795..."


In [116]:
DOCS_IN_CORPUS = list(corpus_df.index)
DOCS_IN_CORPUS

['MED-10',
 'MED-14',
 'MED-118',
 'MED-301',
 'MED-306',
 'MED-329',
 'MED-330',
 'MED-332',
 'MED-334',
 'MED-335',
 'MED-398',
 'MED-557',
 'MED-666',
 'MED-691',
 'MED-692',
 'MED-702',
 'MED-706',
 'MED-707',
 'MED-708',
 'MED-709',
 'MED-711',
 'MED-712',
 'MED-713',
 'MED-714',
 'MED-716',
 'MED-717',
 'MED-718',
 'MED-719',
 'MED-720',
 'MED-721',
 'MED-722',
 'MED-723',
 'MED-724',
 'MED-726',
 'MED-727',
 'MED-728',
 'MED-729',
 'MED-730',
 'MED-731',
 'MED-732',
 'MED-733',
 'MED-734',
 'MED-735',
 'MED-736',
 'MED-743',
 'MED-744',
 'MED-745',
 'MED-746',
 'MED-748',
 'MED-749',
 'MED-751',
 'MED-752',
 'MED-753',
 'MED-754',
 'MED-756',
 'MED-757',
 'MED-758',
 'MED-759',
 'MED-760',
 'MED-761',
 'MED-762',
 'MED-816',
 'MED-818',
 'MED-819',
 'MED-820',
 'MED-821',
 'MED-822',
 'MED-823',
 'MED-824',
 'MED-825',
 'MED-826',
 'MED-827',
 'MED-828',
 'MED-829',
 'MED-830',
 'MED-831',
 'MED-832',
 'MED-833',
 'MED-834',
 'MED-835',
 'MED-836',
 'MED-837',
 'MED-838',
 'MED-

In [117]:
corpus_df.to_csv(os.path.join(data_path, "corpus.csv"), index=False)
queries_df.to_csv(os.path.join(data_path, "queries.csv"), index=False)
qrels_df.to_csv(os.path.join(data_path, "qrels.csv"), index=False)

In [118]:
from sklearn.metrics import precision_score, recall_score


def compute_accuracy(retrieved_docs, relevant_docs, top_k=10):
    # Handle empty retrieved_docs
    if isinstance(retrieved_docs, str):
        if retrieved_docs.strip() == "":
            return 0, 0, 0, 0
        retrieved_docs = retrieved_docs[1:-1].replace("'", "").split(", ")
    retrieved_docs = retrieved_docs[-top_k:]
    
    # Handle empty relevant_docs
    if isinstance(relevant_docs, str):
        if relevant_docs.strip() == "":
            return 0, 0, 0, 0
        relevant_docs = relevant_docs[1:-1].replace("'", "").split(", ")
    relevant_docs = relevant_docs[-top_k:]
    
    retrieved_docs_set = set(retrieved_docs)
    relevant_docs_set = set(relevant_docs)
    
    logging.info(f"Retrieved documents: {retrieved_docs}")
    logging.info(f"Relevant documents: {relevant_docs}")
    
    common_docs_set = retrieved_docs_set.intersection(relevant_docs_set)
    logging.info(f"Common documents: {common_docs_set}")
    
    # Calculate intersect accuracy
    intersect_accuracy = len(common_docs_set) / len(relevant_docs_set) if relevant_docs_set else 0
    
    # Calculate precision and recall
    precision = len(common_docs_set) / len(retrieved_docs_set) if retrieved_docs_set else 0
    recall = len(common_docs_set) / len(relevant_docs_set) if relevant_docs_set else 0
    
    logging.info(f"Length of retrieved docs: {len(retrieved_docs_set)}")
    logging.info(f"Length of relevant docs: {len(relevant_docs_set)}")
    logging.info(f"Length of common docs: {len(common_docs_set)}")
    logging.info(f"Precision: {precision}")
    logging.info(f"Recall: {recall}")
    
    return intersect_accuracy, precision, recall




# # Example usage
# retrieved_docs = "['doc1', 'doc2', 'doc3']"
# relevant_docs = "['doc2', 'doc3', 'doc4']"
# DOCS_IN_CORPUS = ['doc1', 'doc2', 'doc3', 'doc4', 'doc5']

# intersect_accuracy, precision_inner, precision_ohe, recall_ohe = compute_accuracy(retrieved_docs, relevant_docs)
# print(intersect_accuracy, precision_inner, precision_ohe, recall_ohe)


In [119]:
retrieval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   query_id               323 non-null    object
 1   query                  323 non-null    object
 2   retrieved_docs_top50   323 non-null    object
 3   retrieved_texts_top50  323 non-null    object
 4   retrieved_sim_top50    323 non-null    object
dtypes: object(5)
memory usage: 12.7+ KB


In [120]:
METHODS_LIST = ["cosine", "hybrid", "cross_encoder_rerank", "keyword_rerank"]
k_list = [1, 3, 5, 10, 20]

for item in os.listdir(data_path):
    if item in METHODS_LIST:
        retrieval_df = pd.read_csv(os.path.join(data_path, item, "retrieval_results.csv"))
        retrieval_df['query_id'] = retrieval_df['query_id'].astype(str)
        cols_to_keep = ["query_id", "query", "retrieved_docs_top50", "retrieved_texts_top50", "retrieved_sim_top50"]
        retrieval_df = retrieval_df[cols_to_keep]
        retrieval_df = retrieval_df.merge(qrels_df, on=["query_id"], how="inner")
        print(retrieval_df.columns)
        
        for k in k_list:
            retrieval_df[f"accuracy_top{k}"] = 0.0
            retrieval_df[f"precision_top{k}"] = 0.0
            retrieval_df[f"recall_top{k}"] = 0.0
            
            for idx, row in retrieval_df.iterrows():
                accuracy, precision, recall = compute_accuracy(row["retrieved_docs_top50"], row["relevant_docs"], top_k=k)
                retrieval_df.at[idx, f"accuracy_top{k}"] = accuracy
                retrieval_df.at[idx, f"precision_top{k}"] = precision
                retrieval_df.at[idx, f"recall_top{k}"] = recall
                
        retrieval_df.to_csv(os.path.join(data_path, item, f"retrieval_results.csv"), index=False)

Index(['query_id', 'query', 'retrieved_docs_top50', 'retrieved_texts_top50',
       'retrieved_sim_top50', 'relevant_docs'],
      dtype='object')
2024-05-23 22:04:02 - Retrieved documents: ['MED-4925']
2024-05-23 22:04:02 - Relevant documents: ['MED-4830']
2024-05-23 22:04:02 - Common documents: set()
2024-05-23 22:04:02 - Length of retrieved docs: 1
2024-05-23 22:04:02 - Length of relevant docs: 1
2024-05-23 22:04:02 - Length of common docs: 0
2024-05-23 22:04:02 - Precision: 0.0
2024-05-23 22:04:02 - Recall: 0.0
2024-05-23 22:04:02 - Retrieved documents: ['MED-2310']
2024-05-23 22:04:02 - Relevant documents: ['MED-5342']
2024-05-23 22:04:02 - Common documents: set()
2024-05-23 22:04:02 - Length of retrieved docs: 1
2024-05-23 22:04:02 - Length of relevant docs: 1
2024-05-23 22:04:02 - Length of common docs: 0
2024-05-23 22:04:02 - Precision: 0.0
2024-05-23 22:04:02 - Recall: 0.0
2024-05-23 22:04:02 - Retrieved documents: ['MED-2705']
2024-05-23 22:04:02 - Relevant documents: ['MED-5

In [121]:
retrieval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   query_id               323 non-null    object 
 1   query                  323 non-null    object 
 2   retrieved_docs_top50   323 non-null    object 
 3   retrieved_texts_top50  323 non-null    object 
 4   retrieved_sim_top50    323 non-null    object 
 5   relevant_docs          323 non-null    object 
 6   accuracy_top1          323 non-null    float64
 7   precision_top1         323 non-null    float64
 8   recall_top1            323 non-null    float64
 9   accuracy_top3          323 non-null    float64
 10  precision_top3         323 non-null    float64
 11  recall_top3            323 non-null    float64
 12  accuracy_top5          323 non-null    float64
 13  precision_top5         323 non-null    float64
 14  recall_top5            323 non-null    float64
 15  accura

In [122]:
def compute_metrics(retrieval_df, top_k=10, reranked=False, method="cosine"):
    # Compute the mean accuracy, precision, and recall
    accuracy_ft = f"accuracy_top{top_k}"
    precision_ohe_ft = f"precision_top{top_k}"
    recall_ohe_ft = f"recall_top{top_k}"
    
    accuracy = retrieval_df[accuracy_ft].mean()
    precision_ohe = retrieval_df[precision_ohe_ft].mean()
    recall_ohe = retrieval_df[recall_ohe_ft].mean()
    
    temp_df = pd.DataFrame({
        'method': [f"{method}_top{top_k}"],
        'accuracy': [accuracy],
        'precision': [precision_ohe],
        'recall': [recall_ohe]
    })
    
    return temp_df

In [132]:
METHODS_LIST = ["cosine", "hybrid", "cross_encoder_rerank", "keyword_rerank"]
k_list = [1, 3, 5, 10, 20]
metrics_df = pd.DataFrame({'method': [], 'accuracy': [], 'precision': [], 'recall': []})

for item in os.listdir(data_path):
    if item in METHODS_LIST:
        retrieval_df = pd.read_csv(os.path.join(data_path, item, "retrieval_results.csv"))
        
        for k in k_list:
            temp_df = compute_metrics(retrieval_df, top_k=k, method=item)
            metrics_df = pd.concat([metrics_df, temp_df], axis=0)
        metrics_df.to_csv(os.path.join(data_path, item, "metrics.csv"), index=False)

In [134]:
metrics_df

Unnamed: 0,method,accuracy,precision,recall
0,cosine_top1,6.19195,6.19195,6.19195
1,cosine_top3,4.127967,4.127967,4.127967
2,cosine_top5,5.882353,5.572755,5.882353
3,cosine_top10,14.773453,11.523908,14.773453
4,cosine_top20,26.381787,18.183876,26.381787
5,keyword_rerank_top1,9.287926,9.287926,9.287926
6,keyword_rerank_top3,12.383901,10.319917,12.383901
7,keyword_rerank_top5,17.853457,13.77709,17.853457
8,keyword_rerank_top10,22.568677,15.376677,22.568677
9,keyword_rerank_top20,40.476116,24.308305,40.476116


In [135]:
metrics_df.to_csv(os.path.join(data_path, "metrics.csv"), index=False)