## Arguana

In [1]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

  from tqdm.autonotebook import tqdm


In [2]:
def get_beir_dataset(dataset_name):
    dataset = dataset_name
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
    data_path = util.download_and_unzip(url, "datasets")
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
    return corpus, queries, qrels

In [3]:
def get_sentences_corpus_arguana(corpus):
    sentences = []
    payloads = []
    all_doc_items = list(corpus.items())
    for doc_id, contenido in tqdm(all_doc_items, desc="Procesando corpus"):
        payload = {
            'doc_id': doc_id,
            'title': contenido['title'],
            'text': contenido['text']
        }
        payloads.append(payload)
        sentences.append(contenido['text'])
    return sentences, payloads

## Qdrant 

In [4]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, SparseVectorParams, SparseVector
from tqdm import tqdm
#client = QdrantClient(url="http://localhost:6333")
#client.get_collections()

## Models

### Contriever

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from sentence_transformers import SentenceTransformer
from fastembed import SparseTextEmbedding, SparseEmbedding

In [6]:
def load_contriever():
    tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
    model = AutoModel.from_pretrained('facebook/contriever')
    embedding_dimension = model.config.hidden_size
    return tokenizer, model, embedding_dimension

def get_contriever_embeddings(tokenizer, model, sentences):
    def mean_pooling(token_embeddings, mask):
        token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
        sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
        return sentence_embeddings
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
    return embeddings

In [7]:
def load_question_dpr():
    tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    embedding_dimension = model.config.hidden_size
    return tokenizer, model, embedding_dimension

def load_context_dpr():
    tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    embedding_dimension = model.config.hidden_size
    return tokenizer, model, embedding_dimension

def get_dpr_embeddings(tokenizer, model, sentences):
    max_model_input_length = model.config.max_position_embeddings
    if max_model_input_length is None:
        max_model_input_length = 512 
    inputs = tokenizer(
        sentences, 
        padding=True, 
        truncation=True, 
        max_length=max_model_input_length,
        return_tensors='pt'
    )
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    embeddings = model(input_ids=input_ids, attention_mask=attention_mask).pooler_output
    return embeddings

In [8]:
def load_bge_large():
    model = SentenceTransformer('BAAI/bge-large-en-v1.5')
    embedding_dimension = model.get_sentence_embedding_dimension()
    return model, embedding_dimension

def get_bge_large_embeddings(model, sentences):
    embeddings = model.encode(sentences, normalize_embeddings=True)
    return embeddings

In [9]:
def load_bm25():
    model = SparseTextEmbedding(model_name="Qdrant/bm25")
    return model
    
def load_splade():
    model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
    return model

def get_sparse_embeddings(model, sentences, batch_size=6):
    sparse_embeddings_list: list[SparseEmbedding] = list(
        model.embed(sentences, batch_size=batch_size)
    ) 
    return sparse_embeddings_list

In [10]:
def load_models():
    print('Loading contriever...')
    contriever = load_contriever()
    print('Loading question dpr...')
    question_dpr = load_question_dpr()
    print('Loading context dpr...')
    context_dpr = load_context_dpr()
    print('Loading bge large...')
    bge_large = load_bge_large()
    print('Loading bm25...')
    bm25 = load_bm25()
    print('Loading splade...')
    splade = load_splade()
    models = {
        'contriever': contriever,
        'question_dpr': question_dpr,
        'context_dpr': context_dpr,
        'bge_large': bge_large,
        'bm25': bm25,
        'splade': splade
    }
    return models

In [11]:
#models = load_models()

In [12]:
#corpus, queries, qrels = get_beir_dataset("arguana")

In [13]:
#sentences, payloads = get_sentences_corpus_arguana(corpus)

In [14]:
def get_docs_embeddings(models, sentences):
    contriever_embeddings = get_contriever_embeddings(models['contriever'][0],models['contriever'][1], sentences)
    dpr_embeddings = get_dpr_embeddings(models['context_dpr'][0],models['context_dpr'][1], sentences)
    bge_embeddings = get_bge_large_embeddings(models['bge_large'][0], sentences)
    bm25_embeddings = get_sparse_embeddings(models['bm25'], sentences)
    splade_embeddings = get_sparse_embeddings(models['splade'], sentences)
    return [contriever_embeddings, dpr_embeddings, bge_embeddings, bm25_embeddings, splade_embeddings]

In [15]:
#embeddings = get_docs_embeddings(models, sentences[:2])

In [16]:
def create_collection(client, collection_name,  DIM_CONTRIEVER, DIM_DPR, DIM_BGE_L): 
    client.create_collection(
        collection_name= collection_name,
        vectors_config={
            "contriever": VectorParams(size=DIM_CONTRIEVER, distance=Distance.COSINE),
            "dpr": VectorParams(size=DIM_DPR, distance=Distance.COSINE),
            "bge_large": VectorParams(size=DIM_BGE_L, distance=Distance.COSINE),
        },
        sparse_vectors_config={
            "sparse_bm25": SparseVectorParams(),
            "sparse_splade": SparseVectorParams(),
        }
    )

In [17]:
def get_points(all_embeddings,payloads):
    points = []
    for i, payload in tqdm(enumerate(payloads), desc= "creando points"):
        points.append(
            PointStruct(
                id=int(i), 
                payload=payload,
                vector={
                    "contriever":all_embeddings[0][i],
                    "dpr": all_embeddings[1][i],
                    "bge_large": all_embeddings[2][i],
                    "sparse_bm25": SparseVector(
                        indices=all_embeddings[3][i].indices,
                        values=all_embeddings[3][i].values
                    ),
                    "sparse_splade": SparseVector(
                        indices=all_embeddings[4][i].indices,
                        values=all_embeddings[4][i].values
                    )
                }
            )
        )
    return points

In [18]:
#pon = get_points(embeddings,payloads)

In [19]:
def insert_documents(collection_name, points, batch_size=100):
    total_batches = math.ceil(len(points) / batch_size)
    for i in tqdm(range(total_batches), desc="Insertando documentos en lotes"):
        batch = points[i * batch_size : (i + 1) * batch_size]
        client.upsert(collection_name="scifact", points=batch)

In [20]:
def process_corpus(dataset_name):
    client = QdrantClient(url="http://localhost:6333")
    models = load_models()
    DIM_CONTRIEVER = models['contriever'][2]
    DIM_DPR = models['context_dpr'][2]
    DIM_BGE_L = models['bge_large'][1]
    collection_name= f"kdir_{dataset_name}"
    create_collection(client, collection_name,  DIM_CONTRIEVER, DIM_DPR, DIM_BGE_L)
    corpus, queries, qrels = get_beir_dataset(dataset_name)
    sentences, payloads = get_sentences_corpus_arguana(corpus)
    all_embeddings = get_docs_embeddings(models, sentences)
    points = get_points(embeddings,payloads)
    insert_documents(collection_name, points, batch_size=100)
    print("Coleccion creada")

In [21]:
client = QdrantClient(url="http://localhost:6333")
client.get_collections()
client.delete_collection(collection_name="kdir_arguana")

True

In [None]:
dataset_name = "arguana"
process_corpus(dataset_name)

Loading contriever...
Loading question dpr...


Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRCon

Loading context dpr...


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading bge large...
Loading bm25...
Loading splade...


  0%|          | 0/8674 [00:00<?, ?it/s]

Procesando corpus: 100%|█████████████████████████████████████| 8674/8674 [00:00<00:00, 2173580.65it/s]
