In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
client = QdrantClient(url="http://localhost:6333")
client.get_collections()

CollectionsResponse(collections=[])

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
model = AutoModel.from_pretrained('facebook/contriever')

sentences = [
    "Where was Marie Curie born?",
    "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
    "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
]

# Apply tokenizer
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
outputs = model(**inputs)

# Mean pooling
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings
embeddings = mean_pooling(outputs[0], inputs['attention_mask'])


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
len(embeddings[0])

768

In [7]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("facebook/contriever")

No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.


In [2]:
# Cargar modelo de embedding
# https://huggingface.co/spaces/mteb/leaderboard

# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0

from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-4B")

# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
# model = SentenceTransformer(
#     "Qwen/Qwen3-Embedding-4B",
#     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
#     tokenizer_kwargs={"padding_side": "left"},
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/7.26k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [30]:
query_embedding = model.encode("What is the capital of China?", prompt_name="query")

client.create_collection(
    collection_name="scifact",
    vectors_config=VectorParams(size=len(query_embedding), distance=Distance.COSINE), ##DOT
)

True

In [29]:
#client.delete_collection(collection_name="scifact")

True

In [11]:
# Load Dataset
from beir import util
from beir.datasets.data_loader import GenericDataLoader

#### Descarga el dataset SciFact en la carpeta `datasets/scifact`
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip"
out_dir = "datasets"

data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/5183 [00:00<?, ?it/s]

In [26]:
len(corpus)

5183

In [31]:
from tqdm import tqdm
from qdrant_client.models import PointStruct
import math

def insert_documents(json_data, batch_size=100):
    all_doc_items = list(json_data.items())
    total_batches = math.ceil(len(all_doc_items) / batch_size)

    for i in tqdm(range(total_batches), desc="Insertando documentos en lotes"):
        batch = all_doc_items[i * batch_size : (i + 1) * batch_size]
        points = []

        for doc_id, contenido in batch:
            vector = model.encode(contenido['text'])  # vector del abstract
            payload = {
                'doc_id': doc_id,
                'title': contenido['title'],
                'text': contenido['text']
            }
            points.append(
                PointStruct(id=int(doc_id), vector=vector, payload=payload)
            )

        client.upsert(collection_name="scifact", points=points)


In [32]:
insert_documents(corpus)

Insertando documentos en lotes: 100%|████| 52/52 [21:51<00:00, 25.21s/it]


In [34]:
# crear funcion para recuperar
def recuperar_documentos(query_text, top_k=10):
    query_embeddings = model.encode(query_text, prompt_name="query") 
    search_result = client.query_points(
        collection_name="scifact",
        query=query_embeddings,
        with_payload=True,
        limit=top_k
    ).points
    return search_result

"""
qrels = {
    "q1" : {"doc1": 1},
    "q2" : {"doc2": 1},
}
"""
def format_result(rel_docs):
    docs_inference={}
    for doc in rel_docs:
        doc_id = str(doc.id)
        score = doc.score
        docs_inference[doc_id] = score
    return docs_inference

def get_final_format(queries):
    results = {}
    for qid, query_text in tqdm(queries.items(), desc="obteniendo resultados"):
        rel_docs = recuperar_documentos(query_text)
        results[qid] = format_result(rel_docs)
    return results

In [35]:
resultados = get_final_format(queries)

obteniendo resultados: 100%|███████████| 300/300 [00:27<00:00, 10.82it/s]


In [36]:
import json
with open("results.json", "w") as f:
    json.dump(resultados, f, indent=2)

In [33]:
#Evaluar 

In [38]:
from beir import util
from beir.retrieval.evaluation import EvaluateRetrieval

# Cargar qrels y resultados
import json

# with open("qrels/qrels.json") as f:
#     qrels = json.load(f)

# with open("results/results.json") as f:
#     results = json.load(f)

retriever = EvaluateRetrieval()
metrics = retriever.evaluate(qrels, resultados, k_values=[1, 3, 5, 10])
print(metrics)


({'NDCG@1': 0.64667, 'NDCG@3': 0.7162, 'NDCG@5': 0.75226, 'NDCG@10': 0.77172}, {'MAP@1': 0.61606, 'MAP@3': 0.68817, 'MAP@5': 0.71354, 'MAP@10': 0.72414}, {'Recall@1': 0.61606, 'Recall@3': 0.76289, 'Recall@5': 0.85228, 'Recall@10': 0.90667}, {'P@1': 0.64667, 'P@3': 0.27889, 'P@5': 0.19133, 'P@10': 0.10333})


In [1]:
# Crear un pseudo query y testar

In [None]:
# Crear multiples pseudo documentos y probar

In [None]:
# Recuperar documentos y crear psudocumento

In [None]:
# probar con multiples documentos

In [39]:
len(queries)

300