# 01_indexing_colab
Indexing pipeline for YouMed articles

In [1]:
!pip install transformers qdrant-client pinecone-client torch langchain rank-bm25




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import sys
import os
import io

sys.path.append(os.path.abspath('../src'))

In [5]:
import json
from core.chunking import MarkdownChunker

chunker = MarkdownChunker()
chunks = []
with open("../data/processed/youmed_articles.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        doc = json.loads(line)
        chunks.extend(chunker.chunk_document(doc["content"], doc["metadata"]))

print(f"Total chunks: {len(chunks)}")

Total chunks: 55973


In [6]:
from transformers import AutoModel, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
model = AutoModel.from_pretrained("BAAI/bge-m3").cuda()

def embed_all(texts, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model(**inputs).last_hidden_state
            mask = inputs["attention_mask"].unsqueeze(-1)
            pooled = (outputs * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
            pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
        embeddings.append(pooled.cpu())
    return torch.cat(embeddings, dim=0)

texts = [c.enriched_content for c in chunks]
embeddings = embed_all(texts)
print(embeddings.shape)

  _torch_pytree._register_pytree_node(
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

AssertionError: Torch not compiled with CUDA enabled

In [None]:
from src.db.vector_store import QdrantStore

store = QdrantStore(url="http://localhost:6333")
store.create_collection(name="youmed_articles", dimension=embeddings.shape[1])
store.upsert(chunks, embeddings.numpy())

In [4]:
from core.retriever import BM25Retriever

bm25_retriever = BM25Retriever(chunks)
bm25_retriever.save("models/bm25_index.pkl")
print("BM25 index saved")

BM25 index saved
