# 01_indexing_colab
Indexing pipeline for YouMed articles

In [None]:
!pip install transformers qdrant-client pinecone-client torch langchain rank-bm25

In [None]:
import json
from src.core.chunking import MarkdownChunker

chunker = MarkdownChunker()
chunks = []
with open("data/processed/youmed_articles_test.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        doc = json.loads(line)
        chunks.extend(chunker.chunk_document(doc["content"], doc["metadata"]))

print(f"Total chunks: {len(chunks)}")

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
model = AutoModel.from_pretrained("BAAI/bge-m3").cuda()

def embed_all(texts, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model(**inputs).last_hidden_state
            mask = inputs["attention_mask"].unsqueeze(-1)
            pooled = (outputs * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
            pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
        embeddings.append(pooled.cpu())
    return torch.cat(embeddings, dim=0)

texts = [c.enriched_content for c in chunks]
embeddings = embed_all(texts)
print(embeddings.shape)

In [None]:
from src.db.vector_store import QdrantStore

store = QdrantStore(url="http://localhost:6333")
store.create_collection(name="youmed_articles", dimension=embeddings.shape[1])
store.upsert(chunks, embeddings.numpy())

In [None]:
from src.core.retriever import BM25Retriever
import pickle

bm25_retriever = BM25Retriever(chunks)
with open("models/bm25_index.pkl", "wb") as f:
    pickle.dump(bm25_retriever, f)
print("BM25 index saved")