# Chunking and Retrieval Evaluation

Compare multiple chunkers and retrieval configs on local data.

In [None]:
from pathlib import Path
from ai_interviewer_pm.ingestion.chunkers import RecursiveCharChunker, SentenceBoundaryChunker, TimestampChunker
from ai_interviewer_pm.retrieval.vectorstore import build_vectorstore, similarity_search
from ai_interviewer_pm.settings import settings

data_dir = Path(settings.data_dir)
texts = [p.read_text(encoding="utf-8") for p in data_dir.glob('*.vtt')]
chunkers = {
    'recursive': RecursiveCharChunker(chunk_size=800, chunk_overlap=160),
    'sentence': SentenceBoundaryChunker(sentences_per_chunk=6),
    'timestamp': TimestampChunker(),
}

for name, ch in chunkers.items():
    chunks = []
    metas = []
    for t in texts:
        cs = ch.split(t, metadata={'chunker': name})
        chunks.extend([c.text for c in cs])
        metas.extend([c.metadata for c in cs])
    store = build_vectorstore(chunks, metas)
    print(name, similarity_search(store, 'behavioral interview question', k=3))
