# Workflow 2: Paper Processing Pipeline - Local Testing

In [9]:
import sys
from pathlib import Path
from typing import List, Dict, Any
import hashlib

project_root = Path.cwd()
sys.path.insert(0, str(project_root.parent))

from notebook_setup import *


from src.services.pdf_parser.docling_utils import deserialize_docling_document
from src.services.chunking.chunker import ChunkingConfig, PaperChunker
from src.services.embeddings import MultiVectorEmbedder
from src.database import get_sync_session
from src.models.paper import Paper
from src.config import get_settings
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels

settings = get_settings()
workflow_data = {}
print("✓ Setup complete")

✓ Setup complete


## Step 1: Load Papers for Embedding

In [10]:
MAX_PAPERS = 6

results: List[Dict[str, Any]] = []
with get_sync_session() as session:
    papers = (
        session.query(Paper)
        .filter(Paper.is_embedded == False)
        .filter(Paper.docling_document.isnot(None))
        .limit(MAX_PAPERS)
        .all()
    )
    
    print(f"Found {len(papers)} papers needing embeddings\n")
    
    for i, p in enumerate(papers, 1):
        print(f"[{i}] {p.arxiv_id}: {p.title[:60]}...")
        results.append({
            "arxiv_id": p.arxiv_id,
            "title": p.title,
            "authors": p.authors or [],
            "categories": p.categories or [],
            "primary_category": p.primary_category,
            "published_date": p.published_date,
            "docling_document": p.docling_document,
            "affiliations": p.affiliations,
        })

workflow_data['load_papers'] = results
print(f"\n✓ Loaded {len(results)} papers")

Found 6 papers needing embeddings

[1] 2510.26802v1: Are Video Models Ready as Zero-Shot Reasoners? An Empirical ...
[2] 2510.26788v1: Defeating the Training-Inference Mismatch via FP16...
[3] 2510.26790v1: Gistify! Codebase-Level Understanding via Runtime Execution...
[4] 2510.26787v1: Remote Labor Index: Measuring AI Automation of Remote Work...
[5] 2510.26784v1: LLMs Process Lists With General Filter Heads...
[6] 2510.26782v1: Clone Deterministic 3D Worlds with Geometrically-Regularized...

✓ Loaded 6 papers


## Step 2: Ensure Qdrant Collection

In [11]:
collection_name = settings.qdrant_collection
USE_MULTI_VECTOR = True

client = QdrantClient(
    host=settings.qdrant_host,
    port=settings.qdrant_port,
    prefer_grpc=False,
    timeout=30,
)

collections = client.get_collections().collections
existing = any(c.name == collection_name for c in collections)

if not existing:
    print(f"Creating collection '{collection_name}'...")
    
    if USE_MULTI_VECTOR:
        embedder = MultiVectorEmbedder()
        dims = embedder.get_embedding_dimensions()
        
        client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "all-MiniLM-L6-v2": qmodels.VectorParams(
                    size=dims["dense_dim"],
                    distance=qmodels.Distance.COSINE,
                ),
            },
            sparse_vectors_config={
                "bm25": qmodels.SparseVectorParams(modifier=qmodels.Modifier.IDF)
            }
        )
        print(f"✓ Created with hybrid search (dense dim={dims['dense_dim']} + sparse BM25)")
    else:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=qmodels.VectorParams(
                size=settings.embedding_dim,
                distance=qmodels.Distance.COSINE,
            ),
        )
        print(f"✓ Created with single vector (dim={settings.embedding_dim})")
else:
    print(f"✓ Collection '{collection_name}' already exists")

workflow_data['qdrant_collection'] = collection_name

✓ Collection 'arxiv_chunks' already exists


## Step 3: Chunk Documents

In [12]:
papers = workflow_data.get('load_papers', [])
MAX_TOKENS = 1000

cfg = ChunkingConfig(max_tokens=MAX_TOKENS)
chunker = PaperChunker(cfg)
all_chunks: List[Dict[str, Any]] = []

for i, p in enumerate(papers, 1):
    try:
        arxiv_id = p.get('arxiv_id', 'unknown')
        
        if not p.get("docling_document"):
            print(f"[{i}] {arxiv_id}: skipped (no document)")
            continue
        
        doc = deserialize_docling_document(p["docling_document"])
        chunks = chunker.chunk_paper(doc)
        
        for idx, chunk in enumerate(chunks):
            
            all_chunks.append({
                "arxiv_id": arxiv_id,
                "title": p.get("title", ""),
                "primary_category": p.get("primary_category", ""),
                "categories": p.get("categories", []),
                "published_date": p.get("published_date"),
                "authors": p.get("authors", []),
                "chunk_index": idx,
                "chunk_text": chunk.text if hasattr(chunk, 'text') else str(chunk),
            })
        
        print(f"[{i}] {arxiv_id}: {len(chunks)} chunks")
        
    except Exception as e:
        print(f"[{i}] {p.get('arxiv_id', 'unknown')}: ✗ {e}")

result = {"papers": papers, "chunks": all_chunks}
workflow_data['chunk_documents'] = result
print(f"\n✓ Total: {len(all_chunks)} chunks from {len(papers)} papers")

Token indices sequence length is longer than the specified maximum sequence length for this model (817 > 512). Running this sequence through the model will result in indexing errors


[1] 2510.26802v1: 59 chunks
[2] 2510.26788v1: 36 chunks
[3] 2510.26790v1: 47 chunks
[4] 2510.26787v1: 66 chunks
[5] 2510.26784v1: 63 chunks
[6] 2510.26782v1: 27 chunks

✓ Total: 298 chunks from 6 papers


In [13]:
result

{'papers': [{'arxiv_id': '2510.26802v1',
   'title': 'Are Video Models Ready as Zero-Shot Reasoners? An Empirical Study with the MME-CoF Benchmark',
   'authors': ['Ziyu Guo',
    'Xinyan Chen',
    'Renrui Zhang',
    'Ruichuan An',
    'Yu Qi',
    'Dongzhi Jiang',
    'Xiangtai Li',
    'Manyuan Zhang',
    'Hongsheng Li',
    'Pheng-Ann Heng'],
   'categories': ['cs.CV', 'cs.AI', 'cs.CL'],
   'primary_category': 'cs.CV',
   'published_date': datetime.datetime(2025, 10, 30, 17, 59, 55, tzinfo=datetime.timezone.utc),
   'docling_document': {'schema_name': 'DoclingDocument',
    'version': '1.7.0',
    'name': '2510.26802v1',
    'origin': {'mimetype': 'application/pdf',
     'binary_hash': 16293615884481114082,
     'filename': '2510.26802v1.pdf',
     'uri': None},
    'furniture': {'self_ref': '#/furniture',
     'parent': None,
     'children': [],
     'content_layer': 'furniture',
     'name': '_root_',
     'label': 'unspecified'},
    'body': {'self_ref': '#/body',
     'paren

In [None]:
import re

text = workflow_data["parse_pdfs"][4]["_temp_full_text"]

start = 0
end = len(text)

references = re.search(r'Reference|References', text, re.IGNORECASE | re.DOTALL)
appendix = re.search(r'\bAppendix\b', text, re.IGNORECASE | re.DOTALL)

if references:
    print("References found")
    start = references.start()
else:
    references = re.search(r'REFERENCE|REFERENCES', text, re.IGNORECASE | re.DOTALL)
    if references:
        print("References found")
        start = references.start()

if appendix:
    print("Appendix found")
    if appendix.start() > start:
        end = appendix.start()
else:
    appendix = re.search(r'APPENDIX', text, re.IGNORECASE | re.DOTALL)
    if appendix:
        print("Appendix found")
        if appendix.start() > start:
            end = appendix.start()

chunk = text[start:end]
chunk

# split chunks text with \n 
split_text_sections = chunk.split('\n')


## Step 4: Generate Embeddings

In [5]:
payload = workflow_data.get('chunk_documents', {})
papers = payload.get("papers", [])
chunks = payload.get("chunks", [])

valid_chunks = []
valid_texts = []
for chunk in chunks:
    text = chunk.get("chunk_text", "")
    if text and isinstance(text, str) and len(text.strip()) > 0:
        valid_chunks.append(chunk)
        valid_texts.append(text.strip())

print(f"Valid chunks: {len(valid_chunks)}/{len(chunks)}\n")

if valid_chunks and USE_MULTI_VECTOR:
    print("Generating hybrid embeddings (dense + sparse BM25)...")
    embedder = MultiVectorEmbedder()
    dense_embs, sparse_embs = embedder.embed_documents(valid_texts)
    
    for i, (dense, sparse) in enumerate(zip(dense_embs, sparse_embs)):
        valid_chunks[i]["vectors"] = {
            "dense": dense,
            "sparse": sparse.as_object(),
        }
        valid_chunks[i]["embedding_model"] = "hybrid (dense + BM25)"
    
    print(f"✓ Generated {len(dense_embs)} hybrid embeddings")
    print(f"  Dense dimension: {len(dense_embs[0]) if dense_embs else 0}")
else:
    print("No chunks to embed or multi-vector disabled")

result = {"papers": papers, "chunks": valid_chunks}
workflow_data['generate_embeddings'] = result
print(f"\n✓ Embedding generation complete")

Valid chunks: 334/334

Generating hybrid embeddings (dense + sparse BM25)...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fetching 5 files: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 30.87it/s]

✓ Generated 334 hybrid embeddings
  Dense dimension: 384

✓ Embedding generation complete





## Step 5: Upsert to Qdrant

In [None]:
payload = workflow_data.get('generate_embeddings', {})
chunks = payload.get("chunks", [])
BATCH_SIZE = 256

upserted = 0
batch_points: list[qmodels.PointStruct] = []

def flush_batch():
    global upserted, batch_points
    if not batch_points:
        return
    client.upsert(collection_name=collection_name, points=batch_points)
    upserted += len(batch_points)
    print(f"  Upserted batch: {len(batch_points)} points")
    batch_points = []

print(f"Upserting {len(chunks)} chunks to Qdrant...\n")

for idx, ch in enumerate(chunks):
    vectors = ch.get("vectors")
    if not vectors:
        continue
    
    content_id = f"{ch.get('arxiv_id','unknown')}_{ch.get('chunk_index',idx)}"
    pid = int(hashlib.sha256(content_id.encode('utf-8')).hexdigest(), 16) % (2**63 - 1)
    
    payload_data = {k: v for k, v in ch.items() if k not in {"vectors", "vector"}}
    
    point = qmodels.PointStruct(
        id=pid,
        vector={
            "all-MiniLM-L6-v2": vectors["dense"],
            "bm25": vectors["sparse"],
        },
        payload=payload_data
    )
    batch_points.append(point)
    
    if len(batch_points) >= BATCH_SIZE:
        flush_batch()

flush_batch()

workflow_data['upsert_qdrant'] = {"upserted": upserted}
print(f"\n✓ Upserted {upserted} vectors to Qdrant")

Upserting 334 chunks to Qdrant...

  Upserted batch: 256 points
  Upserted batch: 78 points

✓ Upserted 334 vectors to Qdrant


## Step 6: Mark Papers as Embedded

In [7]:
payload = workflow_data.get('generate_embeddings', {})
papers = payload.get("papers", [])
arxiv_ids = {p.get("arxiv_id") for p in papers if p.get("arxiv_id")}

print(f"Marking {len(arxiv_ids)} papers as embedded...\n")

updated = 0
with get_sync_session() as session:
    try:
        db_papers = session.query(Paper).filter(Paper.arxiv_id.in_(list(arxiv_ids))).all()
        for dp in db_papers:
            print(f"  {dp.arxiv_id}")
            dp.is_embedded = True
            dp.is_processed = True
        session.commit()
        updated = len(db_papers)
    except Exception as e:
        print(f"✗ Error: {e}")
        session.rollback()

workflow_data['mark_embedded'] = {"updated": updated}
print(f"\n✓ Marked {updated} papers as embedded")

Marking 5 papers as embedded...

  2510.25771v1
  2510.25770v1
  2510.25758v1
  2510.25766v1
  2510.25761v1

✓ Marked 5 papers as embedded


## Summary

In [8]:
print("=" * 60)
print("WORKFLOW 2 SUMMARY")
print("=" * 60)
print(f"Papers loaded:     {len(workflow_data.get('load_papers', []))}")
print(f"Chunks generated:  {len(workflow_data.get('chunk_documents', {}).get('chunks', []))}")
print(f"Chunks embedded:   {len(workflow_data.get('generate_embeddings', {}).get('chunks', []))}")
print(f"Vectors upserted:  {workflow_data.get('upsert_qdrant', {}).get('upserted', 0)}")
print(f"Papers marked:     {workflow_data.get('mark_embedded', {}).get('updated', 0)}")
print("=" * 60)

WORKFLOW 2 SUMMARY
Papers loaded:     5
Chunks generated:  334
Chunks embedded:   334
Vectors upserted:  334
Papers marked:     5
