In [1]:
#@title 1) System setup: system libs + pinned Python deps (safe for spaCy/NumPy)
!apt-get update -y -qq
!apt-get install -y -qq poppler-utils tesseract-ocr

# Clean conflicting NLP stack to avoid ABI issues
!pip uninstall -y -q spacy thinc scispacy numpy catalogue cymem murmurhash preshed srsly wasabi blis || true

# Install core Python packages (pin NumPy and NLP stack)
!pip install -q --no-cache-dir "numpy==2.0.2" "spacy==3.7.5" "thinc==8.2.5" "scispacy==0.5.4"

# SciSpacy model
!pip install -q "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz"

# Core utilities
!pip install -q --no-deps pillow pdf2image pytesseract

# Vector + Graph
!pip install -q pymilvus==2.5.3 neo4j python-dotenv

# Models - FIXED: Install colpali-engine correctly
!pip install -q torch torchvision --index-url https://download.pytorch.org/whl/cu118
!pip install -q "transformers>=4.45.0" "accelerate" "safetensors" "einops"
!pip install -q git+https://github.com/illuin-tech/colpali.git

# Optional: clear any residual state
import os, gc, torch
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("✓ System + Python deps installed")
print("\n⚠️ IMPORTANT: RESTART RUNTIME NOW!")
print("   Runtime → Restart Runtime")
print("   Then run from Cell 2 onwards")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...
Setting up poppler-utils (22.02.0-2ubuntu0.10) ...
Processing triggers for man-db (2.10.2-1) ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Cannot install numpy==2.0.2, spacy==3.7.5 and thinc==8.2.5 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
#@title 2) Imports and environment check
import os
import json
import gc
import getpass
import shutil
import traceback
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import numpy as np
import torch
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import spacy

from neo4j import GraphDatabase
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility

# Transformers and vision models
from transformers import AutoProcessor

# Larger images may trigger PIL safety checks; disable
Image.MAX_IMAGE_PIXELS = None

print(f"PyTorch: {torch.__version__} | CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Load SciSpacy NER model
print("→ Loading SciSpacy NER...")
nlp = spacy.load("en_ner_bionlp13cg_md")
print("✓ SciSpacy loaded")

PyTorch: 2.7.1+cu126 | CUDA available: True
GPU: Tesla T4
GPU Memory: 15.83 GB
Device: cuda
→ Loading SciSpacy NER...


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


✓ SciSpacy loaded


In [2]:
#@title 3) Configuration (edit paths and options as needed)

# If using Colab, you'll probably place PDFs under /content/pdfs
DATASET_PATH = "/content/pdfs"  #@param {type:"string"}
WORKING_DIR = "/content/working"  #@param {type:"string"}

# In test mode, only process the first few PDFs and reduce load
TEST_MODE = True  #@param {type:"boolean"}
LIMIT = 30 if TEST_MODE else None
BATCH_SIZE = 1  #@param {type:"integer"}
DPI = 200  #@param {type:"integer"}

# Neo4j config - provide your Aura or self-hosted credentials
print("\n=== Neo4j Configuration ===")
NEO4J_URI = input("Enter NEO4J_URI (e.g., neo4j+s://xxxxx.databases.neo4j.io): ").strip()
NEO4J_USER = input("Enter NEO4J_USER (e.g., neo4j): ").strip()
NEO4J_PASSWORD = getpass.getpass("Enter NEO4J_PASSWORD: ").strip()

# Zilliz (Milvus Cloud) config
print("\n=== Zilliz Configuration ===")
ZILLIZ_URI = input("Enter Zilliz URI (https://xxx.cloud.zilliz.com): ").strip()
ZILLIZ_TOKEN = getpass.getpass("Enter Zilliz API token: ").strip()

# Make sure working dirs exist
Path(WORKING_DIR).mkdir(parents=True, exist_ok=True)
Path(DATASET_PATH).mkdir(parents=True, exist_ok=True)

print("\n✓ Configuration loaded")
print(f"  Test Mode: {TEST_MODE}")
print(f"  Dataset Path: {DATASET_PATH}")


=== Neo4j Configuration ===
Enter NEO4J_URI (e.g., neo4j+s://xxxxx.databases.neo4j.io): neo4j+s://239d94db.databases.neo4j.io
Enter NEO4J_USER (e.g., neo4j): neo4j
Enter NEO4J_PASSWORD: ··········

=== Zilliz Configuration ===
Enter Zilliz URI (https://xxx.cloud.zilliz.com): https://in03-0a05f9ae02837d6.serverless.aws-eu-central-1.cloud.zilliz.com
Enter Zilliz API token: ··········

✓ Configuration loaded
  Test Mode: True
  Dataset Path: /content/pdfs


In [3]:
#@title 4) Initialize connections (Neo4j + Zilliz)
print("\n=== INITIALIZING CONNECTIONS ===\n")

# Neo4j
try:
    neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    with neo4j_driver.session() as session:
        session.run("RETURN 1")
    print("✓ Neo4j connected")
except Exception as e:
    print(f"✗ Neo4j connection failed: {e}")
    raise

# Zilliz (Milvus Cloud)
try:
    connections.connect(alias="default", uri=ZILLIZ_URI, token=ZILLIZ_TOKEN)
    print("✓ Zilliz connected")
    print(f"  Server version: {utility.get_server_version()}")
except Exception as e:
    print(f"✗ Zilliz connection failed: {e}")
    raise


=== INITIALIZING CONNECTIONS ===

✓ Neo4j connected
✓ Zilliz connected
  Server version: Zilliz Cloud Vector Database(Compatible with Milvus 2.5)


In [4]:
#@title 5) Load vision embedding model (ColPali - FIXED with proper pooling)
print("\n=== LOADING VISION MODEL ===\n")
USING_CLIP_FALLBACK = False

colpali_model = None
colpali_processor = None

try:
    # FIXED: Correct import paths for ColPali v1.3
    print("→ Loading ColPali v1.3...")

    # Try import method 1: Direct imports
    try:
        from colpali_engine.models.paligemma.colpali.modeling_colpali import ColPali
        from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
        print("  ✓ Using ColPali direct imports")
    except ImportError:
        # Try import method 2: Package-level imports
        try:
            from colpali_engine.models import ColPali
            from colpali_engine.models import ColPaliProcessor
            print("  ✓ Using ColPali package imports")
        except ImportError:
            # Try import method 3: Simpler path
            from colpali_engine import ColPali, ColPaliProcessor
            print("  ✓ Using ColPali simple imports")

    colpali_model_name = "vidore/colpali-v1.3-hf"

    # Load processor
    print("  → Loading processor...")
    try:
        colpali_processor = ColPaliProcessor.from_pretrained(
            colpali_model_name,
            use_fast=True  # Suppress the warning
        )
    except Exception as e:
        print(f"  ⚠ ColPaliProcessor failed: {e}, trying AutoProcessor")
        colpali_processor = AutoProcessor.from_pretrained(
            colpali_model_name,
            trust_remote_code=True
        )

    # Load model
    print("  → Loading model...")
    colpali_model = ColPali.from_pretrained(
        colpali_model_name,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto" if device == "cuda" else None,
    ).eval()

    print(f"✓ ColPali v1.3 loaded successfully on {device.upper()}")
    USING_CLIP_FALLBACK = False

except Exception as e:
    print(f"⚠ ColPali load failed: {e}")
    print(traceback.format_exc())
    print("\n→ Falling back to CLIP")

    try:
        from transformers import CLIPProcessor, CLIPModel
        clip_model_name = "openai/clip-vit-base-patch32"
        colpali_processor = CLIPProcessor.from_pretrained(clip_model_name)
        colpali_model = CLIPModel.from_pretrained(
            clip_model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        ).eval()
        if device == "cuda":
            colpali_model = colpali_model.to(device)
        USING_CLIP_FALLBACK = True
        print("✓ CLIP model loaded as fallback")
    except Exception as e2:
        print(f"✗ CLIP fallback also failed: {e2}")
        raise

# FIXED: Detect embedding dimension with proper pooling for ColPali
# FIXED: Detect embedding dimension with proper pooling for ColPali
def detect_embedding_dim():
    print("\n=== DETECTING EMBEDDING DIMENSION ===")
    test_img = Image.new('RGB', (224, 224), color='white')
    try:
        if USING_CLIP_FALLBACK:
            inputs = colpali_processor(images=[test_img], return_tensors="pt", padding=True)
        else:
            # ColPali expects <image> token in text
            inputs = colpali_processor(
                text=["<image>"],
                images=[test_img],
                return_tensors="pt",
                padding=True
            )

        if device == "cuda":
            inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}

        with torch.no_grad():
            if USING_CLIP_FALLBACK:
                emb = colpali_model.get_image_features(**inputs)
                emb = emb.reshape(emb.shape[0], -1)
                dim = int(emb.shape[-1])
                num_patches = 1
            else:
                # ColPali-specific embedding extraction
                out = colpali_model(**inputs)

                if hasattr(out, "last_hidden_state"):
                    emb = out.last_hidden_state
                elif hasattr(out, "embeddings"):
                    emb = out.embeddings
                elif isinstance(out, torch.Tensor):
                    emb = out
                else:
                    emb = out[0] if isinstance(out, (tuple, list)) else out

                print(f"  Raw ColPali output shape: {emb.shape}")

                # CRITICAL: Keep multi-vector output [batch, num_patches, 128]
                if len(emb.shape) == 3:
                    num_patches = emb.shape[1]
                    dim = emb.shape[2]
                    print(f"  → Multi-vector output: {num_patches} patches × {dim} dimensions")
                else:
                    emb = emb.reshape(emb.shape[0], -1)
                    dim = int(emb.shape[-1])
                    num_patches = 1

        print(f"✓ Embedding dimension per patch: {dim}")
        print(f"✓ Number of patches per image: {num_patches}")
        print(f"  Model: {type(colpali_model).__name__}")

        # Cleanup
        del inputs, out, emb, test_img
        if device == "cuda":
            torch.cuda.empty_cache()

        return dim, num_patches

    except Exception as e:
        print(f"⚠ Dimension detection failed: {e}")
        print(traceback.format_exc())
        default_dim = 512 if USING_CLIP_FALLBACK else 128
        default_patches = 1 if USING_CLIP_FALLBACK else 1024
        print(f"→ Using defaults: {default_dim} dim, {default_patches} patches")
        return default_dim, default_patches

EMBEDDING_DIM, NUM_PATCHES = detect_embedding_dim()
print(f"\n✓ Final config: {EMBEDDING_DIM}D × {NUM_PATCHES} patches")
print(f"✓ Using CLIP fallback: {USING_CLIP_FALLBACK}")


=== LOADING VISION MODEL ===

→ Loading ColPali v1.3...
  ✓ Using ColPali direct imports
  → Loading processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

  → Loading model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/862M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of ColPali were not initialized from the model checkpoint at vidore/colpali-v1.3-hf and are newly initialized: ['custom_text_proj.bias', 'custom_text_proj.weight', 'model.model.language_model.embed_tokens.weight', 'model.model.language_model.layers.0.input_layernorm.weight', 'model.model.language_model.layers.0.mlp.down_proj.weight', 'model.model.language_model.layers.0.mlp.gate_proj.weight', 'model.model.language_model.layers.0.mlp.up_proj.weight', 'model.model.language_model.layers.0.post_attention_layernorm.weight', 'model.model.language_model.layers.0.self_attn.k_proj.weight', 'model.model.language_model.layers.0.self_attn.o_proj.weight', 'model.model.language_model.layers.0.self_attn.q_proj.weight', 'model.model.language_model.layers.0.self_attn.v_proj.weight', 'model.model.language_model.layers.1.input_layernorm.weight', 'model.model.language_model.layers.1.mlp.down_proj.weight', 'model.model.language_model.layers.1.mlp.gate_proj.weight', 'model.model.language_model.

✓ ColPali v1.3 loaded successfully on CUDA

=== DETECTING EMBEDDING DIMENSION ===




  Raw ColPali output shape: torch.Size([1, 1026, 128])
  → Multi-vector output: 1026 patches × 128 dimensions
✓ Embedding dimension per patch: 128
✓ Number of patches per image: 1026
  Model: ColPali

✓ Final config: 128D × 1026 patches
✓ Using CLIP fallback: False


In [5]:
#@title 6) Neo4j schema + Zilliz collection setup
COLLECTION_NAME = "nasa_bioscience_papers"

def setup_neo4j_schema():
    with neo4j_driver.session() as session:
        queries = [
            "CREATE CONSTRAINT IF NOT EXISTS FOR (p:Publication) REQUIRE p.pub_id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (pg:Page) REQUIRE pg.page_id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE e.entity_id IS UNIQUE",
            "CREATE INDEX IF NOT EXISTS FOR (e:Entity) ON (e.name)",
            "CREATE INDEX IF NOT EXISTS FOR (e:Entity) ON (e.entity_type)",
            "CREATE INDEX IF NOT EXISTS FOR (pg:Page) ON (pg.pub_id)",
        ]
        for query in queries:
            try:
                session.run(query)
                print(f"  ✓ {query[:65]}...")
            except Exception as e:
                if "already exists" not in str(e).lower():
                    print(f"  ⚠ {query[:65]}... - {e}")
    print("✓ Neo4j schema created")

def create_zilliz_collection():
    """Create collection with multi-vector support (one row per patch)"""
    # Drop if exists to get a clean start
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)
        print(f"Dropped existing collection: {COLLECTION_NAME}")

    print(f"Creating multi-vector collection (dim={EMBEDDING_DIM})")

    # CRITICAL: Store each patch as a separate vector
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="page_id", dtype=DataType.VARCHAR, max_length=200),
        FieldSchema(name="pub_id", dtype=DataType.VARCHAR, max_length=200),
        FieldSchema(name="page_num", dtype=DataType.INT64),
        FieldSchema(name="patch_num", dtype=DataType.INT64),  # NEW: track which patch this is
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM),
        FieldSchema(name="image_path", dtype=DataType.VARCHAR, max_length=500),
    ]

    schema = CollectionSchema(fields, description="NASA Bioscience Papers (Multi-Vector)")
    collection = Collection(name=COLLECTION_NAME, schema=schema)

    index_params = {
        "metric_type": "IP",
        "index_type": "AUTOINDEX",
        "params": {}
    }
    collection.create_index(field_name="embedding", index_params=index_params)
    print(f"✓ Created collection: {COLLECTION_NAME}")
    print(f"  Dimension: {EMBEDDING_DIM}")
    print(f"  Storage mode: Multi-vector (1 row per patch)")
    return collection

print("\n=== SETTING UP NEO4J ===")
setup_neo4j_schema()

print("\n=== SETTING UP ZILLIZ ===")
zilliz_collection = create_zilliz_collection()


=== SETTING UP NEO4J ===
  ✓ CREATE CONSTRAINT IF NOT EXISTS FOR (p:Publication) REQUIRE p.pub...
  ✓ CREATE CONSTRAINT IF NOT EXISTS FOR (pg:Page) REQUIRE pg.page_id ...
  ✓ CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE e.entity_i...
  ✓ CREATE INDEX IF NOT EXISTS FOR (e:Entity) ON (e.name)...
  ✓ CREATE INDEX IF NOT EXISTS FOR (e:Entity) ON (e.entity_type)...
  ✓ CREATE INDEX IF NOT EXISTS FOR (pg:Page) ON (pg.pub_id)...
✓ Neo4j schema created

=== SETTING UP ZILLIZ ===
Creating multi-vector collection (dim=128)
✓ Created collection: nasa_bioscience_papers
  Dimension: 128
  Storage mode: Multi-vector (1 row per patch)


In [6]:
#@title 7) Utilities: PDF→images, OCR, embeddings, NER, relations
def cleanup_temp_files(pub_id: str):
    temp_dir = Path(WORKING_DIR) / "pages" / pub_id
    if temp_dir.exists():
        try:
            shutil.rmtree(temp_dir)
            print(f"  ✓ Cleaned temp files for {pub_id}")
        except Exception as e:
            print(f"  ⚠ Could not clean temp files: {e}")

def pdf_to_images(pdf_path: str, dpi: int = 150) -> List[str]:
    out_dir = Path(WORKING_DIR) / "pages" / Path(pdf_path).stem
    out_dir.mkdir(parents=True, exist_ok=True)

    try:
        images = convert_from_path(pdf_path, dpi=dpi, thread_count=2)
        paths = []
        for i, img in enumerate(images, start=1):
            img_path = out_dir / f"page_{i:03d}.png"
            img.save(img_path, format="PNG", optimize=True)
            paths.append(str(img_path))
            img.close()
        del images
        gc.collect()
        return paths
    except Exception as e:
        print(f"  ⚠ Error converting PDF {pdf_path}: {e}")
        raise

def extract_text_ocr(image_path: str) -> str:
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img, lang="eng").strip()
        img.close()
        return text
    except Exception as e:
        print(f"  ⚠ OCR error on {image_path}: {e}")
        return ""

def embed_images_colpali(image_paths: List[str], batch_size: int = 2) -> List[np.ndarray]:
    """
    Generate multi-vector embeddings for ColPali.
    Returns: List of arrays, each with shape [num_patches, embedding_dim]
    """
    all_embeddings = []

    for i in range(0, len(image_paths), batch_size):
        batch = image_paths[i:i+batch_size]
        images = []

        try:
            images = [Image.open(p).convert("RGB") for p in batch]

            # Process images with proper tokens
            if USING_CLIP_FALLBACK:
                inputs = colpali_processor(images=images, return_tensors="pt", padding=True)
            else:
                texts = ["<image>" for _ in images]
                inputs = colpali_processor(
                    text=texts,
                    images=images,
                    return_tensors="pt",
                    padding=True
                )

            if device == "cuda":
                inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}

            with torch.no_grad():
                if USING_CLIP_FALLBACK:
                    emb = colpali_model.get_image_features(**inputs)
                    # CLIP: single vector per image [batch, dim]
                    emb = emb.detach().cpu().float().numpy()
                    # Reshape to [batch, 1, dim] for consistency
                    emb = emb[:, np.newaxis, :]
                else:
                    out = colpali_model(**inputs)

                    # Extract embeddings from ColPali output
                    if hasattr(out, "last_hidden_state"):
                        emb = out.last_hidden_state
                    elif hasattr(out, "embeddings"):
                        emb = out.embeddings
                    elif isinstance(out, torch.Tensor):
                        emb = out
                    else:
                        emb = out[0] if isinstance(out, (tuple, list)) else out

                    # Keep multi-vector format [batch, num_patches, dim]
                    emb = emb.detach().cpu().float().numpy()

                    if len(emb.shape) == 2:
                        # Single vector - reshape to [batch, 1, dim]
                        emb = emb[:, np.newaxis, :]
                    elif len(emb.shape) != 3:
                        raise ValueError(f"Unexpected embedding shape: {emb.shape}")

            # Add each image's multi-vector embedding
            for img_emb in emb:
                # img_emb has shape [num_patches, dim]
                all_embeddings.append(img_emb)

        except Exception as e:
            print(f"  ⚠ Embedding error for batch {i//batch_size}: {e}")
            print(traceback.format_exc())
            # Fallback: single zero vector per image
            for _ in batch:
                all_embeddings.append(np.zeros((1, EMBEDDING_DIM), dtype=np.float32))

        finally:
            # Cleanup
            for img in images:
                try:
                    img.close()
                except:
                    pass
            del images
            if 'inputs' in locals():
                del inputs
            if 'out' in locals():
                del out
            if 'emb' in locals():
                del emb
            gc.collect()
            if device == "cuda":
                torch.cuda.empty_cache()

    return all_embeddings

def extract_entities(text: str) -> List[Dict]:
    try:
        doc = nlp(text[:10000])  # limit for speed
        return [{"text": e.text, "label": e.label_, "start": e.start_char, "end": e.end_char}
                for e in doc.ents]
    except Exception as e:
        print(f"  ⚠ Entity extraction error: {e}")
        return []

def extract_relations(text: str, entities: List[Dict]) -> List[Dict]:
    relations = []
    patterns = {
        "UPREGULATES": ["upregulate", "increase", "enhance", "promote", "stimulate"],
        "DOWNREGULATES": ["downregulate", "decrease", "inhibit", "suppress", "reduce"],
        "INTERACTS_WITH": ["interact", "bind", "associate", "complex"],
        "METABOLIZES": ["metabolize", "convert", "transform", "process"],
    }
    try:
        sentences = text.split('.')[:30]
        for sent in sentences:
            sent_lower = sent.lower()
            ents = [e for e in entities if e["text"].lower() in sent_lower]
            if len(ents) >= 2:
                for rel_type, keywords in patterns.items():
                    if any(kw in sent_lower for kw in keywords):
                        relations.append({
                            "subject": ents[0]["text"],
                            "relation": rel_type,
                            "object": ents[1]["text"],
                            "evidence": sent[:200]
                        })
                        break
                if len(relations) >= 20:
                    break
    except Exception as e:
        print(f"  ⚠ Relation extraction error: {e}")
    return relations

In [7]:
#@title 8) Neo4j ingestion helpers
def ingest_publication(pub_id: str, metadata: Dict):
    try:
        with neo4j_driver.session() as session:
            session.run("""
                MERGE (p:Publication {pub_id: $pub_id})
                SET p.total_pages = $pages,
                    p.processed_at = datetime()
            """, {"pub_id": pub_id, "pages": metadata.get("total_pages", 0)})
    except Exception as e:
        print(f"  ⚠ Publication ingestion error: {e}")

def ingest_page(page_id: str, pub_id: str, page_num: int, text: str):
    try:
        with neo4j_driver.session() as session:
            truncated_text = text[:30000] if text else ""
            session.run("""
                MERGE (pg:Page {page_id: $page_id})
                SET pg.pub_id = $pub_id,
                    pg.page_num = $page_num,
                    pg.text = $text,
                    pg.text_length = $text_length
                WITH pg
                MATCH (p:Publication {pub_id: $pub_id})
                MERGE (pg)-[:PART_OF]->(p)
            """, {
                "page_id": page_id,
                "pub_id": pub_id,
                "page_num": page_num,
                "text": truncated_text,
                "text_length": len(text)
            })
    except Exception as e:
        print(f"  ⚠ Page ingestion error: {e}")

def ingest_entity(entity: Dict, page_id: str):
    try:
        entity_id = f"{entity['label']}_{abs(hash(entity['text'])) % 10**9}"
        with neo4j_driver.session() as session:
            session.run("""
                MERGE (e:Entity {entity_id: $eid})
                SET e.name = $name, e.entity_type = $type
                WITH e
                MATCH (pg:Page {page_id: $page_id})
                MERGE (e)-[:MENTIONED_IN]->(pg)
            """, {
                "eid": entity_id,
                "name": entity["text"],
                "type": entity["label"],
                "page_id": page_id
            })
    except Exception:
        pass  # tolerate per-entity failures

def ingest_relation(rel: Dict, page_id: str, pub_id: str):
    try:
        with neo4j_driver.session() as session:
            session.run("""
                MERGE (s:Entity {name: $subj})
                MERGE (o:Entity {name: $obj})
                MERGE (s)-[r:RELATES {relation_type: $rel, source_pub: $pub}]->(o)
                SET r.evidence = $evidence
            """, {
                "subj": rel["subject"],
                "obj": rel["object"],
                "rel": rel["relation"],
                "pub": pub_id,
                "evidence": rel["evidence"]
            })
    except Exception:
        pass

In [8]:
#@title 9) Zilliz insertion helper (with explicit fields)
def insert_to_zilliz(data: List[Dict]):
    """Insert multi-vector embeddings (one row per patch)"""
    if not data:
        return

    try:
        zilliz_collection.load()

        # Flatten: each patch becomes a separate row
        rows_to_insert = []

        for item in data:
            embedding = item["embedding"]  # shape: [num_patches, dim]
            num_patches = embedding.shape[0]

            for patch_idx in range(num_patches):
                rows_to_insert.append({
                    "page_id": item["page_id"],
                    "pub_id": item["pub_id"],
                    "page_num": item["page_num"],
                    "patch_num": patch_idx,
                    "embedding": embedding[patch_idx].tolist(),
                    "image_path": item["image_path"]
                })

        # Insert in batches
        batch_size = 100
        for i in range(0, len(rows_to_insert), batch_size):
            batch = rows_to_insert[i:i+batch_size]

            insert_data = [
                [d["page_id"] for d in batch],
                [d["pub_id"] for d in batch],
                [d["page_num"] for d in batch],
                [d["patch_num"] for d in batch],
                [d["embedding"] for d in batch],
                [d["image_path"] for d in batch],
            ]

            zilliz_collection.insert(insert_data)

        zilliz_collection.flush()
        print(f"  ✓ Inserted {len(rows_to_insert)} patch vectors ({len(data)} pages) to Zilliz")

    except Exception as e:
        print(f"  ⚠ Zilliz insertion error: {e}")
        print(traceback.format_exc())

In [9]:
#@title 10) PDF processing pipeline
def process_pdf(pdf_path: str) -> bool:
    pub_id = Path(pdf_path).stem
    print(f"\n{'='*60}\nProcessing: {pub_id}\n{'='*60}")
    try:
        print("→ Converting to images...")
        images = pdf_to_images(pdf_path, dpi=DPI)
        print(f"  {len(images)} pages")

        print("→ Generating embeddings...")
        embeddings = embed_images_colpali(images, batch_size=BATCH_SIZE)
        if len(embeddings) != len(images):
            print(f"  ⚠ Embedding count mismatch: {len(embeddings)} vs {len(images)}")
            return False

        ingest_publication(pub_id, {"total_pages": len(images)})

        zilliz_data = []
        for i, (img_path, emb) in enumerate(zip(images, embeddings), start=1):
            page_id = f"{pub_id}_p{i}"
            print(f"  Page {i}/{len(images)}", end="")

            text = extract_text_ocr(img_path)
            ingest_page(page_id, pub_id, i, text)

            if text:
                entities = extract_entities(text)
                print(f" - {len(entities)} entities", end="")
                for entity in entities[:30]:
                    ingest_entity(entity, page_id)
                if i % 3 == 1:
                    relations = extract_relations(text, entities)
                    print(f" - {len(relations)} relations", end="")
                    for rel in relations:
                        ingest_relation(rel, page_id, pub_id)

            print()

            zilliz_data.append({
                "page_id": page_id,
                "pub_id": pub_id,
                "page_num": i,
                "embedding": emb,
                "image_path": img_path
            })

            if i % 10 == 0:
                gc.collect()
                if device == "cuda":
                    torch.cuda.empty_cache()

        print("→ Inserting to Zilliz...")
        insert_to_zilliz(zilliz_data)

        cleanup_temp_files(pub_id)
        print(f"✓ Completed: {pub_id}")
        return True

    except Exception as e:
        print(f"\n✗ ERROR processing {pub_id}: {e}")
        traceback.print_exc()
        cleanup_temp_files(pub_id)
        return False

def process_all(pdf_dir: str, limit: Optional[int] = None):
    pdfs = sorted(list(Path(pdf_dir).glob("*.pdf")))
    if limit:
        pdfs = pdfs[:limit]

    print(f"\n{'='*60}\nPROCESSING {len(pdfs)} PDFs\nTest Mode: {TEST_MODE}\n{'='*60}")
    success_count = 0
    failed_pdfs = []

    for i, pdf in enumerate(pdfs, 1):
        print(f"\n[{i}/{len(pdfs)}] - {pdf.name}")
        try:
            if process_pdf(str(pdf)):
                success_count += 1
            else:
                failed_pdfs.append(pdf.name)
        except Exception as e:
            print(f"CRITICAL ERROR: {e}")
            failed_pdfs.append(pdf.name)

        gc.collect()
        if device == "cuda":
            torch.cuda.empty_cache()

    print(f"\n{'='*60}\nPROCESSING SUMMARY\n{'='*60}")
    print(f"Successful: {success_count}/{len(pdfs)}")
    if failed_pdfs:
        print("Failed PDFs:")
        for pdf_name in failed_pdfs:
            print(f"  - {pdf_name}")

In [10]:
#@title 11) Run the ingestion pipeline
print("\n" + "="*60)
print("STARTING INGESTION PIPELINE")
print("="*60)
print(f"Mode: {'TEST' if TEST_MODE else 'PRODUCTION'}")
print(f"Limit: {LIMIT if LIMIT else 'No limit'}")
print(f"Dataset: {DATASET_PATH}")

if not Path(DATASET_PATH).exists():
    print(f"\n✗ ERROR: Dataset path not found: {DATASET_PATH}")
    print("Please check the path and try again.")
else:
    pdf_count = len(list(Path(DATASET_PATH).glob("*.pdf")))
    print(f"Found {pdf_count} PDFs in dataset")
    process_all(DATASET_PATH, limit=LIMIT)
    print("\n" + "="*60)
    print("INGESTION COMPLETE")
    print("="*60)


STARTING INGESTION PIPELINE
Mode: TEST
Limit: 30
Dataset: /content/pdfs
Found 30 PDFs in dataset

PROCESSING 30 PDFs
Test Mode: True

[1/30] - 10528_2010_Article_9411.pdf

Processing: 10528_2010_Article_9411
→ Converting to images...
  23 pages
→ Generating embeddings...




  Page 1/23 - 28 entities - 1 relations
  Page 2/23 - 74 entities
  Page 3/23 - 39 entities
  Page 4/23 - 29 entities - 0 relations
  Page 5/23 - 17 entities
  Page 6/23 - 34 entities
  Page 7/23 - 13 entities - 1 relations
  Page 8/23 - 37 entities
  Page 9/23 - 55 entities
  Page 10/23 - 21 entities - 2 relations
  Page 11/23 - 31 entities
  Page 12/23 - 39 entities
  Page 13/23 - 37 entities - 7 relations
  Page 14/23 - 34 entities
  Page 15/23 - 27 entities
  Page 16/23 - 42 entities - 4 relations
  Page 17/23 - 16 entities
  Page 18/23 - 44 entities
  Page 19/23 - 46 entities - 6 relations
  Page 20/23 - 97 entities
  Page 21/23 - 94 entities
  Page 22/23 - 92 entities - 3 relations
  Page 23/23 - 5 entities
→ Inserting to Zilliz...
  ✓ Inserted 23598 patch vectors (23 pages) to Zilliz
  ✓ Cleaned temp files for 10528_2010_Article_9411
✓ Completed: 10528_2010_Article_9411

[2/30] - 10552_2010_Article_9684.pdf

Processing: 10552_2010_Article_9684
→ Converting to images...
  10 page



  Page 1/10 - 62 entities - 1 relations
  Page 2/10 - 93 entities
  Page 3/10 - 55 entities
  Page 4/10 - 50 entities - 7 relations
  Page 5/10 - 30 entities
  Page 6/10 - 37 entities
  Page 7/10 - 62 entities - 0 relations
  Page 8/10 - 113 entities
  Page 9/10 - 174 entities
  Page 10/10 - 64 entities - 1 relations
→ Inserting to Zilliz...
  ✓ Inserted 10260 patch vectors (10 pages) to Zilliz
  ✓ Cleaned temp files for 10552_2010_Article_9684
✓ Completed: 10552_2010_Article_9684

[3/30] - 10616_2015_Article_9843.pdf

Processing: 10616_2015_Article_9843
→ Converting to images...
  13 pages
→ Generating embeddings...




  Page 1/13 - 53 entities - 2 relations
  Page 2/13 - 76 entities
  Page 3/13 - 83 entities
  Page 4/13 - 57 entities - 1 relations
  Page 5/13 - 78 entities
  Page 6/13 - 35 entities
  Page 7/13 - 42 entities - 1 relations
  Page 8/13 - 61 entities
  Page 9/13 - 42 entities
  Page 10/13 - 66 entities - 4 relations
  Page 11/13 - 68 entities
  Page 12/13 - 153 entities
  Page 13/13 - 148 entities - 5 relations
→ Inserting to Zilliz...
  ✓ Inserted 13338 patch vectors (13 pages) to Zilliz
  ✓ Cleaned temp files for 10616_2015_Article_9843
✓ Completed: 10616_2015_Article_9843

[4/30] - 11095_2022_Article_3191.pdf

Processing: 11095_2022_Article_3191
→ Converting to images...
  11 pages
→ Generating embeddings...




  Page 1/11 - 49 entities - 0 relations
  Page 2/11 - 79 entities
  Page 3/11 - 50 entities
  Page 4/11 - 49 entities - 0 relations
  Page 5/11 - 56 entities
  Page 6/11 - 35 entities
  Page 7/11 - 51 entities - 0 relations
  Page 8/11 - 48 entities
  Page 9/11 - 28 entities
  Page 10/11 - 137 entities - 0 relations
  Page 11/11 - 15 entities
→ Inserting to Zilliz...
  ✓ Inserted 11286 patch vectors (11 pages) to Zilliz
  ✓ Cleaned temp files for 11095_2022_Article_3191
✓ Completed: 11095_2022_Article_3191

[5/30] - 12217_2017_Article_9588.pdf

Processing: 12217_2017_Article_9588
→ Converting to images...
  14 pages
→ Generating embeddings...




  Page 1/14 - 29 entities - 3 relations
  Page 2/14 - 50 entities
  Page 3/14 - 46 entities
  Page 4/14 - 19 entities - 1 relations
  Page 5/14 - 75 entities
  Page 6/14 - 32 entities
  Page 7/14 - 54 entities - 12 relations
  Page 8/14 - 28 entities
  Page 9/14 - 41 entities
  Page 10/14 - 37 entities - 0 relations
  Page 11/14 - 61 entities
  Page 12/14 - 55 entities
  Page 13/14 - 160 entities - 0 relations
  Page 14/14 - 99 entities
→ Inserting to Zilliz...
  ✓ Inserted 14364 patch vectors (14 pages) to Zilliz
  ✓ Cleaned temp files for 12217_2017_Article_9588
✓ Completed: 12217_2017_Article_9588

[6/30] - 12864_2018_Article_4948.pdf

Processing: 12864_2018_Article_4948
→ Converting to images...
  14 pages
→ Generating embeddings...




  Page 1/14 - 20 entities - 2 relations
  Page 2/14 - 26 entities
  Page 3/14 - 19 entities
  Page 4/14 - 29 entities - 5 relations
  Page 5/14 - 42 entities
  Page 6/14 - 34 entities
  Page 7/14 - 48 entities - 12 relations
  Page 8/14 - 51 entities
  Page 9/14 - 51 entities
  Page 10/14 - 23 entities - 0 relations
  Page 11/14 - 29 entities
  Page 12/14 - 35 entities
  Page 13/14 - 124 entities - 0 relations
  Page 14/14 - 29 entities
→ Inserting to Zilliz...
  ✓ Inserted 14364 patch vectors (14 pages) to Zilliz
  ✓ Cleaned temp files for 12864_2018_Article_4948
✓ Completed: 12864_2018_Article_4948

[7/30] - 12864_2025_Article_11426.pdf

Processing: 12864_2025_Article_11426
→ Converting to images...
  15 pages
→ Generating embeddings...




  Page 1/15 - 44 entities - 3 relations
  Page 2/15 - 59 entities
  Page 3/15 - 28 entities
  Page 4/15 - 78 entities - 4 relations
  Page 5/15 - 30 entities
  Page 6/15 - 69 entities
  Page 7/15 - 45 entities - 7 relations
  Page 8/15 - 22 entities
  Page 9/15 - 22 entities
  Page 10/15 - 109 entities - 7 relations
  Page 11/15 - 69 entities
  Page 12/15 - 46 entities
  Page 13/15 - 81 entities - 2 relations
  Page 14/15 - 165 entities
  Page 15/15 - 152 entities
→ Inserting to Zilliz...
  ✓ Inserted 15390 patch vectors (15 pages) to Zilliz
  ✓ Cleaned temp files for 12864_2025_Article_11426
✓ Completed: 12864_2025_Article_11426

[8/30] - 12866_2018_Article_1325.pdf

Processing: 12866_2018_Article_1325
→ Converting to images...
  13 pages
→ Generating embeddings...




  Page 1/13 - 19 entities - 2 relations
  Page 2/13 - 50 entities
  Page 3/13 - 52 entities
  Page 4/13 - 54 entities - 0 relations
  Page 5/13 - 31 entities
  Page 6/13 - 47 entities
  Page 7/13 - 55 entities - 2 relations
  Page 8/13 - 56 entities
  Page 9/13 - 45 entities
  Page 10/13 - 51 entities - 0 relations
  Page 11/13 - 44 entities
  Page 12/13 - 134 entities
  Page 13/13 - 70 entities - 0 relations
→ Inserting to Zilliz...
  ✓ Inserted 13338 patch vectors (13 pages) to Zilliz
  ✓ Cleaned temp files for 12866_2018_Article_1325
✓ Completed: 12866_2018_Article_1325

[9/30] - 12866_2022_Article_2614.pdf

Processing: 12866_2022_Article_2614
→ Converting to images...
  20 pages
→ Generating embeddings...




  Page 1/20 - 22 entities - 3 relations
  Page 2/20 - 59 entities
  Page 3/20 - 36 entities
  Page 4/20 - 76 entities - 5 relations
  Page 5/20 - 105 entities
  Page 6/20 - 47 entities
  Page 7/20 - 33 entities - 1 relations
  Page 8/20 - 31 entities
  Page 9/20 - 52 entities
  Page 10/20 - 14 entities - 0 relations
  Page 11/20 - 32 entities
  Page 12/20 - 28 entities
  Page 13/20 - 59 entities - 5 relations
  Page 14/20 - 69 entities
  Page 15/20 - 50 entities
  Page 16/20 - 31 entities - 1 relations
  Page 17/20 - 55 entities
  Page 18/20 - 103 entities
  Page 19/20 - 174 entities - 0 relations
  Page 20/20 - 35 entities
→ Inserting to Zilliz...
  ✓ Inserted 20520 patch vectors (20 pages) to Zilliz
  ✓ Cleaned temp files for 12866_2022_Article_2614
✓ Completed: 12866_2022_Article_2614

[10/30] - 12870_2017_Article_1024 (1).pdf

Processing: 12870_2017_Article_1024 (1)
→ Converting to images...
  12 pages
→ Generating embeddings...




  Page 1/12 - 47 entities - 9 relations
  Page 2/12 - 86 entities
  Page 3/12 - 57 entities
  Page 4/12 - 45 entities - 3 relations
  Page 5/12 - 62 entities
  Page 6/12 - 46 entities
  Page 7/12 - 45 entities - 3 relations
  Page 8/12 - 65 entities
  Page 9/12 - 58 entities
  Page 10/12 - 121 entities - 13 relations
  Page 11/12 - 134 entities
  Page 12/12 - 183 entities
→ Inserting to Zilliz...
  ✓ Inserted 12312 patch vectors (12 pages) to Zilliz
  ✓ Cleaned temp files for 12870_2017_Article_1024 (1)
✓ Completed: 12870_2017_Article_1024 (1)

[11/30] - 12870_2017_Article_1024.pdf

Processing: 12870_2017_Article_1024
→ Converting to images...
  12 pages
→ Generating embeddings...




  Page 1/12 - 47 entities - 9 relations
  Page 2/12 - 86 entities
  Page 3/12 - 57 entities
  Page 4/12 - 45 entities - 3 relations
  Page 5/12 - 62 entities
  Page 6/12 - 46 entities
  Page 7/12 - 45 entities - 3 relations
  Page 8/12 - 65 entities
  Page 9/12 - 58 entities
  Page 10/12 - 121 entities - 13 relations
  Page 11/12 - 134 entities
  Page 12/12 - 183 entities
→ Inserting to Zilliz...
  ✓ Inserted 12312 patch vectors (12 pages) to Zilliz
  ✓ Cleaned temp files for 12870_2017_Article_1024
✓ Completed: 12870_2017_Article_1024

[12/30] - 12870_2017_Article_975.pdf

Processing: 12870_2017_Article_975
→ Converting to images...
  16 pages
→ Generating embeddings...




  Page 1/16 - 23 entities - 3 relations
  Page 2/16 - 61 entities
  Page 3/16 - 51 entities
  Page 4/16 - 34 entities - 2 relations
  Page 5/16 - 93 entities
  Page 6/16 - 32 entities
  Page 7/16 - 32 entities - 1 relations
  Page 8/16 - 58 entities
  Page 9/16 - 65 entities
  Page 10/16 - 71 entities - 7 relations
  Page 11/16 - 46 entities
  Page 12/16 - 69 entities
  Page 13/16 - 50 entities - 0 relations
  Page 14/16 - 133 entities
  Page 15/16 - 198 entities
  Page 16/16 - 115 entities - 1 relations
→ Inserting to Zilliz...
  ✓ Inserted 16416 patch vectors (16 pages) to Zilliz
  ✓ Cleaned temp files for 12870_2017_Article_975
✓ Completed: 12870_2017_Article_975

[13/30] - 12870_2020_Article_2392.pdf

Processing: 12870_2020_Article_2392
→ Converting to images...
  16 pages
→ Generating embeddings...




  Page 1/16 - 20 entities - 0 relations
  Page 2/16 - 24 entities
  Page 3/16 - 28 entities
  Page 4/16 - 17 entities - 0 relations
  Page 5/16 - 97 entities
  Page 6/16 - 21 entities
  Page 7/16 - 57 entities - 2 relations
  Page 8/16 - 47 entities
  Page 9/16 - 63 entities
  Page 10/16 - 49 entities - 2 relations
  Page 11/16 - 41 entities
  Page 12/16 - 61 entities
  Page 13/16 - 34 entities - 1 relations
  Page 14/16 - 16 entities
  Page 15/16 - 126 entities
  Page 16/16 - 88 entities - 0 relations
→ Inserting to Zilliz...
  ✓ Inserted 16416 patch vectors (16 pages) to Zilliz
  ✓ Cleaned temp files for 12870_2020_Article_2392
✓ Completed: 12870_2020_Article_2392

[14/30] - 12985_2024_Article_2374.pdf

Processing: 12985_2024_Article_2374
→ Converting to images...
  15 pages
→ Generating embeddings...




  Page 1/15 - 31 entities - 5 relations
  Page 2/15 - 88 entities
  Page 3/15 - 65 entities
  Page 4/15 - 67 entities - 0 relations
  Page 5/15 - 28 entities
  Page 6/15 - 62 entities
  Page 7/15 - 29 entities - 3 relations
  Page 8/15 - 63 entities
  Page 9/15 - 62 entities
  Page 10/15 - 49 entities - 2 relations
  Page 11/15 - 76 entities
  Page 12/15 - 17 entities
  Page 13/15 - 59 entities - 8 relations
  Page 14/15 - 197 entities
  Page 15/15 - 6 entities
→ Inserting to Zilliz...
  ✓ Inserted 15390 patch vectors (15 pages) to Zilliz
  ✓ Cleaned temp files for 12985_2024_Article_2374
✓ Completed: 12985_2024_Article_2374

[15/30] - 13059_2022_Article_2824.pdf

Processing: 13059_2022_Article_2824
→ Converting to images...
  19 pages
→ Generating embeddings...




  Page 1/19 - 22 entities - 2 relations
  Page 2/19 - 31 entities
  Page 3/19 - 19 entities
  Page 4/19 - 19 entities - 2 relations
  Page 5/19 - 15 entities
  Page 6/19 - 24 entities
  Page 7/19 - 34 entities - 2 relations
  Page 8/19 - 21 entities
  Page 9/19 - 35 entities
  Page 10/19 - 41 entities - 0 relations
  Page 11/19 - 28 entities
  Page 12/19 - 41 entities
  Page 13/19 - 47 entities - 8 relations
  Page 14/19 - 34 entities
  Page 15/19 - 15 entities
  Page 16/19 - 74 entities - 1 relations
  Page 17/19 - 136 entities
  Page 18/19 - 117 entities
  Page 19/19 - 97 entities - 0 relations
→ Inserting to Zilliz...
  ✓ Inserted 19494 patch vectors (19 pages) to Zilliz
  ✓ Cleaned temp files for 13059_2022_Article_2824
✓ Completed: 13059_2022_Article_2824

[16/30] - 1582-10.pdf

Processing: 1582-10
→ Converting to images...
  10 pages
→ Generating embeddings...




  Page 1/10 - 46 entities - 1 relations
  Page 2/10 - 47 entities
  Page 3/10 - 32 entities
  Page 4/10 - 58 entities - 0 relations
  Page 5/10 - 46 entities
  Page 6/10 - 53 entities
  Page 7/10 - 75 entities - 1 relations
  Page 8/10 - 61 entities
  Page 9/10 - 58 entities
  Page 10/10 - 116 entities - 0 relations
→ Inserting to Zilliz...
  ✓ Inserted 10260 patch vectors (10 pages) to Zilliz
  ✓ Cleaned temp files for 1582-10
✓ Completed: 1582-10

[17/30] - 1707.pdf

Processing: 1707
→ Converting to images...
  11 pages
→ Generating embeddings...




  Page 1/11 - 97 entities - 5 relations
  Page 2/11 - 121 entities
  Page 3/11 - 70 entities
  Page 4/11 - 93 entities - 5 relations
  Page 5/11 - 107 entities
  Page 6/11 - 72 entities
  Page 7/11 - 120 entities - 4 relations
  Page 8/11 - 117 entities
  Page 9/11 - 122 entities
  Page 10/11 - 177 entities - 2 relations
  Page 11/11 - 129 entities
→ Inserting to Zilliz...
  ✓ Inserted 11286 patch vectors (11 pages) to Zilliz
  ✓ Cleaned temp files for 1707
✓ Completed: 1707

[18/30] - 1749-799X-6-8.pdf

Processing: 1749-799X-6-8
→ Converting to images...
  8 pages
→ Generating embeddings...




  Page 1/8 - 74 entities - 5 relations
  Page 2/8 - 77 entities
  Page 3/8 - 75 entities
  Page 4/8 - 64 entities - 10 relations
  Page 5/8 - 71 entities
  Page 6/8 - 66 entities
  Page 7/8 - 172 entities - 2 relations
  Page 8/8 - 14 entities
→ Inserting to Zilliz...
  ✓ Inserted 8208 patch vectors (8 pages) to Zilliz
  ✓ Cleaned temp files for 1749-799X-6-8
✓ Completed: 1749-799X-6-8

[19/30] - 2044-5040-4-13.pdf

Processing: 2044-5040-4-13
→ Converting to images...
  13 pages
→ Generating embeddings...




  Page 1/13 - 71 entities - 6 relations
  Page 2/13 - 121 entities
  Page 3/13 - 93 entities
  Page 4/13 - 106 entities - 4 relations
  Page 5/13 - 98 entities
  Page 6/13 - 77 entities
  Page 7/13 - 81 entities - 7 relations
  Page 8/13 - 71 entities
  Page 9/13 - 63 entities
  Page 10/13 - 112 entities - 12 relations
  Page 11/13 - 63 entities
  Page 12/13 - 260 entities
  Page 13/13 - 155 entities - 5 relations
→ Inserting to Zilliz...
  ✓ Inserted 13338 patch vectors (13 pages) to Zilliz
  ✓ Cleaned temp files for 2044-5040-4-13
✓ Completed: 2044-5040-4-13

[20/30] - 2103.pdf

Processing: 2103
→ Converting to images...
  16 pages
→ Generating embeddings...




  Page 1/16 - 30 entities - 1 relations
  Page 2/16 - 66 entities
  Page 3/16 - 34 entities
  Page 4/16 - 43 entities - 4 relations
  Page 5/16 - 30 entities
  Page 6/16 - 24 entities
  Page 7/16 - 47 entities - 2 relations
  Page 8/16 - 58 entities
  Page 9/16 - 74 entities
  Page 10/16 - 85 entities - 2 relations
  Page 11/16 - 59 entities
  Page 12/16 - 39 entities
  Page 13/16 - 84 entities - 1 relations
  Page 14/16 - 58 entities
  Page 15/16 - 106 entities
  Page 16/16 - 106 entities - 3 relations
→ Inserting to Zilliz...
  ✓ Inserted 16416 patch vectors (16 pages) to Zilliz
  ✓ Cleaned temp files for 2103
✓ Completed: 2103

[21/30] - 3065-09.pdf

Processing: 3065-09
→ Converting to images...
  7 pages
→ Generating embeddings...




  Page 1/7 - 39 entities - 3 relations
  Page 2/7 - 37 entities
  Page 3/7 - 29 entities
  Page 4/7 - 24 entities - 0 relations
  Page 5/7 - 37 entities
  Page 6/7 - 78 entities
  Page 7/7 - 145 entities - 0 relations
→ Inserting to Zilliz...
  ✓ Inserted 7182 patch vectors (7 pages) to Zilliz
  ✓ Cleaned temp files for 3065-09
✓ Completed: 3065-09

[22/30] - 40168_2015_Article_116.pdf

Processing: 40168_2015_Article_116
→ Converting to images...
  18 pages
→ Generating embeddings...




  Page 1/18 - 23 entities - 2 relations
  Page 2/18 - 26 entities
  Page 3/18 - 12 entities
  Page 4/18 - 30 entities - 0 relations
  Page 5/18 - 35 entities
  Page 6/18 - 22 entities
  Page 7/18 - 7 entities - 0 relations
  Page 8/18 - 9 entities
  Page 9/18 - 12 entities
  Page 10/18 - 11 entities - 1 relations
  Page 11/18 - 29 entities
  Page 12/18 - 28 entities
  Page 13/18 - 36 entities - 4 relations
  Page 14/18 - 51 entities
  Page 15/18 - 42 entities
  Page 16/18 - 26 entities - 2 relations
  Page 17/18 - 129 entities
  Page 18/18 - 78 entities
→ Inserting to Zilliz...
  ✓ Inserted 18468 patch vectors (18 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2015_Article_116
✓ Completed: 40168_2015_Article_116

[23/30] - 40168_2017_Article_280.pdf

Processing: 40168_2017_Article_280
→ Converting to images...
  16 pages
→ Generating embeddings...




  Page 1/16 - 21 entities - 0 relations
  Page 2/16 - 48 entities
  Page 3/16 - 34 entities
  Page 4/16 - 39 entities - 0 relations
  Page 5/16 - 24 entities
  Page 6/16 - 31 entities
  Page 7/16 - 43 entities - 0 relations
  Page 8/16 - 15 entities
  Page 9/16 - 16 entities
  Page 10/16 - 19 entities - 1 relations
  Page 11/16 - 32 entities
  Page 12/16 - 11 entities
  Page 13/16 - 17 entities - 0 relations
  Page 14/16 - 62 entities
  Page 15/16 - 128 entities
  Page 16/16 - 140 entities - 1 relations
→ Inserting to Zilliz...
  ✓ Inserted 16416 patch vectors (16 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2017_Article_280
✓ Completed: 40168_2017_Article_280

[24/30] - 40168_2017_Article_330.pdf

Processing: 40168_2017_Article_330
→ Converting to images...
  2 pages
→ Generating embeddings...




  Page 1/2 - 12 entities - 0 relations
  Page 2/2 - 16 entities
→ Inserting to Zilliz...
  ✓ Inserted 2052 patch vectors (2 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2017_Article_330
✓ Completed: 40168_2017_Article_330

[25/30] - 40168_2018_Article_609.pdf

Processing: 40168_2018_Article_609
→ Converting to images...
  1 pages
→ Generating embeddings...




  Page 1/1 - 12 entities - 1 relations
→ Inserting to Zilliz...
  ✓ Inserted 1026 patch vectors (1 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2018_Article_609
✓ Completed: 40168_2018_Article_609

[26/30] - 40168_2019_Article_666.pdf

Processing: 40168_2019_Article_666
→ Converting to images...
  21 pages
→ Generating embeddings...




  Page 1/21 - 14 entities - 1 relations
  Page 2/21 - 47 entities
  Page 3/21 - 22 entities
  Page 4/21 - 40 entities - 1 relations
  Page 5/21 - 18 entities
  Page 6/21 - 20 entities
  Page 7/21 - 40 entities - 1 relations
  Page 8/21 - 23 entities
  Page 9/21 - 17 entities
  Page 10/21 - 11 entities - 0 relations
  Page 11/21 - 31 entities
  Page 12/21 - 31 entities
  Page 13/21 - 33 entities - 2 relations
  Page 14/21 - 47 entities
  Page 15/21 - 48 entities
  Page 16/21 - 33 entities - 3 relations
  Page 17/21 - 25 entities
  Page 18/21 - 35 entities
  Page 19/21 - 162 entities - 0 relations
  Page 20/21 - 159 entities
  Page 21/21 - 57 entities
→ Inserting to Zilliz...
  ✓ Inserted 21546 patch vectors (21 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2019_Article_666
✓ Completed: 40168_2019_Article_666

[27/30] - 40168_2019_Article_724.pdf

Processing: 40168_2019_Article_724
→ Converting to images...
  18 pages
→ Generating embeddings...




  Page 1/18 - 19 entities - 1 relations
  Page 2/18 - 24 entities
  Page 3/18 - 31 entities
  Page 4/18 - 31 entities - 5 relations
  Page 5/18 - 34 entities
  Page 6/18 - 38 entities
  Page 7/18 - 104 entities - 0 relations
  Page 8/18 - 40 entities
  Page 9/18 - 32 entities
  Page 10/18 - 36 entities - 6 relations
  Page 11/18 - 38 entities
  Page 12/18 - 41 entities
  Page 13/18 - 26 entities - 0 relations
  Page 14/18 - 25 entities
  Page 15/18 - 36 entities
  Page 16/18 - 34 entities - 1 relations
  Page 17/18 - 152 entities
  Page 18/18 - 58 entities
→ Inserting to Zilliz...
  ✓ Inserted 18468 patch vectors (18 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2019_Article_724
✓ Completed: 40168_2019_Article_724

[28/30] - 40168_2020_Article_830.pdf

Processing: 40168_2020_Article_830
→ Converting to images...
  14 pages
→ Generating embeddings...




  Page 1/14 - 24 entities - 0 relations
  Page 2/14 - 47 entities
  Page 3/14 - 31 entities
  Page 4/14 - 19 entities - 0 relations
  Page 5/14 - 32 entities
  Page 6/14 - 10 entities
  Page 7/14 - 47 entities - 2 relations
  Page 8/14 - 46 entities
  Page 9/14 - 26 entities
  Page 10/14 - 33 entities - 0 relations
  Page 11/14 - 43 entities
  Page 12/14 - 27 entities
  Page 13/14 - 157 entities - 1 relations
  Page 14/14 - 126 entities
→ Inserting to Zilliz...
  ✓ Inserted 14364 patch vectors (14 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2020_Article_830
✓ Completed: 40168_2020_Article_830

[29/30] - 40168_2022_Article_1279.pdf

Processing: 40168_2022_Article_1279
→ Converting to images...
  16 pages
→ Generating embeddings...




  Page 1/16 - 16 entities - 5 relations
  Page 2/16 - 25 entities
  Page 3/16 - 6 entities
  Page 4/16 - 16 entities - 7 relations
  Page 5/16 - 30 entities
  Page 6/16 - 46 entities
  Page 7/16 - 17 entities - 0 relations
  Page 8/16 - 18 entities
  Page 9/16 - 38 entities
  Page 10/16 - 15 entities - 1 relations
  Page 11/16 - 48 entities
  Page 12/16 - 16 entities
  Page 13/16 - 21 entities - 3 relations
  Page 14/16 - 21 entities
  Page 15/16 - 76 entities
  Page 16/16 - 82 entities - 1 relations
→ Inserting to Zilliz...
  ✓ Inserted 16416 patch vectors (16 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2022_Article_1279
✓ Completed: 40168_2022_Article_1279

[30/30] - 40168_2022_Article_1293 (1).pdf

Processing: 40168_2022_Article_1293 (1)
→ Converting to images...
  19 pages
→ Generating embeddings...




  Page 1/19 - 23 entities - 4 relations
  Page 2/19 - 22 entities
  Page 3/19 - 25 entities
  Page 4/19 - 15 entities - 0 relations
  Page 5/19 - 10 entities
  Page 6/19 - 36 entities
  Page 7/19 - 23 entities - 2 relations
  Page 8/19 - 38 entities
  Page 9/19 - 12 entities
  Page 10/19 - 4 entities - 0 relations
  Page 11/19 - 19 entities
  Page 12/19 - 3 entities
  Page 13/19 - 44 entities - 1 relations
  Page 14/19 - 51 entities
  Page 15/19 - 44 entities
  Page 16/19 - 22 entities - 0 relations
  Page 17/19 - 40 entities
  Page 18/19 - 59 entities
  Page 19/19 - 114 entities - 1 relations
→ Inserting to Zilliz...
  ✓ Inserted 19494 patch vectors (19 pages) to Zilliz
  ✓ Cleaned temp files for 40168_2022_Article_1293 (1)
✓ Completed: 40168_2022_Article_1293 (1)

PROCESSING SUMMARY
Successful: 30/30

INGESTION COMPLETE


In [None]:
#@title 12) Stats and sanity checks
def print_stats():
    print("\n" + "="*60)
    print("FINAL STATISTICS")
    print("="*60 + "\n")

    # Neo4j stats
    try:
        with neo4j_driver.session() as session:
            stats_queries = {
                "Publications": "MATCH (p:Publication) RETURN count(p) as count",
                "Pages": "MATCH (pg:Page) RETURN count(pg) as count",
                "Entities": "MATCH (e:Entity) RETURN count(e) as count",
                "Unique Entity Types": "MATCH (e:Entity) RETURN count(DISTINCT e.entity_type) as count",
                "Relations": "MATCH ()-[r:RELATES]->() RETURN count(r) as count",
                "Entity Mentions": "MATCH ()-[m:MENTIONED_IN]->() RETURN count(m) as count",
            }
            print("Neo4j Knowledge Graph:")
            for name, query in stats_queries.items():
                result = session.run(query)
                count = result.single()["count"]
                print(f"  {name:.<35} {count:>10,}")

            print("\nSample Entity Types:")
            result = session.run("""
                MATCH (e:Entity)
                RETURN DISTINCT e.entity_type as type, count(e) as count
                ORDER BY count DESC
                LIMIT 10
            """)
            for record in result:
                print(f"  - {record['type']:.<30} {record['count']:>8,}")
    except Exception as e:
        print(f"Neo4j stats error: {e}")

    # Zilliz stats
    try:
        zilliz_collection.load()
        print(f"\nZilliz Vector Store:")
        print(f"  {'Embeddings':.<35} {zilliz_collection.num_entities:>10,}")
        print(f"  {'Embedding Dimension':.<35} {EMBEDDING_DIM:>10,}")
    except Exception as e:
        print(f"Zilliz stats error: {e}")

    print("\n" + "="*60)

print_stats()

if device == "cuda":
    print(f"\nGPU Memory Usage:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"  Reserved:  {torch.cuda.memory_reserved() / 1e9:.2f} GB")


FINAL STATISTICS

Neo4j Knowledge Graph:
  Publications.......................          0
  Pages..............................          0
  Entities...........................          0
  Unique Entity Types................          0




  Relations..........................          0
  Entity Mentions....................          0

Sample Entity Types:

Zilliz Vector Store:
  Embeddings.........................          0
  Embedding Dimension................        512


GPU Memory Usage:
  Allocated: 0.32 GB
  Reserved:  0.34 GB


In [None]:
#@title 13) Optional: vector search test
def test_vector_search(query_text: str = "protein synthesis", top_k: int = 5):
    print(f"\n=== TESTING VECTOR SEARCH ===")
    print(f"Query: {query_text}")

    from PIL import ImageDraw
    img = Image.new('RGB', (400, 100), color='white')
    draw = ImageDraw.Draw(img)
    draw.text((10, 40), query_text, fill='black')

    query_img_path = "/content/query.png"
    img.save(query_img_path)

    # Get multi-vector embedding
    query_embeddings = embed_images_colpali([query_img_path])[0]  # shape: [num_patches, dim]

    print(f"Query has {query_embeddings.shape[0]} patch vectors")

    # Search with each patch and aggregate results
    all_results = {}
    search_params = {"metric_type": "IP", "params": {"nprobe": 10}}

    zilliz_collection.load()

    # Search with each query patch
    for patch_idx, patch_emb in enumerate(query_embeddings):
        results = zilliz_collection.search(
            data=[patch_emb.tolist()],
            anns_field="embedding",
            param=search_params,
            limit=top_k * 3,  # Get more to aggregate
            output_fields=["page_id", "pub_id", "page_num", "patch_num"]
        )

        # Aggregate scores by page_id
        for hit in results[0]:
            page_id = hit.entity.get('page_id')
            if page_id not in all_results:
                all_results[page_id] = {
                    'page_id': page_id,
                    'pub_id': hit.entity.get('pub_id'),
                    'page_num': hit.entity.get('page_num'),
                    'max_score': hit.score,
                    'total_score': 0,
                    'hits': 0
                }
            all_results[page_id]['total_score'] += hit.score
            all_results[page_id]['hits'] += 1
            all_results[page_id]['max_score'] = max(all_results[page_id]['max_score'], hit.score)

    # Sort by max score (ColPali's MaxSim approach)
    sorted_results = sorted(all_results.values(), key=lambda x: x['max_score'], reverse=True)[:top_k]

    print(f"\nTop {top_k} Results (aggregated by MaxSim):")
    for i, result in enumerate(sorted_results, 1):
        print(f"{i}. Page: {result['page_id']}, MaxScore: {result['max_score']:.4f}, Hits: {result['hits']}")

    img.close()
    return sorted_results

# Uncomment to run
test_vector_search()

In [None]:
#@title 14) Optional: knowledge graph test
def test_knowledge_graph():
    print(f"\n=== TESTING KNOWLEDGE GRAPH ===")
    with neo4j_driver.session() as session:
        result = session.run("""
            MATCH (e:Entity)-[r]-()
            RETURN e.name as entity, e.entity_type as type, count(r) as connections
            ORDER BY connections DESC
            LIMIT 10
        """)
        print("\nMost Connected Entities:")
        for record in result:
            print(f"  {record['entity'][:30]:.<30} ({record['type']}) - {record['connections']} connections")

# Uncomment to run
# test_knowledge_graph()

In [None]:
#@title 15) Cleanup
print("\n=== CLEANUP ===")
try:
    neo4j_driver.close()
    print("✓ Neo4j connection closed")
except:
    pass

try:
    connections.disconnect("default")
    print("✓ Zilliz connection closed")
except:
    pass

if device == "cuda":
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

print("\n" + "="*60)
print("ALL OPERATIONS COMPLETE")
print("="*60)
print("""
SUCCESS! Your data is now stored in:
- Neo4j: Knowledge graph with entities, relations, and document structure
- Zilliz: Vector embeddings for visual similarity search
""")


=== CLEANUP ===
✓ Neo4j connection closed
✓ Zilliz connection closed
✓ GPU cache cleared

ALL OPERATIONS COMPLETE

SUCCESS! Your data is now stored in:
- Neo4j: Knowledge graph with entities, relations, and document structure
- Zilliz: Vector embeddings for visual similarity search

