In [1]:
from multimodal_rag.components.ingest import ingest_documents
from multimodal_rag.components.image_embed import ImageEmbedder
from multimodal_rag.config.config import settings
from multimodal_rag.logger import logger


def main():
    logger.info("===== IMAGE EMBEDDING PIPELINE =====")

    # --------------------------------------------------
    # 1. INGEST (images only)
    # --------------------------------------------------
    data = ingest_documents(
        docs_dir=settings.paths.docs_dir,
        images_dir=settings.paths.images_dir,
    )

    pdf_images = data["pdf_images"]
    normal_images = data["images"]

    image_items = pdf_images + normal_images

    logger.info(f"Total images found: {len(image_items)}")

    if not image_items:
        logger.warning("No images found. Exiting.")
        return

    # --------------------------------------------------
    # 2. EMBED IMAGES
    # --------------------------------------------------
    embedder = ImageEmbedder()
    embeddings, metadatas = embedder.embed_images(image_items)

    logger.info("===== IMAGE EMBEDDING SUMMARY =====")
    logger.info(f"Total embeddings generated: {len(embeddings)}")
    logger.info(f"Embedding dimension: {embeddings[0].shape[0]}")

    # --------------------------------------------------
    # 3. SAMPLE OUTPUT
    # --------------------------------------------------
    logger.info("===== SAMPLE METADATA =====")
    for i in range(min(3, len(metadatas))):
        logger.info(metadatas[i])

    logger.info("===== IMAGE EMBEDDING DONE =====")


if __name__ == "__main__":
    main()


[2026-02-09 17:29:04,550: INFO: common]: yaml file: /home/logan78/anvXnjack/multimodal_rag/config/config.yaml loaded successfully
[2026-02-09 17:29:04,554: INFO: config]: Configuration loaded successfully
[2026-02-09 17:29:12,324: INFO: 218185323]: ===== IMAGE EMBEDDING PIPELINE =====
[2026-02-09 17:29:12,327: INFO: ingest]: Text splitter initialized | chunk_size=2000, overlap=200
[2026-02-09 17:29:12,932: INFO: ingest]: Ingested 6 text chunks | 21 PDF images | 0 normal images
[2026-02-09 17:29:12,934: INFO: 218185323]: Total images found: 21
[2026-02-09 17:29:12,936: INFO: image_embed]: Loading image embedding model | model=ViT-H-14 | device=cpu
[2026-02-09 17:29:12,939: INFO: factory]: Parsing model identifier. Schema: None, Identifier: ViT-H-14
[2026-02-09 17:29:12,941: INFO: factory]: Loaded built-in ViT-H-14 model config.


open_clip_model.safetensors:   0%|          | 0.00/3.94G [00:00<?, ?B/s]

[2026-02-09 17:32:27,952: INFO: factory]: Instantiating model architecture: CLIP
[2026-02-09 17:32:41,752: INFO: factory]: Loading full pretrained weights from: /home/logan78/.cache/huggingface/hub/models--laion--CLIP-ViT-H-14-laion2B-s32B-b79K/snapshots/1c2b8495b28150b8a4922ee1c8edee224c284c0c/open_clip_model.safetensors
[2026-02-09 17:32:45,856: INFO: factory]: Final image preprocessing configuration set: {'size': (224, 224), 'mode': 'RGB', 'mean': (0.48145466, 0.4578275, 0.40821073), 'std': (0.26862954, 0.26130258, 0.27577711), 'interpolation': 'bicubic', 'resize_mode': 'shortest', 'fill_color': 0}
[2026-02-09 17:32:45,861: INFO: factory]: Model ViT-H-14 creation process complete.
[2026-02-09 17:32:45,888: INFO: image_embed]: Image embedding model loaded successfully
[2026-02-09 17:33:59,675: INFO: image_embed]: Generated image embeddings | vectors=21 | dim=1024
[2026-02-09 17:33:59,678: INFO: 218185323]: ===== IMAGE EMBEDDING SUMMARY =====
[2026-02-09 17:33:59,680: INFO: 218185323]