In [None]:
# 01_exploration.ipynb

## Purpose
"""
This notebook is used for exploratory analysis of the DocVision pipeline.
It demonstrates how document images and PDFs are processed through OCR,
Vision LLM classification, and visual cue detection, and allows inspection
of intermediate outputs for debugging and understanding system behavior.

This notebook is supplementary and does not contain core application logic.
"""

In [4]:
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)


In [5]:
import os
import asyncio
from pprint import pprint


from src.vision import classify_image
from src.visual_cues import detect_logos_from_bytes
from src.pdfconverter import pdf_to_images


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu


In [27]:
import os

SAMPLE_FILE = os.path.join("uploadss", "DOC-20240426-WA0004_251230_141315.pdf")
assert os.path.exists(SAMPLE_FILE), f"File not found: {SAMPLE_FILE}"
print("Using sample file:", SAMPLE_FILE)

Using sample file: uploadss\DOC-20240426-WA0004_251230_141315.pdf


In [15]:
if SAMPLE_FILE.lower().endswith(".pdf"):
    pdf_name = pdf_to_images(SAMPLE_FILE)
    image_dir = os.path.join("uploads", "images", pdf_name)
    pages = sorted(os.listdir(image_dir))
    image_path = os.path.join(image_dir, pages[0])
else:
    image_path = SAMPLE_FILE

print("Image selected for visual analysis:", image_path)


Image selected for visual analysis: uploads\images\DOC-20240426-WA0004_251230_141315\page_1.png


In [16]:
with open(image_path, "rb") as f:
    image_bytes = f.read()

logos = detect_logos_from_bytes(image_bytes)
pprint(logos)


[{'confidence': 0.81,
  'image_base64': 'iVBORw0KGgoAAAANSUhEUgAAANwAAABXCAIAAADgT7zNAACWRElEQVR4nDT9adB2a3YehK217mEPz/O8wzee74w9qLvVLalbalnIkiwbC2MHDCaYwTH5gx0SIPCDVGLyAyoVUxVIEapIKuWKiR1MUrEJCThxZAgecIRjychtua0e1Gp1n9Nn/M43vsMz7L3vaa3UtY+o7jp16nzD+zx73/carnVd1+L80VG7RQ6DDuxOxflQXu7ree9JKBKnaklk6ySIzkTEenPgDdMmqJhmjbex1ha2jll1p1SGZJmaydxkE0krXRMz66ddVNZ91urFS322yMPAUqR2GgK1Qo3UO7ma7e6GN8ZmtFBVFlGqardF+qjn3qeqx2TitLOwOLPZEtuoQoFEVYiLciCjwIn0ZvFnGzpbGpO95/R+4xj5OLuodj7Up4tYlEeJrbfcUSkmQvtszfROHz21VInNmtIpM3dla7Goa5ascPNMUe6RUpRnSa3a4JwJiYpj28Y2VypkAzti2ld21C6izZWW7PqhLNlvesnVUrYdniS3xtdKAzNVMZnNvDjOauOQ/BLV6Fh83PBFq+zsVIxcyGTaeMvcS7le7Gpxj3pX8eRInB1PGgM5x9Xcxaa9uOEzJ9tBTe2oEgOlhZh48FJNRQ1/TLUkYoc3kotzGb+hZfMdhQE/S4RZ2ZqKd9VauyK39T21nMiUmSyY1YlkoHZG7X0XH9EgpLW0yVnP9n5uFsJ9doHK3OjK5I7YxuHrklgieWZ631iEgvHiai1Wp+apLAuNflDPMZr3eG3JNJipSOfFmb/ccuh97I2dkFFUrrN10sjsuuapumpxF8KrMbSmwRtV7tm93FM+GteSs+Yq1PnCFhyNOEdOfMPDVBJtVOs82wlv2oVsxNo5JTJVaqVFIu44OY6a77U0et51UnryASeDSZ22mU1n5mbMtb9tnxwvZzRVR4u

In [20]:
## Observation 

"""
This exploration demonstrates visual cue detection in the DocVision pipeline.
Detected logos or seals are returned as cropped image regions encoded in base64
along with confidence scores. This step complements OCR and Vision LLM
classification by leveraging visual identifiers present in documents.

"""


'\nThis exploration demonstrates visual cue detection in the DocVision pipeline.\nDetected logos or seals are returned as cropped image regions encoded in base64\nalong with confidence scores. This step complements OCR and Vision LLM\nclassification by leveraging visual identifiers present in documents.\n\n'