In [1]:
import sys, os

# get project root (one folder up from notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import os
from modules.preprocessing import preprocessing_image
from modules.ocr_engine import run_best_ocr
from modules.vision_signals import extract_vision_signals
from modules.fusion import fuse_signals
from modules.embeddings import build_faiss_index, load_vectorstore
from modules.rag_pipeline import rag_query


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
img_path = r"C:\Users\Dell\OneDrive\Pictures\Mobilenetv2.png"
image_id = os.path.splitext(os.path.basename(img_path))[0]


In [4]:
versions = preprocessing_image(img_path)
ocr_result = run_best_ocr(img_path, preprocessed_img=versions["enhanced"])
print("OCR Engine:", ocr_result["engine"])
print("OCR Results:", ocr_result["results"][:3])  # preview first 3


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Dell\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Dell\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Dell\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Dell\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32m

OCR Engine: EasyOCR
OCR Results: [{'text': 'n = 1280', 'conf': 0.9979261928051756, 'bbox': [[np.int32(757), np.int32(25)], [np.int32(827), np.int32(25)], [np.int32(827), np.int32(45)], [np.int32(757), np.int32(45)]]}, {'text': 'n =', 'conf': 0.5405014157295227, 'bbox': [[np.int32(339), np.int32(87)], [np.int32(367), np.int32(87)], [np.int32(367), np.int32(101)], [np.int32(339), np.int32(101)]]}, {'text': '32', 'conf': 0.9999996628252286, 'bbox': [[np.int32(365), np.int32(83)], [np.int32(389), np.int32(83)], [np.int32(389), np.int32(103)], [np.int32(365), np.int32(103)]]}]


In [5]:
vision_result = extract_vision_signals(img_path)
print("Vision Result:", vision_result)



image 1/1 C:\Users\Dell\OneDrive\Pictures\Mobilenetv2.png: 288x640 (no detections), 139.0ms
Speed: 4.4ms preprocess, 139.0ms inference, 13.4ms postprocess per image at shape (1, 3, 288, 640)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 15.20it/s]


Vision Result: {'objects': [], 'qr_codes': [], 'caption': 'a diagram of the cell membrane'}


In [6]:
fusion_text = fuse_signals(
    image_id=image_id,
    ocr_results=ocr_result["results"],
    vision_results=vision_result
)
print("Fusion Text:\n", fusion_text)


Fusion Text:
 {'fusion_text': 'This image contains:\n    - OCR text: n = 1280 n = 32 n = 96 n = 1280 128x128x3 128x128 64 x 64 32 x 32 Softmax Fully Connected Classifier MobileNetv2 Preprocessing 3x3 Conv, ReLU Max pool 2x2 [\n    - Detected objects: None\n    - QR codes: None\n    - Caption: a diagram of the cell membrane', 'fusion_json': {'id': 'Mobilenetv2', 'ocr_text': 'n = 1280 n = 32 n = 96 n = 1280 128x128x3 128x128 64 x 64 32 x 32 Softmax Fully Connected Classifier MobileNetv2 Preprocessing 3x3 Conv, ReLU Max pool 2x2 [', 'objects': [], 'qr_codes': [], 'caption': 'a diagram of the cell membrane'}}


In [7]:
build_faiss_index()


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.69it/s]

✅ Saved index to data/vectorstore





In [8]:
retriever = load_vectorstore()
question = "What does the diagram show?"
answer, retrieved = rag_query(question, retriever)

print("Q:", question)
print("A:", answer)
print("Retrieved Context:", retrieved)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


Q: What does the diagram show?
A: cell membrane [ Objects: QR: Caption: a diagram of the cell membrane Image ID: receipt_001 OCR: n = 1280 n = 32 n = 96 n = 1280 128x128x3 128x128 64 x 64 32 x 32 Softmax Fully Connected Classifier MobileNetv2 Preprocessing 3x3 Conv, ReLU Max pool 2x2 [ Objects: QR: Caption: a diagram of the cell membrane Image ID: receipt_001 OCR: n = 1280 n = 32 n = 96 n = 1280 128x128x3 128x128 64 x 64 32 x 32 Softmax Fully Connected Classifier MobileNetv2 Preprocessing 3x3 Conv, ReLU Max pool 2x2 [ Objects: QR: Caption: a
Retrieved Context: [{'score': 0.27438050508499146, 'chunk_id': 'receipt_001_c0', 'image_id': 'receipt_001', 'text': 'Image ID: receipt_001\n        OCR: n = 1280 n = 32 n = 96 n = 1280 128x128x3 128x128 64 x 64 32 x 32 Softmax Fully Connected Classifier MobileNetv2 Preprocessing 3x3 Conv, ReLU Max pool 2x2 [\n        Objects: \n        QR: \n        Caption: a diagram of the cell membrane'}, {'score': 0.25264376401901245, 'chunk_id': 'Mobilenetv2_c