# Survey Plan AI - Document Understanding

This notebook runs the Document Understanding pipeline using the Donut model. It expects a JSON file containing bounding box detections from the YOLO inference step.

In [None]:
# 1. Mount Google Drive (Data & Results)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 2. Deploy Code (Git Clone/Pull)
import os
import shutil

REPO_URL = "https://github.com/14jadon14/survey-plan-AI"
REPO_NAME = REPO_URL.split("/")[-1].replace(".git", "")
CODE_DIR = f"/content/{REPO_NAME}"

if os.path.exists(CODE_DIR):
    print("[INFO] Updating code...")
    !cd "{CODE_DIR}" && git pull
else:
    print("[INFO] Cloning repository...")
    !git clone "{REPO_URL}"

print(f"[SUCCESS] Code deployed to: {CODE_DIR}")

In [None]:
# 3. Setup Environment
# Install dependencies from the document_understanding module
!pip install -r "{CODE_DIR}/document_understanding/requirements.txt"

In [None]:
# 4. Locate YOLO Inference Output (JSON)
import glob

# Define potential paths where the YOLO inference might have saved the JSON
# Adjust strictly if you have a specific folder structure in Drive
DRIVE_ROOT = "/content/drive/MyDrive"
SEARCH_PATHS = [
    f"{DRIVE_ROOT}/marketing_plan_assets.json",
    f"{DRIVE_ROOT}/SurveyPlan AI/marketing_plan_assets.json",
    f"{DRIVE_ROOT}/SurveyPlan/marketing_plan_assets.json",
    f"{DRIVE_ROOT}/**/marketing_plan_assets.json" # Recursive search if needed (slow)
]

json_path = None
print("[INFO] Searching for 'marketing_plan_assets.json' in Drive...")

for path_pattern in SEARCH_PATHS:
    matches = glob.glob(path_pattern, recursive=True)
    if matches:
        json_path = matches[0]
        break

if json_path:
    print(f"[SUCCESS] Found JSON file at: {json_path}")
else:
    print("[WARN] JSON file not found automatically. Please specify the path manually below.")
    # json_path = "/content/drive/MyDrive/path/to/marketing_plan_assets.json"

In [None]:
# 5. Run Document Parsing Pipeline

if json_path and os.path.exists(json_path):
    print(f"[INFO] Running document parser on {json_path}...")
    
    # The script expects the code to be in python path or relative imports to work
    # We run it as a module or script setting PYTHONPATH
    !export PYTHONPATH="{CODE_DIR}" && python "{CODE_DIR}/document_understanding/run_on_image.py" --json "{json_path}"
    
    print("[SUCCESS] Processing complete.")
else:
    print("[ERROR] Cannot run pipeline: JSON path is invalid or file does not exist.")