In [20]:
import os
import torch
import json
import shutil
import base64
import matplotlib.pyplot as plt
import io
from PIL import Image
from tqdm import tqdm # A library for creating smart progress bars

# --- Make sure your utility functions are importable ---
from util.utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model

# ==============================================================================
# 1. SETUP - This is the same setup code you had before
# ==============================================================================
print("Setting up models...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Detection Model (YOLO)
detection_model_path = 'weights/icon_detect/model.pt'
som_model = get_yolo_model(detection_model_path)
som_model.to(device)

# Load Caption Model (Florence-2)
caption_model_processor = get_caption_model_processor(
    model_name="florence2",
    model_name_or_path="weights/icon_caption_florence",
    device=device
)
print(f"Models loaded successfully on '{device}'.")

# ==============================================================================
# 2. CONFIGURATION - Define your input/output folders and settings
# ==============================================================================
INPUT_FRAMES_DIR = "input_frames"         # Folder containing your extracted video frames
OUTPUT_IMAGE_DIR = "outputs/images" # Folder to save annotated images
OUTPUT_JSON_DIR = "outputs/json"    # Folder to save the structured data

# Create output directories if they don't exist
os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)

# Model inference settings
BOX_TRESHOLD = 0.05
# Note: box_overlay_ratio is now calculated inside the loop for each image

# ==============================================================================
# 3. BATCH PROCESSING - The main loop
# ==============================================================================
print("Starting batch processing...")
# Get a list of all image files in the input directory
image_files = [f for f in os.listdir(INPUT_FRAMES_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# Loop through each file with a tqdm progress bar
for filename in tqdm(image_files, desc="Processing Frames"):
    try:
        base_filename = os.path.splitext(filename)[0]
        input_image_path = os.path.join(INPUT_FRAMES_DIR, filename)
        
        # Define where the output files for this frame will go
        output_image_path = os.path.join(OUTPUT_IMAGE_DIR, f"{base_filename}_annotated.jpg")
        output_json_path = os.path.join(OUTPUT_JSON_DIR, f"{base_filename}_data.json")

        # --- RESUME LOGIC ---
        # If the JSON file already exists, skip this frame
        if os.path.exists(output_json_path):
            continue

        # --- DYNAMIC BBOX CONFIG ---
        # Open the image to calculate the dynamic overlay ratio
        with Image.open(input_image_path) as temp_img:
            box_overlay_ratio = max(temp_img.size) / 3200
        
        draw_bbox_config = {
            'text_scale': 0.8 * box_overlay_ratio,
            'text_thickness': max(int(2 * box_overlay_ratio), 1),
            'text_padding': max(int(3 * box_overlay_ratio), 1),
            'thickness': max(int(3 * box_overlay_ratio), 1),
        }

        # --- OCR PASS ---
        ocr_bbox_rslt, _ = check_ocr_box(
            input_image_path,
            display_img=False,
            output_bb_format='xyxy',
            use_paddleocr=True,
            easyocr_args={'paragraph': False, 'text_threshold': 0.9}
        )
        text, ocr_bbox = ocr_bbox_rslt if ocr_bbox_rslt is not None else ([], [])

        # --- MAIN PARSING ---
        # Ensure the function returns a value before unpacking
        result = get_som_labeled_img(
            input_image_path,
            som_model,
            BOX_TRESHOLD=BOX_TRESHOLD,
            output_coord_in_ratio=True,
            ocr_bbox=ocr_bbox,
            draw_bbox_config=draw_bbox_config,
            caption_model_processor=caption_model_processor,
            ocr_text=text,
            use_local_semantics=True,
            iou_threshold=0.7,
            scale_img=False,
            batch_size=128
        )
        
        if result is None:
            print(f"Skipping {filename}: No elements found.")
            continue
            
        annotated_image, label_coordinates, parsed_content_list = result

        # --- SAVE THE RESULTS ---
        # Move the annotated image from its temporary location
        image = Image.open(io.BytesIO(base64.b64decode(annotated_image)))

        with open(output_image_path, 'w') as f:
            image.save(output_image_path)

        # Save the structured data to a JSON file
        with open(output_json_path, 'w') as f:
            json.dump(parsed_content_list, f, indent=4)

    except Exception as e:
        print(f"Failed to process {filename}. Error: {e}")
        # Continue to the next file even if one fails
        continue

print("Batch processing complete!")

Setting up models...
Models loaded successfully on 'cuda'.
Starting batch processing...


Processing Frames:   0%|          | 0/2850 [00:00<?, ?it/s]


0: 736x1280 111 icons, 14.2ms
Speed: 3.2ms preprocess, 14.2ms inference, 0.7ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 119 68
time to get parsed content: 0.14050030708312988


Processing Frames:   0%|          | 1/2850 [00:05<4:03:11,  5.12s/it]


0: 736x1280 110 icons, 13.7ms
Speed: 3.2ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 113 53


Processing Frames:   0%|          | 2/2850 [00:10<4:01:58,  5.10s/it]

time to get parsed content: 0.25359582901000977

0: 736x1280 76 icons, 13.7ms
Speed: 3.2ms preprocess, 13.7ms inference, 0.7ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 81 43
time to get parsed content: 0.19688200950622559


Processing Frames:   0%|          | 3/2850 [00:12<2:56:47,  3.73s/it]


0: 736x1280 172 icons, 13.7ms
Speed: 3.3ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 168 80
time to get parsed content: 0.2469627857208252


Processing Frames:   0%|          | 4/2850 [00:19<3:58:00,  5.02s/it]


0: 736x1280 188 icons, 13.7ms
Speed: 3.4ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 188 72
time to get parsed content: 0.276660680770874


Processing Frames:   0%|          | 5/2850 [00:28<5:10:20,  6.54s/it]


0: 736x1280 154 icons, 13.7ms
Speed: 3.3ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 163 78
time to get parsed content: 0.18999862670898438


Processing Frames:   0%|          | 6/2850 [00:33<4:51:35,  6.15s/it]


0: 736x1280 52 icons, 13.7ms
Speed: 3.3ms preprocess, 13.7ms inference, 0.7ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 58 41
time to get parsed content: 0.10229063034057617


Processing Frames:   0%|          | 7/2850 [00:39<4:44:48,  6.01s/it]


0: 736x1280 117 icons, 13.7ms
Speed: 3.4ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 134 83
time to get parsed content: 0.20371484756469727


Processing Frames:   0%|          | 8/2850 [00:47<5:06:28,  6.47s/it]


0: 736x1280 145 icons, 13.7ms
Speed: 26.4ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 149 44
time to get parsed content: 0.2633230686187744


Processing Frames:   0%|          | 9/2850 [01:05<8:06:24, 10.27s/it]


0: 736x1280 150 icons, 13.7ms
Speed: 3.4ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 157 51
time to get parsed content: 0.31902098655700684


Processing Frames:   0%|          | 10/2850 [01:19<8:52:12, 11.24s/it]


0: 736x1280 189 icons, 13.7ms
Speed: 3.3ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 197 99
time to get parsed content: 0.25031065940856934


Processing Frames:   0%|          | 11/2850 [01:29<8:35:20, 10.89s/it]


0: 736x1280 196 icons, 13.7ms
Speed: 3.3ms preprocess, 13.7ms inference, 0.8ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 206 80
time to get parsed content: 0.2833876609802246


Processing Frames:   0%|          | 12/2850 [01:36<7:38:24,  9.69s/it]


0: 736x1280 108 icons, 13.7ms
Speed: 3.3ms preprocess, 13.7ms inference, 0.7ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 125 64
time to get parsed content: 0.17770123481750488


Processing Frames:   0%|          | 13/2850 [01:40<6:26:14,  8.17s/it]


0: 736x1280 121 icons, 13.7ms
Speed: 3.2ms preprocess, 13.7ms inference, 0.7ms postprocess per image at shape (1, 3, 736, 1280)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


len(filtered_boxes): 122 59
time to get parsed content: 0.15809226036071777


Processing Frames:   0%|          | 14/2850 [01:44<5:14:38,  6.66s/it]

## Do not run below this line!
Old code below, only for reference.

Refer to 'demo.ipynb' for full original script.

In [None]:
import os
import json
import torch

from util.utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model
from ultralytics import YOLO
from PIL import Image
from tqdm import tqdm

print('Setting up models...')
device = 'cuda'

detection_model_path='weights/icon_detect/model.pt' # YOLO
som_model = get_yolo_model(detection_model_path)
som_model.to(device)

print('model to {}'.format(device))

In [None]:
# Load caption model
caption_model_processor = get_caption_model_processor(
    model_name='florence2',
    model_name_or_path="weights/icon_caption_florence",
    device=device
)
print(f'Models loaded successfully on {device}')

In [None]:
# Set up and check input/output directories exist
INPUT_FRAMES_DIR = 'input_frames'
OUTPUT_IMAGE_DIR = 'outputs/images'
OUTPUT_JSON_DIR = 'outputs/json'

os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)

In [None]:
# Model inference settings
BOX_TRESHOLD = 0.05
draw_bbox_config = {
    'text_scale': 0.8,
    'text_thickness': 1,
    'text_padding': 1,
    'thickness': 1,
}

In [None]:
import importlib
import utils
import shutil
importlib.reload(utils)

print("Starting batch processing...")

image_files = [f for f in os.listdir(INPUT_FRAMES_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

for filename in tqdm(image_files, desc="Processing Frames"):
    try:
        base_filename = os.path.splitext(filename)[0]
        input_image_path = os.path.join(INPUT_FRAMES_DIR, filename)

        output_image_path = os.path.join(OUTPUT_IMAGE_DIR, f"{base_filename}_annotated.png")
        output_json_path = os.path.join(OUTPUT_JSON_DIR, f"{base_filename}_data.json")

        if os.path.exists(output_json_path):
            continue
        
        # ocr_bbox_result, _ = check_ocr_box(
        #     input_image_path,
        #     display_img=False,
        #     output_bb_format='xyxy',
        #     use_paddleocr=True,
        # )
        ocr_bbox = [ ]
        text = [ ]

        annotated_image, label_coordinates, parsed_content_list = get_som_labeled_img(
            input_image_path,
            som_model,
            BOX_TRESHOLD=BOX_TRESHOLD,
            output_coord_in_ratio=True,
            ocr_bbox=ocr_bbox,
            draw_bbox_config=draw_bbox_config,
            caption_model_processor=caption_model_processor,
            ocr_text=text,
            use_local_semantics=True,
            iou_threshold=0.7,
            scale_img=False,
            batch_size=128,
        )
        
        #save_image = Image.open(annotated_image)
        #save_image.save(output_image_path)

        shutil.move(annotated_image, output_image_path)

        with open(output_json_path, 'w') as f:
            json.dump(parsed_content_list, f, indent=4)

    except Exception as e:
        print(f"Failed to process {filename}. Error: {e}")
        continue

print('Batch processing complete!')

In [None]:
# two choices for caption model: fine-tuned blip2 or florence2
import importlib
# import util.utils
# importlib.reload(utils)
from util.utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model
caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence", device=device)



In [None]:
som_model.device, type(som_model) 

In [None]:
# reload utils
import importlib
import utils
importlib.reload(utils)
# from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model

# image_path = 'imgs/google_page.png'
# image_path = 'imgs/windows_home.png'
# image_path = 'imgs/windows_multitab.png'
# image_path = 'imgs/omni3.jpg'
# image_path = 'imgs/ios.png'
# image_path = 'imgs/word.png'
# image_path = 'imgs/excel2.png'
# image_path = '/home/oberon/projects/OmniParser/imgs/frame_022500_game_1.jpg'
# image_path = '/home/oberon/projects/OmniParser/imgs/frame_023100_game_1.jpg'
# image_path = 'imgs/frame_025200_game_1.jpg'
image_path = 'imgs/frame_027300_game_1.jpg'


image = Image.open(image_path)
image_rgb = image.convert('RGB')
print('image size:', image.size)

box_overlay_ratio = max(image.size) / 3200
draw_bbox_config = {
    'text_scale': 0.8 * box_overlay_ratio,
    'text_thickness': max(int(2 * box_overlay_ratio), 1),
    'text_padding': max(int(3 * box_overlay_ratio), 1),
    'thickness': max(int(3 * box_overlay_ratio), 1),
}
BOX_TRESHOLD = 0.05

import time
start = time.time()
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
text, ocr_bbox = ocr_bbox_rslt
cur_time_ocr = time.time() 

dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)
cur_time_caption = time.time() 


In [None]:
# plot dino_labled_img it is in base64
import base64
import matplotlib.pyplot as plt
import io
plt.figure(figsize=(15,15))

image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
plt.axis('off')

plt.imshow(image)
# print(len(parsed_content_list))


In [None]:
import pandas as pd
df = pd.DataFrame(parsed_content_list)
df['ID'] = range(len(df))

df

In [None]:
parsed_content_list

In [None]:
df.to_csv('UE5_menu_viewport_2.csv', index=False)