In [2]:
import os
import json
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
from decord import VideoReader, cpu
from tqdm import tqdm


In [3]:
model_id = 'microsoft/Florence-2-base-ft'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision='refs/pr/6'
).to(device)
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision='refs/pr/6'
)



In [15]:
vr = VideoReader("/home/ACG/data/raw/videos/p5UIk5bhaCc.mp4", ctx=cpu(0))

In [16]:
def run_ocr_batch(frames, task_prompt="<OCR>"):
    """
    Runs OCR on a batch of frames.
    """
    with torch.no_grad():  # Ensure no gradients are stored
        inputs = processor(
            text=[task_prompt] * len(frames),
            images=frames,
            return_tensors="pt",
            padding=True
        ).to(device)

        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            early_stopping=False,
            do_sample=False,
            num_beams=3,
        )

        # Move tensors back to CPU to free GPU memory
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)
        parsed_answers = [
            processor.post_process_generation(
                answer, task=task_prompt, image_size=(frames[i].width, frames[i].height)
            )
            for i, answer in enumerate(generated_text)
        ]

        # Clean up GPU memory
        del inputs, generated_ids
        torch.cuda.empty_cache()  # Force free memory
        return parsed_answers

In [17]:
frames=[]

for frame_idx in range(5):
    
    frame = vr[frame_idx].asnumpy()
    frame_image = Image.fromarray(frame)
    frames.append(frame_image)

In [18]:
res = run_ocr_batch(frames)

In [19]:
res

[{'<OCR>': "ECBCO.UKRADORADORAADORathesayThe UK's largests re"},
 {'<OCR>': 'ECBCO.UKRADORADORAADORathesayThe UKs largests re'},
 {'<OCR>': "ECBCO.UKRADORADORAADORathesayThe UK's largest<pad><pad>"},
 {'<OCR>': "ECBCO.UKRADORADORathesayThe UK's largest.ire<pad><pad><pad>"},
 {'<OCR>': "ECBCO.UKThe UK's largestS<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"}]