In [44]:
import cv2
import requests
from PIL import Image, ImageDraw
from transformers import AutoModelForCausalLM, AutoProcessor
import numpy as np
import random
from skimage.metrics import structural_similarity as ssim
from time import time

# Initialize Florence-2 Model
!mkdir -p my_models/Florence_2

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large-ft",
                                             cache_dir="/content/my_models/Florence_2",
                                             device_map="cuda",
                                             trust_remote_code=True)

processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large-ft",
                                             cache_dir="/content/my_models/Florence_2",
                                             device_map="cuda",
                                             trust_remote_code=True)

def extract_frames(video_path, target_fps=20):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(frame_rate / target_fps)
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frames.append(frame)
        frame_count += 1
    cap.release()
    return frames

def compute_similarity(frame1, frame2):
    """Compute similarity between two frames using SSIM."""
    gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    return ssim(gray1, gray2)

def process_frame_with_florence(frame):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    prompt = '<OCR_WITH_REGION>'
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0")
    start_time = time()
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=4096,
        num_beams=3
    )
    processing_time = time() - start_time
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
    ocr_text = parsed_answer['<OCR_WITH_REGION>']
    return ocr_text, image, processing_time

def draw_ocr_bboxes(image, prediction):
    scale = 1
    colormap = ['blue', 'orange', 'green', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'red',
                'lime', 'indigo', 'violet', 'aqua', 'magenta', 'coral', 'gold', 'tan', 'skyblue']
    draw = ImageDraw.Draw(image)
    bboxes, labels = prediction['quad_boxes'], prediction['labels']
    for box, label in zip(bboxes, labels):
        color = random.choice(colormap)
        new_box = (np.array(box) * scale).tolist()
        draw.polygon(new_box, width=3, outline=color)
    return image

def main(video_path):
    frames = extract_frames(video_path)
    last_frame = None
    frame_times = []
    start_time = time()

    for i, frame in enumerate(frames):
        if last_frame is not None:
            similarity = compute_similarity(last_frame, frame)
            if similarity > 0.95:  # Adjust threshold as needed
                print(f"Frame {i + 1} is similar to the last processed frame. Skipping...")
                continue

        ocr_text_dict, image, processing_time = process_frame_with_florence(frame)
        frame_times.append(processing_time)

        print(f"Frame {i + 1} OCR Text: {ocr_text_dict}")  # Print the parsed text
        print(f"Frame {i + 1} Processing Time: {processing_time:.2f} seconds")

        output_image = draw_ocr_bboxes(image, ocr_text_dict)
        output_image.show()  # Display the image
        last_frame = frame  # Update the last processed frame

    # Compute overall metrics
    average_processing_time = sum(frame_times) / len(frame_times) if frame_times else 0
    fps = len(frames) / (time() - start_time) if len(frames) > 0 else 0

    print(f"Overall Average Processing Time: {average_processing_time:.2f} seconds")
    print(f"Overall FPS: {fps:.2f}")

# Example usage
video_path = 'test1.mp4'
main(video_path)


Frame 1 OCR Text: {'quad_boxes': [[85.97900390625, 108.18000793457031, 158.6230010986328, 108.18000793457031, 158.6230010986328, 129.77999877929688, 85.97900390625, 129.77999877929688], [7.23900032043457, 139.5, 247.26901245117188, 139.13999938964844, 247.26901245117188, 154.260009765625, 7.23900032043457, 155.3400115966797], [3.1750001907348633, 161.82000732421875, 246.5070037841797, 161.82000732421875, 246.5070037841797, 176.22000122070312, 3.1750001907348633, 176.22000122070312], [3.1750001907348633, 182.3400115966797, 245.74501037597656, 182.3400115966797, 245.74501037597656, 197.10000610351562, 3.1750001907348633, 197.10000610351562], [3.1750001907348633, 203.5800018310547, 229.74301147460938, 203.5800018310547, 229.74301147460938, 220.1400146484375, 3.1750001907348633, 220.1400146484375], [3.1750001907348633, 225.1800079345703, 231.2670135498047, 225.1800079345703, 231.2670135498047, 241.3800048828125, 3.1750001907348633, 241.74000549316406], [3.1750001907348633, 247.140014648437