In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import requests
import copy
import torch
import utils
%matplotlib inline

In [None]:
model_id = "microsoft/Florence-2-large"
path = "Test2.png"

In [None]:
def use_device()
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        print("Using CPU")
    return device

Using CPU


In [None]:
def run_example(task_prompt, path, model_id, text_input=None):

    image = Image.open(path).convert("RGB")
    prompt = task_prompt if text_input is None else f"{task_prompt} {text_input}"
    inputs = processor(text=prompt, images=image, return_tensors="pt")

    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval().to(device)
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

    with torch.no_grad():  # Pour éviter les calculs inutiles de gradient
        generated_ids = model.generate(
            input_ids=inputs["input_ids"].to(device),
            pixel_values=inputs["pixel_values"].to(device),
            max_new_tokens=1024,
            early_stopping=True,
            do_sample=False,
            num_beams=3,
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        parsed_answer = processor.post_process_generation(
            generated_text,
            task=task_prompt,
            image_size=(image.width, image.height),
        )
    return parsed_answer

In [None]:
# Tâche OCR simple
task_label = utils.TaskType.OCR
results_label = run_example(task_label.value, image)
print(f"Results for {task_label.value}: {results}")

# Tâche OCR avec région
task_coordinates = utils.TaskType.OCR_WITH_REGION
results_coordinates = run_example(task_coordinates.value, image)
print(f"Results for {task_coordinates.value}: {results}")

In [None]:
# Créer un dictionnaire pour organiser les données
ocr_with_region_data = {}

# Extraire les quads (boîtes) et les labels
if '<OCR_WITH_REGION>' in results:
    quad_boxes = results_coordinates['<OCR_WITH_REGION>']['quad_boxes']
    labels = results_label['<OCR_WITH_REGION>']['labels']

    for i, (box, label) in enumerate(zip(quad_boxes, labels)):
        # Créer un dictionnaire pour chaque entrée OCR avec ses coordonnées et le label
        ocr_with_region_data[f'Box_{i + 1}'] = {
            'coordinates': box,  # Coordonnées de la boîte (quad box)
            'label': label       # Label de la détection OCR
        }

# Afficher les résultats sous forme de dictionnaire
dectections = []

print("Données OCR_WITH_REGION (chaque box dans un dictionnaire séparé) :")

for key, value in ocr_with_region_data.items():
    detection.append({key}: {value})
    print(f"{key}: {value}")

    return detections

Données OCR_WITH_REGION (chaque box dans un dictionnaire séparé) :
Box_1: {'coordinates': [169.39651489257812, 31.687501907348633, 469.7185363769531, 31.687501907348633, 469.7185363769531, 43.51750183105469, 169.39651489257812, 43.51750183105469], 'label': '</s>DIONYSOJ/PACCHUS (IN PERIPHERIA OCCIDENTIAL) 324-354'}
Box_2: {'coordinates': [771.2815551757812, 29.15250015258789, 1071.603515625, 29.15250015258789, 1071.603515625, 40.98250198364258, 771.2815551757812, 40.98250198364258], 'label': 'DIONYSOS/PACCHRUS (UN PERIPERIRA OCCIDENTIAL) 352-507'}
Box_3: {'coordinates': [1185.7755126953125, 29.15250015258789, 1200.6676025390625, 29.15250015258789, 1200.6676025390625, 38.44750213623047, 1185.7755126953125, 38.44750213623047], 'label': '63'}
Box_4: {'coordinates': [261.2304992675781, 250.54251098632812, 381.6075134277344, 251.38751220703125, 381.6075134277344, 263.2174987792969, 261.2304992675781, 261.5274963378906], 'label': 'Bacchus (in per ocel) 3274'}
Box_5: {'coordinates': [984.7335

In [None]:
tasks_img_detection =  utils.TaskType.OBJECT_DETECTION

for task in tasks_img_detection:
    results_img_detection = run_example(task.value, image)
    print(f"{task.value}: {results_img_detection}")


<OD>: {'<OD>': {'bboxes': [[704.2675170898438, 445.7375183105469, 894.1405639648438, 665.4375], [88.73150634765625, 542.9125366210938, 243.85650634765625, 738.1075439453125], [430.00653076171875, 133.08750915527344, 557.8295288085938, 283.49749755859375], [1028.1685791015625, 657.83251953125, 1158.4735107421875, 787.9625244140625], [1023.2045288085938, 119.5675048828125, 1138.6175537109375, 253.92251586914062], [78.80350494384766, 128.8625030517578, 191.73451232910156, 261.5274963378906], [1030.6505126953125, 378.13751220703125, 1144.822509765625, 504.88751220703125], [667.0375366210938, 140.6925048828125, 760.112548828125, 248.00750732421875], [348.10052490234375, 541.2225341796875, 418.8375244140625, 644.3125], [679.447509765625, 385.7425231933594, 928.8885498046875, 668.8175048828125], [51.501502990722656, 71.40250396728516, 221.5185089111328, 269.13250732421875], [320.79852294921875, 461.7925109863281, 580.1675415039062, 820.072509765625], [675.7245483398438, 384.0525207519531, 931

In [None]:
# Créer un dictionnaire pour organiser les données
object_detection_data = {}

# Extraire les boîtes (bboxes) et les labels
if '<OD>' in results:
    bboxes = results_img_detection['<OD>']['bboxes']
    labels = results_img_detection['<OD>']['labels']

    for i, (bbox, label) in enumerate(zip(bboxes, labels)):
        # Créer un dictionnaire pour chaque entrée de détection avec ses coordonnées et le label
        object_detection_data[f'Box_{i + 1}'] = {
            'label': label,        # Label de la détection d'objet
            'coordinates': bbox  # Coordonnées de la boîte (bbox)
        }
# Afficher les résultats sous forme de dictionnaire
print("Données OBJECT_DETECTION (chaque box dans un dictionnaire séparé) :")

for key, value in object_detection_data.items():
    print(f"{key}: {value}")

Données OBJECT_DETECTION (chaque box dans un dictionnaire séparé) :
Box_1: {'coordinates': [704.2675170898438, 445.7375183105469, 894.1405639648438, 665.4375], 'label': 'human face'}
Box_2: {'coordinates': [88.73150634765625, 542.9125366210938, 243.85650634765625, 738.1075439453125], 'label': 'human face'}
Box_3: {'coordinates': [430.00653076171875, 133.08750915527344, 557.8295288085938, 283.49749755859375], 'label': 'human face'}
Box_4: {'coordinates': [1028.1685791015625, 657.83251953125, 1158.4735107421875, 787.9625244140625], 'label': 'human face'}
Box_5: {'coordinates': [1023.2045288085938, 119.5675048828125, 1138.6175537109375, 253.92251586914062], 'label': 'human face'}
Box_6: {'coordinates': [78.80350494384766, 128.8625030517578, 191.73451232910156, 261.5274963378906], 'label': 'human face'}
Box_7: {'coordinates': [1030.6505126953125, 378.13751220703125, 1144.822509765625, 504.88751220703125], 'label': 'human face'}
Box_8: {'coordinates': [667.0375366210938, 140.6925048828125, 