In [4]:
# !pip install transformers pillow
# !pip install einops timm flash_attn

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import requests
import numpy as np
import copy
import os

# Initialize the model and processor
model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Single Image

In [2]:
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_array = np.array(image)
    return image_array

def run_example(task_prompt, image_path, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    # Ensure the image is in the correct format
    image_array = preprocess_image(image_path)
    inputs = processor(text=prompt, images=image_array, return_tensors="pt")

    generated_ids = model.generate(
        input_ids=inputs['input_ids'],
        pixel_values=inputs['pixel_values'],
        max_new_tokens=1024,
        early_stopping=False,
        num_beams=3,
        do_sample=False,
    )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image_array.shape[1], image_array.shape[0])  # width, height
    )

    return parsed_answer

def save_results(image_path, results, output_folder):
    image_name = os.path.splitext(os.path.basename(image_path))[0]
    txt_filename = os.path.join(output_folder, image_name + '.txt')

    # Preprocess image to get dimensions
    image_array = preprocess_image(image_path)

    # Extract bounding box information
    bboxes = results['<DENSE_REGION_CAPTION>']['bboxes']
    labels = results['<DENSE_REGION_CAPTION>']['labels']

    # Open the text file for writing
    with open(txt_filename, 'w') as f:
        for label, bbox in zip(labels, bboxes):
            x_min, y_min, x_max, y_max = bbox
            width = x_max - x_min
            height = y_max - y_min
            x_center = x_min + width / 2
            y_center = y_min + height / 2

            # Normalize coordinates
            x_center /= image_array.shape[1]
            y_center /= image_array.shape[0]
            width /= image_array.shape[1]
            height /= image_array.shape[0]

            # Write the label and normalized bounding box to the file
            f.write(f"{label} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

task_prompt = '<DENSE_REGION_CAPTION>'

image_path = '/content/drive/MyDrive/autolabel/car.jpg'
output_folder = '/content/drive/MyDrive/autolabel'
os.makedirs(output_folder, exist_ok=True)

results = run_example(task_prompt, image_path)
save_results(image_path, results, output_folder)


NameError: name 'os' is not defined

## Multiple Images in a Folder

In [5]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import numpy as np
import os

# Initialize the model and processor
model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_array = np.array(image)
    return image_array

def run_example(task_prompt, image_path, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    # Ensure the image is in the correct format
    image_array = preprocess_image(image_path)
    inputs = processor(text=prompt, images=image_array, return_tensors="pt")

    generated_ids = model.generate(
        input_ids=inputs['input_ids'],
        pixel_values=inputs['pixel_values'],
        max_new_tokens=1024,
        early_stopping=False,
        num_beams=3,
        do_sample=False,
    )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image_array.shape[1], image_array.shape[0])  # width, height
    )

    return parsed_answer

def save_results(image_path, results, output_folder):
    image_name = os.path.splitext(os.path.basename(image_path))[0]
    txt_filename = os.path.join(output_folder, image_name + '.txt')

    # Preprocess image to get dimensions
    image_array = preprocess_image(image_path)

    # Extract bounding box information
    bboxes = results['<DENSE_REGION_CAPTION>']['bboxes']
    labels = results['<DENSE_REGION_CAPTION>']['labels']

    # Open the text file for writing
    with open(txt_filename, 'w') as f:
        for label, bbox in zip(labels, bboxes):
            x_min, y_min, x_max, y_max = bbox
            width = x_max - x_min
            height = y_max - y_min
            x_center = x_min + width / 2
            y_center = y_min + height / 2

            # Normalize coordinates
            x_center /= image_array.shape[1]
            y_center /= image_array.shape[0]
            width /= image_array.shape[1]
            height /= image_array.shape[0]

            # Write the label and normalized bounding box to the file
            f.write(f"{label} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

def process_images_in_folder(input_folder, output_folder, task_prompt):
    # Create the output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(input_folder, filename)
            results = run_example(task_prompt, image_path)
            save_results(image_path, results, output_folder)

task_prompt = '<DENSE_REGION_CAPTION>'

input_folder = '/content/drive/MyDrive/autolabel/images'
output_folder = '/content/drive/MyDrive/autolabel/labels'

process_images_in_folder(input_folder, output_folder, task_prompt)


KeyboardInterrupt: 

## Multiple Images in a Folder with the Labels in a List

In [7]:
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_array = np.array(image)
    return image_array

def run_example(task_prompt, image_path, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    # Ensure the image is in the correct format
    image_array = preprocess_image(image_path)
    inputs = processor(text=prompt, images=image_array, return_tensors="pt")

    generated_ids = model.generate(
        input_ids=inputs['input_ids'],
        pixel_values=inputs['pixel_values'],
        max_new_tokens=1024,
        early_stopping=False,
        num_beams=3,
        do_sample=False,
    )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image_array.shape[1], image_array.shape[0])  # width, height
    )

    return parsed_answer

def save_results(image_path, results, output_folder, all_labels):
    image_name = os.path.splitext(os.path.basename(image_path))[0]
    txt_filename = os.path.join(output_folder, image_name + '.txt')

    # Preprocess image to get dimensions
    image_array = preprocess_image(image_path)

    # Extract bounding box information
    bboxes = results['<DENSE_REGION_CAPTION>']['bboxes']
    labels = results['<DENSE_REGION_CAPTION>']['labels']

    # Update all_labels list with the labels from this image
    all_labels.extend(labels)

    # Open the text file for writing
    with open(txt_filename, 'w') as f:
        for label, bbox in zip(labels, bboxes):
            x_min, y_min, x_max, y_max = bbox
            width = x_max - x_min
            height = y_max - y_min
            x_center = x_min + width / 2
            y_center = y_min + height / 2

            # Normalize coordinates
            x_center /= image_array.shape[1]
            y_center /= image_array.shape[0]
            width /= image_array.shape[1]
            height /= image_array.shape[0]

            # Write the label and normalized bounding box to the file
            f.write(f"{label} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

def process_images_in_folder(input_folder, output_folder, task_prompt):
    # Create the output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)

    all_labels = []

    # Iterate over all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(input_folder, filename)
            results = run_example(task_prompt, image_path)
            save_results(image_path, results, output_folder, all_labels)

    # Print all unique labels detected
    unique_labels = list(set(all_labels))
    print("Unique labels detected:", unique_labels)


task_prompt = '<DENSE_REGION_CAPTION>'

input_folder = '/content/drive/MyDrive/autolabel/images'
output_folder = '/content/drive/MyDrive/autolabel/labels'

process_images_in_folder(input_folder, output_folder, task_prompt)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/autolabel/images/14.jpeg'