In [1]:
#!pip install ultralytics 

In [2]:
formatted_llm_lines = [{'animatediff': 'Imagine a dark and gritty street in the heart of a city, the sounds of traffic and people bustling by. The scene is lit by the neon lights of nearby businesses.'},
  {'sadtalker': 'John Wick, a legendary assassin, stands in the middle of the street, his eyes cold and his face impassive. He is wearing a black suit and holding a gun in his hand. He says, <dialogue>"I\'m going to kill them all"</dialogue> (male).'},
  {'animatediff': 'John Wick is a deadly assassin, known for his prowess in taking down his targets with precision and efficiency. He is a man on a mission, and he will stop at nothing to get what he wants. With his gun at the ready, he stalks the streets, searching for his next target.'}]

reference_images_directory = 'reference_images/John Wick in the streets'

In [3]:
from ultralytics import YOLO

# Load a model
model = YOLO('yolov8n.pt')  

def input_transformation(formatted_llm_lines, reference_image_directory):
    """
    Update descriptions in a dictionary of formatted LLM lines based on detected objects in reference images.

    Args:
    formatted_llm_lines (list[dict]): A list of dictionaries containing formatted LLM lines.
    reference_image_directory (str): The directory containing reference images.

    Returns:
    list[dict]: The updated list of formatted LLM lines with enhanced descriptions.
    """
    for i, line in enumerate(formatted_llm_lines):
        # Predict with the model
        image_path = f'{reference_image_directory}/{i + 1}.jpg'
        results = model(image_path)  # predict on an image

        # Initialize a set to store detected object names
        detected_objects = set()

        if 'animatediff' in line:
            existing_description = line['animatediff']
        elif 'sadtalker' in line:
            existing_description = line['sadtalker']
        else:
            existing_description = ""

        for box in results[0].boxes:
            class_id = results[0].names[box.cls[0].item()]
            conf = round(box.conf[0].item(), 2)

            # Check if the probability is greater than or equal to 0.3 and the object hasn't been added
            if conf >= 0.3 and class_id not in detected_objects:
                detected_objects.add(class_id)
                if existing_description:
                    existing_description += ', ' + class_id
                else:
                    existing_description += class_id

        # Update the existing value in the dictionary
        if 'animatediff' in line:
            line['animatediff'] = existing_description
        elif 'sadtalker' in line:
            line['sadtalker'] = existing_description

    return formatted_llm_lines

# Call the function to update the dictionary
formatted_llm_lines = input_transformation(formatted_llm_lines, reference_images_directory)



image 1/1 c:\Users\akshi\Desktop\MIT\pcv project\code\reference_images\John Wick in the streets\1.jpg: 640x640 4 persons, 1 car, 1 traffic light, 4.0ms
Speed: 6.5ms preprocess, 4.0ms inference, 200.8ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 c:\Users\akshi\Desktop\MIT\pcv project\code\reference_images\John Wick in the streets\2.jpg: 640x640 2 persons, 2 cars, 1 umbrella, 1 tie, 5.0ms
Speed: 2.5ms preprocess, 5.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 c:\Users\akshi\Desktop\MIT\pcv project\code\reference_images\John Wick in the streets\3.jpg: 640x640 1 person, 1 tie, 1 cell phone, 4.0ms
Speed: 3.5ms preprocess, 4.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)


In [4]:
formatted_llm_lines

[{'animatediff': 'Imagine a dark and gritty street in the heart of a city, the sounds of traffic and people bustling by. The scene is lit by the neon lights of nearby businesses., car, person, traffic light'},
 {'sadtalker': 'John Wick, a legendary assassin, stands in the middle of the street, his eyes cold and his face impassive. He is wearing a black suit and holding a gun in his hand. He says, <dialogue>"I\'m going to kill them all"</dialogue> (male)., person, car, tie, umbrella'},
 {'animatediff': 'John Wick is a deadly assassin, known for his prowess in taking down his targets with precision and efficiency. He is a man on a mission, and he will stop at nothing to get what he wants. With his gun at the ready, he stalks the streets, searching for his next target., person, cell phone, tie'}]