# Generating small dataset using Yolov8m

In [1]:
!pip install ultralytics
!pip install dataset
!pip install  webcolors==24.8.0


Collecting ultralytics
  Downloading ultralytics-8.3.50-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.50-py3-none-any.whl (898 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.0/899.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.50 ultralytics-thop-2.0.13


# Utilities

In [3]:
def infer_time_of_day(objects, image):
    scene_time = "unknown"

    # Check for specific objects
    detected_classes = [obj['name'] for obj in objects]
    
    if "streetlight" in detected_classes or "lamp" in detected_classes:
        scene_time = "night"
    elif "person" in detected_classes:
        # If many people are detected, it's more likely daytime (people are active during the day)
        scene_time = "daytime"
    elif "car" in detected_classes and "bus" in detected_classes:
        scene_time = "daytime"  # Vehicles are more common during the day in urban areas
    
    # Additional check for lighting conditions (based on image colors)
    avg_color = np.mean(image, axis=(0, 1))  # Average color of the image
    if avg_color[2] > 120:  # Checking for the intensity of the red channel
        scene_time = "daytime"
    else:
        scene_time = "night"

    return scene_time

def infer_urban_area(objects):
    urban_indicators = ["car", "bus", "truck", "building", "streetlight"]
    rural_indicators = ["tree", "grass", "animal"]
    
    detected_classes = [obj['name'] for obj in objects]
    
    # Urban area logic
    if any(item in detected_classes for item in urban_indicators):
        return "urban area"
    elif any(item in detected_classes for item in rural_indicators):
        return "rural area"
    else:
        return "suburban area"

def generate_setting_of_image(objects, image):
    # Infer time of day
    time_of_day = infer_time_of_day(objects, image)
    
    # Infer urban/rural area
    area = infer_urban_area(objects)

    # Construct a caption
    setting = f"The image maybe was taken during the {time_of_day}. It appears to be in a {area}. "
    
    return setting


In [4]:
import cv2
import numpy as np
import webcolors
from scipy import stats
from collections import Counter
from sklearn.cluster import KMeans
import math

def closest_color(requested_color):
    """
    Find the closest color name to a given RGB value.
    
    :param requested_color: RGB tuple to find the closest color for.
    :return: Name of the closest color.
    """
    min_colors = {}
    
    for name in webcolors.names("css3"):
        #print(name)
        r_c, g_c, b_c = webcolors.hex_to_rgb( webcolors.name_to_hex(name) )
        distance = ((r_c - requested_color[0]) ** 2 +
                    (g_c - requested_color[1]) ** 2 +
                    (b_c - requested_color[2]) ** 2)
        min_colors[distance] =  name #webcolors.name_to_hex(name)
    return min_colors[min(min_colors.keys())]
    
# Function to get the closest color name from RGB value
def rgb_to_name(rgb):
    closest_color_name = None
    try:
        return webcolors.rgb_to_name(rgb)
    except ValueError:
        
        closest_color_name = closest_color(rgb)
        return closest_color_name

# get the top 2 dominant colors in an image
def get_top_2_colors(image):
    # Read the image
    
    print(image.shape)
    # Reshape the image to a list of pixels
    pixels = image.reshape(-1, 3)

    # Find the most frequent colors by using a histogram
    colors, count = np.unique(pixels, axis=0, return_counts=True)
    
    # Find the indices of the two most common colors
    most_common_indices = count.argsort()[-2:][::-1]
    
    top_2_colors_rgb = colors[most_common_indices]
    top_2_colors_counts = count[most_common_indices]

    total_pixels = pixels.shape[0] * 3
    

    top_2_colors_percentage = [ (count / total_pixels) * 100 for count in top_2_colors_counts]
    
    # Convert the RGB values to color names
    top_2_colors_names = [rgb_to_name(tuple(color)) for color in top_2_colors_rgb]
    # Return the top 2 colors with their names and percentages
    result = [{"color_name": name, "percentage": percentage} 
              for name, percentage in zip(top_2_colors_names, top_2_colors_percentage)]

    print(top_2_colors_percentage)
    return result

image_path = "/kaggle/input/flicker-8k-image-dataset-captionstxt/Images/1001773457_577c3a7d70.jpg"
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB
top_2_colors = get_top_2_colors(image)
print(f"Top 2 colors: {top_2_colors}")


(375, 500, 3)
[0.15662222222222222, 0.07662222222222222]
Top 2 colors: [{'color_name': 'white', 'percentage': 0.15662222222222222}, {'color_name': 'black', 'percentage': 0.07662222222222222}]


In [5]:
def combine_duplicate_colors(color_list):
    # Dictionary to hold unique colors and their summed percentages
    color_dict = {}

    # Iterate through the color list
    for color in color_list:
        color_name = color['color_name']
        percentage = color['percentage']
        
        # If the color is already in the dictionary, add the percentage
        if color_name in color_dict:
            color_dict[color_name] += percentage
        else:
            # If it's a new color, add it to the dictionary
            color_dict[color_name] = percentage
    
    # Convert the dictionary back into a list of dictionaries
    result = [{'color_name': name, 'percentage': round(percentage, 2)} 
              for name, percentage in color_dict.items()]
    
    return result



combined_colors = combine_duplicate_colors(color_list)
print(combined_colors)


[{'color_name': 'black', 'percentage': 0.95}]


# Generating realtionships

In [23]:
def get_centroid(bbox):
    x_min, y_min, x_max, y_max = bbox
    return ((x_min + x_max) / 2, (y_min + y_max) / 2)

# Function to determine the spatial relationship between two bounding boxes
def get_relationship(obj1, obj2):
    x1, y1 = get_centroid(obj1['bbox'])
    x2, y2 = get_centroid(obj2['bbox'])
    
    # Check horizontal (left-right) and vertical (top-bottom) relationships
    if abs(x1 - x2) < 50 and abs(y1 - y2) < 50:
        return "near"
    elif x1 < x2:  # obj1 is to the left of obj2
        return "to the left of"
    elif x1 > x2:  # obj1 is to the right of obj2
        return "to the right of"
    elif y1 < y2:  # obj1 is above obj2
        return "above"
    elif y1 > y2:  # obj1 is below obj2
        return "below"
    
    return "near"  # Default case

# Gets relationship between object and each other object in a scene
def generate_caption(objects):
    caption_parts = []
    for index, obj1 in enumerate(objects):
        for obj2 in objects[index+1:]:
            relationship = get_relationship(obj1, obj2)
            # Form a sentence for the spatial relationship between objects
            
            caption_parts.append(f"{obj1['name']} is {relationship} the {obj2['name']}.")

    return " ".join(caption_parts)

In [None]:
# model.names

### Example output from one image

In [57]:
import cv2
from ultralytics import YOLO

# Load YOLOv8m model (pre-trained)
model = YOLO('yolov8m.pt')  # Use your model path here if needed

# Function to run inference and print objects with bounding boxes and scores
def detect_objects(image_path):
    # Read the input image
    image = cv2.imread(image_path)

    # Run inference
    results = model(image_path)

    # Parse the results (boxes, labels, scores)
    boxes = results[0].boxes.xyxy  # Bounding box coordinates (x1, y1, x2, y2)
    labels = results[0].boxes.cls  # Object class IDs
    scores = results[0].boxes.conf  # Confidence scores

    # Get object names from the model's labels (COCO class names)
    object_names = model.names  # This contains the COCO class names
    objects = []
    object_count = {}  # To track the count of each object type (e.g., "car_1", "car_2")

    # Print the detected objects with their bounding boxes and scores
    for box, label, score in zip(boxes, labels, scores):
        object_name = object_names[int(label)]  # Get the class name from the model
        print(f"Detected {object_name} with confidence {score:.2f}")
        
        # Create a unique name for each object type
        if object_name not in object_count:
            object_count[object_name] = 1
        else:
            object_count[object_name] += 1
        
        # Create unique name (e.g., "car_1", "car_2")
        unique_name = f"{object_name}_{object_count[object_name]}"
        
        x1, y1, x2, y2 = map(int, box)  # Convert to integers
        object_image = image[y1:y2, x1:x2]  # Crop the object from the image

        topColors = combine_duplicate_colors(get_top_2_colors(object_image))
        
        objects.append({
            'name': unique_name,  # Use the unique name here
            'bbox': [x1, y1, x2, y2],
            # 'score': score.item(),
             'top_2_colors':topColors,  
            # 'color': topColors[0]['color_name'],
        })

    relationships = generate_caption(objects)
    context = {'objects_info': objects,
              'relationships': relationships,
              'setting': generate_setting_of_image(objects, image),
              'objects_count': object_count}

    return context 

# Example usage
image_path = "/kaggle/input/imagecaption-notchtech/1.jpg"  # Provide the path to your image
context = detect_objects(image_path)



image 1/1 /kaggle/input/imagecaption-notchtech/1.jpg: 448x640 1 bench, 14.0ms
Speed: 3.2ms preprocess, 14.0ms inference, 1.8ms postprocess per image at shape (1, 3, 448, 640)
Detected bench with confidence 0.86
(566, 1168, 3)
[0.005495990448069445, 0.005495990448069445]


In [58]:
context

{'objects_info': [{'name': 'bench_1',
   'bbox': [2484, 1984, 3652, 2550],
   'score': 0.8641608357429504,
   'color': 'midnightblue'}],
 'relationships': '',
 'setting': 'The image was taken during the night. It appears to be in a suburban area. ',
 'objects_count': {'bench': 1}}

In [39]:
objects

[{'name': 'dog',
  'bbox': [0, 0, 186, 152],
  'score': 0.9661977291107178,
  'top_2_colors': [{'color_name': 'black', 'percentage': 0.95}],
  'color': 'black'},
 {'name': 'dog',
  'bbox': [236, 97, 419, 374],
  'score': 0.9538587927818298,
  'top_2_colors': [{'color_name': 'white', 'percentage': 0.72}],
  'color': 'white'}]

# Generating Data for the task

In [None]:
import pandas as pd 
data = pd.read_csv('/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt')

data[data['image']=='1000268201_693b08cb0e.jpg'].loc[0]['caption']

In [35]:
import cv2
from datasets import Dataset  # Correct import from the 'datasets' library
# from transformers import Dataset
from ultralytics import YOLO
import random
import os 
# Load YOLOv8m model (pre-trained)
model = YOLO('yolov8m.pt')  # Use your model path here if needed

# Function to run inference and print objects with bounding boxes and scores
def detect_objects(image_path):
    # Read the input image
    image = cv2.imread(image_path)

    # Run inference
    results = model(image_path)

    # Parse the results (boxes, labels, scores)
    boxes = results[0].boxes.xyxy  # Bounding box coordinates (x1, y1, x2, y2)
    labels = results[0].boxes.cls  # Object class IDs
    scores = results[0].boxes.conf  # Confidence scores

    # Get object names from the model's labels (COCO class names)
    object_names = model.names  # This contains the COCO class names
    objects = []
    object_count = {}  # To track the count of each object type (e.g., "car_1", "car_2")

    # Print the detected objects with their bounding boxes and scores
    for box, label, score in zip(boxes, labels, scores):
        object_name = object_names[int(label)]  # Get the class name from the model
        
        # Create a unique name for each object type
        if object_name not in object_count:
            object_count[object_name] = 1
        else:
            object_count[object_name] += 1
        
        # Create unique name (e.g., "car_1", "car_2")
        unique_name = f"{object_name}_{object_count[object_name]}"
        
        x1, y1, x2, y2 = map(int, box)  # Convert to integers
        object_image = image[y1:y2, x1:x2]  # Crop the object from the image

        topColors = combine_duplicate_colors(get_top_2_colors(object_image))

        wdith,height = image.shape[0],image.shape[1]

        # x_center,y_center = 
        objects.append({
            'name': unique_name,  # Use the unique name here
            'bbox': [x1, y1, x2, y2],
            'position':[(x1+x2)/2,(y1+y2)/2],
            'score': score.item(),
            'color': topColors[0]['color_name'],
        })
        
    relationships = generate_caption(objects)
    
    context = {'objects_info': objects,
              'relationships': relationships,
              'setting': generate_setting_of_image(objects, image),
              'object_count':object_count,
              'image_shape':image.shape[:2]}

    return context


# Function to format the context into a better input string for LLaMA
def format_context_for_llama(context):
    objects_info = context['objects_info']
    relationships = context['relationships']
    setting = context['setting']
    
    # Construct a structured input string
    formatted_context = f"This image with shape {context['image_shape']} contains"+" ".join(context['object_count']) +",details of objects: "
    formatted_context += ", ".join([f"{obj['color']} {obj['name']} in position {obj['position']}  )" for obj in objects_info])
    
    # Add relationships
    if relationships:
        formatted_context += ". Relationships between objects: " +(relationships)
    
    # Add setting of the image (e.g., whether it's indoors, outdoors, etc.)
    formatted_context += ". settings: " + setting
    
    return formatted_context


# Function to create a dataset using Hugging Face
def create_dataset(image_paths,metadata):
    texts = []
    EOS_TOKEN = '<|end_of_text|>'
    DIR = '/kaggle/input/flicker-8k-image-dataset-captionstxt/Images/'
    instruction = "Generate a detailed caption based on the given information about the image with hypothesizing any missing information."

    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
    
    for image_path in image_paths:
        
        context = detect_objects(DIR+image_path)
        input_string = format_context_for_llama(context)
        
        # For now, we're assuming that the caption is somehow generated or provided
        # You can modify this to use an actual caption generation model or process
        caption = metadata[metadata['image']==image_path].iloc[0]['caption']
       
        
        # for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_string, caption) + EOS_TOKEN # without this token generation goes on forever!
        texts.append(text)
         
        
    
    # Create and return the Hugging Face Dataset
    return Dataset.from_dict({ "text" : texts, })


# Example usage: Prepare a dataset from a list of image paths
image_paths = os.listdir('/kaggle/input/flicker-8k-image-dataset-captionstxt/Images')[0:500]  # Replace with your actual image paths
metadata = pd.read_csv('/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt')
dataset = create_dataset(image_paths,metadata)

# Print the first entry in the dataset to verify
print(dataset[0])



image 1/1 /kaggle/input/flicker-8k-image-dataset-captionstxt/Images/3226254560_2f8ac147ea.jpg: 448x640 1 dog, 28.1ms
Speed: 1.7ms preprocess, 28.1ms inference, 1.6ms postprocess per image at shape (1, 3, 448, 640)
(107, 151, 3)
[0.3527882651482329, 0.33834664025912403]

image 1/1 /kaggle/input/flicker-8k-image-dataset-captionstxt/Images/214543992_ce6c0d9f9b.jpg: 640x640 2 dogs, 1 sports ball, 1 potted plant, 38.0ms
Speed: 2.4ms preprocess, 38.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)
(404, 309, 3)
[0.16474948465720357, 0.15807405983316777]
(142, 131, 3)
[0.10034763287101746, 0.08063649069992473]
(26, 24, 3)
[0.10683760683760685, 0.10683760683760685]
(262, 128, 3)
[0.21469465648854963, 0.21370069974554706]

image 1/1 /kaggle/input/flicker-8k-image-dataset-captionstxt/Images/2366643786_9c9a830db8.jpg: 480x640 1 person, 1 car, 24.8ms
Speed: 1.8ms preprocess, 24.8ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)
(276, 179, 3)
[0.625455428710226

In [32]:
print(dataset[2])

{'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate a detailed caption based on the given information about the image with hypothesizing any missing information.\n\n### Input:\nThis image contains the following objects: person_1 (confidence: 0.95, color: white), car_1 (confidence: 0.38, color: white). Relationships between objects: person_1 is to the right of the car_1.. settings: The image maybe was taken during the night. It appears to be in a suburban area. \n\n### Response:\nA child in denim playing hopscotch .<|end_of_text|>'}


'A child in a pink dress is climbing up a set of stairs in an entry way .'



---



Next, we integrate LoRA adapters into our model, which allows us to efficiently update just a fraction of the model's parameters, enhancing training speed and reducing computational load.

In [40]:
!zip -r  "data.zip" /kaggle/working/captionedDataset 

  adding: kaggle/working/captionedDataset/ (stored 0%)
  adding: kaggle/working/captionedDataset/dataset_info.json (deflated 64%)
  adding: kaggle/working/captionedDataset/data-00000-of-00001.arrow (deflated 93%)
  adding: kaggle/working/captionedDataset/state.json (deflated 38%)


In [38]:
# dataset.
dataset.save_to_disk("captionedDataset")

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]