In [1]:
pip install opencv-python-headless matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import cv2
import numpy as np
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import re
import random
import sys
sys.path.append(os.path.abspath('src'))
import data_preparation as dp

In [3]:
# Load the image
image_path = './DATASET_938_img/flickr30k/test'
image = cv2.imread(image_path)

# Parse the XML file
xml_path = './DATASET_938_img/flickr30k/flickr30k/Annotations/'

image_folder = image_path
sentence_folder = './DATASET_938_img/flickr30k/flickr30k/Sentences/'
annotation_folder = xml_path
output_base_folder = "DATASET_938_img/flickr30k/flickr30k_images"
insert_images_folder = "OBJECT_IMAGES/"
output_folder = "DATASET_938_img/flickr30k/flickr30k_images"


In [4]:
# Get the bounding box with the "person" object which has the the maximum area
bounding_boxes_to_add = dp.process_images(image_folder, sentence_folder, annotation_folder, output_folder)

['75953', '75953', '75953', '75953', '75953']
[(6, 161, 120, 333)]
['234622', '234629', '234622', '234622', '234622']
[(312, 62, 498, 408), (81, 13, 331, 407), (1, 55, 42, 223), (94, 16, 210, 224)]
['31116', '31116', '31116', '31116', '31116']
[(147, 25, 382, 374)]
['248820', '248820', '248820', '248819', '248820']
[(85, 26, 338, 475), (22, 330, 51, 425), (51, 319, 78, 386)]
['83638', '83638', '83638', '83638', '83638']
[(39, 87, 357, 491)]
['232001', '232002', '232003', '232003', '232001', '232003', '232002']
[(276, 66, 397, 315), (381, 70, 479, 322)]
['249531', '249531', '249531', '249537', '249531']
[(66, 32, 259, 498), (1, 251, 61, 458)]
['266245', '266245', '266245', '266245', '266245']
[(50, 25, 253, 281), (226, 67, 306, 189)]
['63296', '63294', '63294', '63294', '63294', '63301', '63294']
[(154, 95, 293, 268), (187, 246, 359, 500), (163, 124, 198, 168)]
['214231', '214231', '214231', '214231', '214231', '214231']
[(30, 172, 331, 499)]
['38768', '38768', '38768', '38768', '38774'

In [5]:
# remove black background in the images of objects
def remove_black_background(image):
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Create a binary mask where black pixels are considered background
    _, mask = cv2.threshold(gray, 1, 255, cv2.THRESH_BINARY)

    # Invert the mask
    mask_inv = cv2.bitwise_not(mask)

    # Convert single channel mask to three channels
    mask_rgb = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
    mask_inv_rgb = cv2.cvtColor(mask_inv, cv2.COLOR_GRAY2BGR)

    # Keep only the part of the image without the black background
    image_with_bg_removed = cv2.bitwise_and(image, mask_rgb)

    # Add alpha channel to the image
    b, g, r = cv2.split(image_with_bg_removed)
    rgba = [b, g, r, mask]
    image_with_alpha = cv2.merge(rgba, 4)

    return image_with_alpha

In [6]:
# function to insert the image of an object inside the bounding box
def insert_image(image, insert_image_path, bbox, percentage, start_margin):
    insert_img = cv2.imread(insert_image_path)
    if insert_img is None:
        print(f"Error: Image {insert_image_path} not found.")
        return image

    # Remove the black background
    insert_img = remove_black_background(insert_img)

    xmin, ymin, xmax, ymax = bbox
    bbox_width = xmax - xmin
    bbox_height = ymax - ymin

    if start_margin in ['top', 'bottom']:
        insert_width = bbox_width
        insert_height = int(bbox_height * (percentage / 100.0))
    elif start_margin in ['left', 'right']:
        insert_width = int(bbox_width * (percentage / 100.0))
        insert_height = bbox_height

    # Resize insert_img to fit the specified dimensions
    insert_img_resized = cv2.resize(insert_img, (insert_width, insert_height))

    # Determine the position to insert the image
    if start_margin == 'top':
        insert_y_start = ymin
        insert_x_start = xmin
    elif start_margin == 'bottom':
        insert_y_start = ymax - insert_height
        insert_x_start = xmin
    elif start_margin == 'left':
        insert_y_start = ymin
        insert_x_start = xmin
    elif start_margin == 'right':
        insert_y_start = ymin
        insert_x_start = xmax - insert_width

    # Handle alpha channel if present
    if insert_img_resized.shape[2] == 4:
        alpha_s = insert_img_resized[:, :, 3] / 255.0
        alpha_l = 1.0 - alpha_s

        for c in range(0, 3):
            image[insert_y_start:insert_y_start+insert_height, insert_x_start:insert_x_start+insert_width, c] = (
                alpha_s * insert_img_resized[:, :, c] +
                alpha_l * image[insert_y_start:insert_y_start+insert_height, insert_x_start:insert_x_start+insert_width, c]
            )
    else:
        image[insert_y_start:insert_y_start+insert_height, insert_x_start:insert_x_start+insert_width] = insert_img_resized

    return image

In [7]:
def process_images_with_inserts(image_folder, sentence_folder, annotation_folder, insert_images_folder, percentage, start_margin, output_subfolder):
    bounding_boxes_data = dp.process_images(image_folder, sentence_folder, annotation_folder, output_subfolder)
    processed_images = []

    for image_path, bboxes, image_id in bounding_boxes_data:
        insert_images = [os.path.join(insert_images_folder, f) for f in os.listdir(insert_images_folder) if f.endswith('.png')]

        if not insert_images:
            print(f"No insert images found in folder: {insert_images_folder}")
            continue

        insert_image_path = random.choice(insert_images)

        # Draw bounding boxes and insert random image
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: Image {image_path} not found.")
            continue

        max_bbox = max(bboxes, key=dp.calculate_area)
        image_with_insert = insert_image(image, insert_image_path, max_bbox, percentage, start_margin)

        output_filename = os.path.join(output_subfolder, f'{image_id}.jpg')
        cv2.imwrite(output_filename, image_with_insert)
        print(f"Processed {image_id} and saved as {output_filename}")

        processed_images.append(image_with_insert)

    return processed_images

In [8]:
# Function to plot images
def plot_images(images, title_prefix):
    for i, image in enumerate(images):
        plt.figure(figsize=(10, 5))
        plt.title(f"{title_prefix} - Image {i+1}")
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.show()

In [9]:
percentage = 50
start_margin = 'top'

output_subfolder = os.path.join(output_base_folder, f'{start_margin}_{percentage}')
if not os.path.exists(output_subfolder):
  os.makedirs(output_subfolder)
processed_images = process_images_with_inserts(image_folder, sentence_folder, annotation_folder, insert_images_folder, percentage, start_margin, output_subfolder)
#plot_images(processed_images, f"{start_margin.capitalize()} {percentage}%")

['75953', '75953', '75953', '75953', '75953']
[(6, 161, 120, 333)]
['234622', '234629', '234622', '234622', '234622']
[(312, 62, 498, 408), (81, 13, 331, 407), (1, 55, 42, 223), (94, 16, 210, 224)]
['31116', '31116', '31116', '31116', '31116']
[(147, 25, 382, 374)]
['248820', '248820', '248820', '248819', '248820']
[(85, 26, 338, 475), (22, 330, 51, 425), (51, 319, 78, 386)]
['83638', '83638', '83638', '83638', '83638']
[(39, 87, 357, 491)]
['232001', '232002', '232003', '232003', '232001', '232003', '232002']
[(276, 66, 397, 315), (381, 70, 479, 322)]
['249531', '249531', '249531', '249537', '249531']
[(66, 32, 259, 498), (1, 251, 61, 458)]
['266245', '266245', '266245', '266245', '266245']
[(50, 25, 253, 281), (226, 67, 306, 189)]
['63296', '63294', '63294', '63294', '63294', '63301', '63294']
[(154, 95, 293, 268), (187, 246, 359, 500), (163, 124, 198, 168)]
['214231', '214231', '214231', '214231', '214231', '214231']
[(30, 172, 331, 499)]
['38768', '38768', '38768', '38768', '38774'