In [1]:
import csv
import math
import os
import random
import re
import shutil
import time
from itertools import product
from typing import Dict, List, Tuple

import cv2
import numpy as np
from IPython.display import Image, display
from PIL import Image as PILImage
from pydantic import BaseModel
from ultralytics import YOLO

# Gamma Correction
These functions will get executed during the image slicing.

In [2]:
def gamma_correction(image, gamma=1.0):
    # Ensure gamma is a positive number
    if gamma <= 0:
        raise ValueError("Gamma should be greater than 0")

    # Build a lookup table mapping pixel values [0, 255] to their gamma-corrected values
    inv_gamma = 1.0 / gamma
    table = np.array([
        ((i / 255.0) ** inv_gamma) * 255
        for i in range(256)
    ]).astype("uint8")

    # Apply gamma correction using the lookup table
    return cv2.LUT(image, table)


def auto_gamma(image, target_brightness=0.5):
    """
    Automatically adjusts gamma to normalize image brightness.

    Gamma > 1 brightens the image, Gamma < 1 darkens the image.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    brightness = np.mean(gray) / 255.0

    # Prevent division by zero or extremely low brightness
    if brightness < 1e-3:
        gamma = 2.5  # Force strong brightening
    else:
        gamma = target_brightness / brightness

    gamma = np.clip(gamma, 0.3, 3.0)  # Clamp to reasonable range
    corrected = gamma_correction(image, gamma)
    return corrected, gamma

# Image Slicing
Here we divide the images into slices of at most 1280x1280. It assumes that the input is in YOLO format, at `./data/yolo_data`. Note that we use the
words slicing and
tiling interchangeably.

In [3]:
VARIANTS = ["train", "test", "val"]
TILING_TARGET_RESOLUTION = (1280, 1280)
TILING_INPUT_FOLDER = os.path.join(os.getcwd(), "data", "yolo_data")
TILING_OUTPUT_FOLDER = os.path.join(os.getcwd(), "data", "tiling_output")

In [4]:
# Create the output folders
for variant in VARIANTS:
    if not os.path.exists(os.path.join(TILING_OUTPUT_FOLDER, variant)):
        os.makedirs(os.path.join(TILING_OUTPUT_FOLDER, variant, "images"), exist_ok=True)
        os.makedirs(os.path.join(TILING_OUTPUT_FOLDER, variant, "labels"), exist_ok=True)

files = []
for variant in VARIANTS:
    for file in os.listdir(os.path.join(TILING_INPUT_FOLDER, variant, "images")):
        files.append(str(os.path.join(TILING_INPUT_FOLDER, variant, "images", file)))


In [5]:

def slice_image(image: str) -> list:
    img_tiles = []
    img = cv2.imread(image)

    h, w, channels = img.shape

    number_of_images_w = math.ceil(w / TILING_TARGET_RESOLUTION[0])
    number_of_images_h = math.ceil(h / TILING_TARGET_RESOLUTION[1])

    for y in range(number_of_images_h):
        for x in range(number_of_images_w):
            x_min = int(x * TILING_TARGET_RESOLUTION[0])
            x_max = int(min(x_min + TILING_TARGET_RESOLUTION[0], w))
            y_min = int(y * TILING_TARGET_RESOLUTION[1])
            y_max = int(min(y_min + TILING_TARGET_RESOLUTION[1], h))

            tile = img[y_min:y_max, x_min:x_max]
            tile_corrected, gamma = auto_gamma(tile)
            img_tiles.append({"w": w, "h": h, "x_min": x_min, "x_max": x_max, "y_min": y_min, "y_max": y_max, "tile": tile_corrected})

    return img_tiles

In [6]:
class Label:
    """Denotes the bounding box in pixels, by having a minimum and maximum x & y."""
    def __init__(self, x_min, x_max, y_min, y_max):
        self.x_min = x_min
        self.x_max = x_max
        self.y_min = y_min
        self.y_max = y_max


def parse_labels(labels: List[str], original_w: int, original_h: int) -> List[Label]:
    parsed_labels = []
    for label in labels:
        coords = label.split(" ")[1:]

        width = float(coords[2])
        height = float(coords[3])

        x_min = float(coords[0]) - (width/2)
        y_min = float(coords[1]) - (height/2)
        x_max = x_min + width
        y_max = y_min + height

        parsed_labels.append(
            Label(
                x_min * original_w,
                x_max * original_w,
                y_min * original_h,
                y_max * original_h
            )
        )
    return parsed_labels

def process_tile_label(original_image_path: str, tiles: list, store_on_disk: bool = True) -> None:
    label_file = os.path.join(os.path.dirname(original_image_path), "..", "labels", os.path.basename(original_image_path))
    label_file = ".".join(label_file.split(".")[:-1]) + ".txt"

    with open(label_file, "r") as f:
        parsed_labels = parse_labels(f.readlines(), tiles[0]["w"], tiles[0]["h"])

    # Get the variant of the original image
    last_folder_name = TILING_INPUT_FOLDER.split("/")[-1].split("\\")[-1]
    search = re.search(rf"{last_folder_name}(/|\\)(.*?)(/|\\)images(/|\\)", original_image_path)
    variant = search.group(2)

    all_labels = []
    tile_files = []
    for i, tile in enumerate(tiles):
        # Figure out which labels of the labels of the original image are in the tile, and adapt the coordinates accordingly
        tile_labels = []
        for label_i, label in enumerate(parsed_labels):
            if (tile["x_min"] < label.x_min and tile["x_max"] > label.x_max and
                tile["y_min"] < label.y_min and tile["y_max"] > label.y_max):

                x_min = label.x_min - tile["x_min"]
                y_min = label.y_min - tile["y_min"]
                x_max = label.x_max - tile["x_min"]
                y_max = label.y_max - tile["y_min"]

                tile_width = tile["x_max"] - tile["x_min"]
                tile_height = tile["y_max"] - tile["y_min"]

                label_relative_width = (x_max - x_min) / tile_width
                label_relative_height = (y_max - y_min) / tile_height

                tile_labels.append(f"0 {x_min / tile_width + (label_relative_width / 2)} {y_min / tile_height + (label_relative_height / 2)} "
                                   f"{label_relative_width} "
                                   f"{label_relative_height}")

        if store_on_disk:
            # Store tile image
            tile_img_file_name = ".".join(original_image_path.split("/")[-1].split("\\")[-1].split(".")[:-1]) + "_tile-" + str(i) + ".png"
            tile_img_path = os.path.join(TILING_OUTPUT_FOLDER, variant, "images", tile_img_file_name)
            cv2.imwrite(tile_img_path, tile["tile"])

            # Store label
            tile_label_file_name = tile_img_file_name.replace(".png", ".txt")
            tile_label_path = os.path.join(TILING_OUTPUT_FOLDER, variant, "labels", tile_label_file_name)
            with open(tile_label_path, "w") as f:
                f.write("\n".join(tile_labels))

            tile_files.append((tile_img_path, tile_label_path))

        all_labels.append(tile_labels)

    if store_on_disk:
        return tile_files
    return all_labels

## Executing the slicing
We also time the results. It is noticeable how much time the disk read/write takes. This can surely be optimized when running it on the drone.

In [100]:
times = []
for file in files:
    t = time.time()

    tiles = slice_image(file)

    t1 = time.time()

    process_tile_label(file, tiles, True)

    t2 = time.time()

    times.append((t1 - t, t2 - t1))
    print(f"This took {t1 - t} seconds to slice and {t2 - t1} seconds to process. Total: {t2-t}")

This took 0.15807414054870605 seconds to slice and 0.2807657718658447 seconds to process. Total: 0.4388399124145508
This took 0.15058207511901855 seconds to slice and 0.31879186630249023 seconds to process. Total: 0.4693739414215088
This took 0.17541909217834473 seconds to slice and 0.3342711925506592 seconds to process. Total: 0.5096902847290039
This took 0.15952825546264648 seconds to slice and 0.32883405685424805 seconds to process. Total: 0.48836231231689453
This took 0.19880914688110352 seconds to slice and 0.2794930934906006 seconds to process. Total: 0.4783022403717041
This took 0.15489745140075684 seconds to slice and 0.2710402011871338 seconds to process. Total: 0.4259376525878906
This took 0.14499402046203613 seconds to slice and 0.27611589431762695 seconds to process. Total: 0.4211099147796631
This took 0.1387336254119873 seconds to slice and 0.2708473205566406 seconds to process. Total: 0.40958094596862793
This took 0.1475231647491455 seconds to slice and 0.2674715518951416

KeyboardInterrupt: 

# Bird Generation
This next part extracts birds from [a dataset](https://universe.roboflow.com/yaelym-hong/harmful-birds-detection/dataset/2) and places them into the
previously generated image slices/tiles. Because this dataset contains bounding boxes rather than outlines, we'll have to remove the background first, using
Apple shortcuts.

## Preprocess locally using Apple shortcuts
First, create the folders `./data/bird_dataset/crows` and `./data/bird_dataset/pigeons` . You need to use apple shortcuts to remove the background of these
pictures and add them to a folder. Do this separately for the pigeons and crows folders. Important to create a "pigeons" and "crows" folder in this new
folder. Use this new folder to add the pictures without background to the existing AllImages object.

Link to Apple shortcut (Self made)
https://www.icloud.com/shortcuts/cb718c0257a8443fb68a9d5243597a47

A problem is that some of the images have instead of 4 yolo coordinates have 6 and many classifications, even though there is only a single pigeon in the bird.
Therefore, these are removed. These pictures are annotated with an outline around the bird, but that is not the correct format for our project

In [None]:
BIRD_DATASET_LOCATION = os.path.join("data", "bird_dataset")

In [None]:
class ImageData(BaseModel):
    image_name: str
    image_paths: List[str] = list()
    label_text: str
    bird_class: int = None # 0 = crow, 1 = , 2 = , 3 = pigeon, 4 = other
    cleaned_file: str = ""

    def model_post_init(self, context):
        self.bird_class = int(self.label_text[0])

        return super().model_post_init(context)

    def get_random_image_path(self):
        random_image = random.choice(self.image_paths)
        random_image = self.image_paths[0]

        return self.image_name, random_image, self.label_text, self.bird_class

    def get_cleaned_image(self):
        display(Image(filename=self.cleaned_file))

    def get_cleaned_scaled_image(self, new_width, new_height):
        img = PILImage.open(self.cleaned_file)
        wpercent = (new_width / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img_resized = img.resize((new_width, hsize), PILImage.Resampling.LANCZOS)
        display(img_resized)

    def get_cropped_images(self, new_width):
        img = PILImage.open(self.cleaned_file)
        img_width, img_height = img.size

        bounding_boxes = self.label_text.split("\n")
        # You only want to take one of the bounding boxes to display because we only want to add one picture into another picture
        # So we take the largest one, which has the highest probability to be one that is the most complete bird
        sorted_bounding_boxes = sorted(bounding_boxes, reverse=True, key= lambda x: x[3])
        for largest_bounding_box in sorted_bounding_boxes:
            # when the data is in incorrect format
            if len(largest_bounding_box.split(" ")) != 5:
                return False
            bird_class, x_center_rel, y_center_rel, width_rel, height_rel = map(float, largest_bounding_box.split(" "))
            x_center = x_center_rel * img_width
            y_center = y_center_rel * img_height
            width = width_rel * img_width
            height = height_rel * img_height

            x_short = x_center - (0.5 * width)
            x_long = x_center + (0.5 * width)
            y_short = y_center - (0.5 * height)
            y_long = y_center + (0.5 * height)
            cropped_img = img.crop((x_short, y_short, x_long, y_long))

            # now we are scaling the cropped image to the correct size
            wpercent = (new_width / float(img_width))
            hsize = int((float(img_height) * wpercent))
            img_resized = cropped_img.resize((new_width, hsize), PILImage.Resampling.LANCZOS)
            if cropped_img.mode != "RGBA":
                print("image is in mode: ", cropped_img.mode, "converting to RGBA")
                cropped_img = cropped_img.convert("RGBA")
            # Extract alpha channel (opacity)
            alpha = img_resized.getchannel("A")

            # Convert to numpy array for efficient computation
            alpha_np = np.array(alpha, dtype=np.float32) / 255.0  # Normalize to [0,1]

            # Calculate average opacity
            avg_opacity = np.mean(alpha_np)

            # Skip image if average opacity is less than 0.05
            if avg_opacity < 0.05:
                print("the opacity is too little for the largest bounding box")
                continue

            return img_resized
        return False

In [None]:
class AllImages(BaseModel):
    images_dict: Dict[str, ImageData] = dict()

    def get_image_list_index(self, bird_classes: Tuple[int] = (0, 1, 2, 3, 4), cleaned_file=False):
        """gets a list of bird images that satisify the requirement of input"""
        if cleaned_file:
            found_image_dict = {index: image_name
                        for index, (image_name, image)
                          in enumerate(self.images_dict.items())
                          if image.bird_class in bird_classes and image.cleaned_file != ""}
        else:
            found_image_dict = {index: image_name
                            for index, (image_name, image)
                            in enumerate(self.images_dict.items())
                            if image.bird_class in bird_classes}
        return found_image_dict

    def get_random_instance(self, bird_classes: Tuple[int] = (0, 1, 2, 3, 4), cleaned_file=False):
        found_image_dict = self.get_image_list_index(bird_classes, cleaned_file)
        random_key = random.choice(list(found_image_dict.keys()))
        found_image = found_image_dict[random_key]
        return self.images_dict[found_image]

    def get_random_picture(self, bird_classes: Tuple[int] = (0, 1, 2, 3, 4)):
        found_image = self.get_random_instance(bird_classes)
        image_name, found_image_path, label_text, bird_class = found_image.get_random_image_path()
        display(Image(filename=found_image_path))
        return found_image_path

    def get_random_clean_image(self, new_width, new_height):
        image = self.get_random_instance((0, 3), True)
        print(image.image_name)
        print(image.cleaned_file)
        image.get_cleaned_scaled_image(new_width, new_height)

    def get_random_cropped_images(self, new_width):
        # Sometimes the cropped image is in the wrong format. So we recursively call this function to retry another one
        for i in range(10):
            image = self.get_random_instance((0, 3), True)
            cropped_image = image.get_cropped_images(new_width)
            if cropped_image:
                print("found image= ", image.image_name)
                return cropped_image

        print("No valid cropped image found after 10 attempts.")
        return None

    def get_list_of_paths_crows_pigeons(self):
        """returns all of the information of the files as a list of lists.
        only includes pigeons and crows"""
        found_images_objects = self.get_image_list_index((0, 3))
        image_paths = [self.images_dict[image].get_random_image_path() for image in list(found_images_objects.values())]
        return image_paths

    def copy_crows_pigeons(self, destination_folder: str):
        crows_pigeon_paths = self.get_list_of_paths_crows_pigeons()
        crows_path = f"{destination_folder}/crows"
        pigeons_path = f"{destination_folder}/pigeons"
        if not os.path.exists(destination_folder):
            os.mkdir(destination_folder)
            os.mkdir(crows_path)
            os.mkdir(f"{crows_path}/labels")
            os.mkdir(pigeons_path)
            os.mkdir(f"{pigeons_path}/labels")
        else:
            raise Exception("folder already exists")


        for index, (image_name, image_path, label_text, bird_class) in enumerate(crows_pigeon_paths):
            bird_cat = "c" if bird_class == 0 else "p"
            image_name = f"{bird_cat}_{index}"
            if bird_class == 0:
                #shutil.copy(image_path, f"{crows_path}/{image_name}.jpg")
                shutil.copy(image_path, crows_path)
                with open(f"{crows_path}/labels/{image_name}.txt", "w") as f:
                    f.write(label_text)

            elif bird_class == 3:
                #shutil.copy(image_path, f"{pigeons_path}/{image_name}.jpg")
                shutil.copy(image_path, pigeons_path)

                with open(f"{pigeons_path}/labels/{image_name}.txt", "w") as f:
                    f.write(label_text)

    def load_removed_background_pictures(self, path: str):
        """give the folder of where the pictures are that have removed the background
        the path folder should contain two folders "pigeons" and "crows"
        """
        folders = os.listdir(path)
        if not ("pigeons" in folders and "crows" in folders):
            raise Exception("pigeons and crows doesn't exist in folder")

        for file in os.listdir(f"{path}/pigeons"):
            if ".DS_Store" in file:
                continue
            first_file_name = file.split(".")[0]
            self.images_dict[first_file_name].cleaned_file = f"{path}/pigeons/{file}"

        for file in os.listdir(f"{path}/crows"):
            if ".DS_Store" in file:
                continue
            first_file_name = file.split(".")[0]
            self.images_dict[first_file_name].cleaned_file = f"{path}/crows/{file}"

    def get_files_in_data_folder(self, path: str):
        images_paths = [f"{path}/images/{file_path}" for file_path in os.listdir(f"{path}/images")]
        label_file_names = [file_path for file_path in os.listdir(f"{path}/labels")]

        for label_file in label_file_names:
            if ".DS_Store" in label_file:
                continue
            # the same picture has the first part the same but might have had different augmentation
            first_file_name = label_file.split(".")[0]
            if first_file_name in self.images_dict:
                image = self.images_dict[first_file_name]
            else:
                with open(f"{path}/labels/{label_file}") as f:
                    label_text = f.read()
                image = ImageData(image_name=first_file_name,
                                  label_text=label_text)
            label_file_no_ext = os.path.splitext(label_file)[0]
            found_image = [file_name for file_name in images_paths if label_file_no_ext in file_name][0]
            image.image_paths.append(found_image)
            self.images_dict[first_file_name] = image

In [None]:
def get_all_images_objects():
    all_images = AllImages()
    all_images.load_removed_background_pictures(BIRD_DATASET_LOCATION)
    return all_images

all_images_objects = get_all_images_objects()


def get_average_bounding_box(label_path: str):
    """label_path is the location to the yoloflow.txt file of the image. takes all the bounding boxes of the image and calculate the average"""
    with open(label_path) as f:
        bounding_boxes = f.readlines()

    bounding_boxes = [bounding_box.strip().split(" ") for bounding_box in bounding_boxes]
    #bird_class, x_center_rel, y_center_rel, width_rel, height_rel = map(float, largest_bounding_box.split(" "))

    try:
        average_rel_width = sum([float(bounding_box[3]) for bounding_box in bounding_boxes])/len(bounding_boxes)
        average_rel_height = sum([float(bounding_box[4]) for bounding_box in bounding_boxes])/len(bounding_boxes)
    except ZeroDivisionError:
        average_rel_width = 0
        average_rel_height = 0
    return average_rel_width, average_rel_height


def add_picture_to_picture(image_path: str, label_path: str, average_rel_width, average_rel_height):
    """file_name is the name of the file to be augmented onto
    folder_path is where the filename is in
    average_rel_width and average_rel_height should come from the function get_average_bounding_box(label_path)
    """
    img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)

    h, w, channels = img.shape
    height_pixels = int(h*average_rel_height)
    width_pixels = int(w*average_rel_width)

    cropped_pil = all_images_objects.get_random_cropped_images(width_pixels)
    if cropped_pil is None:
        print("cropped image above is None")
        cropped_pil = all_images_objects.get_random_cropped_images(width_pixels)
        if cropped_pil is None:
            print("it is still None, abort")
            return

    # Convert PIL to OpenCV format
    cropped_np = np.array(cropped_pil.convert("RGBA"))
    cropped_cv = cv2.cvtColor(cropped_np, cv2.COLOR_RGBA2BGRA)  # Preserve alpha channel

    # Choose a position to paste. At least 3 pixels from the border and half of the picture size to be added
    x_offset = random.randint(
        int(3 + width_pixels * 0.5),
        int(w - 3 - width_pixels * 0.5)
    )

    y_offset = random.randint(
        int(3 + height_pixels * 0.5),
        int(h - 3 - height_pixels * 0.5)
    )

    # Get overlay dimensions
    overlay_h, overlay_w = cropped_cv.shape[:2]

    # Make sure the overlay fits within the image bounds
    if y_offset + overlay_h > h:
        overlay_h = h - y_offset
        cropped_cv = cropped_cv[:overlay_h, :, :]

    if x_offset + overlay_w > w:
        overlay_w = w - x_offset
        cropped_cv = cropped_cv[:, :overlay_w, :]

    # Get the ROI from the original image
    roi = img[y_offset:y_offset+overlay_h, x_offset:x_offset+overlay_w]

    # Check channels
    ch = cropped_cv.shape[2]

    # Proper alpha blending
    if ch == 4:  # If we have an alpha channel
        # Extract the alpha channel and normalize to [0, 1]
        alpha = cropped_cv[:, :, 3] / 255.0

        # Create a 3-channel alpha mask
        alpha_3d = np.dstack((alpha, alpha, alpha))

        # Extract BGR channels from overlay
        overlay_bgr = cropped_cv[:, :, :3]

        # Calculate blended image
        blended = (1.0 - alpha_3d) * roi + alpha_3d * overlay_bgr

        # Replace the ROI with the blended image
        img[y_offset:y_offset+overlay_h, x_offset:x_offset+overlay_w] = blended.astype(np.uint8)
    else:
        # Just copy if no alpha
        img[y_offset:y_offset+overlay_h, x_offset:x_offset+overlay_w] = cropped_cv

    # below we are adding the newly generated image that is augmented with one extra bird
    # we add the label as well to the dataset
    x_offset_rel = x_offset/w
    y_offset_rel = y_offset/h
    x_rel = overlay_w/w
    y_rel = overlay_h/h
    new_yolo_label_str = f"0 {x_offset_rel+0.5*x_rel} {y_offset_rel+0.5*y_rel} {x_rel} {y_rel}"
    with open(label_path, "a") as f:
        f.write(f"\n{new_yolo_label_str}")
    cv2.imwrite(image_path, img)

## Executing the bird generation

In [105]:
GENERATED_BIRD_OUTPUT_FOLDER = os.path.join(os.getcwd(), "data", "generated_bird_output")

# Create the output folders, only for the training data.
os.makedirs(os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "train", "images"), exist_ok=True)
os.makedirs(os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "train", "labels"), exist_ok=True)

files = []
for file in os.listdir(os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "train", "images")):
    files.append(str(os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "train", "images", file)))

# The birds are only added to the training data, so we'll copy the validation & test data to the output folder without augmenting them.
shutil.copytree(os.path.join(TILING_OUTPUT_FOLDER, "val"), os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "val"))
shutil.copytree(os.path.join(TILING_OUTPUT_FOLDER, "test"), os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "test"))

# Also add a small data.yaml file, for the YOLO model.
with open(os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "data.yaml"), "w") as f:
    f.write("""
names:
  0: bird
nc: 1
train: train/images
test: test/images
val: val/images
    """)


# tiles_per_image is a variable that was created in the image slicing phase
for original_path, tiles in tiles_per_image.items():
    if "train" not in original_path:
        continue

    label_file = os.path.join(os.path.dirname(original_path), "..", "labels", os.path.basename(original_path))
    label_file = ".".join(label_file.split(".")[:-1]) + ".txt"

    avg_bb_width, avg_bb_height = get_average_bounding_box(label_file)
    if avg_bb_width == 0 or avg_bb_height == 0:
        continue

    # Here we scale the avg bounding boxes of the original image to a same sized bounding box in a tile/slice.
    target_width = avg_bb_width * tiles[0]["w"] / TILING_TARGET_RESOLUTION[0]
    target_height = avg_bb_height * tiles[0]["h"] / TILING_TARGET_RESOLUTION[1]

    for i in range(len(tiles)):
        tile_img_file_name = ".".join(original_path.split("\\")[-1].split(".")[:-1]) + "_tile-" + str(i) + ".png"
        tile_img_path = os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "train", "images", tile_img_file_name)

        tile_label_file_name = tile_img_file_name.replace(".png", ".txt")
        tile_label_path = os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "train", "labels", tile_label_file_name)

        # Here we finally add the birds to the tile.
        add_picture_to_picture(tile_img_path, tile_label_path, target_width, target_height)




C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\yolo_data\train\images\..\labels\20240901115456_0265_D_frame_1170 - kopie.txt
C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\yolo_data\train\images\..\labels\20240901115456_0265_D_frame_1230 - kopie.txt
C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\yolo_data\train\images\..\labels\20240901115456_0265_D_frame_4560 - kopie.txt
C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\yolo_data\train\images\..\labels\20240901115456_0265_D_frame_4590 - kopie.txt
C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\yolo_data\train\images\..\labels\20240901120230_0267_D_frame_630 - kopie.txt
C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in

# Hyperparameter tuning

In [None]:
# Hyperparameter options
learning_rates = [0.001, 0.005, 0.01]
mosaic_values = [0.0, 0.5, 1.0]
scale_values = [0.3, 0.5]

# Prepare CSV to store results
results_file = "grid_search_results.csv"
with open(results_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Run", "lr0", "mosaic", "scale", "mAP50", "mAP50-95"])

# Run grid search
runs = list(product(learning_rates, mosaic_values, scale_values))
for i, (lr0, mosaic, scale) in enumerate(runs, start=1):
    print(f"\n🔁 Running configuration {i}/12: lr0={lr0}, mosaic={mosaic}, scale={scale}")

    model = YOLO("yolov8m.pt")  # Change to yolov8s.pt or other if needed
    if f"run_{i}_lr{lr0}_mos{mosaic}_sc{scale}" in os.listdir("grid_search_yolo"):
        print("already process this parameter")
        continue
    # Train
    results = model.train(
        data=DATA_YOLO_FORMAT_LOCATION,
        epochs=50,
        imgsz=1280,
        batch=-1,
        lr0=lr0,
        mosaic=mosaic,
        scale=scale,
        patience=10,
        project="grid_search_yolo",
        name=f"run_{i}_lr{lr0}_mos{mosaic}_sc{scale}",
        exist_ok=True,
        seed=42,
        verbose=False,
    )

    # Get metrics
    metrics = model.val()
    mAP50 = metrics.box.map50
    mAP50_95 = metrics.box.map

    # Save results
    with open(results_file, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([i, lr0, mosaic, scale, mAP50, mAP50_95])

print("\n✅ Grid search complete. Results saved to:", results_file)

# Best Model

The best parameters we found were:
- Learning rate: 0.01
- Mosiac value: 1.0
- Scale: 0.5

In [111]:
model = YOLO("FINAL_NOTEBOOK/best_model.pt")
# results = model.train(
#     data=GENERATED_BIRD_OUTPUT_FOLDER,
#     epochs=50,
#     imgsz=1280,
#     batch=-1,
#     lr0=0.01,
#     mosaic=1.0,
#     scale=0.5,
#     patience=10,
#     project="grid_search_yolo",
#     name=f"Best model",
#     exist_ok=True,
#     seed=42,
#     verbose=False,
# )

test_set_results = model.val(data=os.path.join(GENERATED_BIRD_OUTPUT_FOLDER, "data.yaml"), imgsz=1280, save=True, split='test')
print("\n📊 YOLO Evaluation Metrics on Test Set")
print("======================================")
print(f"Precision      : {test_set_results['metrics/precision']:.4f}")
print(f"Recall         : {test_set_results['metrics/recall']:.4f}")
print(f"mAP@0.5        : {test_set_results['metrics/mAP_0.5']:.4f}")
print(f"mAP@0.5:0.95   : {test_set_results['metrics/mAP_0.5:0.95']:.4f}")
print("======================================")

model.save("best_model.pt")

Ultralytics 8.3.132  Python-3.12.0 torch-2.7.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11m summary (fused): 125 layers, 20,030,803 parameters, 0 gradients, 67.6 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 1065.065.1 MB/s, size: 2441.8 KB)


[34m[1mval: [0mScanning C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\generated_bird_output\test\labels... 176 images, 123 backgrounds, 0 corrupt: 100%|██████████| 176/176 [00:01<00:00, 159.35it/s]

[34m[1mval: [0mNew cache created: C:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\data\generated_bird_output\test\labels.cache



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  36%|███▋      | 4/11 [01:16<02:14, 19.20s/it]


KeyboardInterrupt: 

# Inference
We first have to tile and apply gamma correction, then we can infer.


In [7]:
def tile_coords_to_image_coords(bird_xyxy: tuple, tile):
    x_min = int(bird_xyxy[0] + tile["x_min"])
    y_min = int(bird_xyxy[1] + tile["y_min"])
    x_max = int(bird_xyxy[2] + tile["x_min"])
    y_max = int(bird_xyxy[3] + tile["y_min"])
    return x_min, y_min, x_max, y_max

def infer(model_location: str, image_to_infer_location: str) -> List[Tuple[int, int, int, int]]:
    model = YOLO(model_location)
    tiles = slice_image(image_to_infer_location)
    bird_boxes = []
    for i, tile in enumerate(tiles):
        tile_file_name = f"to_infer_tmp.png"
        cv2.imwrite(tile_file_name, tile["tile"])

        results = model(tile_file_name)
        for box in results[0].boxes.xyxy:
            bird_boxes.append(tile_coords_to_image_coords(box, tile))

        os.remove(tile_file_name)
        if i == 1:
            break

    return bird_boxes

In [11]:
BEST_MODEL = "best_model.pt"
IMAGE_TO_INFER = "test_for_inference.png"

# Do the inference
found_birds = infer(BEST_MODEL, IMAGE_TO_INFER)

# Display the results
image = cv2.imread(IMAGE_TO_INFER)

for (x_min, y_min, x_max, y_max) in found_birds:
    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=(0, 0, 255), thickness=2)

image = cv2.resize(image, (1244, 700))
cv2.imshow('Image with Bounding Boxes', image)
cv2.waitKey(0)
cv2.destroyAllWindows()


image 1/1 c:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\FINAL_NOTEBOOK\to_infer_tmp.png: 1280x1280 (no detections), 944.4ms
Speed: 8.0ms preprocess, 944.4ms inference, 0.9ms postprocess per image at shape (1, 3, 1280, 1280)

image 1/1 c:\studie\Semester 2\content\Deep Learning\group_assignment\DL---detection-of-birds-in-drone-images\FINAL_NOTEBOOK\to_infer_tmp.png: 1280x1280 6 items, 929.9ms
Speed: 8.0ms preprocess, 929.9ms inference, 1.6ms postprocess per image at shape (1, 3, 1280, 1280)
