# Deep Learning group assignment
Group name: Angry Birds

Group members:
- Nienke Reijnen: 2117034
- Andrea Ciavatti: 2115635
- Niels Boonstra: 1451294
- Yannick Lankhorst: 2052754
- Thom Zoomer:2059225
- Anne Barnasconi: 2053988

## Setting up the environment

Before running, make sure to also have installed the following packages (according to lab 8 instructions):
- pip install imageio
- pip install future
- pip install tensorboard

In [71]:
import os
import json
import shutil
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset
import json
from PIL import Image
from tqdm import tqdm
import random
from torchvision.transforms import functional
from torch.utils.data import ConcatDataset



In [None]:
device, device_name = (torch.device("cuda"), torch.cuda.get_device_name(0)) if torch.cuda.is_available() else (torch.device("cpu"), "CPU")
print(f"Device: {device}, {device_name}")

## Data loading & preprocessing

- DONE Correct implementation of data loaders for images and annotations for your specific object detection model
- DONE Use of data augmentation techniques
- DONE Appropriate shuffling  and batching of data
- TO DO: Conduct an online search for relevant open-source datasets, and if you can find them, use them in your application as additional training data (to improve generalization)

### The code below needs to be run on your own laptop to convert the names of the scare_crow dataset

In [None]:
# Define base directory and dataset splits
base_dir = "scarecrow_dataset"
splits = ["train", "val", "test"]

# Allowed image extensions (all lowercased for matching)
image_extensions = (".png", ".jpg", ".jpeg")

for split in splits:
    print(f"\n Processing {split} split...")
    
    image_dir = os.path.join(base_dir, split, "images")
    annotation_path = os.path.join(base_dir, split, "annotations.json")

    # Get image files (case-insensitive extension check)
    image_files = sorted([
        f for f in os.listdir(image_dir)
        if os.path.splitext(f)[1].lower() in image_extensions
    ])

    print(f" Found {len(image_files)} image files in '{split}/images'")

    # Load annotations
    with open(annotation_path, "r") as f:
        annotations = json.load(f)

    print(f" Found {len(annotations)} annotation entries in annotations.json")

    # Ensure lengths match
    if len(image_files) != len(annotations):
        print(f"  WARNING: Number of images ≠ number of annotations in {split}")
        min_len = min(len(image_files), len(annotations))
        print(f" Truncating to {min_len} entries to avoid mismatch.")
        image_files = image_files[:min_len]
        annotations = annotations[:min_len]
    else:
        min_len = len(image_files)

    # Detect if already renamed
    expected_prefix = f"{split}_original_"
    already_renamed = all(f.startswith(expected_prefix) for f in image_files)

    if already_renamed:
        print(f" Skipping {split} — already renamed.")
        continue

    # Perform renaming
    for idx, old_name in enumerate(image_files, 1):
        ext = os.path.splitext(old_name)[1]
        new_name = f"{split}_original_{idx}{ext.lower()}"
        old_path = os.path.join(image_dir, old_name)
        new_path = os.path.join(image_dir, new_name)
        os.rename(old_path, new_path)
        image_files[idx - 1] = new_name  # Update name in list

    # Update annotations
    for idx, entry in enumerate(annotations):
        entry["OriginalFileName"] = image_files[idx]

    # Save updated annotations
    with open(annotation_path, "w") as f:
        json.dump(annotations, f, indent=4)

    print(f" {split} renamed and annotations updated.")


### Data loading & augmentation

In [None]:
######################################
### Defining a CustomDataset class ###
######################################


class CustomDataset(Dataset):
    def __init__(self, data_path, transform = None):
        """
        Initialize the custom dataset.
        Works for both the train data and the test data.
        """
        self.images_dir = os.path.join(data_path, "images")
        self.transform = transform
        annotations_file = data_path + "/annotations.json"
        with open(annotations_file, 'r') as f:
            annotations_list = json.load(f)
       
        # We need to extract the bounding boxes of the annotations from the JSON file and store them as [x_min, y_min, x_max, y_max] tensors
        self.data = []
        for entry in annotations_list:
            image_name = entry['OriginalFileName']
            annotation_data = entry['AnnotationData']
            bird_boxes = self.extract_bird_boxes(annotation_data)
            self.data.append({'imagename': image_name, 'bird_boxes_tensor': bird_boxes})

        # Note: we should not load all the images into a tensor here, as it would take too much memory. We load images into a tensor in the __getitem__ method.


    def extract_bird_boxes(self, annotation_data):
        """
        Extract the coordinates of the birds from the annotation data in the JSON file and return it as a tensor.
        """
        bird_boxes = []
        for entry in annotation_data:
            if entry['Label'] == 'Bird':
                coordinates_list = entry['Coordinates']
                x_coordinates = [point['X'] for point in coordinates_list]
                y_coordinates = [point['Y'] for point in coordinates_list]
                x_min, x_max = min(x_coordinates), max(x_coordinates)
                y_min, y_max = min(y_coordinates), max(y_coordinates)
                bird_boxes.append([x_min, y_min, x_max, y_max])

        return torch.tensor(bird_boxes, dtype=torch.float32) # Shape: (num_birds, 4)


    def __len__(self):
        """
        Return the size of the dataset, i.e. the number of images.
        """
        return len(self.data)


    def __getitem__(self, index):
        """
        Load an image and its corresponding annotations.
        Returns the image and a target dictionary with bounding boxes and labels (we need this for compatiblity with object detection models like Faster R-CNN)
        """
        item = self.data[index]
        image_path = os.path.join(self.images_dir, item['imagename'])
        image = Image.open(image_path).convert("RGB")
        
        bird_boxes = item['bird_boxes_tensor']
        labels = torch.ones((bird_boxes.shape[0],), dtype=torch.int64) # Assuming all the labels are 'Bird' --> we assign this to class 1
        target = {'boxes': bird_boxes, 'labels': labels} # should contain the bounding boxes and the labels

        # Apply data augmentations
        if self.transform:
            image, target = self.transform(image, target)

        return image, target


In [None]:
###############################################
### Finding the mean and std of the dataset ###
###############################################

def calculate_mean_and_std(dataset):
    # Initialize sums for mean and variance
    mean = torch.zeros(3)
    std = torch.zeros(3)
    num_pixels = 0

    # Use tqdm to add a progress bar
    for image, _ in tqdm(dataset, desc="Calculating Mean and Std", unit="image"):
        # Convert image to tensor if it is in PIL format
        image = transforms.ToTensor()(image)  # shape: (C, H, W)
        
        # Calculate the sum and squared sum of pixels for each channel
        mean += image.mean([1, 2])  # mean per channel (C,)
        std += image.std([1, 2])    # std per channel (C,)
        num_pixels += 1
    
    # Average the sums to get the mean and std
    mean /= num_pixels
    std /= num_pixels
    
    return mean, std

# For now, do no transformations:
train_data_original = CustomDataset("scarecrow_dataset/train", transform=None)
train_data_extra = CustomDataset("bird-detection-farm/train", transform=None)

train_data = torch.utils.data.ConcatDataset([train_data_original, train_data_extra])

mean, std = calculate_mean_and_std(train_data)
print(f"Dataset Mean: {mean}")
print(f"Dataset Std: {std}")


Dataset Mean: tensor([0.5390, 0.5306, 0.4421])

Dataset Std: tensor([0.1624, 0.1527, 0.1647])

### With the extra data

Dataset Mean: tensor([0.5409, 0.5505, 0.3894])

Dataset Std: tensor([0.1674, 0.1557, 0.1689])

In [None]:
#############################################
### Defining a CustomTransformation class ###
##############################################

random.seed(7)
torch.manual_seed(7)
mean = [0.5409, 0.5505, 0.3894]
std = [0.1674, 0.1557, 0.1689]

class CustomTransformation:
    def __init__(self):
        self.transforms = transforms.Compose([transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean, std)])
        

    def perform_horizontal_flip(self, image, target, prob = 0.25):
        """
        Horizontally flips the image with a given probability, default is 0.25
        """
        flip_prob = prob
        if random.random() < flip_prob:
            # Flip the image
            image = functional.hflip(image)

            # Flip the bounding boxes
            boxes = target['boxes']
            width, _ = image.size
            x_min = boxes[:, 0].clone()
            x_max = boxes[:, 2].clone()
            boxes[:, 0] = width - x_max
            boxes[:, 2] = width - x_min
        
        return image, target
    

    def perform_vertical_flip(self, image, target, prob = 0.25):
        """
        Vertically flips the image with a given probability, default is 0.25
        """
        flip_prob = prob
        if random.random() < flip_prob:
            # Flip the image
            image = functional.vflip(image)

            # Flip the bounding boxes
            boxes = target['boxes']
            _, height = image.size
            y_min = boxes[:, 1].clone()
            y_max = boxes[:, 3].clone()
            boxes[:, 1] = height - y_max
            boxes[:, 3] = height - y_min
            target['boxes'] = boxes

        return image, target
    

    def perform_random_rotation(self, image, target, prob = 0.25):
        """
        Perform a random rotation in multiples of 90 degrees with a given probability, default is 0.25.
        """
        if random.random() < prob:
            angle = random.choice([90, 180, 270])

            image = functional.rotate(image, angle)

            boxes = target['boxes']
            width, height = image.size

            if angle == 90:
                boxes = boxes[:, [1, 0, 3, 2]] # swap x and y coordinates
                boxes[:, [0, 2]] = height - boxes[:, [2, 0]] # adjut x-coordinates because the rotation changes the origin
            elif angle == 180:
                boxes[:, [0, 2]] = width - boxes[:, [2, 0]] # adjust x-coordinates
                boxes[:, [1, 3]] = height - boxes[:, [3, 1]] # adjust y-coordinates
            elif angle == 270:
                boxes = boxes[:, [1, 0, 3, 2]] # swap x and y coordinates
                boxes[:, [1, 3]] = width - boxes[:, [3, 1]] # adjust y-coordinates because the rotation changes to origin
            target['boxes'] = boxes
        
        return image, target


    def perform_random_resize(self, image, target, scale_range=(0.75, 1.25)):
        """
        Perform a random reize within the specified scale range, default scale range is (0,75, 1.25)
        """
        scale = random.uniform(*scale_range)

        # Resize the image
        width, height = image.size
        new_height, new_width = int(height * scale), int(width * scale)
        image = functional.resize(image, [new_height, new_width])
        
        # Resize the boxes
        boxes = target['boxes']
        boxes = boxes * scale
        target['boxes'] = boxes
        
        return image, target


    def __call__(self, image, target):
        """
        Apply the transformations to an image
        """
        image, target = self.perform_horizontal_flip(image, target)
        image, target = self.perform_vertical_flip(image, target)
        image, target = self.perform_random_rotation(image, target)
        image, target = self.perform_random_resize(image, target)
        image = self.transforms(image)
        return image, target

In [None]:
batch_size = 32

# Define paths
train_data_original_path = "scarecrow_dataset/train"
test_data_original_path = "scarecrow_dataset/test"
val_data_original_path = "scarecrow_dataset/val"
train_data_extra_path = "bird-detection-farm/train"
valid_data_extra_path = "bird-detection-farm/valid"
test_data_extra_path = "bird-detection-farm/test"

# Loading the datasets with the transformations
transform = CustomTransformation()

train_data_original = CustomDataset(train_data_original_path, transform)
valid_data_original = CustomDataset(val_data_original_path, transform)
test_data_original = CustomDataset(test_data_original_path, transform)

train_data_extra = CustomDataset(train_data_extra_path, transform)
valid_data_extra = CustomDataset(valid_data_extra_path, transform)
test_data_extra = CustomDataset(test_data_extra_path, transform)

# Split old training set into train/val
#train_data_original, valid_data_original = torch.utils.data.random_split(train_data_original, [0.8, 0.2])

# Combine datasets
train_data = torch.utils.data.ConcatDataset([train_data_original, train_data_extra])
valid_data = torch.utils.data.ConcatDataset([valid_data_original, valid_data_extra])
test_data = torch.utils.data.ConcatDataset([test_data_original, test_data_extra])

# Loaders
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
# No shuffling for validation and test data because we want consistnt order for reproducibility:
val_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)
