# DATASET PREPARATION

In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import tensorflow as tf
import shutil
import os
import albumentations as A
import cv2

In [3]:
trainImages = "Dataset/VOC2012/JPEGImages/"
trainMaps = "Dataset/VOC2012/Annotations/"

classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 
            'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 
            'train', 'tvmonitor']

B = 1
NUM_CLASSES = 20
OBJ_IND = NUM_CLASSES + 5*B
H, W = 224, 224
SPLIT_SIZE = H//32
NUM_EPOCHS = 135
BATCH_SIZE = 32

## Annotation Parsing Function

In [4]:
def processXML(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    height = float(root.find('size').find('height').text)
    width = float(root.find('size').find('width').text)

    boxes = []
    for obj in root.iter('object'):
        cls = obj.find('name').text
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        xMIN = float(xmlbox.find('xmin').text)
        yMIN = float(xmlbox.find('ymin').text)
        xMAX = float(xmlbox.find('xmax').text)
        yMAX = float(xmlbox.find('ymax').text)
        xCentre = (xMIN + xMAX) / (2*width)
        yCentre = (yMIN + yMAX) / (2*height)
        boxWidth = (xMAX - xMIN) / width
        boxHeight = (yMAX - yMIN) / height
        b = [xCentre, yCentre, boxWidth, boxHeight]
        boxes.append(list(b) + [cls_id])

    return tf.convert_to_tensor(boxes)

In [5]:
processXML("Dataset/VOC2012/Annotations/2007_000032.xml")

2025-02-07 10:53:53.760230: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-02-07 10:53:53.760402: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-02-07 10:53:53.760410: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2025-02-07 10:53:53.760691: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-07 10:53:53.760717: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[ 0.479     ,  0.4644128 ,  0.542     ,  0.37366548,  0.        ],
       [ 0.33      ,  0.37544483,  0.128     ,  0.12455516,  0.        ],
       [ 0.408     ,  0.727758  ,  0.036     ,  0.17437722, 14.        ],
       [ 0.07      ,  0.7597865 ,  0.036     ,  0.17437722, 14.        ]],
      dtype=float32)>

## Processing Bounding Boxes To Generate Required Output

In [6]:
def generateOutput(boundingBoxes):
    outputLabel = np.zeros((SPLIT_SIZE, SPLIT_SIZE, OBJ_IND))
    for b in range(boundingBoxes.shape[0]):
        xCentre, yCentre, boxWidth, boxHeight, cls_id = boundingBoxes[b]
        gridX = SPLIT_SIZE * xCentre
        gridY = SPLIT_SIZE * yCentre
        i = int(gridX)
        j = int(gridY)
        if outputLabel[i, j, 0] == 0:
            outputLabel[i, j, 0:5] = [1., gridX%1, gridY%1, boxWidth, boxHeight]
            outputLabel[i, j, 5 + int(cls_id)] = 1
    return tf.convert_to_tensor(outputLabel, dtype=tf.float32)

In [7]:
boundingBoxes = processXML("Dataset/VOC2012/Annotations/2007_000032.xml")
outputLabel = generateOutput(boundingBoxes)
print(outputLabel[0][5])

tf.Tensor(
[1.         0.49       0.3185053  0.036      0.17437722 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         1.         0.         0.         0.         0.
 0.        ], shape=(25,), dtype=float32)


In [8]:
val_list=['2007_000027.jpg','2007_000032.jpg','2007_000033.jpg','2007_000039.jpg','2007_000042.jpg','2007_000061.jpg',
          '2007_000063.jpg','2007_000068.jpg','2007_000121.jpg','2007_000123.jpg','2007_000129.jpg','2007_000170.jpg',
          '2007_000175.jpg','2007_000187.jpg','2007_000241.jpg','2007_000243.jpg','2007_000250.jpg','2007_000256.jpg',
          '2007_000272.jpg','2007_000323.jpg','2007_000332.jpg','2007_000333.jpg','2007_000346.jpg','2007_000363.jpg',
          '2007_000364.jpg','2007_000392.jpg','2007_000423.jpg','2007_000452.jpg','2007_000464.jpg','2007_000480.jpg',
          '2007_000491.jpg','2007_000504.jpg','2007_000515.jpg','2007_000528.jpg','2007_000529.jpg','2007_000549.jpg',
          '2007_000559.jpg','2007_000572.jpg','2007_000584.jpg','2007_000629.jpg','2007_000636.jpg','2007_000645.jpg',
          '2007_000648.jpg','2007_000661.jpg','2007_000663.jpg','2007_000664.jpg','2007_000676.jpg','2007_000713.jpg',
          '2007_000720.jpg','2007_000727.jpg','2007_000733.jpg','2007_000738.jpg','2007_000762.jpg','2007_000768.jpg',
          '2007_000783.jpg','2007_000793.jpg','2007_000799.jpg','2007_000804.jpg','2007_000807.jpg','2007_000822.jpg',
          '2007_001299.jpg','2007_001311.jpg','2007_001321.jpg','2007_001340.jpg']

In [9]:
!mkdir Dataset/VOC2012/ValJPEGImages/
!mkdir Dataset/VOC2012/ValAnnotations/

In [10]:
valImages = "Dataset/VOC2012/ValJPEGImages/"
valMaps = "Dataset/VOC2012/ValAnnotations/"

In [11]:
for name in val_list:
    shutil.move(trainMaps+name[:-3]+"xml", valMaps+name[:-3]+"xml")
for name in val_list:
    shutil.move(trainImages+name, valImages+name)

In [12]:
imPaths = []
xmlPaths = []
valImPaths = []
valXmlPaths = []

for i in os.listdir(trainMaps):
    imPaths.append(trainImages+i[:-3]+"jpg")
    xmlPaths.append(trainMaps+i)

for i in os.listdir(valMaps):
    valImPaths.append(valImages+i[:-3]+"jpg")
    valXmlPaths.append(valMaps+i)

print(len(imPaths), len(xmlPaths))
print(len(valImPaths), len(valXmlPaths))

17061 17061
64 64


In [13]:
trainDataset = tf.data.Dataset.from_tensor_slices((imPaths, xmlPaths))
valDataset = tf.data.Dataset.from_tensor_slices((valImPaths, valXmlPaths))

In [14]:
for i in trainDataset.take(1):
    print(i)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Dataset/VOC2012/JPEGImages/2011_006135.jpg'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Dataset/VOC2012/Annotations/2011_006135.xml'>)


2025-02-07 10:53:58.275031: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [15]:
def getImageBBoxes(imPath, xmlPath):
    image = tf.io.decode_jpeg(tf.io.read_file(imPath))
    image = tf.cast(tf.image.resize(image, [H, W]), tf.float32)

    bBoxes = tf.numpy_function(processXML, [xmlPath], tf.float32)
    return image, bBoxes  

In [16]:
trainDataset = trainDataset.map(getImageBBoxes)
valDataset = valDataset.map(getImageBBoxes)

In [17]:
for i, j in trainDataset.skip(10):
    print(i.shape)
    print(j)
    break

(224, 224, 3)
tf.Tensor([[0.488      0.5693333  0.396      0.81333333 2.        ]], shape=(1, 5), dtype=float32)


## Data Augmentation

In [18]:
transforms = A.Compose([
    A.Resize(H,W),
    A.RandomCrop(
         width=np.random.randint(int(0.9*W),W),
         height=np.random.randint(int(0.9*H),H), p=0.5),
    A.RandomScale(scale_limit=0.1, interpolation=cv2.INTER_LANCZOS4,p=0.5),
    A.HorizontalFlip(p=0.5,),
    A.Resize(H,W),

], bbox_params=A.BboxParams(format='yolo', ))

In [19]:
def augAlbumentations(image, bBoxes):
    transformed = transforms(image=image, bboxes=bBoxes)
    image = transformed['image']
    bBoxes = transformed['bboxes']
    return [tf.convert_to_tensor(image, dtype=tf.float32), tf.convert_to_tensor(bBoxes, dtype=tf.float32)]

In [20]:
def processTrainData(image, bBoxes):
    # Albumentations
    aug = tf.numpy_function(augAlbumentations, [image, bBoxes], [tf.float32, tf.float32])
    image = aug[0]
    bBoxes = aug[1]
    # Tensorflow Augmentation
    image = tf.image.random_brightness(image, max_delta=50.)
    image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
    image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
    # image = tf.image.random_hue(image, max_delta=0.5)
    image = tf.clip_by_value(image, 0., 255.)
    outputLabel = tf.numpy_function(generateOutput, [bBoxes], tf.float32)
    return image, outputLabel

In [21]:
def processValData(image, bBoxes):
    outputLabel = tf.numpy_function(generateOutput, [bBoxes], tf.float32)
    return image, outputLabel

In [22]:
trainDataset = trainDataset.map(processTrainData)
valDataset = valDataset.map(processValData)

In [28]:
for i, j in trainDataset.take(1):
    print(i.dtype, i.shape)
    print(j.dtype, j.shape)
    break

<dtype: 'float32'> (224, 224, 3)
<dtype: 'float32'> (7, 7, 25)


In [29]:
for i, j in valDataset.take(1):
    print(i.dtype, i.shape)
    print(j.dtype, j.shape)
    break

<dtype: 'float32'> (224, 224, 3)
<dtype: 'float32'> (7, 7, 25)


In [30]:
trainDataset.save("SavedDataset/trainDataset")
valDataset.save("SavedDataset/valDataset")

In [32]:
def getFolderSize(folderPath):
    totalSize = 0
    for dirpath, dirnames, filenames in os.walk(folderPath):
        for filename in filenames:
            filePath = os.path.join(dirpath, filename)
            totalSize += os.path.getsize(filePath)
    return totalSize  # Size in bytes
trainSizeInBytes = getFolderSize("SavedDataset/trainDataset")
valSizeInBytes = getFolderSize("SavedDataset/valDataset")
print(f"Train Dataset Size: {trainSizeInBytes / (1024 * 1024):.2f} MB")
print(f"Validation Dataset Size: {valSizeInBytes / (1024 * 1024):.2f} MB")

Train Dataset Size: 9877.66 MB
Validation Dataset Size: 37.05 MB
