In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import os
import cv2
import numpy as np
import xml.etree.ElementTree as ET
import math

In [2]:
IMG_PATH = r'C:\Users\Cordu\Desktop\Projects\Artificial-Intelligence\CNN\Untitled Folder\VOCdevkit\VOC2007\JPEGImages'
IMG_TRAIN_TXT_PATH = r'C:\Users\Cordu\Desktop\Projects\Artificial-Intelligence\CNN\Untitled Folder\VOCdevkit\VOC2007\ImageSets\Main\train.txt'
ANNOT_TRAIN_PATH = r'C:\Users\Cordu\Desktop\Projects\Artificial-Intelligence\CNN\Untitled Folder\VOCdevkit\VOC2007\Annotations'

#TEST_PATH = 'C:\Users\Cordu\Desktop\Projects\Artificial-Intelligence\CNN\Untitled Folder\VOCdevkit\VOC2007\ImageSets\Segmentation\'

In [3]:
classes = [
'aeroplane',
'bicycle',
'bird',
'boat',
'bottle',
'bus',
'car',
'cat',
'chair',
'cow',
'diningtable',
'dog',
'horse',
'motorbike',
'person',
'pottedplant',
'sheep',
'sofa',
'train',
'tvmonitor']

In [65]:
class VOCDataset(Dataset):
    def __init__(self, img_path, file_path, annot_path, img_size=416):
        with open(file_path, 'r') as file:
            self.img_files = [os.path.join(img_path, line.replace('\n', '')) for line in file.readlines()]
        
        self.img_shape = (img_size, img_size)
        self.max_objects = 75
        
    def __len__(self):
        return len(self.img_files)
    
    def __getitem__(self, index):
        
        # Get the image
        img = cv2.imread(self.img_files[index] + '.jpg')
        h, w, _ = img.shape
        
        # Dimension difference between height and width
        dim_diff = np.abs(h - w)
        
        # Upper (left) and lower (right) padding
        # [1, 2] padding means for [2, 2] ones matrix
        # [0, 0, 0, 0, 0]
        # [0, 1, 1, 0, 0]
        # [0, 1, 1, 0, 0]
        # [0, 0, 0, 0, 0]
        # [0, 0, 0, 0, 0]
        pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2
        
        # Determine padding
        pad = ((pad1, pad2), (0, 0), (0, 0)) if h <= w else ((0, 0), (pad1, pad2), (0, 0))
        # Add padding
        pad_img = np.pad(img, pad, 'constant', constant_values=128)
        padded_h, padded_w, _ = pad_img.shape
        
        # Resize the image to the Darknet input dimension
        pad_img = cv2.resize(pad_img, self.img_shape)
        # Channels-first
        input_img = pad_img.reshape((3, 416, 416))
        # As pytorch tensor
        input_img = torch.from_numpy(input_img).float().div(255.0)
        
        # -------------------
        # -Label calculation-
        # -------------------
        
        img_name = self.img_files[index].split('\\')[len(self.img_files[index].split('\\'))-1]
        annot_file = os.path.join(ANNOT_TRAIN_PATH, img_name + '.xml')
        
        tree = ET.parse(annot_file)
        root = tree.getroot()
        
        
        filled_labels = np.zeros((self.max_objects, 5))
        
        for i, child in enumerate(root.iter('object')):
            object_name = child.find('name').text
            x_min = int(child.find('bndbox').find('xmin').text)
            y_min = int(child.find('bndbox').find('ymin').text)
            
            x_max = int(child.find('bndbox').find('xmax').text)
            y_max = int(child.find('bndbox').find('ymax').text)
            
            # Ajust for the added padding
            x_min += pad[1][0]
            y_min += pad[0][0]
            x_max += pad[1][0]
            y_max += pad[0][0]
            
            x = (x_min + x_max)/2. * 1./padded_w
            y = (y_min + y_max)/2. * 1./padded_h
            w = (x_max - x_min) * 1./padded_w
            h = (y_max - y_min) * 1./padded_h
            
            filled_labels[i] = np.array([classes.index(object_name), x, y, w, h])
        
        filled_labels = torch.from_numpy(filled_labels)
        
        return input_img, filled_labels 


In [8]:
#dataset = VOCDataset(IMG_PATH, IMG_TRAIN_TXT_PATH, ANNOT_TRAIN_PATH)
#print("Number of loaded files is {}".format(len(dataset.img_files)))

Number of loaded files is 16551


In [12]:
#inp, target = next(iter(dataset))
#print(inp.shape)
#print(target.shape)

torch.Size([3, 416, 416])
torch.Size([1, 5])
