In [29]:
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms

import os
import random
import numpy as np
from PIL import Image

class VOCDataset(Dataset):

    def __init__(self, is_train, image_dir, label_txt, image_size=448, grid_size=7, num_bboxes=2, num_classes=20):
        self.is_train = is_train
        self.image_size = image_size

        self.S = grid_size
        self.B = num_bboxes
        self.C = num_classes

        # if isinstance(label_txt, list) or isinstance(label_txt, tuple):
        #     # cat multiple list files together.
        #     # This is useful for VOC2007/VOC2012 combination.
        #     tmp_file = '/content/label.txt'
        #     os.system('cat %s > %s' % (' '.join(label_txt), tmp_file))
        #     label_txt = tmp_file
            

        self.paths, self.boxes, self.labels = [], [], []

        with open('/content/VOC.txt') as f:
            lines = f.readlines()
            
        for line in lines:
            splitted = line.strip().split()

            fname = splitted[0]
            path = os.path.join(image_dir, fname)

            self.paths.append(path)

            num_boxes = (len(splitted) - 1) // 5
            box, label = [], []
            for i in range(num_boxes):
                x1 = float(splitted[5*i + 1])
                y1 = float(splitted[5*i + 2])
                x2 = float(splitted[5*i + 3])
                y2 = float(splitted[5*i + 4])
                c  =   int(splitted[5*i + 5])
                box.append([x1, y1, x2, y2])
                label.append(c)
            self.boxes.append(torch.Tensor(box))
            self.labels.append(torch.LongTensor(label))

        self.num_samples = len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        img = Image.open(path).convert('RGB')
        boxes = self.boxes[idx].clone() # [n, 4]
        labels = self.labels[idx].clone() # [n,]
        
        h, w = img.size
        boxes /= torch.Tensor([[w, h, w, h]]).expand_as(boxes) # normalize (x1, y1, x2, y2) w.r.t. image width/height.
        
        target = self.encode(boxes, labels) # [S, S, 5 x B + C]

        img = transforms.Resize((448,448))(img)
        img = transforms.ToTensor()(img) #(img - self.mean) / 255.0 # normalize from -1.0 to 1.0.

        return img, target

    def __len__(self):
        return self.num_samples

    def encode(self, boxes, labels):
        """ Encode box coordinates and class labels as one target tensor.
        Args:
            boxes: (tensor) [[x1, y1, x2, y2]_obj1, ...], normalized from 0.0 to 1.0 w.r.t. image width/height.
            labels: (tensor) [c_obj1, c_obj2, ...]
        Returns:
            An encoded tensor sized [S, S, 5 x B + C], 5=(x, y, w, h, conf)
        """

        S, B, C = self.S, self.B, self.C
        N = 5 * B + C

        target = torch.zeros(S, S, N)
        cell_size = 1.0 / float(S)
        boxes_wh = boxes[:, 2:] - boxes[:, :2] # width and height for each box, [n, 2]
        boxes_xy = (boxes[:, 2:] + boxes[:, :2]) / 2.0 # center x & y for each box, [n, 2]

        for b in range(boxes.size(0)):
            xy, wh, label = boxes_xy[b], boxes_wh[b], int(labels[b])
            print(xy, wh, label, sep="\n")

            ij = (xy / cell_size).ceil() - 1.0
            print(ij)
            i, j = int(ij[0]), int(ij[1]) # y & x index which represents its location on the grid.
            x0y0 = ij * cell_size # x & y of the cell left-top corner.
            xy_normalized = (xy - x0y0) / cell_size # x & y of the box on the cell, normalized from 0.0 to 1.0.

            # TBM, remove redundant dimensions from target tensor.
            # To remove these, loss implementation also has to be modified.
            for k in range(B):
                s = 5 * k
                target[j, i, s  :s+2] = xy_normalized
                target[j, i, s+2:s+4] = wh
                target[j, i, s+4    ] = 1.0
            target[j, i, 5*B + label] = 1.0

        return target




In [22]:
(torch.Tensor([0.3050,0.3770])/cell_size).ceil()

tensor([1., 2.])

In [30]:
from torch.utils.data import DataLoader

image_dir = '/content/'
label_txt = ['/content/VOC.txt']

dataset = VOCDataset(True, image_dir, label_txt, grid_size=3,  num_bboxes=1, num_classes=15)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

for k, v in data_loader:
    img, target = k, v
    # print(img.size(), target.size())


tensor([0.3050, 0.3770])
tensor([0.5995, 0.7420])
14
tensor([0., 1.])
tensor([0.7149, 0.4250])
tensor([1.2228, 0.6500])
11
tensor([2., 1.])
tensor([0.5851, 0.7440])
tensor([0.3149, 0.5282])
7
tensor([1., 2.])
tensor([0.1337, 0.9531])
tensor([0.2515, 0.4316])
7
tensor([0., 2.])


In [6]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(1, 1))

img_arr = img.squeeze(0).permute(1,2,0).numpy() * 255.0
img1 = Image.fromarray(img_arr.astype(np.uint8))
rgbimg = Image.new("RGBA", img1.size)
rgbimg.paste(img1)

# test_bbox_img1 = target[0][2,1,:4]
test_bbox_img1 = torch.Tensor([[0.0079, 0.7373, 0.2594, 1.1689]])
#bbox14_abs = relative2absolute_(test_bbox_img1)

bbox14_abs = test_bbox_img1*1
xmin, ymin, xmax, ymax = bbox14_abs.permute(1,0)
left, right, top, bottom = xmin, xmax, ymin, ymax

draw = PIL.ImageDraw.Draw(rgbimg)
draw.line([(left, top), (left, bottom), (right, bottom), (right, top), (left, top)], width=2, fill='red')
# draw.point((cx_abs,cy_abs), fill="red")

gleft = 0
gright = 0
for it in range(3):
    g1 = 149.33*it
    g2 = 149.33*it
    
    grid_x = [(0,g1), (448,g2)]
    grid_y = [(g1,0), (g2,448)]
    draw.line(grid_x)
    draw.line(grid_y)

fig.set_size_inches(5, 5)
plt.imshow(rgbimg)
plt.axis('off');

NameError: ignored

<Figure size 72x72 with 0 Axes>