In [1]:
import os
import numpy as np
import torch
from PIL import Image
import pathlib
import utils
import json

In [2]:
class PubLayNetDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "Images"))))
        self.json = os.path.join(root, "samples.json")
        with open(self.json) as f:
            self.templates = json.load(f)
            f.close()
        self.images = {}
        for image in self.templates['images']:
            self.images[image['id']] = {'file_name': image['file_name'], 'annotations': []}
        for ann in self.templates['annotations']:
            self.images[ann['image_id']]['annotations'].append(ann)
        self.keys = list(self.images.keys())
        
     
    def __getitem__(self, idx):
        # Load images
        img_path = os.path.join(self.root, "Images", self.imgs[idx])
        img = Image.open(img_path).convert("RGB")        
        # Get objects
        objects_key = 0
        for key in self.keys:
            if self.images[key]['file_name'] == self.imgs[idx]:
                objects_key = key
                break               
        objects = self.images[objects_key]['annotations']
        boxes = []
        labels = []
        for object_ in objects:
            # Get the label and boxes
            label = object_['category_id']
            labels.append(label)
            bndbox = object_['bbox']
            bndbox_max_min = [min(bndbox[0], bndbox[2]), min(bndbox[1], bndbox[3]),
                              max(bndbox[0], bndbox[2]), max(bndbox[1], bndbox[3])]
            boxes.append(bndbox_max_min)        
 
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)        
 
        image_id = torch.as_tensor([objects_key])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # All instances are not crowd
        iscrowd = torch.zeros((len(objects),), dtype=torch.int64)
 
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
 
        if self.transforms is not None:
            img, target = self.transforms(img, target)
 
        return img, target
 
    def __len__(self):
        return len(self.imgs)

In [3]:
root = '/media/kirb/ADATA HD680/examples/'

In [4]:
publaynet = PubLayNetDataset(root)

In [5]:
publaynet[0]

(<PIL.Image.Image image mode=RGB size=601x792 at 0x7F0DA7B6F850>,
 {'boxes': tensor([[ 50.5800,  24.8100, 240.1200, 316.7500],
          [ 50.5800, 105.3700, 240.1300, 339.4800],
          [ 50.5800,  47.5500, 240.1100, 442.7900],
          [ 50.5800, 149.8500, 240.1600, 488.2600],
          [240.1100,  36.1700, 308.6100, 316.7500],
          [ 50.5800, 106.3800, 240.1400, 636.0400],
          [240.1800, 161.2200, 308.6100, 386.6400],
          [240.1500, 197.3200, 308.6100, 545.7900],
          [ 50.5800,  23.3200, 498.1300,  71.2000],
          [ 51.9300,  10.9600,  82.4200, 278.7700],
          [ 50.5800, 101.4200, 498.1400, 176.5700],
          [ 71.0700,  12.8800, 308.6100, 367.5700]]),
  'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2]),
  'image_id': tensor([407967]),
  'area': tensor([ 55334.3047,  44375.5547,  74909.8438,  64155.7695,  19219.7266,
          100402.3438,  15425.4902,  23856.2520,  21428.6914,   8165.5264,
           33634.1367,  84253.0547]),
  'iscrowd': 

In [6]:
publaynet[1]

(<PIL.Image.Image image mode=RGB size=601x792 at 0x7F0C8B6A76D0>,
 {'boxes': tensor([[ 50.5800,  36.1700, 240.1200, 328.7300],
          [ 50.5800, 115.7500, 240.1700, 362.8300],
          [ 50.5800, 115.7500, 240.1400, 476.5100],
          [ 50.5800,  81.6400, 240.1500, 590.1900],
          [ 50.5800,  70.2800, 240.1700, 671.7500],
          [240.1700, 138.4800, 308.6100, 328.7400],
          [240.1400, 149.8500, 308.6100, 465.1600],
          [240.1800,  47.5400, 308.6100, 612.9300],
          [225.3900,  13.4400, 308.6100, 696.0400],
          [ 50.5800,  33.0200, 498.1600, 276.6900],
          [ 50.8300,  70.6800, 495.4100, 200.4100],
          [222.8300,  24.8100, 325.9400, 718.3000],
          [ 74.4700,  12.8800, 308.6100, 676.9600]]),
  'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 2]),
  'image_id': tensor([379698]),
  'area': tensor([ 55451.8203,  46843.8945,  68385.6641,  96405.8203, 114032.6875,
           13021.3916,  21589.2715,  38689.6328,  56805.9609, 109061.82