In [12]:
# importing libraries to load and process the locally downloaded COCO dataset
import json, os, random, math
from collections import defaultdict

import torch
from torch.utils.data import Dataset
import torchvision.transforms as T

import numpy as np
import PIL
from skimage.transform import resize as imresize
import pycocotools.mask as mask_utils

In [13]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

INV_IMAGENET_MEAN = [-m for m in IMAGENET_MEAN]
INV_IMAGENET_STD = [1.0 / s for s in IMAGENET_STD]

def imagenet_preprocess():
  return T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)

# To resize the pictures to the same resolution (in our case we set the resolution 64x64)
class Resize(object):
  def __init__(self, size, interp=PIL.Image.BILINEAR): 
    if isinstance(size, tuple):
      H, W = size
      self.size = (W, H)
    else:
      self.size = (size, size)
    self.interp = interp

  def __call__(self, img):
    return img.resize(self.size, self.interp)


In [49]:
# The CocoSceneGraphDataset class gets and prepares the downloaded dataset for training

class CocoSceneGraphDataset(Dataset):
  def __init__(self, image_dir, instances_json, 
               image_size=(64, 64), mask_size=16,
               normalize_images=True,
               min_object_size=0.02,
               min_objects_per_image=1, max_objects_per_image=4,):
    """
    Loading images and annotations then converting them to scene graphs on the fly.

    Inputs:
    - image_dir: Path to a directory where images are held
    - instances_json: Path to a JSON file giving COCO annotations
    - image_size: Size (H, W) at which to load images. Default (64, 64).
    - mask_size: Size M for object segmentation masks; default 16.
    - normalize_image: If True then normalize images by subtracting ImageNet
      mean pixel and dividing by ImageNet std pixel.
    - include_relationships: If True then include spatial relationships; if
      False then only include the trivial __in_image__ relationship.
    - min_object_size: Ignore objects whose bounding box takes up less than
      this fraction of the image.
    - min_objects_per_image: Ignore images which have fewer than this many
      object annotations.
    - max_objects_per_image: Ignore images which have more than this many
      object annotations.

    """
    super(Dataset, self).__init__()

    self.image_dir = image_dir
    self.mask_size = mask_size
    self.max_samples = None
    self.normalize_images = normalize_images
    self.set_image_size(image_size)

    with open(instances_json, 'r') as f:
      instances_data = json.load(f)

    self.image_ids = []
    self.image_id_to_filename = {}
    self.image_id_to_size = {}
    for image_data in instances_data['images']:
      image_id = image_data['id']
      filename = image_data['file_name']
      width = image_data['width']
      height = image_data['height']
      self.image_ids.append(image_id)
      self.image_id_to_filename[image_id] = filename
      self.image_id_to_size[image_id] = (width, height)
    
    # in vocab we store the objects and relationships name  
    # and the belonging IDs
    self.vocab = {
      'object_name_to_idx': {},
      'pred_name_to_idx': {},
    }

    object_idx_to_name = {}
    all_instance_categories = []
    for category_data in instances_data['categories']:
      category_id = category_data['id']
      category_name = category_data['name']
      all_instance_categories.append(category_name)
      object_idx_to_name[category_id] = category_name
      self.vocab['object_name_to_idx'][category_name] = category_id
    all_stuff_categories = []
    
    instance_whitelist = all_instance_categories
    stuff_whitelist = all_stuff_categories
    category_whitelist = set(instance_whitelist) | set(stuff_whitelist)

    # Add object data from instances
    self.image_id_to_objects = defaultdict(list)
    for object_data in instances_data['annotations']:
      image_id = object_data['image_id']
      _, _, w, h = object_data['bbox']
      W, H = self.image_id_to_size[image_id]
      box_area = (w * h) / (W * H)
      box_ok = box_area > min_object_size
      object_name = object_idx_to_name[object_data['category_id']]
      category_ok = object_name in category_whitelist
      other_ok = object_name != 'other'
      if box_ok and category_ok and other_ok:
        self.image_id_to_objects[image_id].append(object_data)

    # COCO category labels start at 1, so use 0 for __image__
    self.vocab['object_name_to_idx']['__image__'] = 0

    # Build object_idx_to_name
    name_to_idx = self.vocab['object_name_to_idx']
    assert len(name_to_idx) == len(set(name_to_idx.values()))
    max_object_idx = max(name_to_idx.values())
    idx_to_name = ['NONE'] * (1 + max_object_idx)
    for name, idx in self.vocab['object_name_to_idx'].items():
      idx_to_name[idx] = name
    self.vocab['object_idx_to_name'] = idx_to_name

    # Prune images that have too few or too many objects
    new_image_ids = []
    total_objs = 0
    for image_id in self.image_ids:
      num_objs = len(self.image_id_to_objects[image_id])
      total_objs += num_objs
      if min_objects_per_image <= num_objs <= max_objects_per_image:
        new_image_ids.append(image_id)
    self.image_ids = new_image_ids
    
    self.vocab['pred_idx_to_name'] = [
      '__in_image__',
      'left of',
      'right of',
      'above',
      'below',
      'inside',
      'surrounding',
    ]
    self.vocab['pred_name_to_idx'] = {}
    for idx, name in enumerate(self.vocab['pred_idx_to_name']):
      self.vocab['pred_name_to_idx'][name] = idx

  def set_image_size(self, image_size):
    print('called set_image_size', image_size)
    transform = [Resize(image_size), T.ToTensor()]
    if self.normalize_images:
      transform.append(imagenet_preprocess())
    self.transform = T.Compose(transform)
    self.image_size = image_size

  def total_objects(self):
    total_objs = 0
    for i, image_id in enumerate(self.image_ids):
      if self.max_samples and i >= self.max_samples:
        break
      num_objs = len(self.image_id_to_objects[image_id])
      total_objs += num_objs
    return total_objs

  def __len__(self):
    if self.max_samples is None:
      return len(self.image_ids)
    return min(len(self.image_ids), self.max_samples)

  def getimageID(self, index):
        return self.image_ids[index]

  def __getitem__(self, index):
    """
    Get the pixels of an image, and a random synthetic scene graph for that
    image constructed on-the-fly from its COCO object annotations. We assume
    that the image will have height H, width W, C channels; there will be O
    object annotations, each of which will have both a bounding box and a
    segmentation mask of shape (M, M). There will be T triples in the scene
    graph.

    Returns a tuple of:
    - image: FloatTensor of shape (C, H, W)
    - objs: LongTensor of shape (O,)
    - boxes: FloatTensor of shape (O, 4) giving boxes for objects in
      (x0, y0, x1, y1) format, in a [0, 1] coordinate system
    - masks: LongTensor of shape (O, M, M) giving segmentation masks for
      objects, where 0 is background and 1 is object.
    - triples: LongTensor of shape (T, 3) where triples[t] = [i, p, j]
      means that (objs[i], p, objs[j]) is a triple.
    """
    image_id = self.image_ids[index]
    
    #print("image id: " + str(image_id))
    
    filename = self.image_id_to_filename[image_id]
    image_path = os.path.join(self.image_dir, filename)
    with open(image_path, 'rb') as f:
      with PIL.Image.open(f) as image:
        WW, HH = image.size
        image = self.transform(image.convert('RGB'))

    H, W = self.image_size
    objs, boxes, masks = [], [], []
    for object_data in self.image_id_to_objects[image_id]:
      objs.append(object_data['category_id'])
      x, y, w, h = object_data['bbox']
      x0 = x / WW
      y0 = y / HH
      x1 = (x + w) / WW
      y1 = (y + h) / HH
      boxes.append(torch.FloatTensor([x0, y0, x1, y1]))

      # This will give a numpy array of shape (HH, WW)
      mask = seg_to_mask(object_data['segmentation'], WW, HH)

      # Crop the mask according to the bounding box, being careful to
      # ensure that we don't crop a zero-area region
      mx0, mx1 = int(round(x)), int(round(x + w))
      my0, my1 = int(round(y)), int(round(y + h))
      mx1 = max(mx0 + 1, mx1)
      my1 = max(my0 + 1, my1)
      mask = mask[my0:my1, mx0:mx1]
      mask = imresize(255.0 * mask, (self.mask_size, self.mask_size),
                      mode='constant')
      mask = torch.from_numpy((mask > 128).astype(np.int64))
      masks.append(mask)

    # Add dummy __image__ object
    objs.append(self.vocab['object_name_to_idx']['__image__'])
    boxes.append(torch.FloatTensor([0, 0, 1, 1]))
    masks.append(torch.ones(self.mask_size, self.mask_size).long())

    objs = torch.LongTensor(objs)
    boxes = torch.stack(boxes, dim=0)
    masks = torch.stack(masks, dim=0)

    box_areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

    # Compute centers of all objects
    obj_centers = []
    _, MH, MW = masks.size()
    for i, obj_idx in enumerate(objs):
      x0, y0, x1, y1 = boxes[i]
      mask = (masks[i] == 1)
      xs = torch.linspace(x0, x1, MW).view(1, MW).expand(MH, MW)
      ys = torch.linspace(y0, y1, MH).view(MH, 1).expand(MH, MW)
      if mask.sum() == 0:
        mean_x = 0.5 * (x0 + x1)
        mean_y = 0.5 * (y0 + y1)
      else:
        mean_x = xs[mask].mean()
        mean_y = ys[mask].mean()
      obj_centers.append([mean_x, mean_y])
    obj_centers = torch.FloatTensor(obj_centers)

    # Add triples
    triples = []
    num_objs = objs.size(0)
    __image__ = self.vocab['object_name_to_idx']['__image__']
    real_objs = []
    if num_objs > 1:
      real_objs = (objs != __image__).nonzero().squeeze(1)
    for cur in real_objs:
      choices = [obj for obj in real_objs if obj != cur]
      if len(choices) == 0:
        break
      other = random.choice(choices)
      if random.random() > 0.5:
        s, o = cur, other
      else:
        s, o = other, cur

      # Check for inside / surrounding
      sx0, sy0, sx1, sy1 = boxes[s]
      ox0, oy0, ox1, oy1 = boxes[o]
      d = obj_centers[s] - obj_centers[o]
      theta = math.atan2(d[1], d[0])

      if sx0 < ox0 and sx1 > ox1 and sy0 < oy0 and sy1 > oy1:
        p = 'surrounding'
      elif sx0 > ox0 and sx1 < ox1 and sy0 > oy0 and sy1 < oy1:
        p = 'inside'
      elif theta >= 3 * math.pi / 4 or theta <= -3 * math.pi / 4:
        p = 'left of'
      elif -3 * math.pi / 4 <= theta < -math.pi / 4:
        p = 'above'
      elif -math.pi / 4 <= theta < math.pi / 4:
        p = 'right of'
      elif math.pi / 4 <= theta < 3 * math.pi / 4:
        p = 'below'
      p = self.vocab['pred_name_to_idx'][p]
      triples.append([s, p, o])

    # Add __in_image__ triples
    O = objs.size(0)
    in_image = self.vocab['pred_name_to_idx']['__in_image__']
    for i in range(O - 1):
      triples.append([i, in_image, O - 1])
    
    triples = torch.LongTensor(triples)
    return image, objs, boxes, masks, triples
    
# For decoding segmentation masks using the pycocotools API:
def seg_to_mask(seg, width=1.0, height=1.0): 
  if type(seg) == list:
    rles = mask_utils.frPyObjects(seg, height, width)
    rle = mask_utils.merge(rles)
  elif type(seg['counts']) == list:
    rle = mask_utils.frPyObjects(seg, height, width)
  else:
    rle = seg
  return mask_utils.decode(rle)





In [50]:
"""
Because COCO is a much bigger dataset than we need so we only downloaded the training dataset 
We only exported 10000 images from it that we are going to use for training, validating and testing.

links for the dataset:
    The images:       http://images.cocodataset.org/zips/train2017.zip
    The annotations:  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
"""
dset_kwargs = {
    'image_dir': "../train2017/train2017",
    'instances_json': "../annotations_trainval2017/annotations/instances_train2017.json"  
  }
   
train_dset = CocoSceneGraphDataset(**dset_kwargs)
num_objs = train_dset.total_objects()
num_imgs = len(train_dset)
print('Training dataset has %d images and %d objects' % (num_imgs, num_objs))
print('(%.2f objects per image)' % (float(num_objs) / num_imgs))

#val_dset = CocoSceneGraphDataset(**dset_kwargs)
#assert train_dset.vocab == val_dset.vocab
#vocab = json.loads(json.dumps(train_dset.vocab))
#print(vocab)

called set_image_size (64, 64)
Training dataset has 86375 images and 185516 objects
(2.15 objects per image)


In [52]:
# Only for testing and inspetcing what does it look like exactly a processed item
train_dset.__getitem__(32)

(tensor([[[-0.4739, -0.3712,  1.2899,  ..., -1.0219, -1.3987, -1.3644],
          [-0.4739, -0.4226,  0.9817,  ..., -0.9534, -1.3130, -1.2788],
          [-0.4739, -0.4397,  0.6049,  ..., -1.0733, -1.2959, -1.2445],
          ...,
          [-0.8849, -0.8507, -0.8164,  ..., -1.6898, -1.9295, -2.0665],
          [-0.9020, -0.8678, -0.8335,  ..., -1.8610, -2.0665, -2.1008],
          [-0.9192, -0.8849, -0.8507,  ..., -2.0494, -2.1008, -2.1008]],
 
         [[-0.3725, -0.2850,  1.4832,  ..., -1.2129, -1.3529, -1.2304],
          [-0.3725, -0.3375,  1.1506,  ..., -0.9503, -1.1779, -1.1429],
          [-0.3725, -0.3375,  0.7479,  ..., -0.7227, -1.0553, -1.1078],
          ...,
          [-0.7927, -0.7577, -0.7227,  ..., -1.7031, -1.8606, -2.0007],
          [-0.8102, -0.7752, -0.7402,  ..., -1.8081, -2.0007, -2.0182],
          [-0.8277, -0.7927, -0.7577,  ..., -1.9832, -2.0182, -2.0182]],
 
         [[-0.1835, -0.0615,  1.8208,  ..., -1.0550, -1.1247, -0.9678],
          [-0.2184, -0.1312,

In [55]:
# The GenerateSceneGraph function exports the processed data to jsons
# we use the images ID's to connect later the exported objects
def GenerateSceneGraph(num):
    
    for n in range(num):
        tensors = train_dset.__getitem__(n)
        relations = tensors[-1]
        objectids = tensors[1]
        
        image = tensors[0]
        maskCoords = tensors[2]
        masks = tensors[3]
        
        # First we get and collect the triples (relasionships between objects)
        relationArray = []
        for i in range(len(relations)):
            relation = relations[i]
            obj1 = vocab['object_idx_to_name'][objectids[relation[0]]]
            rel = vocab['pred_idx_to_name'][relation[1]]
            obj2 = vocab['object_idx_to_name'][objectids[relation[2]]]
            relarray = [obj1, rel ,obj2]
            relationArray.append(relarray) 
            
        # and then exporting the relations TO json
        x = { "ID" : train_dset.getimageID(n),
            "relationships": relationArray}
        with open('sceneGraphs.json', 'a') as f:
            json.dump(x, f)
        
        # Also we have to export the cropped and resized images
        imageJson = { "ID" : train_dset.getimageID(n),
            "image": image.tolist()}

        with open('images.json', 'a') as f:
            json.dump(imageJson, f)
            
        # we are exporting the coordinates of rectangles that 
        # contains the objects to know where they are on the images
        # and also the masks that shows where's the object exactly
        maskJson = { "ID" : train_dset.getimageID(n),
            "maskcords": maskCoords.tolist(),
            "masks" :  masks.tolist()}
        
        with open('masks.json', 'a') as f:
            json.dump(maskJson, f)
            
        # And finally we collect the images ID's too
        with open('imagelist.json', 'a') as f:
            json.dump({"ID" : train_dset.getimageID(n)}, f)
            


In [56]:
GenerateSceneGraph(2)