
```python
{'split': 'val', 'license': 'Creative Commons Attribution (CC BY 4.0)', 'version': '1.0', 'date': '2/14/2017'}
15000
dict_keys(['image_index', 'objects', 'relationships', 'image_filename', 'split', 'directions'])
<scene> {
  "spit": <string: "train", "val", or "test">,
  "image_index": <integer>,
  "image_filename": <string, e.g. "CLEVR_train_000000.png">,
  "directions": {
    "left": [list of 3 numbers x, y, z],
    "right": [list of 3 numbers x, y, z],
    "front": [list of 3 numbers x, y, z],
    "behind": [list of 3 numbers x, y, z],
    "below": [list of 3 numbers x, y, z],
    "above": [list of 3 numbers x, y, z]
  },
  "objects": [<object>],
  "relations": {
    "left": <adjacency list>,
    "right": <adjacency list>,
    "front": <adjacency list>,
    "behind": <adjacency list>
  }
}

Relationships are stored as adjacency lists, which are lists of lists of
integers. If s is a <scene> object, then s['relations']['left'][i] is a list of
indices for objects which are left of s['objects'][i].

In other words, s['objects'][j] is left of s['objects'][i] if and only if
j is in s['relations']['left'][i].

<object> {
  "3d_coords": [list of 3 numbers x, y, z],
  "pixel_coords": [list of 3 numbers x, y, z],
  "rotation": <number, in degrees>,
  "size': <string: "small" or "large">,
  "color": <string: "gray", "blue", "brown", "yellow", "red", "green", "purple", or "cyan">,
  "material": <string: "rubber" or "metal">,
  "shape": <string: "cube", "sphere", or "cylinder">
}
```

In [None]:
from PIL import Image
from IPython.display import display
import json, os, tqdm

def process_one_scene(scene):
    img_path = os.path.join('images', scene['split'], scene['image_filename'])
    # image = Image.open(img_path)
    # display(image)

    objects = scene['objects']
    # sort objects by object['pixel_coords'][0]
    objects = sorted(objects, key=lambda x: x['pixel_coords'][0])
    object_captions = []
    for object in objects:
        caption = f"a {object['size']} {object['color']} {object['material']} {object['shape']}"
        object_captions.append(caption)

    image_caption = f'Total {len(scene["objects"])} objects: '
    image_caption += ', '.join(object_captions) + '.'

    return img_path, image_caption


dataset_root = '/datasets01/CLEVR_v1.0/060817'
output_dir = '/private/home/delong/workspace/data/clevr-caption'

os.makedirs(output_dir, exist_ok=True)
for split in ['train', 'val']:
    scene_graphs = json.load(open(f'{dataset_root}/scenes/CLEVR_{split}_scenes.json', 'r'))
    samples = []
    for scene_graph in tqdm.tqdm(scene_graphs['scenes']):
        img_path, caption = process_one_scene(scene_graph)
        img_path = os.path.join(dataset_root, img_path)
        samples.append({
            'img_path': img_path,
            'caption': caption
        })

    json.dump(samples, open(f'{output_dir}/{split}.json', 'w'), indent=4)