In [1]:
import quilt3
import ujson as json

In [4]:
# base_path = "/Users/armandmcqueen/data/coco"
base_path = "/home/ubuntu/coco"
annotations_base_path = f"{base_path}/annotations/"
train_annotation_path = f"{annotations_base_path}/instances_train2017.json"
val_annotation_path = f"{annotations_base_path}/instances_val2017.json"

In [5]:
with open(train_annotation_path, 'r') as f:
    train_annotations = json.load(f)

with open(val_annotation_path, 'r') as f:
    val_annotations = json.load(f)

In [None]:
train_annotations.keys()

In [None]:
train_annotations["images"][0]

In [None]:
train_annotations["categories"][0]

In [None]:
train_annotations["annotations"][0]

## Rebuild Annotations JSON for Metadata

In [6]:
def transform_to_metadata_dict(annotation_json):
    """ 
    Turn the COCO formatted annotations into a dict where the key 
    is the image id and the value is the metadata for that image
    """
    
    # First, transform 'images' list into dict
    
    metadata_dict = {}
    for image_info in annotation_json['images']:
        metadata_dict[image_info['id']] = {"image_info": image_info, "annotations": []}
    
    # Next, transform 'categories' list into dict
    category_dict = {}
    for category_info in annotation_json['categories']:
        category_dict[category_info['id']] = category_info
    
    # Next, iterate over annotations and add them to appropriate metadata_dict entry
    for annotation_info in annotation_json["annotations"]:
        category_id = annotation_info["category_id"]
        category_name = category_dict[category_id]["name"]
        supercategory = category_dict[category_id]["supercategory"]
        annotation_info["category_name"] = category_name
        annotation_info["supercategory"] = supercategory
        
        metadata_dict[annotation_info["image_id"]]["annotations"].append(annotation_info)
    
    return metadata_dict

In [7]:
train_metadata_dict = transform_to_metadata_dict(train_annotations)
val_metadata_dict = transform_to_metadata_dict(val_annotations)

In [8]:
list(train_metadata_dict.values())[0]

{'image_info': {'license': 3,
  'file_name': '000000391895.jpg',
  'coco_url': 'http://images.cocodataset.org/train2017/000000391895.jpg',
  'height': 360,
  'width': 640,
  'date_captured': '2013-11-14 11:18:45',
  'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
  'id': 391895},
 'annotations': [{'segmentation': [[376.97,
     176.91,
     398.81,
     176.91,
     396.38,
     147.78,
     447.35,
     146.17,
     448.16,
     172.05,
     448.16,
     178.53,
     464.34,
     186.62,
     464.34,
     192.28,
     448.97,
     195.51,
     447.35,
     235.96,
     441.69,
     258.62,
     454.63,
     268.32,
     462.72,
     276.41,
     471.62,
     290.98,
     456.25,
     298.26,
     439.26,
     292.59,
     431.98,
     308.77,
     442.49,
     313.63,
     436.02,
     316.86,
     429.55,
     322.53,
     419.84,
     354.89,
     402.04,
     359.74,
     401.24,
     312.82,
     370.49,
     303.92,
     391.53,
     299.87,
     3

In [9]:
pkg = quilt3.Package()

In [10]:
pkg.set_dir("annotations", annotations_base_path)

(local Package)
 └─annotations/
   └─captions_train2017.json
   └─captions_val2017.json
   └─instances_train2017.json
   └─instances_val2017.json
   └─person_keypoints_train2017.json
   └─person_keypoints_val2017.json

In [11]:
for metadata in train_metadata_dict.values():
    file_name = metadata["image_info"]["file_name"]
    logical_key = f"train2017/{file_name}"
    physical_key = f"{base_path}/train2017/{file_name}"
    pkg.set(logical_key, physical_key, metadata)

for metadata in val_metadata_dict.values():
    file_name = metadata["image_info"]["file_name"]
    logical_key = f"val2017/{file_name}"
    physical_key = f"{base_path}/val2017/{file_name}"
    pkg.set(logical_key, physical_key, metadata)

In [12]:
len(list(pkg.walk()))

123293

### Confirm that the metadata matches the logical key and the physical_key

In [25]:
for lk, entry in pkg.walk():
    if entry.meta == {}:
        continue
    metadata_file_name = entry.meta["image_info"]["file_name"]
    assert metadata_file_name in lk, f"The logical key is {lk}, but the file_name in the metadata is {metadata_file_name}"
    assert metadata_file_name in entry.get(), f"The physical key is {entry.get()}, but the file_name in the metadata is {metadata_file_name}"
    
    
    

In [26]:
pkg.push(
            "cv/coco2017",
            registry="s3://quilt-ml-data/",
            dest="s3://quilt-ml-data/data/raw",
#             selector_fn=lambda lk, e: False
    )

Hashing: 100%|██████████| 21.7G/21.7G [05:02<00:00, 71.8MB/s]  
Copying: 100%|██████████| 21.7G/21.7G [07:16<00:00, 49.7MB/s]


(remote Package)
 └─annotations/
   └─captions_train2017.json
   └─captions_val2017.json
   └─instances_train2017.json
   └─instances_val2017.json
   └─person_keypoints_train2017.json
   └─person_keypoints_val2017.json
 └─train2017/
   └─000000000009.jpg
   └─000000000025.jpg
   └─000000000030.jpg
   └─000000000034.jpg
   └─000000000036.jpg
   └─000000000042.jpg
   └─000000000049.jpg
   └─000000000061.jpg
   └─000000000064.jpg
   └─000000000071.jpg
   └─000000000072.jpg
 └─val2017/
 ...