# Import Relevant Libraries

In [1]:
import pycocotools.coco as coco
import json
import numpy as np

We will use the COCO API to get read in the json files with the targets and all the annotations. It will help us work with the correspondence between the images that have unique image ids and the annotations that also have their own unique annotation ids and also contain the variable that defines the image id that the annotation corresponds to.

In [2]:
# define file paths
anot_path_train = "instances_train2017.json"
anot_path_val = "instances_val2017.json"
img_path_train = "../train/data"
img_path_val = "../validation/data"

In [3]:
# create the coco objects to work with the images and their annotations
coco_train = coco.COCO(anot_path_train)
coco_val = coco.COCO(anot_path_val)

loading annotations into memory...
Done (t=19.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.68s)
creating index...
index created!


In [4]:
imgs_train = coco_train.getImgIds()
# use the same category ids from EDA
cat_ids = [1,2,3,4,6,7,8,10,11,13,14,17,18]

In [5]:
# choose 30,000 random images from the larger training dataset for training
num_train_samples = 30000
indices = np.random.choice(range(len(imgs_train)), num_train_samples)

In [6]:
# create a new annotation file with annotations corresponding to the categories that we need
with open(anot_path_train, 'r') as train_file:
    train_data = json.load(train_file)
    new_train_data = {}
    new_train_data['info'] = train_data['info'].copy()
    new_train_data['categories'] = train_data['categories'].copy()
    new_train_data['images'] = []
    new_train_data['annotations'] = []
    for index in indices:
        new_train_data['images'].append(train_data['images'][index].copy())
        img_id = imgs_train[index]
        ann_ids = coco_train.getAnnIds([img_id])
        annotations = coco_train.loadAnns(ann_ids)
        for ann in annotations:
            if ann['category_id'] in cat_ids:
                new_train_data['annotations'].append(ann.copy())
train_file.close()

with open('sub_samples_instances_train2017.json', 'w') as file:
    json.dump(new_train_data, file)
file.close()

In [7]:
imgs_val = coco_val.getImgIds()

In [None]:
# choose 5,000 random images from the larger validation dataset for validation
num_val_samples = 5000
indices = np.random.choice(range(len(imgs_val)), num_val_samples)

In [11]:
# create a new annotation file with annotations corresponding to the categories that we need
with open(anot_path_val, 'r') as val_file:
    val_data = json.load(val_file)
    new_val_data = {}
    new_val_data['info'] = val_data['info'].copy()
    new_val_data['categories'] = val_data['categories'].copy()
    new_val_data['images'] = []
    new_val_data['annotations'] = []
    for index in indices:
        new_val_data['images'].append(val_data['images'][index].copy())
        img_id = imgs_val[index]
        ann_ids = coco_val.getAnnIds([img_id])
        annotations = coco_val.loadAnns(ann_ids)
        for ann in annotations:
            if ann['category_id'] in cat_ids:
                new_val_data['annotations'].append(ann.copy())
val_file.close()

with open('sub_samples_instances_val2017.json', 'w') as file:
    json.dump(new_val_data, file)
file.close()