# Data preparation from COCO dataset

Before extracting data, we should first import some necessary dependencies.

In [1]:
from pycocotools.coco import COCO
import requests
import numpy as np
from tqdm import tqdm

Then we can go into the dataset directory and start making some data.

In [None]:
cd dataset

First, we should download the zip files of the COCO dataset and unzip them.

In [None]:
!wget http://images.cocodataset.org/zips/train2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip train2017.zip
!unzip annotations_trainval2017.zip

After downloading and unzipping files, we can use the API provided by the COCO dataset to extract specific images containing the type of object we want to classify.

If we want to classify more than two classes, we must ensure that each class has a similar amount of data, so that the model can learn better. So we extract the image IDs of each object separately and add them equally to the final set of image IDs.

Of course, if you request more data than the coco data set has, there will be some problems. At the same time, some data augumentation should also be done to avoid excessive differences in the amount of data of each class.

## Single class

Here, we will demonstrate a single class extraction process:

In [3]:
# Use coco api to get informations in annotations json file
coco=COCO('annotations/instances_train2017.json')

# Get the class Ids from the classes we want to extract
catIds = coco.getCatIds(catNms=['person'])

# Extract the image IDs for each object separately and store them into a 2D list
imgIds = [[] for _ in range(len(catIds))]
for i in range(len(catIds)):
    imgIds[i] += coco.getImgIds(catIds=catIds[i])

# Define the total number of datas we want to extract
num_data = 100

# Add each object's image ID equally to the final set of image IDs and load the image info into a list 'images'
num_class = len(catIds)
helpset = set()
imgIds_final = []
count = 0
while count < num_data:
    while imgIds[count%num_class][0] in helpset:
        imgIds[count%num_class].pop(0)
    curr_id = imgIds[count%num_class].pop(0)
    helpset.add(curr_id)
    imgIds_final.append(curr_id)
    count+=1
images = coco.loadImgs(imgIds_final)

loading annotations into memory...
Done (t=22.69s)
creating index...
index created!


Then we need to make a reference chart to convert coco's class labels into our own class labels

In [4]:
reference = {
    1:1,  #coco's person id=1, custom person id = 1
}

Finally, we can use the API provided by the COCO dataset to get the images and necessary annotations, and save them in the corresponding directory

In [5]:
# For each iteration, an image and a corresponding annotation will be saved
for i in tqdm(range(len(images))):
    im = images[i]
    
    # Request image from coco_url and save it
    img_data = requests.get(im['coco_url']).content
    with open('image_train/' + str(i) + '.jpg', 'wb') as handler:
        handler.write(img_data)
    
    # Define a dictionary to store annotations
    img_inf = {
        'boxes': [],
        'labels': [],
        'masks': []
    }
    
    # Get annotations corresponding to the image
    annIds = []
    for catId in catIds:
        annIds += coco.getAnnIds(imgIds=im['id'], catIds=catId, iscrowd=None)
    anns = coco.loadAnns(annIds)
    
    # Store annotations in the format required by the model
    for ann in anns:
        a_box = ann['bbox']
        box = [a_box[0],a_box[1],a_box[0]+a_box[2],a_box[1]+a_box[3]]
        img_inf["boxes"].append(box)
        label = reference[ann['category_id']]
        img_inf["labels"].append(label)
        mask = coco.annToMask(ann)
        img_inf["masks"].append(mask)
    
    # Convert annotations into np.array
    img_inf["boxes"] = np.array(img_inf["boxes"])
    img_inf["labels"] = np.array(img_inf["labels"])
    img_inf["masks"] = np.array(img_inf["masks"])
    
    # Save the annotation in a NPY file
    np.save('annotation_train/' + str(i) + '.npy', img_inf)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:55<00:00,  1.16s/it]


## Multiple classes

We can also do it with multiple classes

In [6]:
# Use coco api to get informations in annotations json file
coco=COCO('annotations/instances_train2017.json')

# Get the class Ids from the classes we want to extract
catIds = coco.getCatIds(catNms=['person','cat','bird'])

# Extract the image IDs for each object separately and store them into a 2D list
imgIds = [[] for _ in range(len(catIds))]
for i in range(len(catIds)):
    imgIds[i] += coco.getImgIds(catIds=catIds[i])

# Define the total number of datas we want to extract
num_data = 20

# Add each object's image ID equally to the final set of image IDs and load the image info into a list 'images'
num_class = len(catIds)
helpset = set()
imgIds_final = []
count = 0
while count < num_data:
    while imgIds[count%num_class][0] in helpset:
        imgIds[count%num_class].pop(0)
    curr_id = imgIds[count%num_class].pop(0)
    helpset.add(curr_id)
    imgIds_final.append(curr_id)
    count+=1
images = coco.loadImgs(imgIds_final)

loading annotations into memory...
Done (t=24.13s)
creating index...
index created!


In [7]:
reference = {
    1:1,  #coco's person id=1, custom person id = 1
    17:2, #coco's cat id=1, custom cat id = 1
    16:3  #coco's bird id=1, custom bird's id = 1
}

In [8]:
!mkdir image_train_multi annotation_train_multi

In [9]:
# For each iteration, an image and a corresponding annotation will be saved
for i in tqdm(range(len(images))):
    im = images[i]
    
    # Request image from coco_url and save it
    img_data = requests.get(im['coco_url']).content
    with open('image_train_multi/' + str(i) + '.jpg', 'wb') as handler:
        handler.write(img_data)
    
    # Define a dictionary to store annotations
    img_inf = {
        'boxes': [],
        'labels': [],
        'masks': []
    }
    
    # Get annotations corresponding to the image
    annIds = []
    for catId in catIds:
        annIds += coco.getAnnIds(imgIds=im['id'], catIds=catId, iscrowd=None)
    anns = coco.loadAnns(annIds)
    
    # Store annotations in the format required by the model
    for ann in anns:
        a_box = ann['bbox']
        box = [a_box[0],a_box[1],a_box[0]+a_box[2],a_box[1]+a_box[3]]
        img_inf["boxes"].append(box)
        label = reference[ann['category_id']]
        img_inf["labels"].append(label)
        mask = coco.annToMask(ann)
        img_inf["masks"].append(mask)
    
    # Convert annotations into np.array
    img_inf["boxes"] = np.array(img_inf["boxes"])
    img_inf["labels"] = np.array(img_inf["labels"])
    img_inf["masks"] = np.array(img_inf["masks"])
    
    # Save the annotation in a NPY file
    np.save('annotation_train_multi/' + str(i) + '.npy', img_inf)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:21<00:00,  1.05s/it]
