In [1]:
import glob
import json
import numpy as np
import os
import shutil
from tqdm import tqdm
from pycocotools.coco import COCO  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoDemo.ipynb

In [2]:
prev_percent_kept = 30
percent_kept = 30

In [3]:
unfiltered_data_folder = "/media/data/dad/cnet/experiments/coco10novel/mix_n2000_o1_s1_p640_pfa_csl%d"%prev_percent_kept
unfiltered_annotation_path = os.path.join(unfiltered_data_folder, "annotation.json")
unfiltered_image_folder = os.path.join(unfiltered_data_folder, "images")
target_folder = unfiltered_data_folder + "_pfb_csl%d"%percent_kept

if not os.path.exists(target_folder):
    os.mkdir(target_folder)

target_image_folder = os.path.join(target_folder, "images")
target_annotation_path = os.path.join(target_folder, "annotation.json")

if not os.path.exists(target_image_folder):
    os.mkdir(target_image_folder)

In [4]:
with open(unfiltered_annotation_path, "r") as f:
    unfiltered_annotation = json.load(f)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [5]:
print(unfiltered_annotation.keys())
print(unfiltered_annotation["images"][0])
print(unfiltered_annotation["annotations"][0])
print(unfiltered_annotation["categories"][0])
print(unfiltered_annotation["images"][0].keys())
print(unfiltered_annotation["annotations"][0].keys())
print(unfiltered_annotation["categories"][0].keys())
print(len(unfiltered_annotation["images"]))
print(len(unfiltered_annotation["annotations"]))
print(len(unfiltered_annotation["categories"]))

dict_keys(['images', 'annotations', 'categories'])
{'license': 2, 'file_name': '000000568790.jpg', 'coco_url': 'http://images.cocodataset.org/train2017/000000568790.jpg', 'height': 426, 'width': 640, 'date_captured': '2013-11-21 00:25:07', 'flickr_url': 'http://farm2.staticflickr.com/1082/5110147496_81f18ceee0_z.jpg', 'id': 568790, 'csl_person': 16.453125}
{'segmentation': [[290.06, 195.29, 297.72, 190.5, 304.42, 188.59, 315.91, 184.76, 323.57, 184.76, 329.31, 180.93, 336.97, 175.19, 341.76, 172.31, 345.59, 172.31, 349.42, 187.63, 355.16, 200.08, 369.52, 213.48, 378.13, 216.35, 384.84, 218.27, 389.62, 218.27, 399.2, 216.35, 406.85, 200.08, 411.64, 175.19, 411.64, 164.66, 409.73, 153.17, 403.02, 144.55, 392.49, 135.94, 388.67, 134.02, 384.84, 134.02, 381.96, 128.28, 375.26, 118.71, 371.43, 116.79, 362.82, 116.79, 355.16, 120.62, 352.29, 120.62, 341.76, 115.83, 330.27, 115.83, 312.08, 110.09, 300.59, 108.18, 301.55, 115.83, 314.95, 119.66, 329.31, 123.49, 347.5, 133.07, 357.07, 136.89, 3

In [6]:
from collections import defaultdict

cat_names = [cat['name'] for cat in unfiltered_annotation["categories"]]
print(cat_names)

clip_scores = defaultdict(list)
for img_obj in unfiltered_annotation["images"]:
    for k in img_obj.keys():
        if k[:3] == "csl":  # csl_<cat_name>
            cat_name = k[4:]
            clip_scores[cat_name].append(img_obj[k])

csl_thres = {category_name:np.percentile(scores, percent_kept) for category_name, scores in clip_scores.items()}
print(len(clip_scores))
print(csl_thres)

['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'boat', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'bottle', 'chair', 'couch', 'potted plant', 'dining table', 'tv']
20
{'person': 16.353125, 'dog': 17.303124999999998, 'sheep': 16.434375, 'motorcycle': 15.5046875, 'couch': 17.6125, 'bicycle': 15.496875, 'train': 20.56875, 'cat': 17.109375, 'car': 17.796875, 'chair': 18.1546875, 'horse': 15.8765625, 'cow': 15.4703125, 'airplane': 17.65625, 'bus': 16.3625, 'potted plant': 21.575, 'boat': 17.871875, 'bird': 17.759375, 'dining table': 17.796875, 'bottle': 17.38125, 'tv': 20.1640625}


In [7]:
old_images = unfiltered_annotation["images"]
old_annos = unfiltered_annotation["annotations"]
new_annos = []
new_images = []

valid_image_ids = []
for img_obj in old_images:
    is_valid_image = True
    for k in img_obj.keys():
        if k[:3] == "csl":  # csl_<cat_name>
            cat_name = k[4:]
            if img_obj[k] > csl_thres[cat_name]:  # background to similar to <cat_name>
                is_valid_image = False
    if 'coco_url' in img_obj.keys():  # keep GT images
        is_valid_image = True
    if is_valid_image:
        new_images.append(img_obj)
        valid_image_ids.append(img_obj['id'])
        
valid_ann_ids = coco.getAnnIds(imgIds=valid_image_ids)
for anno in old_annos:
    if anno['id'] in valid_ann_ids:
        new_annos.append(anno)

unfiltered_annotation["images"] = new_images
unfiltered_annotation["annotations"] = new_annos

In [8]:
print(len(new_images))
print(len(new_annos))
print(len(old_images))
print(len(old_annos))

248
331
724
807


In [9]:
with open(target_annotation_path, "w+") as f:
    json.dump(unfiltered_annotation, f)

In [10]:
img_names = [img['file_name'] for img in unfiltered_annotation["images"]]
for img_name in img_names:
    shutil.copy(os.path.join(unfiltered_image_folder, img_name), os.path.join(target_image_folder, img_name))