In [71]:
import json
import os
from tqdm import tqdm

# Load reference and instances
refs_file = open(os.getcwd() + "/data/refcocog/refs(google).json")
instances_file = open(os.getcwd() + "/data/refcocog/instances.json")
refs_list = json.load(refs_file)
instances_dict = json.load(instances_file)

# refcocog/refs(google).json does not have any references with a file_name that correspond with the images in /test2014
# therefore we will need to split train2014 in to a new train/val/test
images_list = os.listdir(os.getcwd() + "/data/images/mscoco/images/train2014")  

print("Total Number of Images in /test2014: ", len(images_list))
print("Total Number of Images accounted for in instances.json: ", len(instances_dict["images"]))
print("Total Number of Annotations accounted for in instances.json: ", len(instances_dict["annotations"]))
print("Total Number of References accounted for in refs.json: ", len(refs_list))


Total Number of Images in /test2014:  82783
Total Number of Images accounted for in instances.json:  25799
Total Number of Annotations accounted for in instances.json:  208960
Total Number of References accounted for in refs.json:  49822


In [72]:
# determine which images are called by both refs.json and instances.json
images_in_refs = []
for ref in tqdm(refs_list):
    file_name = '_'.join(ref["file_name"].split("_")[:-1]) + ".jpg"
    if file_name in images_list and file_name not in images_in_refs:
        images_in_refs.append(file_name)
        
images_in_refs_and_instances = []
for image in tqdm(instances_dict["images"]):
    if image["file_name"] in images_in_refs and image["file_name"] not in images_in_refs_and_instances:
        images_in_refs_and_instances.append(image["file_name"])

print("Num images after refs filter: ", len(images_in_refs))
print("Num images after instances filter: ", len(images_in_refs_and_instances))

100%|██████████| 49822/49822 [00:22<00:00, 2211.29it/s]
100%|██████████| 25799/25799 [00:06<00:00, 4254.44it/s]

Num images after refs filter:  25799
Num images after instances filter:  25799





In [73]:
images_list = images_in_refs_and_instances
        


In [75]:
# make the test_RefCOCOg dataset
from datetime import datetime

current_time_stamp = str(datetime.now()).replace(":","_")

DATASET_PATH = f"/generated_datasets/test_RefCOCOg_{current_time_stamp}"
DATASET_IMAGES_PATH = f"/generated_datasets/test_RefCOCOg_{current_time_stamp}/images/"
os.mkdir(os.getcwd() + DATASET_PATH)
os.mkdir(os.getcwd() + DATASET_IMAGES_PATH)


In [76]:
import shutil

    
test_instances_dict = {}
test_instances_dict["info"] = instances_dict["info"]
test_instances_dict["images"] = []
test_instances_dict["annotations"] = []
test_instances_dict["categories"] = instances_dict["categories"]

# load image and annotations for all images in the train dataset
images_list = images_list[0:500]    # shrink the dataset
for image in tqdm(instances_dict["images"]):
    if image["file_name"] in images_list:
        test_instances_dict["images"].append(image)
    
        src_path = os.getcwd() + "/data/images/mscoco/images/train2014/" + image["file_name"]
        dst_path = os.getcwd() + DATASET_IMAGES_PATH + image["file_name"]
        shutil.copy(src_path,dst_path)

for anno in tqdm(instances_dict["annotations"]):
    image_name = "COCO_train2014_" + str(anno['image_id']).zfill(12) + ".jpg"
    if image_name in images_list:
        test_instances_dict["annotations"].append(anno)
        
print("num images: ",len(test_instances_dict["images"]))
print("num annos: ",len(test_instances_dict["annotations"]))

100%|██████████| 25799/25799 [00:00<00:00, 67894.81it/s]
100%|██████████| 208960/208960 [00:00<00:00, 226044.41it/s]

num images:  500
num annos:  3865





In [77]:
test_refs_list = []
image_names_not_found = []

for ref in tqdm(refs_list):
    image_name = '_'.join(ref["file_name"].split("_")[:-1])+".jpg"
    if image_name in images_list:
        test_refs_list.append(ref)


print(f"Number of Images found: {len(images_list)-len(image_names_not_found)}")
print(f"Number of References: {len(test_refs_list)}")

100%|██████████| 49822/49822 [00:00<00:00, 209068.13it/s]

Number of Images found: 500
Number of References: 943





# generate instances.json and refs.json

In [78]:
with open(os.getcwd() + DATASET_PATH + "/instances.json", 'w') as f:
    json.dump(test_instances_dict, f)
with open(os.getcwd() + DATASET_PATH + "/refs.json", 'w') as f:
    json.dump(test_refs_list, f)