In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import json
from collections import Counter

In [4]:



N_class = 3000  # keep the top 3000 classes
raw_data_dir = 'D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Dataset\\VG\\'
output_dir = 'D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Image2Description\\'

# ---------------------------------------------------------------------------- #
# Load raw VG annotations and collect top-frequent synsets
# ---------------------------------------------------------------------------- #

with open(raw_data_dir + 'image_data.json') as f:
    raw_img_data = json.load(f)
with open(raw_data_dir + 'objects.json') as f:
    raw_obj_data = json.load(f)

# collect top frequent synsets
all_synsets = [
    synset for img in raw_obj_data
    for obj in img['objects'] for synset in obj['synsets']]
synset_counter = Counter(all_synsets)
top_synsets = [
    synset for synset, _ in synset_counter.most_common(N_class)]

# ---------------------------------------------------------------------------- #
# build raw "categories"
# ---------------------------------------------------------------------------- #

categories = [
    {'id': (n + 1), 'name': synset} for n, synset in enumerate(top_synsets)]
synset2cid = {c['name']: c['id'] for c in categories}

# ---------------------------------------------------------------------------- #
# build "image"
# ---------------------------------------------------------------------------- #

images = [
    {'id': img['image_id'],
     'width': img['width'],
     'height': img['height'],
     'file_name': str(img['image_id'])+'.jpg',
     'coco_id': img['coco_id']}
    for img in raw_img_data]

# ---------------------------------------------------------------------------- #
# build "annotations"
# ---------------------------------------------------------------------------- #

annotations = []
skip_count_1, skip_count_2, skip_count_3 = 0, 0, 0
for img in raw_obj_data:
    for obj in img['objects']:
        synsets = obj['synsets']
        if len(synsets) == 0:
            skip_count_1 += 1
        elif len(synsets) > 1:
            skip_count_2 += 1
        elif synsets[0] not in synset2cid:
            skip_count_3 += 1
        else:
            cid = synset2cid[synsets[0]]
            bbox = [obj['x'], obj['y'], (obj['x']+obj['w']), (obj['y']+obj['h'])]
            area = obj['w'] * obj['h']
            ann = {'id': obj['object_id'],
                   'image_id': img['image_id'],
                   'category_id': cid,
                   'segmentation': [],
                   'area': area,
                   'bbox': bbox,
                   'iscrowd': 0}
            annotations.append(ann)

# ---------------------------------------------------------------------------- #
# Save to json file
# ---------------------------------------------------------------------------- #
categories = [{'id': (n + 1), 'name': synset.split('.')[0]} for n, synset in enumerate(top_synsets)]
with open(output_dir + 'instances_vg3k_raw.json', 'w') as f:
    json.dump(
        {'images': images,
         'annotations': annotations,
         'categories': categories}, f)

In [5]:
import json
import random

vg3k_raw_json_file = 'D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Image2Description\\instances_vg3k_raw.json'
output_dir = 'D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Image2Description\\'

# Load raw VG annotations and COCO annotations
with open(vg3k_raw_json_file) as f:
    dataset_vg3k = json.load(f)

vg3k_cls_names = dataset_vg3k['categories']

# Split dataset randomly into train, val, and test
total_images = len(dataset_vg3k['images'])
indices = list(range(total_images))
random.shuffle(indices)

# Define split percentages
train_split = 0.80
val_split = 0.1
test_split = 0.1

# Calculate number of images for each split
num_train = int(train_split * total_images)
num_val = int(val_split * total_images)
num_test = total_images - num_train - num_val

# Split images
images_train = [dataset_vg3k['images'][i] for i in indices[:num_train]]
images_val = [dataset_vg3k['images'][i] for i in indices[num_train:num_train + num_val]]
images_test = [dataset_vg3k['images'][i] for i in indices[-num_test:]]

# Create sets of image IDs for faster lookup
imgids_train = {img['id'] for img in images_train}
imgids_val = {img['id'] for img in images_val}
imgids_test = {img['id'] for img in images_test}

# Split annotations
annotations_train = [ann for ann in dataset_vg3k['annotations'] if ann['image_id'] in imgids_train]
annotations_val = [ann for ann in dataset_vg3k['annotations'] if ann['image_id'] in imgids_val]
annotations_test = [ann for ann in dataset_vg3k['annotations'] if ann['image_id'] in imgids_test]

# Save to JSON file
dataset_vg3k_train = {
    'images': images_train,
    'annotations': annotations_train,
    'categories': vg3k_cls_names
}
dataset_vg3k_val = {
    'images': images_val,
    'annotations': annotations_val,
    'categories': vg3k_cls_names
}
dataset_vg3k_test = {
    'images': images_test,
    'annotations': annotations_test,
    'categories': vg3k_cls_names
}

with open(output_dir + 'instances_vg3k_cocoaligned_train.json', 'w') as f:
    json.dump(dataset_vg3k_train, f)
with open(output_dir + 'instances_vg3k_cocoaligned_val.json', 'w') as f:
    json.dump(dataset_vg3k_val, f)
with open(output_dir + 'instances_vg3k_cocoaligned_test.json', 'w') as f:
    json.dump(dataset_vg3k_test, f)

print('Dataset split completed.')


Dataset split completed.
