This notebook downloads captions corresponding to the 73k images. It queries them by coco id, and then stores them in an hdf5 array.

In [1]:
import json

TRAIN_CAPTIONS_DIR = "/srv/eeg_reconstruction/shared/coco/captions_train2017.json"
VAL_CAPTIONS_DIR = "/srv/eeg_reconstruction/shared/coco/captions_val2017.json"

In [2]:
with open(TRAIN_CAPTIONS_DIR, 'r') as file:
    captions_train2017 = json.load(file)['annotations']

with open(VAL_CAPTIONS_DIR, 'r') as file:
    captions_val2017 = json.load(file)['annotations']

In [6]:
def find_captions(image_id, coco_split):
    if coco_split == 'train2017':
        caption_list = captions_train2017
    elif coco_split == 'val2017':
        caption_list = captions_val2017
    else:
        return None

    captions = [caption['caption'] for caption in caption_list if caption['image_id'] == int(image_id)]
    return captions

In [7]:
find_captions(203564, "train2017")

['A bicycle replica with a clock as the front wheel.',
 'The bike has a clock as a tire.',
 'A black metal bicycle with a clock inside the front wheel.',
 'A bicycle figurine in which the front wheel is replaced with a clock\n',
 'A clock with the appearance of the wheel of a bicycle ']

In [8]:
import pandas as pd

NSD_COCO_MAP_DIR = "/srv/eeg_reconstruction/shared/natural_scenes_dataset/nsd_stim_info_merged.pkl"

data = pd.read_pickle(NSD_COCO_MAP_DIR)


In [15]:
from tqdm import tqdm

captions = []

for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    captions.append(find_captions(row["cocoId"], row["cocoSplit"]))


100%|██████████| 73000/73000 [34:08<00:00, 35.64it/s]  


In [16]:
import pickle

OUTPUT_PATH = "/srv/eeg_reconstruction/shared/biosemi-dataset/captions.pkl"
with open(OUTPUT_PATH, 'wb') as file:
    pickle.dump(captions, file)

In [19]:
# Verify
with open(OUTPUT_PATH, 'rb') as file:
    test_captions = pickle.load(file)

print(test_captions[72996])

['Potted plants lined up including a toilet bowl full of flowers', 'Someone has planted some plants in a toilet.', 'Geraniums and petunias in containers, including a toilet.', 'A white toilet sitting in the middle of potted flowers.', 'some potted plants and a white toilet is also used as one']
