In [1]:
import os.path as op
import pandas as pd
import json
import concurrent.futures
from tqdm import tqdm

# Define data directories and files
data_dir = '/root/autodl-tmp/.autodl/Projects/fMRI2TextAligner/data'
nsddata_folder = op.join(data_dir, 'nsd')
responses_file = op.join(nsddata_folder, 'responses.tsv')
stim_info_file = op.join(nsddata_folder, 'nsd_stim_info_merged.csv')
caption_files = {
    'val2017': op.join(data_dir, 'coco/annotations/captions_val2017.json'),
    'train2017': op.join(data_dir, 'coco/annotations/captions_train2017.json')
}


In [2]:
'''
cocoSplit:
    0: train2017
    1: val2017
shared1000:
    1: True
    0: False
'''

# Read responses.tsv file
responses_df = pd.read_csv(responses_file, sep='\t', usecols=['SUBJECT', 'SESSION', '73KID'])
responses_df['73KID'] = responses_df['73KID'] - 1
responses_df['trial'] = (responses_df.groupby('SESSION').cumcount() + 1) % 750
responses_df['trial'] = responses_df['trial'].apply(lambda x: x if x != 0 else 750)
responses_df['fMRI_id'] = range(len(responses_df))
nsd_dataset_file = 'nsd_dataset.csv'
responses_df.to_csv(nsd_dataset_file, index=False)

# Read stim_info_file
stim_info_df = pd.read_csv(stim_info_file, usecols=['cocoId', 'cocoSplit', 'nsdId', 'shared1000'])
stim_info_df['shared1000'] = stim_info_df['shared1000'].replace({True: 1, False: 0})

# Merge data
nsd_dataset_df = pd.read_csv(nsd_dataset_file)
merged_df = pd.merge(nsd_dataset_df, stim_info_df, left_on='73KID', right_on='nsdId')
merged_df = merged_df.drop(columns=['nsdId'])

# Check for missing 73KIDs
missing_73KIDs = set(nsd_dataset_df['73KID']) - set(stim_info_df['nsdId'])
if missing_73KIDs:
    print(f"Missing 73KIDs: {missing_73KIDs}")

merged_df

  stim_info_df['shared1000'] = stim_info_df['shared1000'].replace({True: 1, False: 0})


Unnamed: 0,SUBJECT,SESSION,73KID,trial,fMRI_id,cocoId,cocoSplit,shared1000
0,1,1,46002,1,0,412922,train2017,1
1,1,1,61882,2,1,474858,train2017,0
2,1,1,828,3,2,320696,val2017,0
3,1,1,67573,4,3,234676,train2017,0
4,1,1,16020,5,4,301595,train2017,0
...,...,...,...,...,...,...,...,...
29995,1,40,13773,746,29995,32606,train2017,0
29996,1,40,66767,747,29996,388123,train2017,0
29997,1,40,53167,748,29997,179070,train2017,0
29998,1,40,1943,749,29998,13597,val2017,0


In [3]:
# Split according to the value of the shared1000 column
df_train = merged_df[merged_df['shared1000'] == 0]
df_val = merged_df[merged_df['shared1000'] == 1]

# make train dataset

In [5]:
df_train

Unnamed: 0,SUBJECT,SESSION,73KID,trial,fMRI_id,cocoId,cocoSplit,shared1000
1,1,1,61882,2,1,474858,train2017,0
2,1,1,828,3,2,320696,val2017,0
3,1,1,67573,4,3,234676,train2017,0
4,1,1,16020,5,4,301595,train2017,0
5,1,1,40422,6,5,129059,train2017,0
...,...,...,...,...,...,...,...,...
29995,1,40,13773,746,29995,32606,train2017,0
29996,1,40,66767,747,29996,388123,train2017,0
29997,1,40,53167,748,29997,179070,train2017,0
29998,1,40,1943,749,29998,13597,val2017,0


In [6]:
# Preload captions data
def load_captions(caption_file):
    with open(caption_file, 'r') as f:
        captions_data = json.load(f)
    captions_dict = {}
    for ann in captions_data['annotations']:
        if ann['image_id'] not in captions_dict:
            captions_dict[ann['image_id']] = []
        captions_dict[ann['image_id']].append((ann['id'], ann['caption']))
    return captions_dict

captions_dict = {split: load_captions(file) for split, file in caption_files.items()}

# Get captions using the caption json file of the coco dataset
def get_caption(row):
    coco_split = row['cocoSplit']
    coco_id = row['cocoId']
    if coco_split in captions_dict and coco_id in captions_dict[coco_split]:
        return captions_dict[coco_split][coco_id]
    return []

# Function to process each row and get all captions
def process_row(row):
    captions = get_caption(row)
    results = []
    for caption_id, caption in captions:
        new_row = row.copy()
        new_row['caption_id'] = caption_id
        new_row['caption'] = caption
        results.append(new_row)
    return results

# Use ThreadPoolExecutor to process rows in parallel
all_results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_row, row) for row in df_train.to_dict('records')]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing rows"):
        all_results.extend(future.result())

# Convert results to DataFrame
result_train_df = pd.DataFrame(all_results)

result_train_df

Processing rows: 100%|██████████| 27000/27000 [00:00<00:00, 250733.31it/s]


Unnamed: 0,SUBJECT,SESSION,73KID,trial,fMRI_id,cocoId,cocoSplit,shared1000,caption_id,caption
0,1,7,37535,440,4939,117765,train2017,0,721617,An airplane flying over a city near the ocean.
1,1,7,37535,440,4939,117765,train2017,0,724929,An aerial view of a shipping container yard.
2,1,7,37535,440,4939,117765,train2017,0,725247,An arial view looking down shows a man made is...
3,1,7,37535,440,4939,117765,train2017,0,725715,the view of the ground from a plane in the air
4,1,7,37535,440,4939,117765,train2017,0,725781,aerial view of water and land from a airplane
...,...,...,...,...,...,...,...,...,...,...
135061,1,25,10813,671,18670,286171,train2017,0,464338,A woman sitting next to Ronald McDonald talkin...
135062,1,25,10813,671,18670,286171,train2017,0,466009,Girl on phone looking up a statue of Ronald Mc...
135063,1,25,10813,671,18670,286171,train2017,0,466504,A woman on the cell phone sitting next to a re...
135064,1,25,10813,671,18670,286171,train2017,0,467680,A woman on her cell phone is sitting next to a...


In [7]:
df = result_train_df
fMRIid = df['fMRI_id'] 
captions = df['caption'] 
caption_id = df['caption_id'] 
new_df = pd.DataFrame({'filepath': fMRIid, 'title': captions, 'caption_id': caption_id})

new_df

Unnamed: 0,filepath,title,caption_id
0,4939,An airplane flying over a city near the ocean.,721617
1,4939,An aerial view of a shipping container yard.,724929
2,4939,An arial view looking down shows a man made is...,725247
3,4939,the view of the ground from a plane in the air,725715
4,4939,aerial view of water and land from a airplane,725781
...,...,...,...
135061,18670,A woman sitting next to Ronald McDonald talkin...,464338
135062,18670,Girl on phone looking up a statue of Ronald Mc...,466009
135063,18670,A woman on the cell phone sitting next to a re...,466504
135064,18670,A woman on her cell phone is sitting next to a...,467680


In [8]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled

Unnamed: 0,filepath,title,caption_id
0,29365,An elephant that is walking through some water.,136154
1,9274,A tractor truck painted in camouflage sitting ...,688290
2,10579,A piece of cheesecake and 2 pieces of an Engli...,145394
3,24233,A street sign next to a public road in a city.,28523
4,19691,A small brown and white dog lying by a window ...,496097
...,...,...,...
135061,25250,Man in a full wet suit holding a surfboard ove...,122331
135062,8396,A man is on the couch next a dog laying down.,368997
135063,15518,A pickup truck bed with chairs sitting in it.,484941
135064,7959,a group of elephants sitting in some dead grass,616605


In [9]:
train_csv_file_path = './train.csv'
df_shuffled.to_csv(train_csv_file_path, index=False)

# make val dataset

In [10]:
# Use ThreadPoolExecutor to process rows in parallel
all_results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_row, row) for row in df_val.to_dict('records')]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing rows"):
        all_results.extend(future.result())

# Convert results to DataFrame
result_val_df = pd.DataFrame(all_results)
result_val_df

Processing rows: 100%|██████████| 3000/3000 [00:00<00:00, 271253.60it/s]


Unnamed: 0,SUBJECT,SESSION,73KID,trial,fMRI_id,cocoId,cocoSplit,shared1000,caption_id,caption
0,1,37,6132,19,27018,1625,train2017,1,438502,A bicycle with a book bag chained to a bike st...
1,1,37,6132,19,27018,1625,train2017,1,439840,A bike is shown hooked up to a rack.
2,1,37,6132,19,27018,1625,train2017,1,443494,a white and blue bus and a red bicycle
3,1,37,6132,19,27018,1625,train2017,1,446293,A bus driving down a street next to a parked b...
4,1,37,6132,19,27018,1625,train2017,1,448081,A bicycle parked outside on a bicycle ramp.
...,...,...,...,...,...,...,...,...,...,...
15004,1,38,4768,539,28288,267647,train2017,1,366963,A cake with a large piece cut out of it
15005,1,38,4768,539,28288,267647,train2017,1,369513,A cake with a piece cut out is shown in a box.
15006,1,38,4768,539,28288,267647,train2017,1,370683,A birthday cake on a doily is cut to show it's...
15007,1,38,4768,539,28288,267647,train2017,1,371745,A birthday cake with a piece cut out sitting o...


In [11]:
df = result_val_df
fMRIid = df['fMRI_id'] 
captions = df['caption'] 
caption_id = df['caption_id'] 
new_df = pd.DataFrame({'filepath': fMRIid, 'title': captions, 'caption_id': caption_id})

new_df

Unnamed: 0,filepath,title,caption_id
0,27018,A bicycle with a book bag chained to a bike st...,438502
1,27018,A bike is shown hooked up to a rack.,439840
2,27018,a white and blue bus and a red bicycle,443494
3,27018,A bus driving down a street next to a parked b...,446293
4,27018,A bicycle parked outside on a bicycle ramp.,448081
...,...,...,...
15004,28288,A cake with a large piece cut out of it,366963
15005,28288,A cake with a piece cut out is shown in a box.,369513
15006,28288,A birthday cake on a doily is cut to show it's...,370683
15007,28288,A birthday cake with a piece cut out sitting o...,371745


In [12]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled

Unnamed: 0,filepath,title,caption_id
0,6273,The boy rides the ocean waves on a wake board.,4091
1,13404,A boy flying a kite with a dog during the day.,332687
2,17382,A train with several carts sitting on a track.,788628
3,21108,Traffic signs against a residential building m...,479268
4,24781,"Two cell phones on a table, one is charging.",196380
...,...,...,...
15004,4500,The animals are extremely hairy and are walkin...,786370
15005,27710,Two people in chef's outfits cooking inside a ...,738654
15006,9791,A man riding a skateboard down a step.,566168
15007,4783,An adult elephant with a baby elephant standin...,267054


In [13]:
val_csv_file_path = './val.csv'
df_shuffled.to_csv(val_csv_file_path, index=False)