In [2]:
import pandas as pd
import h5py
import json
import random
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from pycocotools.coco import COCO
from tqdm import tqdm
import time
import concurrent.futures
import os.path as op

In [14]:
data_dir = '../data'
nsddata_folder = op.join(data_dir, 'nsd')
nsddata_folder

'../data/nsd'

In [22]:
# set subj
subj = 'subj01'

# if you download the data from NSD webset directly
# responses_file = f'./nsd/nsddata/ppdata/{subj}/behav/responses.tsv'
# stim_info_file = './nsd/nsddata/experiments/nsd/nsd_stim_info_merged.csv'

data_dir = '/root/autodl-tmp/.autodl/Projects/fMRI2TextAligner/data'

nsddata_folder = op.join(data_dir, 'nsd')

responses_file = op.join(nsddata_folder, 'responses.tsv')

# Read the nsd_stim_info_merged.csv file
stim_info_file = op.join(nsddata_folder, 'nsd_stim_info_merged.csv')

caption_files = {
    'val2017': op.join(data_dir, 'coco/annotations/captions_val2017.json'),
    'train2017': op.join(data_dir, 'coco/annotations/captions_train2017.json')
}

In [24]:
# read responses.tsv file
start_time = time.time()


responses_df = pd.read_csv(responses_file, sep='\t', usecols=['SUBJECT', 'SESSION', '73KID'])

responses_df['73KID'] = responses_df['73KID'] - 1

# Add trial column
responses_df['trial'] = (responses_df.groupby('SESSION').cumcount() + 1) % 750
responses_df['trial'] = responses_df['trial'].apply(lambda x: x if x != 0 else 750)

# Save to nsd_dataset.csv
nsd_dataset_file = 'nsd_dataset.csv'
responses_df.to_csv(nsd_dataset_file, index=False)

stim_info_df = pd.read_csv(stim_info_file, usecols=['cocoId', 'cocoSplit', 'nsdId', 'shared1000'])

# Merge data
nsd_dataset_df = pd.read_csv(nsd_dataset_file)
merged_df = pd.merge(nsd_dataset_df, stim_info_df, left_on='73KID', right_on='nsdId')

# Check if there is a missing 73KID
missing_73KIDs = set(nsd_dataset_df['73KID']) - set(stim_info_df['nsdId'])
if missing_73KIDs:
    print(f"Missing 73KIDs: {missing_73KIDs}")

# Preload captions data
def load_captions(caption_file):
    with open(caption_file, 'r') as f:
        captions_data = json.load(f)
    captions_dict = {ann['image_id']: ann['caption'] for ann in captions_data['annotations']}
    return captions_dict

captions_dict = {split: load_captions(file) for split, file in caption_files.items()}

# Get captions using the caption json file of the coco dataset
def get_caption(row):
    coco_split = row['cocoSplit']
    coco_id = row['cocoId']
    if coco_split in captions_dict:
        return captions_dict[coco_split].get(coco_id, '')
    return ''

captions = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    captions = list(tqdm(executor.map(get_caption, merged_df.to_dict('records')), total=merged_df.shape[0], desc="get captions"))

# Add captions column
merged_df['caption'] = captions

# Save the final nsd_dataset.csv
merged_df.to_csv(nsd_dataset_file, index=False)

get captions: 100%|██████████| 30000/30000 [00:00<00:00, 396167.45it/s]


Need coco image datase

In [None]:
# Read the image in the nsd_stimuli.hdf5 file
nsd_stimuli_file = './nsd/nsddata_stimuli/stimuli/nsd/nsd_stimuli.hdf5'
with h5py.File(nsd_stimuli_file, 'r') as f:
    imgBrick = f['imgBrick']
    
    # Randomly select 4 trials
    selected_trials = random.sample(range(merged_df.shape[0]), 4)
    
    for trial in selected_trials:
        row = merged_df.iloc[trial]
        img_index = row['73KID']  
        img_data = imgBrick[img_index]
        
        # Convert to PIL image
        img = Image.fromarray(img_data)
        
        # Add caption information
        plt.figure()
        plt.imshow(img)
        plt.title(f"Subject: {row['SUBJECT']}, Session: {row['SESSION']}, Trial: {row['trial']}\nCaption: {row['caption']}")
        plt.axis('off')
        plt.show()

print(f"Final dataset saved to {nsd_dataset_file}")

# Divide the dataset

In [3]:

input_file = 'nsd_dataset.csv'
df = pd.read_csv(input_file)

# Extract the caption column and generate the id column
captions = df['caption'] 
ids = range(len(captions))


# create new DataFrame
new_df = pd.DataFrame({'filepath': ids, 'title': captions})

# Split data into train and val
train_df = new_df.iloc[:27000]
val_df = new_df.iloc[27000:]

train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

print("Data segmentation and saving completed")

Data segmentation and saving completed
