# Dataset Setup

This notebook showcases a demo using this toolbox to analyze data on HCP dataset.

In [None]:
import pathlib
import os
import boto3
import numpy as np
import glob
import pandas as pd
import ast

Download the LANGUAGE task data from a sample of 30 subjects in the HCP dataset:

In [None]:
def list_s3_objects(s3_client, bucket_name, prefix):
    bucket = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    return [obj['Key'] for obj in bucket.get('Contents', [])]


def download_file(s3_client, bucket_name, s3_key, local_path):
    try:
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        s3_client.download_file(bucket_name, s3_key, local_path)
    except Exception as e:
        print(f"Missing or failed: {s3_key}")


def download_selected(s3_client, subjects, task):
    patterns = [
        "MNINonLinear/Results/tfMRI_{task}_LR",
        "MNINonLinear/Results/tfMRI_{task}_RL"
    ]

    for subject in subjects:
        for pattern in patterns:
            s3_path = f"HCP_1200/{subject}/{pattern.format(task=task)}"
            s3_objects = list_s3_objects(s3_client, 'hcp-openaccess', s3_path)
            for s3_object in s3_objects:
                download_file(s3_client, 'hcp-openaccess', s3_object, 
                              'data/' + s3_object)


aws_access_key_id = input("Enter your AWS access key ID: ")
aws_secret_access_key = input("Enter your AWS secret access key: ")

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

subjects = [
    "211417", "164030", "480141", "248238", "214221", "381038", "117021", "671855", 
    "352738", "180836", "677968", "200917", "715647", "107018", "937160", "349244", 
    "214625", "286347", "715041", "749058", "614439", "250932", "145834", "872158", 
    "164636", "932554", "118528", "737960", "187547", "110613"
]
task = 'LANGUAGE'

download_selected(s3_client, subjects, task)

The few lines of codes below tidy the HCP dataset in BIDS format. TEST TODO

In [None]:
data_dir = 'data'
bids_dir = 'bids'

subject_task_info = pd.read_csv('./HCP_summary/inclusion_results.csv')
subjs = []
for i, row in subject_task_info.iterrows():
    # check empty tasks
    tasks = ast.literal_eval(row['tasks'])
    if len(tasks) == 0:
        continue
    subjs.append(str(row['subject']))

def get_events(ev_folder_path, task):
    events_df = pd.DataFrame(columns=["onset", "duration", "trial_type", "amplitude"])
    for ev_file in glob.glob(f"{ev_folder_path}/*.txt"):
        if 'Sync' in ev_file: continue
        condition = ev_file.split('/')[-1].replace('.txt', '')
        ev_data = pd.read_csv(ev_file, sep="\t", header=None, names=["onset", "duration", "amplitude"])
        ev_data["trial_type"] = condition
        
        events_df = pd.concat([events_df, ev_data], ignore_index=True)
    events_df = events_df[["onset", "duration", "trial_type", "amplitude"]]
    events_df = events_df.sort_values("onset").reset_index(drop=True)
    events_df = events_df.loc[events_df['trial_type'].isin['story', 'math']][['trial_type', 'onset', 'duration']]
    return events_df

for sub in subjs:
    print(f"Processing {sub}")
    runs = os.listdir(os.path.join(data_dir, sub, 'MNINonLinear', 'Results'))
    for run in [run for run in runs if 'tfMRI' in run]:
        run_task = run.split('_')[1]

        run_suffix = run.split('_')[-1]
        tab_file = glob.glob(
            os.path.join(data_dir, sub, 'MNINonLinear', 'Results', run, f'LANGUAGE*_TAB.txt')
        )
        if len(tab_file) != 1:
            raise ValueError(f"Expected 1 tab file, found {len(tab_file)}")
        tab_file = tab_file[0]
        label = os.path.basename(tab_file).split('_TAB')[0]
        run_i = label.split('_')[-1].replace('run', '')
        
        folder = os.path.join(data_dir, sub, 'MNINonLinear', 'Results', run)
        bids_folder = os.path.join(bids_dir, f'sub-{sub}', 'func')
        pathlib.Path(bids_folder).mkdir(parents=True, exist_ok=True)

        # JSON file
        dest = os.path.join(bids_folder, f'sub-{sub}_task-{run_task}_run-{run_i}_space-MNINonLinear_desc-preproc_bold.json')
        os.symlink(os.path.join(bids_dir, f'task-{run_task}_acq-{run_suffix}_bold.json'), dest)

        # Confounds file
        tsv_no_header = os.path.join(data_dir, sub, 'MNINonLinear', 'Results', run, 'Movement_Regressors.txt')
        columns = [
            'trans_x', 'trans_y', 'trans_z', 'rot_x', 'rot_y', 'rot_z',
            'trans_dx', 'trans_dy', 'trans_dz', 'rot_dx', 'rot_dy', 'rot_dz']
        data = []
        with open(tsv_no_header, 'r') as f:
            for line in f:
                line = line.strip().split()
                data.append([float(x) for x in line])
        tbl = pd.DataFrame(data, columns=columns)
        tbl.to_csv(os.path.join(bids_dir, f'sub-{sub}', 'func', f'sub-{sub}_task-{run_task}_run-{run_i}_desc-confounds_timeseries.tsv'), sep='\t', index=False)

        filename = run.split('_', 1)[1]
        func_dir = os.path.join(bids_dir, f'sub-{sub}', 'func')
        os.makedirs(func_dir, exist_ok=True)

        # Brain mask file
        brain_mask_dest = os.path.join(func_dir, f'sub-{sub}_task-{run_task}_run-{run_i}_space-MNINonLinear_desc-brain_mask.nii.gz')
        os.symlink(os.path.join(folder, 'brainmask_fs.2.nii.gz'), brain_mask_dest)

        # Preprocessed BOLD file
        preproc_bold_dest = os.path.join(func_dir, f'sub-{sub}_task-{run_task}_run-{run_i}_space-MNINonLinear_desc-preproc_bold.nii.gz')
        os.symlink(os.path.join(folder, f'tfMRI_{filename}.nii.gz'), preproc_bold_dest)

        evs_folder = os.path.join(data_dir, sub, 'MNINonLinear', 'Results', run, 'EVs')
        events_df = get_events(evs_folder, run_task)
        events_df.to_csv(os.path.join(bids_dir, f'sub-{sub}', 'func', f'sub-{sub}_task-{run_task}_run-{run_i}_events.tsv'), sep='\t', index=False)

# First Level Modeling

# Generate Parcels for the Language Network

# Inspect Individual fROIs

# Analysis: Estimate Effect Sizs

# Analysis: Estimate Spatial Correlation Across Conditions

# Analysis: Estimate Overlap Between fROIs