In [1]:
import os
import shutil
import zipfile
import bidsme
import datetime
import pandas as pd
# from GluckLab.utils import subjectid_to_seqid as s2s
from multiprocessing import Pool

In [4]:
# resources directory
resources_dir = "resources"

# data directory
data_dir = "data"

# stages directory names
# Change paths/names to your liking
# For paths use os.path.join("path", "to", "your", "directory"). 
# This will generate a path like path/to/your/directory in an approtiate format for your OS
compressed_dir = os.path.join(data_dir, "compressed")
extract_dir = os.path.join(data_dir, "extracted")
reorganize_dir = os.path.join(data_dir, "reorganized")
prepared_dir = os.path.join(data_dir, "prepared")
bidsified_dir =  os.path.join(data_dir, "bidsified")

# bidsmap path
bidsmap_path = os.path.join(resources_dir, "bidsmap.yaml")

# logs
logs_dir = "logs"
extraction_log_name = "extraction_log.csv"
extraction_log_path = os.path.join(logs_dir, extraction_log_name)

reorganization_log_name = "reorganization_log.csv"
reorganization_log_path = os.path.join(logs_dir, reorganization_log_name)

bidsify_bval_bvec_log_name = "bidsify_bval_bvec_log.csv"
bidsify_bval_bvec_log_path = os.path.join(logs_dir, bidsify_bval_bvec_log_name)

# Multiprocessing
# number of subprocesses to divide the reorganization process among all CPU cores
# In this case 4 subprocesses per CPU core 
number_of_subprocesses = os.cpu_count() * 4


# Exercise Subject IDs
exercise_subjectids_path = os.path.join(resources_dir, "exercise_subjectids.csv")
exercise_subjectids = pd.read_csv(exercise_subjectids_path).dropna(subset=["alias_exercise"], how="any")
exercise_first_session = exercise_subjectids.groupby("alias_exercise").first().reset_index()
exercise_second_session = exercise_subjectids.groupby("alias_exercise").last().reset_index()

In [5]:
dirs_to_create = [extract_dir, reorganize_dir, prepared_dir, bidsified_dir, logs_dir]
for dir in dirs_to_create:
    os.makedirs(dir, exist_ok=True)

In [6]:
# compressed NIfTI data dirs
zipfiles = os.listdir(compressed_dir)
zipfiles_paths = [os.path.join(compressed_dir, file) for file in zipfiles]

In [7]:
def extract(filepath: str, extract_dir: str) -> dict:
    """
    Extracts a zip file (`filepath`) to `extract_dir` and returns a log entry specifying the success or failure of the extraction.

    Args:
        - filepath: path to a zip file.
        - extract_dir: directory to extract the zip file to.
    
    Returns:
        - log_entry: dictionary specifying the success or failure of the extraction.

    """
    filename = os.path.basename(filepath)
    now = datetime.datetime.now()
    log_entry = {"filename": filename, 
                 "extraction_date": now.date(),
                 "extraction_time" : now.time(),
    }

    try:
        with zipfile.ZipFile(filepath, "r") as zip_ref:

            zip_ref.extractall(extract_dir)
            log_entry["extraction_status"] = "success"
            log_entry["error"] = None

    except Exception as error:
        log_entry["extraction_status"] = "fail"
        log_entry["error"] = error

    return log_entry 

In [8]:
prev_extraction_log = None
extraction_log = []
files_to_extract = []

if not os.path.isfile(extraction_log_path):
    # extract all the zipfiles
    files_to_extract = zipfiles_paths

else:
    # read the previous log
    prev_extraction_log = pd.read_csv(extraction_log_path)

    # get the files that have been extracted already
    extracted_files = set(prev_extraction_log["filename"])

    # extract only the zipfiles that have not been extracted yet
    files_to_extract = [filepath for filepath in zipfiles_paths if os.path.basename(filepath) not in extracted_files]

with Pool(number_of_subprocesses) as p:
    extract_args = [(file, extract_dir) for file in files_to_extract]
    extraction_log = p.starmap(extract, extract_args)

if isinstance(prev_extraction_log, pd.DataFrame):
    extraction_log = pd.concat([prev_extraction_log, pd.DataFrame(extraction_log)], ignore_index=True)
else:
    extraction_log = pd.DataFrame(extraction_log)

extraction_log.to_csv(extraction_log_path, index=False)
extraction_log

Unnamed: 0,filename,extraction_date,extraction_time,extraction_status,error
0,AAL_039.zip,2025-01-13,17:01:10.819065,fail,Bad magic number for file header
1,COV249.zip,2025-01-13,17:01:10.819158,success,
2,COV101.zip,2025-01-13,17:01:10.819191,fail,File is not a zip file
3,AA_248.zip,2025-01-13,17:01:10.819274,fail,Bad magic number for file header
4,AA_R319.zip,2025-01-13,17:01:10.819368,success,
...,...,...,...,...,...
426,AA_R261_L.zip,2025-01-13,17:05:46.067285,success,
427,AA_287.zip,2025-01-13,17:05:47.037325,success,
428,AA_R461_0.zip,2025-01-13,17:05:48.750397,success,
429,AA_R345.zip,2025-01-13,17:05:48.782784,success,


In [9]:
def dir_to_subjectid(dirname: str):
    '''
    Takes a dirname and attempts to translate it to a Subject ID as best as possible.
    '''
    # Subject ID must be the name of its directory, with 
    # all characters capitalized,
    subjectid = dirname.upper()

    # Delete "_O", "_0", "_Q", "_FMRI" and/or "FMRI" from the Subject ID
    subjectid = subjectid.replace("_O", "")
    subjectid = subjectid.replace("_Q", "")
    subjectid = subjectid.replace("_FMRI", "")
    subjectid = subjectid.replace("FMRI", "")

    # Correct common naming mistake COVR### instead of COV_R###
    subjectid  = subjectid.replace("COVR", "COV_R")

    # This if is necessary to not mess up subject IDs like AA_0##_L
    if subjectid.endswith("_0"):
        subjectid = subjectid.replace("_0", "")

    # Correct invalid subject IDs in the form AA_###, without affecting valid Subject IDs in the forms AA_R### or AA_###_L
    if subjectid.startswith("AA_") and subjectid[3] != "R" and subjectid[-1] != "L":
        subjectid = subjectid.replace("_", "")

    if subjectid.startswith("__COV") or subjectid.startswith("__A"):
        subjectid = subjectid[2:]

    # Exercise ID translation
    if subjectid.startswith("EX"):
        row = None
        # Dirname is like EX###_[SUBJECTID], but the subjectid might be misspelled so lets take it from the table instead
        if "_" in subjectid:
            subjectid = subjectid.split("_")[0]
            row = exercise_second_session.loc[exercise_second_session["alias_exercise"] == subjectid]["subjectid"]
        # Dirname is just EX###
        else:
            row = exercise_first_session.loc[exercise_first_session["alias_exercise"] == subjectid]["subjectid"]

        if not row.empty:
            subjectid = row.values[0]
        
    return subjectid
    

def reorganize(dirpath: str, reorganize_dir: str) -> dict:
    '''
    Reorganizes a single directory in the from a structure like:

    <Data_directory>
    ├── <SubjectID for first session>
    |   ├── example-1.json
    |   └── example-1.nii.gz

    To a structure like:

    <Data directory>
    ├── <SeqID>
    |   ├── session-01
    |   |   ├── example-1.json
    |   |   └── example-1.nii.gz

    Args:
        - dirpath: path to subject directory to reorganize.
        - reorganize_dir: directory where to put the reorganized subject directory.

    Returns:
        - log_entry: dictionary specifying the success or failure of the reorganization.
    '''
    dirname = os.path.basename(dirpath)
    subjectid = dir_to_subjectid(dirname)

    session = s2s.get_instance_number(subjectid)
    seqid = s2s.get_seqid(subjectid)

    now = datetime.datetime.now()
    log_entry = {"directory": dirname,
                 "subjectid": subjectid,
                 "seqid": seqid,
                 "session": session,
                 "reorganization_date": now.date(),
                 "reorganization_time": now.time()
                 }
    if seqid == None or session == None:
        log_entry["reorganization_status"] = "fail"
        log_entry["error"] = "Directory name translated to an invalid SubjectID"
    else:
        try:
            # Path to target reorganized directory <extract_dir>/sub-<seqid>/ses-0<session number>
            target_dir = os.path.join(reorganize_dir, f"sub-{seqid}", f"ses-0{session}")
            
            # Move all files from extracted directory to reorganized directory
            shutil.move(dirpath, target_dir)

            log_entry["reorganization_status"] = "success"
            log_entry["error"] = None
    
        except Exception as error:
            log_entry["reorganization_status"] = "fail"
            log_entry["error"] = error
    return log_entry


In [10]:
prev_reorganization_log = None
reorganization_log = []
dirs_to_reorganize = []
subjects = []

if not os.path.isfile(reorganization_log_path):

    # reorganize all the directories extracted
    dirs_to_reorganize = [os.path.join(extract_dir, dirname) for dirname in os.listdir(extract_dir)]
else:
    
    # read the previous log
    prev_reorganization_log = pd.read_csv(reorganization_log_path)

    # get the directories that have been reorganized already
    reorganized_files = set(prev_reorganization_log["directory"])
    # extract only the zipfiles that have not been extracted yet
    dirs_to_reorganize = [os.path.join(extract_dir, dirname) for dirname in os.listdir(extract_dir) if dirname not in reorganized_files]
    # remove the extraction log from the list
    dirs_to_reorganize = [dir for dir in dirs_to_reorganize if dirs_to_reorganize != extraction_log_path]

with Pool(number_of_subprocesses) as p:
    reorganize_args = [(dir, reorganize_dir) for dir in dirs_to_reorganize]
    reorganization_log = p.starmap(reorganize, reorganize_args)

if isinstance(prev_reorganization_log, pd.DataFrame):
    reorganization_log = pd.concat([prev_reorganization_log, pd.DataFrame(reorganization_log)], ignore_index=True)
else:
    reorganization_log = pd.DataFrame(reorganization_log)

reorganization_log.to_csv(reorganization_log_path, index=False)
reorganization_log

Unnamed: 0,directory,subjectid,seqid,session,reorganization_date,reorganization_time,reorganization_status,error
0,EX022,AA_3R317_L,A317,4.0,2025-01-13,17:09:38.517573,success,
1,COV210,COV210,C210,1.0,2025-01-13,17:09:38.515645,success,
2,AA445,AA445,A445,1.0,2025-01-13,17:09:38.515394,success,
3,Cov144,COV144,C144,1.0,2025-01-13,17:09:38.515963,success,
4,AA416,AA416,A416,1.0,2025-01-13,17:09:38.515820,success,
...,...,...,...,...,...,...,...,...
441,AA_413_Q,AA413,A413,1.0,2025-01-13,17:11:33.865116,success,
442,AA_R461_0,AA_R461,A461,2.0,2025-01-13,17:11:33.868260,success,
443,Cov094,COV094,C094,1.0,2025-01-13,17:11:33.868334,success,
444,AA_255,AA255,A255,1.0,2025-01-13,17:11:33.868410,success,


In [11]:
logger = bidsme.init()
if not os.path.isdir(prepared_dir):
    os.makedirs(prepared_dir)
bidsme.prepare(reorganize_dir, prepared_dir)

[34mmain(81)[0m - [1;30mINFO[0m 
[34mmain(82)[0m - [1;30mINFO[0m -------------- START bidsme ----------------
[34mmain(83)[0m - [1;30mINFO[0m Mon Jan 13 17:15:36 2025
[34mmain(84)[0m - [1;30mINFO[0m version: 1.8.1
[34mbidsme.schema.BIDSschema(670)[0m - [1;30mINFO[0m Loaded BIDS schema version 1.10.0
[34mbidsme.prepare(192)[0m - [1;30mINFO[0m -------------- Prepearing data -------------
[34mbidsme.prepare(193)[0m - [1;30mINFO[0m Source directory: reorganized
[34mbidsme.prepare(194)[0m - [1;30mINFO[0m Destination directory: prepared
[34mbidsme.bidsMeta.BidsTable(141)[0m - [1;30mINFO[0m Created empty participants.tsv table
[34mbidsme.prepare(295)[0m - [1;30mINFO[0m Scanning folder reorganized/sub-A041/ses-01
[34mbidsme.prepare(58)[0m - [1;30mINFO[0m Processing: sub 'sub-A041', ses 'ses-01' (13 files)
[34mbidsme.prepare(295)[0m - [1;30mINFO[0m Scanning folder reorganized/sub-A044/ses-03
[34mbidsme.prepare(58)[0m - [1;30mINFO[0m Processing

In [12]:
bidsme.bidsify(prepared_dir, bidsified_dir, bidsmapfile=bidsmap_path)

[34mbidsme.bidsify(188)[0m - [1;30mINFO[0m -------------- Prepearing data -------------
[34mbidsme.bidsify(189)[0m - [1;30mINFO[0m Source directory: prepared
[34mbidsme.bidsify(190)[0m - [1;30mINFO[0m Destination directory: bidsified
[34mbidsme.bidsify(233)[0m - [1;30mINFO[0m loading bidsmap bidsified/code/bidsme/bidsmap.yaml
[34mbidsme.bidsMeta.BidsTable(134)[0m - [1;30mINFO[0m Loaded participants.tsv table with 242 entries
[34mbidsme.bidsMeta.BidsTable(141)[0m - [1;30mINFO[0m Created empty participants.tsv table
[34mbidsme.bidsify(344)[0m - [1;30mINFO[0m sub-A041 (1/242): Scanning folder prepared/sub-A041/ses-01
[34mbidsme.bidsify(70)[0m - [1;30mINFO[0m Processing: sub 'sub-A041', ses 'ses-01', 001-localizer/0 (3 files)
[34mbidsme.bidsify(97)[0m - [1;30mINFO[0m 001-localizer/0: ignored modality
[34mbidsme.bidsify(97)[0m - [1;30mINFO[0m 001-localizer/1: ignored modality
[34mbidsme.bidsify(97)[0m - [1;30mINFO[0m 001-localizer/2: ignored modal

In [13]:
def bidsify_bval_bvec(subject, session, reorganize_dir, bidsified_dir):

    reorganized_session_path = os.path.join(reorganize_dir, subject, session)

    for file in os.listdir(reorganized_session_path):
        rename = None
        if file.endswith("bval") or file.endswith("bvec"):
            if file.endswith("AP.bval"):
                rename = f"{subject}_{session}_dir-AP_dwi.bval"
            elif file.endswith("PA.bval"):
                rename = f"{subject}_{session}_dir-PA_dwi.bval"
            elif file.endswith("AP.bvec"):
                rename = f"{subject}_{session}_dir-AP_dwi.bvec"
            elif file.endswith("PA.bvec"):
                rename = f"{subject}_{session}_dir-PA_dwi.bvec"
            elif file.endswith("iso.bval"):
                rename = f"{subject}_{session}_acq-iso_dwi.bval"
            elif file.endswith("iso.bvec"):
                rename = f"{subject}_{session}_acq-iso_dwi.bvec"
            elif file.endswith("isoa.bval"):
                rename = f"{subject}_{session}_acq-iso_dwi.bval"
            elif file.endswith("isoa.bvec"):
                rename = f"{subject}_{session}_acq-iso_dwi.bvec"

            source_path = os.path.join(reorganized_session_path, file)
            destination_path = os.path.join(bidsified_dir, subject, session, "dwi", rename)
            now = datetime.datetime.now()
            log_entry = {"subject": subject, 
                        "session": session, 
                        "filename": file,
                        "rename": rename,
                        "source_path": source_path,
                        "destination_path": destination_path,
                        "bidsification_date": now.date(),
                        "bidsification_time": now.time(),
                        }
            
            try:
                shutil.move(source_path, destination_path)
                log_entry["status"] = "success"
                log_entry["error"] = None

            except Exception as error:
                log_entry["status"] = "fail"
                log_entry["error"] = error

            return log_entry


In [14]:
bidsify_bval_bvec_args = []
for dirpath, dirnames, filenames in os.walk(bidsified_dir):
    if dirpath.endswith("dwi"):
        split_path = dirpath.split("/")
        session = split_path[-2]
        subject = split_path[-3]
        bidsify_bval_bvec_args.append((subject, session, reorganize_dir, bidsified_dir))


bidsify_bval_bvec_log = []
with Pool(number_of_subprocesses) as p:
    bidsify_bval_bvec_log = p.starmap(bidsify_bval_bvec, bidsify_bval_bvec_args)

bidsify_bval_bvec_log = pd.DataFrame(bidsify_bval_bvec_log)
bidsify_bval_bvec_log

Unnamed: 0,subject,session,filename,rename,source_path,destination_path,bidsification_date,bidsification_time,status,error
0,sub-C278,ses-01,EX045_COV278_11_AXMB_DTI_PA.bval,sub-C278_ses-01_dir-PA_dwi.bval,reorganized/sub-C278/ses-01/EX045_COV278_11_AX...,bidsified/sub-C278/ses-01/dwi/sub-C278_ses-01_...,2025-01-13,17:37:38.961158,success,
1,sub-C248,ses-01,COV248_8_AXMB_DTI_AP.bvec,sub-C248_ses-01_dir-AP_dwi.bvec,reorganized/sub-C248/ses-01/COV248_8_AXMB_DTI_...,bidsified/sub-C248/ses-01/dwi/sub-C248_ses-01_...,2025-01-13,17:37:38.961249,success,
2,sub-C327,ses-01,COV327_11_AXMB_DTI_PA.bval,sub-C327_ses-01_dir-PA_dwi.bval,reorganized/sub-C327/ses-01/COV327_11_AXMB_DTI...,bidsified/sub-C327/ses-01/dwi/sub-C327_ses-01_...,2025-01-13,17:37:38.961349,success,
3,sub-A085,ses-02,AA_R085_10_ep2d_diff_30_iso.bval,sub-A085_ses-02_acq-iso_dwi.bval,reorganized/sub-A085/ses-02/AA_R085_10_ep2d_di...,bidsified/sub-A085/ses-02/dwi/sub-A085_ses-02_...,2025-01-13,17:37:38.961453,success,
4,sub-A085,ses-01,AA_085_10_ep2d_diff_30_iso.bvec,sub-A085_ses-01_acq-iso_dwi.bvec,reorganized/sub-A085/ses-01/AA_085_10_ep2d_dif...,bidsified/sub-A085/ses-01/dwi/sub-A085_ses-01_...,2025-01-13,17:37:38.961568,success,
...,...,...,...,...,...,...,...,...,...,...
244,sub-C177,ses-01,COV177_8_AXMB_DTI_AP.bvec,sub-C177_ses-01_dir-AP_dwi.bvec,reorganized/sub-C177/ses-01/COV177_8_AXMB_DTI_...,bidsified/sub-C177/ses-01/dwi/sub-C177_ses-01_...,2025-01-13,17:37:38.973740,success,
245,sub-C117,ses-01,Cov117_8_ep2d_diff_30_iso.bval,sub-C117_ses-01_acq-iso_dwi.bval,reorganized/sub-C117/ses-01/Cov117_8_ep2d_diff...,bidsified/sub-C117/ses-01/dwi/sub-C117_ses-01_...,2025-01-13,17:37:38.973960,success,
246,sub-A382,ses-01,AA_382_Q_10_ep2d_diff_30_iso.bval,sub-A382_ses-01_acq-iso_dwi.bval,reorganized/sub-A382/ses-01/AA_382_Q_10_ep2d_d...,bidsified/sub-A382/ses-01/dwi/sub-A382_ses-01_...,2025-01-13,17:37:38.973826,success,
247,sub-C213,ses-01,COV213_8_AXMB_DTI_AP.bval,sub-C213_ses-01_dir-AP_dwi.bval,reorganized/sub-C213/ses-01/COV213_8_AXMB_DTI_...,bidsified/sub-C213/ses-01/dwi/sub-C213_ses-01_...,2025-01-13,17:37:38.973851,success,


In [15]:
bidsify_bval_bvec_log.to_csv(bidsify_bval_bvec_log_path, index=False)