# ADHD200 — Подготовка данных (Athena .1D → TSV/NPY + манифесты)

Функции этого ноутбука:
1) Сплющивание `Site/ID` → `sub-<lower(site)><ID>/func/`.
2) Конвертация `*.1D` → Z-score TSV + фиксированное окно NPY.
3) Поиск `*_bold.json` в BIDS и извлечение `RepetitionTime` (TR).
4) Валидация TR и длительности.
5) Агрегирование нескольких прогонов per-subject → `aggregate_manifest.csv`.


In [2]:
import os, re, json, shutil
import numpy as np, pandas as pd
from pathlib import Path
from IPython.display import display

In [8]:
# === Параметры подготовки ===

INPUT_ROOT = "./ADHD200_CC400_TCs_filtfix"    # структура Site/ID/*.1D
DEST_ROOT  = "./Athena_flat"                  # сюда ляжет sub-*/func/, TSV/NPY, манифесты

INPUT_ROOT = Path(INPUT_ROOT)
DEST_ROOT = Path(DEST_ROOT)

FLATTEN = True

PICK_VERSION = "filtered"    # 'filtered' | 'unfiltered' | 'any'  (Athena: 'sf*'~filtered, 'sn*'~unfiltered)
ZSCORE = True
WIN_TR = 120                 # длина окна на run; 0 — без обрезки/паддинга

BIDS_ROOT = "../RawDataBIDS"     # для *_bold.json и TR
COPY_BOLD_JSON = True

MIN_VOLS_TR = 60

WIN_TR_AGG = 240            # длина окна после агрегирования; 0 — без изменения
AGGREGATE_MODE = 'concat'   # 'concat' | 'mean'

## Вспомогательные функции


In [16]:
def slug_site(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", s.lower())

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def is_filtered_filename(name: str) -> bool:
    base = os.path.basename(name)
    return base.startswith("sf")

def is_unfiltered_filename(name: str) -> bool:
    base = os.path.basename(name)
    return base.startswith("sn")

def pick_1d_files(files, pick_version: str):
    if pick_version == "filtered":
        return [f for f in files if is_filtered_filename(f.name)]
    if pick_version == "unfiltered":
        return [f for f in files if is_unfiltered_filename(f.name)]
    return files

def flatten_site_id(input_root: str, dest_root: str) -> pd.DataFrame:
    src = Path(input_root)
    dst = Path(dest_root)
    ensure_dir(dst)
    
    rows = []
    for site_dir in sorted([p for p in src.iterdir() if p.is_dir()]):
        site = site_dir.name
        for id_dir in sorted([p for p in site_dir.iterdir() if p.is_dir()]):
            sid = id_dir.name
            sub_label = f"sub-{slug_site(site)}{re.sub(r'[^0-9a-zA-Z]+','',sid)}"
            sub_root = dst / sub_label; sub_func = sub_root / "func"; ensure_dir(sub_func)
            one_ds = sorted(list(id_dir.glob("*.1D")))
            linked = []
            for f in one_ds:
                target = sub_func / f.name
                if not target.exists():
                    try:
                        os.symlink(os.path.abspath(f), target)
                    except OSError:
                        shutil.copy2(f, target)
                linked.append(target)
            if linked:
                rows.append({"participant_id": sub_label, "site": site, "id": sid, "one_d_paths": [str(p) for p in linked]})
    return pd.DataFrame(rows)

def load_1d(path: Path) -> np.ndarray:
    try:
        X = np.loadtxt(str(path), comments=['#','@'])
    except Exception:
        X = pd.read_csv(path, sep=r"\s+", header=None, comment="#", engine="python").values
    if X.ndim == 1:
        X = X[:, None]
    return X

def zscore_cols(X: np.ndarray) -> np.ndarray:
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, ddof=1, keepdims=True) + 1e-8
    return (X - mu) / sd

def window_pad(X: np.ndarray, win: int) -> np.ndarray:
    if win <= 0: return X
    T, R = X.shape
    if T >= win: return X[:win]
    Y = np.zeros((win, R), dtype=X.dtype); Y[:T] = X
    return Y

def find_bold_json(bids_root: str, participant_id: str):
    if not bids_root: return None
    sub = Path(bids_root) / participant_id / "func"
    if sub.exists():
        cand = sorted(sub.glob("*_bold.json"))
        return cand[0] if cand else None
    for p in Path(bids_root).rglob(f"{participant_id}/func/*_bold.json"):
        return p
    return None

def extract_tr_from_json(json_path: Path):
    try:
        data = json.loads(json_path.read_text())
        tr = data.get("RepetitionTime")
        return float(tr) if tr is not None else None
    except Exception:
        return None

def convert_subject(sub_row, pick_version: str, out_root: Path, bids_root: str, copy_json: bool,
                    zscore=True, win_tr=120):
    participant_id = sub_row["participant_id"]
    out_sub = out_root / participant_id / "func"; ensure_dir(out_sub)
    files = [Path(p) for p in sub_row["one_d_paths"]]
    files = pick_1d_files(files, pick_version)
    results = []
    tr, bold_json_dst = None, None
    json_path = find_bold_json(bids_root, participant_id) if bids_root else None
    if json_path and Path(json_path).exists():
        tr = extract_tr_from_json(Path(json_path))
        if copy_json:
            bold_json_dst = out_sub / Path(json_path).name
            if not bold_json_dst.exists(): shutil.copy2(str(json_path), str(bold_json_dst))
    for f in files:
        X = load_1d(f); T, R = int(X.shape[0]), int(X.shape[1])
        print(f"Processing {f} for {participant_id}: T={T}, R={R}, TR={tr}")
        # Убедимся, что значения в X числовые
        if not np.issubdtype(X.dtype, np.number):
            # Преобразуем в числовой тип, заменяя нечисловые значения на NaN
            X = pd.DataFrame(X).apply(pd.to_numeric, errors='coerce').values
            # Удалим 2 первых столбца, если они NaN
            if R > 2 and np.isnan(X[:, 0]).all() and np.isnan(X[:, 1]).all():
                X = X[:, 2:]
                R = X.shape[1]
            # Удалим строки с NaN
            X = X[~np.isnan(X).any(axis=1)]
            T = X.shape[0]
        # Пропускаем файлы с недостаточным количеством временных точек
        if tr and (MIN_VOLS_TR > 0) and (T * tr < MIN_VOLS_TR):
            print(f"Skipping {f} for {participant_id}: T={T}, TR={tr} => total time {T*tr:.1f}s < {MIN_VOLS_TR}s")
            continue
        # # Агрегируем по времени, если нужно
        # if WIN_TR_AGG > 0 and AGGREGATE_MODE == 'concat' and tr:
        #     target_T = int(WIN_TR_AGG / tr)
        #     if T > target_T:
        #         n_segments = T // target_T
        #         X = X[:n_segments * target_T].reshape(n_segments, target_T, R).mean(axis=0)
        #         T = X.shape[0]
        Xz = zscore_cols(X) if zscore else X
        Xw = window_pad(Xz, win_tr) if win_tr else Xz
        stem = f.stem
        tsv_path = out_sub / f"{stem}_znorm.tsv"
        npy_path = out_sub / (f"{stem}_znorm_{win_tr}tr.npy" if win_tr else f"{stem}_znorm.npy")
        pd.DataFrame(Xz, columns=[f"roi_{i+1}" for i in range(R)]).to_csv(tsv_path, sep="\t", index=False)
        np.save(npy_path, Xw)
        results.append({
            "participant_id": participant_id,
            "file": str(f),
            "tsv_path": str(tsv_path),
            "npy_path": str(npy_path),
            "T": T, "R": R,
            "filter_version": ("filtered" if is_filtered_filename(f.name) else ("unfiltered" if is_unfiltered_filename(f.name) else "unknown")),
            "TR": tr,
            "bold_json": str(bold_json_dst) if bold_json_dst else (str(json_path) if json_path else "")
        })
    return results


## 1) Сплющивание и сбор списка файлов


In [None]:
# Удаляем старые данные
shutil.rmtree(DEST_ROOT)
# Создаем корневую директорию для выходных данных
DEST_ROOT.mkdir(parents=True)

if FLATTEN:
    subs_df = flatten_site_id(str(INPUT_ROOT), str(DEST_ROOT))
else:
    rows = []
    for sub in sorted(DEST_ROOT.glob("sub-*/func")):
        one_ds = sorted(sub.glob("*.1D"))
        if one_ds:
            rows.append({"participant_id": sub.parent.name, "site": "", "id": "", "one_d_paths": [str(p) for p in one_ds]})
    subs_df = pd.DataFrame(rows)
print("Найдено субъектов:", len(subs_df))
display(subs_df.head())


Найдено субъектов: 776


Unnamed: 0,participant_id,site,id,one_d_paths
0,sub-kki1018959,KKI,1018959,[Athena_flat/sub-kki1018959/func/sfnwmrda10189...
1,sub-kki1019436,KKI,1019436,[Athena_flat/sub-kki1019436/func/sfnwmrda10194...
2,sub-kki1043241,KKI,1043241,[Athena_flat/sub-kki1043241/func/sfnwmrda10432...
3,sub-kki1266183,KKI,1266183,[Athena_flat/sub-kki1266183/func/sfnwmrda12661...
4,sub-kki1535233,KKI,1535233,[Athena_flat/sub-kki1535233/func/sfnwmrda15352...


In [52]:
# Преобразуем один тестовый файл

# --- IGNORE ---
# Создаем корневую директорию для выходных данных
test_dest = DEST_ROOT / "test_subject"
shutil.rmtree(test_dest, ignore_errors=True)
ensure_dir(test_dest)

test_row = subs_df.iloc[0]
test_results = convert_subject(
    test_row,
    PICK_VERSION,
    test_dest,
    BIDS_ROOT,
    COPY_BOLD_JSON,
    zscore=ZSCORE,
    win_tr=WIN_TR
)
# --- IGNORE ---
print("Тестовое преобразование одного субъекта:")
display(pd.DataFrame(test_results))

Processing Athena_flat/sub-kki1018959/func/sfnwmrda1018959_session_1_rest_1_cc400_TCs.1D for sub-kki1018959: T=149, R=353, TR=None
Тестовое преобразование одного субъекта:


Unnamed: 0,participant_id,file,tsv_path,npy_path,T,R,filter_version,TR,bold_json
0,sub-kki1018959,Athena_flat/sub-kki1018959/func/sfnwmrda101895...,Athena_flat/test_subject/sub-kki1018959/func/s...,Athena_flat/test_subject/sub-kki1018959/func/s...,148,351,filtered,,


## 2) Конвертация .1D → NPY/TSV и сборка manifest.csv


In [53]:
out_root = DEST_ROOT if isinstance(DEST_ROOT, Path) else Path(str(DEST_ROOT))
print("Вывод в:", out_root)

# Сбор конвертаций
all_rows = []
for _, row in subs_df.iterrows():
    rows = convert_subject(
        row,
        PICK_VERSION,
        out_root,
        BIDS_ROOT,
        COPY_BOLD_JSON,
        zscore=ZSCORE,
        win_tr=WIN_TR
    )
    all_rows.extend(rows)

# Формирование и сохранение manifest
manifest = pd.DataFrame(all_rows)
man_path = out_root / "manifest.csv"
manifest.to_csv(man_path, index=False)
print("Сохранён manifest:", man_path)
display(manifest.head())
print("Всего рядов:", len(manifest))


Вывод в: Athena_flat
Processing Athena_flat/sub-kki1018959/func/sfnwmrda1018959_session_1_rest_1_cc400_TCs.1D for sub-kki1018959: T=149, R=353, TR=None
Processing Athena_flat/sub-kki1019436/func/sfnwmrda1019436_session_1_rest_1_cc400_TCs.1D for sub-kki1019436: T=149, R=353, TR=None
Processing Athena_flat/sub-kki1043241/func/sfnwmrda1043241_session_1_rest_1_cc400_TCs.1D for sub-kki1043241: T=149, R=353, TR=None
Processing Athena_flat/sub-kki1266183/func/sfnwmrda1266183_session_1_rest_1_cc400_TCs.1D for sub-kki1266183: T=121, R=353, TR=None
Processing Athena_flat/sub-kki1535233/func/sfnwmrda1535233_session_1_rest_1_cc400_TCs.1D for sub-kki1535233: T=149, R=353, TR=None
Processing Athena_flat/sub-kki1541812/func/sfnwmrda1541812_session_1_rest_1_cc400_TCs.1D for sub-kki1541812: T=121, R=353, TR=None
Processing Athena_flat/sub-kki1577042/func/sfnwmrda1577042_session_1_rest_1_cc400_TCs.1D for sub-kki1577042: T=121, R=353, TR=None
Processing Athena_flat/sub-kki1594156/func/sfnwmrda1594156_ses

Unnamed: 0,participant_id,file,tsv_path,npy_path,T,R,filter_version,TR,bold_json
0,sub-kki1018959,Athena_flat/sub-kki1018959/func/sfnwmrda101895...,Athena_flat/sub-kki1018959/func/sfnwmrda101895...,Athena_flat/sub-kki1018959/func/sfnwmrda101895...,148,351,filtered,,
1,sub-kki1019436,Athena_flat/sub-kki1019436/func/sfnwmrda101943...,Athena_flat/sub-kki1019436/func/sfnwmrda101943...,Athena_flat/sub-kki1019436/func/sfnwmrda101943...,148,351,filtered,,
2,sub-kki1043241,Athena_flat/sub-kki1043241/func/sfnwmrda104324...,Athena_flat/sub-kki1043241/func/sfnwmrda104324...,Athena_flat/sub-kki1043241/func/sfnwmrda104324...,148,351,filtered,,
3,sub-kki1266183,Athena_flat/sub-kki1266183/func/sfnwmrda126618...,Athena_flat/sub-kki1266183/func/sfnwmrda126618...,Athena_flat/sub-kki1266183/func/sfnwmrda126618...,120,351,filtered,,
4,sub-kki1535233,Athena_flat/sub-kki1535233/func/sfnwmrda153523...,Athena_flat/sub-kki1535233/func/sfnwmrda153523...,Athena_flat/sub-kki1535233/func/sfnwmrda153523...,148,351,filtered,,


Всего рядов: 1198


## 2.1 Валидация TR и длительностей


In [54]:
manifest = pd.read_csv(DEST_ROOT/"manifest.csv")
manifest['has_TR'] = manifest['TR'].notna()
manifest['duration_sec'] = np.where(manifest['has_TR'], manifest['T']*manifest['TR'], np.nan)
manifest['too_short'] = manifest['T'] < MIN_VOLS_TR
display(manifest[['participant_id','file','T','R','TR','duration_sec','filter_version','too_short']].head(20))
print('TR coverage:', manifest['has_TR'].mean())
print('Коротких:', manifest['too_short'].sum(), 'из', len(manifest))
manifest.to_csv(DEST_ROOT/"manifest.csv", index=False)


Unnamed: 0,participant_id,file,T,R,TR,duration_sec,filter_version,too_short
0,sub-kki1018959,Athena_flat/sub-kki1018959/func/sfnwmrda101895...,148,351,,,filtered,False
1,sub-kki1019436,Athena_flat/sub-kki1019436/func/sfnwmrda101943...,148,351,,,filtered,False
2,sub-kki1043241,Athena_flat/sub-kki1043241/func/sfnwmrda104324...,148,351,,,filtered,False
3,sub-kki1266183,Athena_flat/sub-kki1266183/func/sfnwmrda126618...,120,351,,,filtered,False
4,sub-kki1535233,Athena_flat/sub-kki1535233/func/sfnwmrda153523...,148,351,,,filtered,False
5,sub-kki1541812,Athena_flat/sub-kki1541812/func/sfnwmrda154181...,120,351,,,filtered,False
6,sub-kki1577042,Athena_flat/sub-kki1577042/func/sfnwmrda157704...,120,351,,,filtered,False
7,sub-kki1594156,Athena_flat/sub-kki1594156/func/sfnwmrda159415...,120,351,,,filtered,False
8,sub-kki1623716,Athena_flat/sub-kki1623716/func/sfnwmrda162371...,120,351,,,filtered,False
9,sub-kki1638334,Athena_flat/sub-kki1638334/func/sfnwmrda163833...,120,351,,,filtered,False


TR coverage: 0.0
Коротких: 1 из 1198


## 2.2 Агрегирование прогонов (per-subject)

Если у субъекта несколько прогонов, агрегируем их в один файл NPY и TSV, а также обновляем манифест.

In [55]:
man = pd.read_csv(DEST_ROOT/"manifest.csv")
groups = man.groupby('participant_id')
agg_rows = []; agg_dir = DEST_ROOT/"aggregated"; agg_dir.mkdir(parents=True, exist_ok=True)

def pad_to(X, Tfinal):
    T,R = X.shape
    if T >= Tfinal: return X[:Tfinal]
    Y = np.zeros((Tfinal,R), dtype=X.dtype); Y[:T]=X; return Y

for pid, df in groups:
    npys = df['npy_path'].tolist()
    if not npys: continue
    mats = [np.load(p) for p in npys]
    if AGGREGATE_MODE=='concat':
        cat = np.concatenate(mats, axis=0)
        Tfinal = WIN_TR_AGG if WIN_TR_AGG>0 else cat.shape[0]
        Xagg = pad_to(cat, Tfinal)
    elif AGGREGATE_MODE=='mean':
        Tmax = max(m.shape[0] for m in mats)
        mats_eq = [pad_to(m, Tmax) for m in mats]
        Xagg = np.mean(mats_eq, axis=0)
        if WIN_TR_AGG>0: Xagg = pad_to(Xagg, WIN_TR_AGG)
    else:
        Xagg = mats[0]
        if WIN_TR_AGG>0: Xagg = pad_to(Xagg, WIN_TR_AGG)
    out_path = agg_dir / f"{pid}_agg_{AGGREGATE_MODE}_{WIN_TR_AGG if WIN_TR_AGG>0 else 'var'}tr.npy"
    np.save(out_path, Xagg)
    TRs = df['TR'].dropna().values
    tr_val = float(np.mean(TRs)) if TRs.size>0 else np.nan
    agg_rows.append({"participant_id": pid, "n_runs": len(npys), "R": int(Xagg.shape[1]), "TR": tr_val, "npy_path": str(out_path)})

agg_manifest = pd.DataFrame(agg_rows)
agg_manifest.to_csv(DEST_ROOT/"aggregate_manifest.csv", index=False)
print('Сохранён aggregate_manifest:', DEST_ROOT/"aggregate_manifest.csv")
display(agg_manifest.head())


Сохранён aggregate_manifest: Athena_flat/aggregate_manifest.csv


Unnamed: 0,participant_id,n_runs,R,TR,npy_path
0,sub-kki1018959,1,351,,Athena_flat/aggregated/sub-kki1018959_agg_conc...
1,sub-kki1019436,1,351,,Athena_flat/aggregated/sub-kki1019436_agg_conc...
2,sub-kki1043241,1,351,,Athena_flat/aggregated/sub-kki1043241_agg_conc...
3,sub-kki1266183,1,351,,Athena_flat/aggregated/sub-kki1266183_agg_conc...
4,sub-kki1535233,1,351,,Athena_flat/aggregated/sub-kki1535233_agg_conc...


## 2.3 Фильтрация испытуемых на основании .csv списка

Здесь мы можем отфильтровать испытуемых на основании внешнего списка по возрастным когортам.

In [34]:
DESIRED_TRIALS_LIST_FILE_TEENS = "./cohort_teen_participants.txt"
FILTERED_DEST_ROOT_TEENS = "./Athena_flat_filtered/cohort_teen_participants"

DESIRED_TRIALS_LIST_FILE_ADULTS = "./cohort_adult_participants.txt"
FILTERED_DEST_ROOT_ADULTS = "./Athena_flat_filtered/cohort_adult_participants"

# Создаем директорию для отфильтрованных данных подростков
shutil.rmtree(FILTERED_DEST_ROOT_TEENS, ignore_errors=True)
os.makedirs(FILTERED_DEST_ROOT_TEENS, exist_ok=True)

# Читаем желаемый список субъектов из .txt файла построчно
with open(DESIRED_TRIALS_LIST_FILE_TEENS, 'r') as f:
    desired_teen_ids = set(line.strip() for line in f if line.strip())
# Добавим префикс 'sub-' к каждому идентификатору, если его нет
desired_teen_ids = set(pid if pid.startswith('sub-') else f'sub-{pid}' for pid in desired_teen_ids)
print("Желаемых субъектов (teen participants):", len(desired_teen_ids))

# Создаем директорию для отфильтрованных данных взрослых
shutil.rmtree(FILTERED_DEST_ROOT_ADULTS, ignore_errors=True)
os.makedirs(FILTERED_DEST_ROOT_ADULTS, exist_ok=True)

# Читаем желаемый список субъектов из .txt файла построчно
with open(DESIRED_TRIALS_LIST_FILE_ADULTS, 'r') as f:
    desired_adult_ids = set(line.strip() for line in f if line.strip())
# Добавим префикс 'sub-' к каждому идентификатору, если его нет
desired_adult_ids = set(pid if pid.startswith('sub-') else f'sub-{pid}' for pid in desired_adult_ids)
print("Желаемых субъектов (adult participants):", len(desired_adult_ids))

Желаемых субъектов (teen participants): 274
Желаемых субъектов (adult participants): 117


In [35]:
original_manifest = pd.read_csv(DEST_ROOT/"manifest.csv")

# Скопируем соответствующие данные подростков по списку
filtered_manifest_teen = original_manifest[original_manifest['participant_id'].isin(desired_teen_ids)]
filtered_manifest_teen.to_csv(Path(FILTERED_DEST_ROOT_TEENS) / "manifest.csv", index=False)
print("Сохранён отфильтрованный manifest для подростков:", Path(FILTERED_DEST_ROOT_TEENS) / "manifest.csv")

# Скопируем соответствующие данные взрослых по списку
filtered_manifest_adult = original_manifest[original_manifest['participant_id'].isin(desired_adult_ids)]
filtered_manifest_adult.to_csv(Path(FILTERED_DEST_ROOT_ADULTS) / "manifest.csv", index=False)
print("Сохранён отфильтрованный manifest для взрослых:", Path(FILTERED_DEST_ROOT_ADULTS) / "manifest.csv")

Сохранён отфильтрованный manifest для подростков: Athena_flat_filtered/cohort_teen_participants/manifest.csv
Сохранён отфильтрованный manifest для взрослых: Athena_flat_filtered/cohort_adult_participants/manifest.csv


In [36]:
# скопируем файлы .npy для подростков
for _, row in filtered_manifest_teen.iterrows():
    npy_src = Path(row['npy_path'])
    participant_id = row['participant_id']
    out_sub = Path(FILTERED_DEST_ROOT_TEENS) / participant_id
    ensure_dir(out_sub)
    npy_dst = out_sub / npy_src.name
    if not npy_dst.exists():
        shutil.copy2(npy_src, npy_dst)

In [37]:
# Скопируем файлы .npy для взрослых
for _, row in filtered_manifest_adult.iterrows():
    npy_src = Path(row['npy_path'])
    participant_id = row['participant_id']
    out_sub = Path(FILTERED_DEST_ROOT_ADULTS) / participant_id
    ensure_dir(out_sub)
    npy_dst = out_sub / npy_src.name
    if not npy_dst.exists():
        shutil.copy2(npy_src, npy_dst)