# ADHD-200 • Athena-filtered preparation notebook (enhanced)

Готовит **Athena (filtered) time-series** к обучению:
• per-run z-score, выравнивание до `T_FIX` (окна/паддинг+маска)
• `manifest_windows.csv` / `aggregate_manifest.csv`
• (опц.) dFC: оконные FC → KMeans → агрегаты
• валидатор
• авто-детект CC200/CC400
• фильтр по возрастным когортам (12–15, 16–21)
• экспорт `.tsv.gz` рядом с `.npy`

In [None]:
import shutil
import re
import os
from pathlib import Path
from io import StringIO
import json
import yaml

from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

# Отключим предупреждения для чистоты логирования
import warnings
warnings.filterwarnings('ignore')

In [None]:
T_FIX = 200  # unify with parameter cell
WINDOWS_PER_RUN = 4
SAVE_WINDOW_TSV = False
DO_DFC_FEATURES = False
DO_DFC_FEATURES = True
K_STATES = 4
SEED = 42

# ВХОД:
ATHENA_ROOT = 'ADHD200_CC400_TCs_filtfix'                 # корень с .1D
BIDS_BOLD_JSON_ROOT = '../SortedRawDataBIDS'              # если есть BIDS с *_bold.json (для TR)
TR_MAP_CSV = None                                         # csv: participant_id,TR (fallback)
PHENO_CSV_TEEN = '../cohort_teen_participants.csv'        # csv фенотипов подростков, см. PHENO_AGE_COL
PHENO_CSV_ADULT = '../cohort_adult_participants.csv'      # csv фенотипов взрослых, см. PHENO_AGE_COL
PHENO_AGE_COL = 'Age'
OUT_ROOT = './Athena_prepared_filtered'                   # директория, куда сохранять результаты (будет создана, если отсутствует)
APPLY_AGE_FILTER = True
AGE_COHORTS = [(12,15.99),(16,21)]

CLEAN_PREVIOUS = True

# Текущий каталог с .ipynb:
CURRENT_DIR = './'

SITE_MAPPING = {
    # unify site naming if needed
    'KKI': 'kki', 'NYU': 'nyu', 'OHSU': 'ohsu', 'Peking_1': 'peking1', 'Peking_2': 'peking2', 'Peking_3': 'peking3',
    'Pittsburgh': 'pittsburgh', 'Peking': 'peking', 'NeuroIMAGE': 'neuroimage', 'Brown': 'brown', 'WashU': 'washu'
}

GLOBAL_NUMERIC_PAD = None  # will be set after scanning Athena root

KNOWN_SITE_NAMES = set(v for v in SITE_MAPPING.values())

In [None]:
# === NEW DEFINITIONS: build_run_table + GLOBAL_NUMERIC_PAD ===

np.random.seed(SEED)

_run_file_re = re.compile(r"session_(\d+)_rest_(\d+)_cc(\d+)_TCs\.1D$", re.IGNORECASE)
_atlas_re = re.compile(r"cc(200|400)", re.IGNORECASE)

def _extract_run_id(fname: str) -> str:
    m = _run_file_re.search(fname)
    if not m:
        # fallback: just return 'ses1_rest1'
        return 'ses1_rest1'
    ses, rest, atlas_num = m.groups()
    return f"ses{ses}_rest{rest}"

def _infer_atlas(fname: str) -> str:
    m = _atlas_re.search(fname)
    if m:
        num = m.group(1)
        return f"CC{num}"
    return 'UNKNOWN'

def build_run_table(athena_root: str | Path,
                    bids_root: str | Path | None = None,
                    tr_map_csv: str | Path | None = None,
                    pheno_csv: str | Path | None = None,
                    apply_age_filter: bool = False,
                    age_cohorts=None,
                    age_col: str = 'Age') -> pd.DataFrame:
    """Scan Athena filtered time-series tree and build a run table.
    Expected layout: ATHENA_ROOT/<Site>/<NumericID>/...*_TCs.1D
    Returns columns:
      participant_id, pid_raw, run_id, site, site_original, atlas, TR, path_1D, numeric_id, numeric_id_padded
    """
    athena_root = Path(athena_root)
    if not athena_root.exists():
        raise FileNotFoundError(f"Athena root not found: {athena_root}")
    rows = []
    numeric_lengths = []
    for path in athena_root.rglob("*_TCs.1D"):
        try:
            site_original = path.parts[-3]  # e.g. WashU
            numeric_id = path.parts[-2]
        except IndexError:
            continue
        site = SITE_MAPPING.get(site_original, site_original.lower())
        # sanitize numeric id: keep digits only for padding logic
        numeric_digits = ''.join(ch for ch in numeric_id if ch.isdigit()) or numeric_id
        numeric_lengths.append(len(numeric_digits))
        run_id = _extract_run_id(path.name)
        atlas = _infer_atlas(path.name)
        participant_id = f"sub-{site}{numeric_digits}"  # BIDS-like synthetic id
        rows.append({
            'participant_id': participant_id,
            'pid_raw': numeric_id,
            'run_id': run_id,
            'site': site,
            'site_original': site_original,
            'atlas': atlas,
            'TR': np.nan,  # TR population deferred (could be filled from BIDS JSON later)
            'path_1D': str(path),
            'numeric_id': numeric_digits,
        })
    if not rows:
        return pd.DataFrame(columns=['participant_id','pid_raw','run_id','site','site_original','atlas','TR','path_1D','numeric_id','numeric_id_padded'])
    # determine global pad
    pad_len = max(numeric_lengths) if numeric_lengths else 0
    global GLOBAL_NUMERIC_PAD
    GLOBAL_NUMERIC_PAD = pad_len
    for r in rows:
        r['numeric_id_padded'] = r['numeric_id'].zfill(pad_len)
    df = pd.DataFrame(rows)
    # optional age filter could be added later using pheno_csv
    return df

print('[INIT] build_run_table ready; GLOBAL_NUMERIC_PAD will be set on first call.')

[INIT] build_run_table ready; GLOBAL_NUMERIC_PAD will be set on first call.


In [98]:
# Utility functions (updated zscore_time, choose_windows, upper_tri_fc for NaN robustness)
def participant_dir_name(site: str, pid_bids: str, site_original: str) -> str:
    return f"sub-{pid_bids}_{site}" if site_original == site else f"sub-{pid_bids}_{site_original}-{site}"  # example logic


def zscore_time(ts: np.ndarray) -> np.ndarray:
    """NaN-robust z-score across time for each ROI column.
    Steps:
    1. Compute nanmean/nanstd per column.
    2. Replace NaN means with 0; std <1e-8 or NaN -> 1.
    3. Fill NaNs in original time-series with column mean.
    4. Standardize; replace any residual NaN/Inf with 0.
    Returns float32 array of same shape.
    """
    ts = np.asarray(ts)
    if ts.ndim != 2:
        ts = np.atleast_2d(ts)
    m = np.nanmean(ts, axis=0)
    s = np.nanstd(ts, axis=0)
    m = np.where(np.isnan(m), 0.0, m)
    s = np.where(np.isnan(s) | (s < 1e-8), 1.0, s)
    ts_filled = ts.copy()
    nan_mask = np.isnan(ts_filled)
    if nan_mask.any():
        cols = np.where(nan_mask)[1]
        ts_filled[nan_mask] = m[cols]
    z = (ts_filled - m) / s
    z[~np.isfinite(z)] = 0.0
    return z.astype(np.float32)


def choose_windows(ts: np.ndarray, window_len: int, windows_per_run: int = 4) -> Tuple[List[np.ndarray], List[np.ndarray], List[Tuple[int, int]]]:
    """Select fixed-length windows uniformly across the time series.
    Produces:
      - windows: list of (T, R) arrays
      - masks: list of (T,) float arrays indicating finite-data ratio at each time point (1.0 == all ROIs finite)
      - segments: list of (start, end) indices
    """
    T = ts.shape[0]
    if T < window_len:
        pad_rows = window_len - T
        col_mean = np.nanmean(ts, axis=0)
        col_mean = np.where(np.isnan(col_mean), 0.0, col_mean)
        pad_block = np.repeat(col_mean[None, :], pad_rows, axis=0)
        ts_padded = np.concatenate([ts, pad_block], axis=0)
        finite_ratio = np.isfinite(ts_padded).sum(axis=1) / ts_padded.shape[1]
        return [ts_padded], [finite_ratio.astype(np.float32)], [(0, window_len)]
    starts = np.linspace(0, T - window_len, num=windows_per_run, dtype=int)
    windows, masks, segments = [] , [], []
    for s in starts:
        e = s + window_len
        seg_ts = ts[s:e]
        if seg_ts.shape[0] < window_len:
            pad_rows = window_len - seg_ts.shape[0]
            col_mean = np.nanmean(ts, axis=0)
            col_mean = np.where(np.isnan(col_mean), 0.0, col_mean)
            pad_block = np.repeat(col_mean[None, :], pad_rows, axis=0)
            seg_ts = np.concatenate([seg_ts, pad_block], axis=0)
        finite_ratio_seg = np.isfinite(seg_ts).sum(axis=1) / seg_ts.shape[1]
        windows.append(seg_ts)
        masks.append(finite_ratio_seg.astype(np.float32))
        segments.append((int(s), int(e)))
    return windows, masks, segments


def upper_tri_fc(window_ts: np.ndarray) -> np.ndarray:
    """Compute flattened upper-triangular correlation (Fisher z) matrix from window time-series.
    NaN-robust: columns with near-zero variance produce zeros; any NaNs become 0.
    Returns 1D float32 array.
    """
    X = np.asarray(window_ts)
    if X.ndim != 2:
        X = np.atleast_2d(X)
    m = np.nanmean(X, axis=0)
    s = np.nanstd(X, axis=0)
    s = np.where((s < 1e-8) | np.isnan(s), 1.0, s)
    Xc = X - m
    Xz = Xc / s
    Xz[~np.isfinite(Xz)] = 0.0
    corr = np.corrcoef(Xz, rowvar=False)
    corr[~np.isfinite(corr)] = 0.0
    with np.errstate(invalid='ignore'):
        zcorr = np.arctanh(np.clip(corr, -0.999999, 0.999999))
    zcorr[~np.isfinite(zcorr)] = 0.0
    R = zcorr.shape[0]
    iu = np.triu_indices(R, k=1)
    vec = zcorr[iu]
    return vec.astype(np.float32)


## Подготовка окон (+ экспорт TSV)

In [99]:
def participant_dir_name(site: str, pid: str, site_original: str | None = None) -> str:
    """Return output directory name using BIDS participant_id, ensure no legacy 'sup-' prefix.
    pid must be of form sub-<site><zero_padded_numeric>.
    """
    return pid

In [100]:
# Функция подготовки (фильтрованная версия) с BIDS participant_id (fixed imports + missing path logging)

def load_athena_1D(path: Path) -> np.ndarray:
    numeric_lines = []
    with open(path, 'r') as f:
        for line in f:
            s = line.strip()
            if not s: continue
            if s[0] not in '0123456789.-Nn':  # allow lines starting with N for NaN tokens
                continue
            s_clean = s.replace(',', ' ')
            toks = s_clean.split()
            ok = True
            for t in toks:
                try: float(t)
                except Exception: ok=False; break
            if not ok: continue
            numeric_lines.append(s_clean)
    if not numeric_lines:
        try:
            df = pd.read_csv(path, header=None, sep=r"\s+|,", engine='python', comment='#')
            arr = df.astype(float).values
        except Exception:
            arr = np.genfromtxt(path, comments=None)
    else:
        buf = StringIO('\n'.join(numeric_lines))
        arr = np.loadtxt(buf)
    if arr.size == 0:
        return np.zeros((0,1), dtype=np.float64)
    if arr.ndim == 1:
        arr = arr[:, None]
    col_mean = np.nanmean(arr, axis=0)
    col_mean = np.where(np.isnan(col_mean), 0.0, col_mean)
    nan_mask = np.isnan(arr)
    if nan_mask.any():
        arr[nan_mask] = np.take(col_mean, np.where(nan_mask)[1])
    return arr.astype(np.float64)


def athena_prepare_runs_filtered(run_table: pd.DataFrame, out_root: Path | str, debug: bool = True):
    out_root = Path(out_root)
    out_root.mkdir(parents=True, exist_ok=True)
    manif_rows = []
    agg_rows = []
    missing_paths = []  # collect missing or failed loads
    unique_atlases = sorted(run_table['atlas'].unique()) if 'atlas' in run_table.columns else []
    if debug:
        print(f"[INFO] atlases in run_table: {unique_atlases}")
        print(f"[INFO] rows to process: {len(run_table)}")
    for ridx, row in run_table.iterrows():
        pid_bids = str(row['participant_id'])
        pid_raw = str(row.get('pid_raw', ''))
        site = str(row['site']) if 'site' in row else 'unknown'
        site_original = row.get('site_original', site)
        run_id = str(row['run_id'])
        atlas = row.get('atlas', 'UNKNOWN')
        tr_val = row.get('TR', np.nan)
        path_1D = Path(row['path_1D'])
        if not path_1D.exists():
            missing_paths.append(str(path_1D))
            if debug: print(f"[SKIP] missing file: {path_1D}")
            continue
        pdir = out_root / participant_dir_name(site, pid_bids, site_original)
        pdir.mkdir(parents=True, exist_ok=True)
        try:
            ts = load_athena_1D(path_1D)
        except Exception as e:
            missing_paths.append(str(path_1D))
            if debug: print(f"[SKIP] cannot load {path_1D}: {e}")
            continue
        if ts.ndim == 1: ts = ts[:, None]
        nan_ratio = float(np.isnan(ts).mean()) if ts.size else 0.0
        if debug:
            print(f"[ROW {ridx}] pid_bids={pid_bids} raw={pid_raw} run_id={run_id} ts_shape={ts.shape} nan_ratio={nan_ratio:.3f}")
        if ts.shape[0] == 0:
            continue
        ts_z = zscore_time(ts)
        wins, masks, segments = choose_windows(ts_z, T_FIX, windows_per_run=WINDOWS_PER_RUN)
        if debug: print(f"    windows={len(wins)} segments={segments}")
        for w_i, (w_arr, m_arr, seg) in enumerate(zip(wins, masks, segments)):
            if np.isnan(w_arr).any():
                w_arr = np.nan_to_num(w_arr, nan=0.0, posinf=0.0, neginf=0.0)
            base = f"{pid_bids}_{run_id}_w{w_i}_{atlas}".replace('__', '_')
            npy_path = pdir / f"{base}.npy"
            mask_path = pdir / f"{base}_mask.npy"
            tsv_path = pdir / f"{base}.tsv.gz" if SAVE_WINDOW_TSV else ''
            np.save(npy_path, w_arr.astype(np.float32))
            np.save(mask_path, m_arr.astype(np.float32))
            if SAVE_WINDOW_TSV:
                try: pd.DataFrame(w_arr).to_csv(tsv_path, sep='\t', index=False, compression='gzip')
                except Exception as e:
                    if debug: print(f"[WARN] TSV save failed for {tsv_path}: {e}")
                    tsv_path = ''
            if debug:
                print(f"        win {w_i}: npy={npy_path.exists()} mask={mask_path.exists()} has_nan={np.isnan(w_arr).any()}")
            manif_rows.append({
                'participant_id': pid_bids,
                'pid_raw': pid_raw,
                'site': site,
                'site_original': site_original,
                'run_id': run_id,
                'win_index': w_i,
                'segment_start': seg[0],
                'segment_end': seg[1],
                'npy_path': str(npy_path),
                'mask_path': str(mask_path),
                'tsv_path': str(tsv_path),
                'atlas': atlas,
                'TR': tr_val,
                'nan_ratio_run': nan_ratio
            })
            if DO_DFC_FEATURES:
                fc_vec = upper_tri_fc(w_arr)
                if fc_vec.size > 0:
                    try:
                        km = KMeans(n_clusters=min(K_STATES, 1), random_state=SEED)
                        km.fit(fc_vec.reshape(-1, 1))
                        state_label = int(km.labels_[0])
                    except Exception: state_label = -1
                else: state_label = -1
                agg_rows.append({
                    'participant_id': pid_bids,
                    'pid_raw': pid_raw,
                    'site': site,
                    'site_original': site_original,
                    'run_id': run_id,
                    'win_index': w_i,
                    'state_label': state_label,
                    'fc_dim': int(fc_vec.size),
                    'atlas': atlas,
                    'TR': tr_val
                })
    man_df = pd.DataFrame(manif_rows)
    man_csv = out_root / 'manifest_windows.csv'
    man_df.to_csv(man_csv, index=False)
    if debug: print(f"[WRITE] {man_csv} ({len(man_df)} rows)")
    if missing_paths:
        miss_file = out_root / 'missing_paths.txt'
        miss_file.write_text('\n'.join(missing_paths))
        print(f"[MISS] {len(missing_paths)} paths logged -> {miss_file}")
    if DO_DFC_FEATURES:
        agg_df = pd.DataFrame(agg_rows)
        agg_csv = out_root / 'aggregate_manifest.csv'
        agg_df.to_csv(agg_csv, index=False)
        if debug: print(f"[WRITE] {agg_csv} ({len(agg_df)} rows)")
    else:
        if debug: print("[INFO] dFC features disabled")
    return man_df

## Валидация

In [101]:
def validate_prepared(out_root):
    out=Path(out_root); man=out/'manifest_windows.csv'
    if not man.exists(): print('[ERROR] нет manifest_windows.csv в', out); return
    df=pd.read_csv(man); problems=0
    missing_npy=df[~df['npy_path'].astype(str).apply(lambda p: Path(p).exists())]
    missing_msk=df[~df['mask_path'].astype(str).apply(lambda p: Path(p).exists())]
    if len(missing_npy): problems+=len(missing_npy); print('[WARN] отсутствуют npy:', len(missing_npy))
    if len(missing_msk): problems+=len(missing_msk); print('[WARN] отсутствуют mask:', len(missing_msk))
    if 'tsv_path' in df.columns:
        # ensure all paths are strings
        tsv_series = df['tsv_path'].fillna('').astype(str)
        missing_tsv = df[(tsv_series != '') & (~tsv_series.apply(lambda p: Path(p).exists()))]
        if len(missing_tsv): problems+=len(missing_tsv); print('[WARN] отсутствуют tsv:', len(missing_tsv))
    sample=None
    for p in df['npy_path'].head(5).astype(str):
        if Path(p).exists():
            arr=np.load(p); sample=arr.shape
            if np.isnan(arr).any():
                problems+=1; print(f'[WARN] NaN обнаружен в {p}')
            if arr.shape[0]!=T_FIX: print(f'[WARN] {p}: T != T_FIX ({arr.shape[0]} != {T_FIX})')
    if sample: print('Пример формы ряда:', sample)
    print('\nСводка по сайтам:')
    print(df.groupby('site')['participant_id'].nunique().sort_values(ascending=False))
    print('\nСводка по окнам на участника:')
    print(df.groupby('participant_id')['win_index'].count().describe())
    dups=df[df.duplicated(['participant_id','run_id','win_index'], keep=False)]
    if len(dups): problems+=len(dups); print('[WARN] повторяющиеся окна:', len(dups))
    print('OK' if problems==0 else f'Завершено с предупреждениями: {problems}')

# NEW: verification helper
import random

def quick_window_integrity_check(manifest_csv: str, n=10):
    if not Path(manifest_csv).exists():
        print('[CHECK] manifest отсутствует:', manifest_csv); return
    df = pd.read_csv(manifest_csv)
    if len(df)==0:
        print('[CHECK] пустой manifest'); return
    picks = df.sample(min(n, len(df)), random_state=SEED)
    issues = 0
    for _, row in picks.iterrows():
        p = Path(str(row['npy_path']))
        if not p.exists():
            print('[MISS]', p); issues += 1; continue
        arr = np.load(p)
        if np.isnan(arr).any():
            print('[NaN]', p); issues += 1
        if not np.isfinite(arr).all():
            print('[Inf]', p); issues += 1
        if arr.shape[0] != T_FIX:
            print('[LEN]', p, arr.shape)
            issues += 1
    print(f'[CHECK] завершено; проблемных примеров: {issues}')

## Подготовка когорт с новым именованием каталогов

In [None]:
# CLEAN PREVIOUS OUTPUTS (optional): remove old cohort dirs with mixed naming to regenerate clean structure

if CLEAN_PREVIOUS:
    out_root = Path(OUT_ROOT)
    for cohort_dir in ['cohort_teen_participants', 'cohort_adult_participants']:
        target = out_root / cohort_dir
        if target.exists():
            print(f'[CLEAN] removing {target}')
            shutil.rmtree(target)
        # also remove diag if exists
        diag = out_root / f'{cohort_dir}_diag'
        if diag.exists():
            print(f'[CLEAN] removing {diag}')
            shutil.rmtree(diag)
else:
    print('[CLEAN] Skipped removal; set CLEAN_PREVIOUS=True to purge.')

[CLEAN] removing Athena_prepared_filtered/cohort_teen_participants


[CLEAN] removing Athena_prepared_filtered/cohort_adult_participants


In [103]:
# Убедимся, что путь к корню BIDS верный:
print(BIDS_BOLD_JSON_ROOT)

# Напечатаем все корневые каталоги:
root = Path(BIDS_BOLD_JSON_ROOT)
# Каталоги с возрастными когортыми:
catalogs = [f"cohort_{age_range}_participants" for age_range in ['teen', 'adult']]
for cat in catalogs:
    cat_path = root / cat
    if not cat_path.exists():
        print(f"[WARN] Каталог не найден: {cat_path}")
        
print("Корневые каталоги:")
for item in catalogs:
    print(f"- {BIDS_BOLD_JSON_ROOT}/{item}")

# Сколько вообще TR встречается?
trs = []
for catalog in catalogs:
    cat_path = root / catalog
    if not cat_path.exists():
        continue
    for j in cat_path.rglob("sub-*/ses-*/func/*_bold.json"):
        try:
            d = json.loads(j.read_text())
            if 'RepetitionTime' in d:
                trs.append(float(d['RepetitionTime']))
        except: 
            pass


print("Найдено JSON:", len(trs), "уникальных TR:", np.unique(trs))

../SortedRawDataBIDS
Корневые каталоги:
- ../SortedRawDataBIDS/cohort_teen_participants
- ../SortedRawDataBIDS/cohort_adult_participants
Найдено JSON: 523 уникальных TR: [1.5  1.96 2.   2.5  3.  ]


In [104]:
COHORTS = {
    'cohort_teen_participants': PHENO_CSV_TEEN,
    'cohort_adult_participants': PHENO_CSV_ADULT,
}

# Извлечём допустимые NUMERIC id (только цифры) из фенотипов,
# затем фильтруем run_table по колонке numeric_id_padded (стабильная, с глобальным нулевым дополнением)
run_tables = {}
for cohort_name, pheno_path in COHORTS.items():
    if not pheno_path or not Path(pheno_path).exists():
        print(f'[SKIP] cohort {cohort_name}: файл фенотипов отсутствует -> {pheno_path}')
        continue
    pheno_df = pd.read_csv(pheno_path, delimiter='\t' if pheno_path.endswith('.tsv') else ',')
    if 'participant_id' not in pheno_df.columns:
        alt_cols = [c for c in pheno_df.columns if c.lower() in ('participant_id','subject','sub','id','scandir id')]
        if alt_cols:
            pheno_df = pheno_df.rename(columns={alt_cols[0]: 'participant_id'})
        else:
            raise ValueError(f'Нет participant_id в {pheno_path}')

    # Допустимые NUMERIC ids из фенотипов (убираем всё, кроме цифр)
    def only_digits(x):
        s = str(x)
        d = ''.join(ch for ch in s if ch.isdigit())
        return d
    allowed_numeric = [only_digits(x) for x in pheno_df['participant_id']]
    allowed_numeric = [d for d in allowed_numeric if len(d) > 0]

    # Собираем run_table из файлов Athena и вычисляем глобальную длину дополнения
    rt_all = build_run_table(
        ATHENA_ROOT,
        bids_root=BIDS_BOLD_JSON_ROOT,
        tr_map_csv=TR_MAP_CSV,
        pheno_csv=None,
        apply_age_filter=False,
        age_cohorts=AGE_COHORTS,
        age_col=PHENO_AGE_COL
    )

    pad_len = GLOBAL_NUMERIC_PAD if GLOBAL_NUMERIC_PAD is not None else (max(len(n) for n in allowed_numeric) if allowed_numeric else 0)
    allowed_numeric_padded = set(n.zfill(pad_len) for n in allowed_numeric)

    # Фильтруем по numeric_id_padded: это устраняет проблемы префиксов (sfnwmrda) и ведущих нулей
    rt_filtered = rt_all[rt_all['numeric_id_padded'].astype(str).isin(allowed_numeric_padded)].copy()

    # Дедупликация: на случай, если один и тот же запуск попал дважды (например, из разных путей)
    rt_filtered = rt_filtered.sort_values('path_1D')
    rt_filtered = rt_filtered.drop_duplicates(['participant_id','run_id','atlas'], keep='first').reset_index(drop=True)

    print(f'[INFO] cohort {cohort_name}: после NUMERIC-фильтра осталось {len(rt_filtered)} запусков из {len(rt_all)}')

    # Проверим формат нового participant_id
    bad_fmt = rt_filtered[~rt_filtered['participant_id'].str.match(r'^sub-[a-z]+\d+$')]
    if len(bad_fmt):
        print(f'[WARN] {len(bad_fmt)} строк имеют нестандартный participant_id формат')
    run_tables[cohort_name] = rt_filtered
    display(rt_filtered.head(5))

[INFO] cohort cohort_teen_participants: после NUMERIC-фильтра осталось 333 запусков из 2404


Unnamed: 0,participant_id,pid_raw,run_id,site,site_original,atlas,TR,path_1D,numeric_id,numeric_id_padded
0,sub-kki1018959,1018959,ses1_rest1,kki,KKI,CC400,,ADHD200_CC400_TCs_filtfix/KKI/1018959/sfnwmrda...,1018959,1018959
1,sub-kki1019436,1019436,ses1_rest1,kki,KKI,CC400,,ADHD200_CC400_TCs_filtfix/KKI/1019436/sfnwmrda...,1019436,1019436
2,sub-kki1594156,1594156,ses1_rest1,kki,KKI,CC400,,ADHD200_CC400_TCs_filtfix/KKI/1594156/sfnwmrda...,1594156,1594156
3,sub-kki1623716,1623716,ses1_rest1,kki,KKI,CC400,,ADHD200_CC400_TCs_filtfix/KKI/1623716/sfnwmrda...,1623716,1623716
4,sub-kki2026113,2026113,ses1_rest1,kki,KKI,CC400,,ADHD200_CC400_TCs_filtfix/KKI/2026113/sfnwmrda...,2026113,2026113


[INFO] cohort cohort_adult_participants: после NUMERIC-фильтра осталось 127 запусков из 2404


Unnamed: 0,participant_id,pid_raw,run_id,site,site_original,atlas,TR,path_1D,numeric_id,numeric_id_padded
0,sub-nyu0010050,10050,ses1_rest1,nyu,NYU,CC400,,ADHD200_CC400_TCs_filtfix/NYU/0010050/sfnwmrda...,10050,10050
1,sub-nyu0010050,10050,ses1_rest2,nyu,NYU,CC400,,ADHD200_CC400_TCs_filtfix/NYU/0010050/sfnwmrda...,10050,10050
2,sub-nyu0010052,10052,ses1_rest1,nyu,NYU,CC400,,ADHD200_CC400_TCs_filtfix/NYU/0010052/sfnwmrda...,10052,10052
3,sub-nyu0010052,10052,ses1_rest2,nyu,NYU,CC400,,ADHD200_CC400_TCs_filtfix/NYU/0010052/sfnwmrda...,10052,10052
4,sub-nyu0010054,10054,ses1_rest1,nyu,NYU,CC400,,ADHD200_CC400_TCs_filtfix/NYU/0010054/sfnwmrda...,10054,10054


In [105]:
# Запуск подготовки по когортам с BIDS participant_id
for cohort_name, rt in run_tables.items():
    cohort_out = Path(OUT_ROOT) / cohort_name
    print(f'\n[PROCESS] cohort {cohort_name} -> {cohort_out}')
    athena_prepare_runs_filtered(rt, cohort_out, debug=False)
    validate_prepared(cohort_out)

print('\n[SUMMARY] cohorts prepared:', ', '.join(run_tables.keys()))


[PROCESS] cohort cohort_teen_participants -> Athena_prepared_filtered/cohort_teen_participants


  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  co

Пример формы ряда: (200, 353)

Сводка по сайтам:
site
nyu           65
peking2       38
pittsburgh    36
peking3       34
peking1       23
washu         18
neuroimage    14
kki            9
Name: participant_id, dtype: int64

Сводка по окнам на участника:
count    237.000000
mean       2.784810
std        1.228243
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max        4.000000
Name: win_index, dtype: float64
OK

[PROCESS] cohort cohort_adult_participants -> Athena_prepared_filtered/cohort_adult_participants


  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  co

Пример формы ряда: (200, 353)

Сводка по сайтам:
site
pittsburgh    36
neuroimage    31
nyu           23
washu          5
peking1        1
peking3        1
Name: participant_id, dtype: int64

Сводка по окнам на участника:
count    97.000000
mean      2.329897
std       1.288747
min       1.000000
25%       1.000000
50%       2.000000
75%       4.000000
max       4.000000
Name: win_index, dtype: float64
OK

[SUMMARY] cohorts prepared: cohort_teen_participants, cohort_adult_participants


  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]


In [106]:
# Диагностика: прогон первых 2 строк подростковой когорты
if 'cohort_teen_participants' in run_tables:
    rt_sample = run_tables['cohort_teen_participants'].head(2)
    diag_out = Path(OUT_ROOT) / 'cohort_teen_participants_diag'
    print(f"[DIAG] Preparing sample to {diag_out}")
    man_df = athena_prepare_runs_filtered(rt_sample, diag_out, debug=True)
    print(f"[DIAG] manifest rows: {len(man_df)}")
    print("[DIAG] diag_out contents:")
    for p in diag_out.rglob('*'):
        print(" -", p)
else:
    print("[DIAG] No run_tables for cohort_teen_participants available.")

[DIAG] Preparing sample to Athena_prepared_filtered/cohort_teen_participants_diag
[INFO] atlases in run_table: ['CC400']
[INFO] rows to process: 2
[ROW 0] pid_bids=sub-kki1018959 raw=1018959 run_id=ses1_rest1 ts_shape=(149, 353) nan_ratio=0.000
    windows=1 segments=[(0, 200)]
        win 0: npy=True mask=True has_nan=False
[ROW 1] pid_bids=sub-kki1019436 raw=1019436 run_id=ses1_rest1 ts_shape=(149, 353) nan_ratio=0.000
    windows=1 segments=[(0, 200)]
        win 0: npy=True mask=True has_nan=False
[WRITE] Athena_prepared_filtered/cohort_teen_participants_diag/manifest_windows.csv (2 rows)
[WRITE] Athena_prepared_filtered/cohort_teen_participants_diag/aggregate_manifest.csv (2 rows)
[DIAG] manifest rows: 2
[DIAG] diag_out contents:
 - Athena_prepared_filtered/cohort_teen_participants_diag/sub-kki1019436
 - Athena_prepared_filtered/cohort_teen_participants_diag/manifest_windows.csv
 - Athena_prepared_filtered/cohort_teen_participants_diag/aggregate_manifest.csv
 - Athena_prepared_fil

  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]
  col_mean = np.nanmean(arr, axis=0)
  c /= stddev[:, None]
  c /= stddev[None, :]


In [107]:
# Просмотр первых строк одного файла Athena для диагностики парсинга
sample_path = Path('ADHD200_CC400_TCs_filtfix/WashU/0015042/sfnwmrda0015042_session_1_rest_1_cc400_TCs.1D')
print('[HEAD] of', sample_path)
if sample_path.exists():
    with open(sample_path, 'r') as f:
        for i in range(12):
            line = f.readline()
            if not line:
                break
            print(f"[{i}]", line.rstrip())
else:
    print('[HEAD] sample path does not exist')

[HEAD] of ADHD200_CC400_TCs_filtfix/WashU/0015042/sfnwmrda0015042_session_1_rest_1_cc400_TCs.1D
[0] File	Sub-brick	Mean_1  	Mean_3  	Mean_4  	Mean_6  	Mean_7  	Mean_8  	Mean_9  	Mean_11  	Mean_12  	Mean_13  	Mean_14  	Mean_15  	Mean_16  	Mean_17  	Mean_18  	Mean_19  	Mean_20  	Mean_21  	Mean_23  	Mean_24  	Mean_25  	Mean_26  	Mean_27  	Mean_28  	Mean_29  	Mean_30  	Mean_31  	Mean_32  	Mean_33  	Mean_34  	Mean_35  	Mean_36  	Mean_37  	Mean_38  	Mean_39  	Mean_41  	Mean_42  	Mean_43  	Mean_44  	Mean_45  	Mean_46  	Mean_47  	Mean_48  	Mean_50  	Mean_51  	Mean_52  	Mean_53  	Mean_54  	Mean_55  	Mean_56  	Mean_57  	Mean_58  	Mean_59  	Mean_60  	Mean_61  	Mean_62  	Mean_63  	Mean_64  	Mean_65  	Mean_67  	Mean_68  	Mean_69  	Mean_70  	Mean_71  	Mean_72  	Mean_73  	Mean_74  	Mean_75  	Mean_77  	Mean_78  	Mean_79  	Mean_80  	Mean_81  	Mean_82  	Mean_83  	Mean_84  	Mean_85  	Mean_86  	Mean_87  	Mean_88  	Mean_89  	Mean_90  	Mean_91  	Mean_93  	Mean_94  	Mean_95  	Mean_96  	Mean_97  	Mean_98  	Me

In [108]:
# Быстрая интеграционная проверка окон после полной генерации
from pathlib import Path

for cohort_name in ['cohort_teen_participants','cohort_adult_participants']:
    man = Path(OUT_ROOT) / cohort_name / 'manifest_windows.csv'
    print(f'\n[INTEGRITY] sampling {cohort_name} manifest -> {man}')
    quick_window_integrity_check(str(man), n=15)

# Показать первые строки файла пропущенных путей (если есть)
for cohort_name in ['cohort_teen_participants','cohort_adult_participants']:
    miss = Path(OUT_ROOT) / cohort_name / 'missing_paths.txt'
    if miss.exists():
        lines = miss.read_text().splitlines()
        print(f'[MISS LIST] {cohort_name}: {len(lines)} пропущено; первые 5:')
        for l in lines[:5]:
            print('  -', l)
    else:
        print(f'[MISS LIST] {cohort_name}: нет пропущенных путей')


[INTEGRITY] sampling cohort_teen_participants manifest -> Athena_prepared_filtered/cohort_teen_participants/manifest_windows.csv
[CHECK] завершено; проблемных примеров: 0

[INTEGRITY] sampling cohort_adult_participants manifest -> Athena_prepared_filtered/cohort_adult_participants/manifest_windows.csv
[CHECK] завершено; проблемных примеров: 0
[MISS LIST] cohort_teen_participants: нет пропущенных путей
[MISS LIST] cohort_adult_participants: нет пропущенных путей
