In [17]:
# 根据文件名的模式整理分组列表

from glob import glob
import os
import re


src = "/home/hulabdl/CMRxRecon2025/val/" # 输入路径是已经二分为train和val的其中的val目录的路径, 该路径下应该包含val的软链接文件
assert os.path.isdir(src), '目录不存在'

files = glob(os.path.join(src, '*.h5'))

print(f'{len(files)} files was found in {src}')

groups = {
    "acquisition":{},
    "setname":{},
    "datatype":{},
    "center":{},
    "device":{},
    "patientid":{},
    "basename":{},
}

for file in files:
    filename = os.path.basename(file)

    pattern = r"^(.*?)@(.*?)@(.*?)@(.*?)@(.*?)@(.*?)@(.*?)\.h5$"

    match = re.match(pattern, filename)

    if match:
        fields = match.groups()
        groups["acquisition"].setdefault(fields[0], []).append(file)
        groups["setname"].setdefault(fields[1], []).append(file)
        groups["datatype"].setdefault(fields[2], []).append(file)
        groups["center"].setdefault(fields[3], []).append(file)
        groups["device"].setdefault(fields[4], []).append(file)
        groups["patientid"].setdefault(fields[5], []).append(file)
        groups["basename"].setdefault(fields[6], []).append(file)

    else:
        raise ValueError(f"Filename '{filename}' does not match the expected pattern.")


print("每个字段包含的标签以及对应的样本数:")
for groupname, group in groups.items():
    print(f"[{groupname}]", end='')
    for clazzname, clazz in group.items():
        print(f" {clazzname}:{len(clazz)}, ", end='')
    print()


277 files was found in /home/hulabdl/CMRxRecon2025/val/
每个字段包含的标签以及对应的样本数:
[acquisition] Mapping:68,  Flow2d:4,  T1w:8,  Cine:108,  Perfusion:5,  LGE:63,  T2w:14,  T1rho:5,  BlackBlood:2, 
[setname] TrainingSet:277, 
[datatype] FullSample:277, 
[center] Center002:38,  Center006:48,  Center003:39,  Center005:58,  Center001:71,  Center007:23, 
[device] Siemens_30T_CIMA.X:20,  Siemens_30T_Prisma:44,  UIH_30T_umr880:57,  Siemens_30T_Vida:15,  UIH_30T_umr780:71,  UIH_15T_umr670:45,  UIH_30T_umr790:14,  Siemens_15T_Sola:10,  Siemens_15T_Avanto:1, 
[patientid] P003:10,  P020:14,  P005:12,  P017:9,  P004:9,  P032:3,  P054:2,  P024:10,  P022:2,  P014:11,  P018:6,  P040:2,  P034:4,  P007:11,  P026:1,  P015:8,  P019:9,  P039:1,  P008:10,  P056:2,  P025:3,  P002:20,  P009:16,  P011:9,  P031:3,  P016:3,  P013:10,  P012:8,  P049:2,  P028:3,  P006:10,  P023:6,  P035:3,  P021:3,  P001:10,  P043:2,  P030:1,  P027:3,  P057:3,  P010:8,  P061:2,  P058:2,  P038:1,  P051:1,  P055:2,  P060:1,  P033:2,  P046:

In [None]:
# 分组
import os

tgt = "/home/hulabdl/CMRxRecon2025/"
assert os.path.isdir(tgt), '目录不存在'

groupconditions = []

# 示例
groupconditions.append({
    "acquisition": ['LGE', 'Cine', 'Flow2d',  ] # 第1组
})

groupconditions.append({
    "acquisition": ['Mapping', 'T1w','T2w', 'T1rho', 'BlackBlood'] # 第2组
})

for valid in range(len(groupconditions)):
    assert not os.path.isdir(os.path.join(tgt, f'group{valid}')), f'组{valid}的目录已经存在'

for valid in range(len(groupconditions)):
    os.makedirs(os.path.join(tgt, f'val{valid}'))
    m = {}
    for key, condition in groupconditions[valid].items():
        assert key in groups.keys(), f"groups has no key {key}"
        s = []
        # 取并集
        for clazz in condition:
            assert clazz in groups[key].keys(), f"groups{key} has no clazz {clazz}"
            s += groups[key][clazz]
        m[key] = s
    
    # 取交集
    valset = None
    for key, s in m.items():
        if valset is None:
            valset = set(s)
        else:
            valset = valset.intersection(set(s))

    valset = list(valset)[:5] # 控制每个验证集的样本数

    print(f'验证集{valid}中有{len(valset)}个文件')
    for file in valset:
        realpath = os.path.realpath(file)
        basename = os.path.basename(file)
        os.symlink(realpath, os.path.join(tgt, f'val{valid}', basename))

print("分组完成")

验证集0中有5个文件
验证集1中有5个文件
分组完成
