In [10]:
# 根据文件名的模式整理分组列表

from glob import glob
import os
import re


src = "/nas-data/datasets/CMRxRecon2025/val/" # 输入路径是已经二分为train和val的其中的val目录的路径, 该路径下应该包含val的软链接文件
assert os.path.isdir(src), '目录不存在'

files = glob(os.path.join(src, '*.h5'))

print(f'{len(files)} files was found in {src}')

groups = {
    "acquisition":{},
    "setname":{},
    "datatype":{},
    "center":{},
    "device":{},
    "patientid":{},
    "basename":{},
}

for file in files:
    filename = os.path.basename(file)

    pattern = r"^(.*?)@(.*?)@(.*?)@(.*?)@(.*?)@(.*?)@(.*?)\.h5$"

    match = re.match(pattern, filename)

    if match:
        fields = match.groups()
        groups["acquisition"].setdefault(fields[0], []).append(file)
        groups["setname"].setdefault(fields[1], []).append(file)
        groups["datatype"].setdefault(fields[2], []).append(file)
        groups["center"].setdefault(fields[3], []).append(file)
        groups["device"].setdefault(fields[4], []).append(file)
        groups["patientid"].setdefault(fields[5], []).append(file)
        groups["basename"].setdefault(fields[6], []).append(file)

    else:
        raise ValueError(f"Filename '{filename}' does not match the expected pattern.")


print("每个字段包含的标签以及对应的样本数:")
for groupname, group in groups.items():
    print(f"[{groupname}]", end='')
    for clazzname, clazz in group.items():
        print(f" {clazzname}:{len(clazz)}, ", end='')
    print()


55 files was found in /nas-data/datasets/CMRxRecon2025/val/
每个字段包含的标签以及对应的样本数:
[acquisition] LGE:15,  Cine:21,  Flow2d:1,  Mapping:10,  T1w:3,  T2w:2,  T1rho:2,  BlackBlood:1, 
[setname] TrainingSet:55, 
[datatype] FullSample:55, 
[center] Center006:11,  Center007:3,  Center001:18,  Center002:7,  Center005:11,  Center003:5, 
[device] UIH_30T_umr790:4,  Siemens_30T_Vida:3,  Siemens_30T_Prisma:8,  UIH_30T_umr780:18,  UIH_30T_umr880:9,  UIH_15T_umr670:9,  Siemens_30T_CIMA.X:3,  Siemens_15T_Sola:1, 
[patientid] P002:8,  P038:1,  P021:1,  P024:2,  P009:6,  P013:1,  P014:4,  P054:1,  P011:3,  P020:4,  P015:2,  P017:1,  P046:2,  P010:2,  P004:1,  P040:1,  P006:1,  P032:1,  P023:1,  P018:1,  P056:2,  P008:1,  P019:1,  P049:1,  P012:1,  P035:1,  P001:2,  P061:1,  P033:1, 
[basename] lge_lax_4ch:3,  cine_lax_4ch:5,  cine_sax:10,  cine_lax_3ch:4,  flow2d:1,  lge_lax_2ch:7,  T1mappost:2,  lge_sax:3,  T2map:3,  T1w:3,  lge_lax:1,  T1map:4,  T2w:2,  T1rho:2,  T2smap:1,  cine_lax_2ch:1,  lge_lax_3ch:

In [16]:
# 分组
import os

tgt = "/nas-data/datasets/CMRxRecon2025/"
assert os.path.isdir(tgt), '目录不存在'

groupconditions = []

# 示例
groupconditions.append({
    "acquisition": ['LGE', 'Cine', 'Flow2d', 'Mapping', 'T1w','T2w', 'T1rho'] # 第一组根据acquisition将['LGE', 'Cine', 'Flow2d', 'Mapping', 'T1w','T2w', 'T1rho']分为一组
})

groupconditions.append({
    "acquisition": ['BlackBlood'] # 第二组只包含BlackBlood
})

for valid in range(len(groupconditions)):
    assert not os.path.isdir(os.path.join(tgt, f'group{valid}')), f'组{valid}的目录已经存在'

for valid in range(len(groupconditions)):
    os.makedirs(os.path.join(tgt, f'val{valid}'))
    m = {}
    for key, condition in groupconditions[valid].items():
        assert key in groups.keys(), f"groups has no key {key}"
        s = []
        # 取并集
        for clazz in condition:
            assert clazz in groups[key].keys(), f"groups{key} has no clazz {clazz}"
            s += groups[key][clazz]
        m[key] = s
    
    # 取交集
    valset = None
    for key, s in m.items():
        if valset is None:
            valset = set(s)
        else:
            valset = valset.intersection(set(s))

    print(f'验证集{valid}中有{len(valset)}个文件')
    for file in valset:
        realpath = os.path.realpath(file)
        basename = os.path.basename(file)
        os.symlink(realpath, os.path.join(tgt, f'val{valid}', basename))

print("分组完成")

验证集0中有54个文件
验证集1中有1个文件
分组完成
