In [1]:
import h5py


In [2]:
h5ad_path = "/work/home/cryoem666/czx/dataset/STATE/arcinstitute-State-Replogle-Filtered-Dec-6-2025/replogle_concat.h5ad"
data = h5py.File(h5ad_path, "r")

In [None]:
print(data)

data['X']
# 查看组的基本信息
print(data['X'])
print(data['X'][0])

# 查看文件结构
def print_h5_structure(item, prefix=''):
    """递归打印H5文件结构"""
    if isinstance(item, h5py.Group):
        print(f"{prefix}Group: {item.name}")
        for key in item.keys():
            print_h5_structure(item[key], prefix + '  ')
    else:
        print(f"{prefix}Dataset: {item.name}, shape: {item.shape}, dtype: {item.dtype}")

print_h5_structure(data)

<HDF5 file "replogle_concat.h5ad" (mode r)>
<HDF5 dataset "X": shape (643413, 6546), type "<f4">
[0.         0.         0.64062476 ... 2.9419792  1.8541076  4.790733  ]
Group: /
  Dataset: /X, shape: (643413, 6546), dtype: float32
  Group: /layers
  Group: /obs
    Dataset: /obs/UMI_count, shape: (643413,), dtype: float32
    Dataset: /obs/cell_barcode, shape: (643413,), dtype: object
    Group: /obs/cell_line
      Dataset: /obs/cell_line/categories, shape: (4,), dtype: object
      Dataset: /obs/cell_line/codes, shape: (643413,), dtype: int8
    Dataset: /obs/gem_group, shape: (643413,), dtype: int64
    Group: /obs/gene
      Dataset: /obs/gene/categories, shape: (2024,), dtype: object
      Dataset: /obs/gene/codes, shape: (643413,), dtype: int16
    Group: /obs/gene_id
      Dataset: /obs/gene_id/categories, shape: (2024,), dtype: object
      Dataset: /obs/gene_id/codes, shape: (643413,), dtype: int16
    Group: /obs/gene_transcript
      Dataset: /obs/gene_transcript/categories,

In [None]:
def print_spaced(string:str, h):
    print('  '*h + string)

def show_h5_dataset(obj, h):
    # 如果是数据集
    print_spaced(f"形状: {obj.shape}", h)
    print_spaced(f"数据类型: {obj.dtype}", h)
    print_spaced(f"大小: {obj.size}", h)
        
    # 如果数据量不大，可以查看实际数据
    if obj.size < 100:  # 只查看小数据集
        print_spaced(f"数据: {obj[()]}", h)
    else:
        print_spaced(f"数据过大，只显示前10个元素: {obj[:10]}", h)

def show_h5_obj(father_obj, h):
    for name, obj in father_obj.items():
        print_spaced(f"对象: {name}", h)
        print_spaced(f"类型: {type(obj)}", h+1)

        if isinstance(obj, h5py.Dataset):
            # 如果是数据集
            show_h5_dataset(obj, h+1)
        
        elif isinstance(obj, h5py.Group):
            # 如果是子组
            print('  '*(h+1) + f"子组成员: {list(obj.keys())}")
            print_spaced("-" * 30, h+1)
            show_h5_obj(obj, h+1)
        
        print_spaced("-" * 30, h)

def print_h5_part(part):
    print(f"组路径: {part.name}")
    print(f"组成员数量: {len(part)}")
    print("组成员:", list(part.keys()))
    print("=" * 50)
    
    show_h5_obj(part, 0)

## HVG

In [None]:
hvg_mask = data["/var/highly_variable"][:]
all_gene_names = data["var/gene_name_index"][:]
hvg_names = all_gene_names[hvg_mask]

print(all_gene_names.shape)
print(all_gene_names)
print(hvg_names.shape)
print(hvg_names)

(6546,)
[b'NOC2L' b'HES4' b'ISG15' ... b'MT-ND5' b'MT-ND6' b'MT-CYB']
(2000,)
[b'HES4' b'ISG15' b'MIB2' ... b'MT-ND4L' b'MT-ND5' b'MT-ND6']


## VAR

In [None]:
print_h5_part(data['var'])
# print(data['var'])

组路径: /var
组成员数量: 11
组成员: ['class', 'dispersions', 'dispersions_norm', 'end', 'gene_name_index', 'highly_variable', 'in_matrix', 'length', 'means', 'start', 'strand']
对象: class
  类型: <class 'h5py._hl.group.Group'>
  子组成员: ['categories', 'codes']
  ------------------------------
  对象: categories
    类型: <class 'h5py._hl.dataset.Dataset'>
    形状: (26,)
    数据类型: object
    大小: 26
    数据: [b'gene_version1' b'gene_version2' b'gene_version3' b'gene_version4'
 b'gene_version5' b'gene_version6' b'gene_version7' b'gene_version8'
 b'gene_version9' b'gene_version10' b'gene_version11' b'gene_version12'
 b'gene_version13' b'gene_version14' b'gene_version15' b'gene_version16'
 b'gene_version17' b'gene_version18' b'gene_version19' b'gene_version20'
 b'gene_version21' b'gene_version22' b'gene_version23' b'gene_version24'
 b'gene_version25' b'gene_version27']
  ------------------------------
  对象: codes
    类型: <class 'h5py._hl.dataset.Dataset'>
    形状: (6546,)
    数据类型: int8
    大小: 6546
    数据过大，只显示前

## UNS

In [None]:
print_h5_part(data['uns'])

组路径: /uns
组成员数量: 1
组成员: ['hvg']
对象: hvg
  类型: <class 'h5py._hl.group.Group'>
  子组成员: ['flavor']
  ------------------------------
  对象: flavor
    类型: <class 'h5py._hl.dataset.Dataset'>
    形状: ()
    数据类型: object
    大小: 1
    数据: b'seurat'
  ------------------------------
------------------------------


## OBS

In [None]:
print_h5_part(data['obs'])

组路径: /obs
组成员数量: 11
组成员: ['UMI_count', 'cell_barcode', 'cell_line', 'gem_group', 'gene', 'gene_id', 'gene_transcript', 'mitopercent', 'sgID_AB', 'transcript', 'z_gemgroup_UMI']
对象: UMI_count
  类型: <class 'h5py._hl.dataset.Dataset'>
  形状: (643413,)
  数据类型: float32
  大小: 643413
  数据过大，只显示前10个元素: [11234. 45146. 20190. 23912.  8282. 33209. 15895. 24743. 13288. 18707.]
------------------------------
对象: cell_barcode
  类型: <class 'h5py._hl.dataset.Dataset'>
  形状: (643413,)
  数据类型: object
  大小: 643413
  数据过大，只显示前10个元素: [b'AAACCCAAGAATAGTC-3-hepg2' b'AAACCCAAGAGGTATT-55-hepg2'
 b'AAACCCAAGAGTGACC-39-hepg2' b'AAACCCAAGATGGCAC-43-hepg2'
 b'AAACCCAAGCAACAAT-16-hepg2' b'AAACCCAAGCACACAG-14-hepg2'
 b'AAACCCAAGCGACTGA-47-hepg2' b'AAACCCAAGCGCCATC-14-hepg2'
 b'AAACCCAAGCTCCACG-12-hepg2' b'AAACCCAAGCTGCGAA-29-hepg2']
------------------------------
对象: cell_line
  类型: <class 'h5py._hl.group.Group'>
  子组成员: ['categories', 'codes']
  ------------------------------
  对象: categories
    类型: <class 'h5py._h

## OBSM

In [None]:
print_h5_part(data['obsm'])

组路径: /obsm
组成员数量: 3
组成员: ['X_hvg', 'X_state', 'X_vci']
对象: X_hvg
  类型: <class 'h5py._hl.dataset.Dataset'>
  形状: (643413, 2000)
  数据类型: float32
  大小: 1286826000
  数据过大，只显示前10个元素: [[0.         0.64062476 0.64062476 ... 1.5240247  2.9419792  1.8541076 ]
 [1.1030915  0.3697946  0.         ... 2.0713184  3.7362523  2.3414388 ]
 [0.4065011  1.3886238  0.4065011  ... 1.2549816  2.998682   0.4065011 ]
 ...
 [0.         0.3418372  0.         ... 1.4493277  3.2206478  1.7733195 ]
 [0.         0.9242541  0.         ... 1.5686096  3.6237998  1.957267  ]
 [1.1491904  0.         0.         ... 1.3068577  3.473727   1.1491904 ]]
------------------------------
对象: X_state
  类型: <class 'h5py._hl.dataset.Dataset'>
  形状: (643413, 2058)
  数据类型: float32
  大小: 1324143954
  数据过大，只显示前10个元素: [[-0.01779838 -0.00328709 -0.03078638 ...  0.6015625   0.05102539
  -0.0480957 ]
 [-0.03701852 -0.02942895 -0.00108422 ...  0.6328125   0.05957031
   0.03320312]
 [-0.00576355 -0.03582744 -0.01308481 ...  0.5         0.012

## VARM

In [None]:
print_h5_part(data['varm'])

组路径: /varm
组成员数量: 0
组成员: []
