In [None]:
# 处理CITEseq数据集
# 1. 从train_cite_inputs.h5, train_cite_targets.h5, test_cite_inputs.h5中读取数据
# 2. 从train_cite_inputs.h5中筛选低变基因，表达细胞较少的基因，以及表达基因较少的细胞
# 3. 从train_cite_targets.h5中删除表达基因较少的细胞，从test_cite_inputs.h5中删除表达细胞较少的基因
# 4. 将数据转换为稀疏矩阵格式,存储为h5ad文件

import pandas as pd
import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix
import os

# -------------------- 0. 参数与路径设置 (Parameters and Paths) --------------------
# --- 输入文件路径 ---
PATH_TRAIN_INP = "train_cite_inputs.h5"
PATH_TRAIN_TGT = "train_cite_targets.h5"
PATH_TEST_INP  = "test_cite_inputs.h5"

# --- 过滤参数 ---
# 基因方差过滤：过滤掉方差最低的百分比
LOW_VARIANCE_GENE_PERCENTILE = 30 
# 基因表达广度过滤：过滤掉在最少细胞中表达的基因百分比
LOW_COUNT_GENE_PERCENTILE = 10
# 细胞表达广度过滤：过滤掉表达最少基因的细胞百分比
LOW_COUNT_CELL_PERCENTILE = 5 # 按照要求，targets用5%的阈值，我们将此统一应用

# -------------------- 1. 读取数据 (Read Data) --------------------
print(">> Step 1: Reading HDF5 files into pandas DataFrames...")

try:
    train_inputs_df = pd.read_hdf(PATH_TRAIN_INP)
    train_targets_df = pd.read_hdf(PATH_TRAIN_TGT)
    test_inputs_df = pd.read_hdf(PATH_TEST_INP)
    print("All HDF5 files loaded successfully.")
    print(f"Initial train_inputs_df shape: {train_inputs_df.shape}")
    print(f"Initial train_targets_df shape: {train_targets_df.shape}")
    print(f"Initial test_inputs_df shape: {test_inputs_df.shape}")
except Exception as e:
    print(f"An error occurred while reading HDF5 files: {e}")
    # 如果出错，则退出脚本
    exit()


>> Step 1: Reading HDF5 files into pandas DataFrames...
All HDF5 files loaded successfully.
Initial train_inputs_df shape: (70988, 22050)
Initial train_targets_df shape: (70988, 140)
Initial test_inputs_df shape: (48663, 22050)


In [3]:
# -------------------- 2. 核心清洗: train_cite_inputs (Clean train_cite_inputs) --------------------
print("\n>> Step 2: Applying filters to 'train_cite_inputs'...")

# --- 2.1 细胞过滤：过滤掉表达基因较少的细胞 (5%) ---
print(f"Filtering cells with fewest expressed genes (bottom {LOW_COUNT_CELL_PERCENTILE}%)...")
n_cells_before = train_inputs_df.shape[0]

# 计算每个细胞表达了多少个基因 (行非零元素的数量)
n_genes_per_cell = (train_inputs_df > 0).sum(axis=1)
# 计算阈值
cell_filter_threshold = np.percentile(n_genes_per_cell, LOW_COUNT_CELL_PERCENTILE)
# 创建要保留的细胞的掩码
cells_to_keep_mask = n_genes_per_cell >= cell_filter_threshold

# 应用细胞过滤器
train_inputs_df = train_inputs_df[cells_to_keep_mask]
print(f"Cells before: {n_cells_before}, Cells after: {train_inputs_df.shape[0]}")

# 保存保留下来的细胞ID，用于后续同步过滤 train_targets
cells_kept_index = train_inputs_df.index


# --- 2.2 基因过滤：过滤低变基因和低表达广度基因 ---
n_genes_before = train_inputs_df.shape[1]

# a) 过滤低变基因 (方差后30%)
print(f"Filtering genes with lowest variance (bottom {LOW_VARIANCE_GENE_PERCENTILE}%)...")
gene_variances = train_inputs_df.var(axis=0)
variance_threshold = np.percentile(gene_variances, LOW_VARIANCE_GENE_PERCENTILE)
variance_mask = gene_variances >= variance_threshold

# b) 过滤表达细胞较少的基因 (后10%)
# 【修正】: 使用已定义的变量 LOW_COUNT_GENE_PERCENTILE
print(f"Filtering genes expressed in fewest cells (bottom {LOW_COUNT_GENE_PERCENTILE}%)...")
n_cells_per_gene = (train_inputs_df > 0).sum(axis=0)
# 【修正】: 使用已定义的变量 LOW_COUNT_GENE_PERCENTILE
gene_cell_count_threshold = np.percentile(n_cells_per_gene, LOW_COUNT_GENE_PERCENTILE)
cell_count_mask = n_cells_per_gene >= gene_cell_count_threshold

# c) 合并基因过滤器并应用
genes_to_keep_mask = variance_mask & cell_count_mask
train_inputs_df = train_inputs_df.loc[:, genes_to_keep_mask]
print(f"Genes before: {n_genes_before}, Genes after: {train_inputs_df.shape[1]}")

# 保存保留下来的基因ID，用于后续同步过滤 test_inputs
genes_kept_columns = train_inputs_df.columns



>> Step 2: Applying filters to 'train_cite_inputs'...
Filtering cells with fewest expressed genes (bottom 5%)...
Cells before: 67446, Cells after: 64074
Filtering genes with lowest variance (bottom 30%)...
Filtering genes expressed in fewest cells (bottom 10%)...
Genes before: 22050, Genes after: 15435


In [4]:
# -------------------- 3. 同步清洗其他文件 (Apply Filters to Other Files) --------------------
print("\n>> Step 3: Applying consistent filters to other dataframes...")

# --- 3.1 过滤 train_cite_targets.h5 ---
# 重要：必须使用在训练集上计算出的细胞列表来过滤靶标，以确保行一一对应。
print("Filtering cells in 'train_cite_targets'...")
n_cells_before = train_targets_df.shape[0]
train_targets_df = train_targets_df.loc[cells_kept_index]
print(f"Cells before: {n_cells_before}, Cells after: {train_targets_df.shape[0]}")


# --- 3.2 过滤 test_cite_inputs.h5 ---
# 重要：测试集的基因必须与训练集的基因完全一致。我们使用训练集筛选后的基因列表来过滤测试集。
print("Filtering genes in 'test_cite_inputs'...")
n_genes_before = test_inputs_df.shape[1]
# 确保所有在训练集中保留的基因都存在于测试集中，对于不存在的基因则无法保留
common_genes = genes_kept_columns.intersection(test_inputs_df.columns)
test_inputs_df = test_inputs_df[common_genes]
# 确保最终训练和测试的基因顺序和数量完全一致
train_inputs_df = train_inputs_df[common_genes]
print(f"Genes before: {n_genes_before}, Genes after: {test_inputs_df.shape[1]}")



>> Step 3: Applying consistent filters to other dataframes...
Filtering cells in 'train_cite_targets'...
Cells before: 70988, Cells after: 64074
Filtering genes in 'test_cite_inputs'...
Genes before: 22050, Genes after: 15435


In [6]:
# -------------------- 4. 转换并存储为h5ad (Convert and Save to .h5ad) --------------------
# --- 输出文件路径 ---
PATH_PROCESSED_TRAIN_INP = "train_cite_inputs.h5ad"
PATH_PROCESSED_TRAIN_TGT = "train_cite_targets.h5ad"
PATH_PROCESSED_TEST_INP  = "test_cite_inputs.h5ad"
print("\n>> Step 4: Converting DataFrames to sparse matrices and saving as .h5ad files...")

# a) 处理 train_inputs
print(f"Processing '{PATH_PROCESSED_TRAIN_INP}'...")
train_inputs_sparse = csr_matrix(train_inputs_df.values)
adata_train_inp = ad.AnnData(train_inputs_sparse)
adata_train_inp.obs_names = train_inputs_df.index
adata_train_inp.var_names = train_inputs_df.columns
adata_train_inp.write_h5ad(PATH_PROCESSED_TRAIN_INP, compression="gzip")

# b) 处理 train_targets
print(f"Processing '{PATH_PROCESSED_TRAIN_TGT}'...")
train_targets_sparse = csr_matrix(train_targets_df.values)
adata_train_tgt = ad.AnnData(train_targets_sparse)
adata_train_tgt.obs_names = train_targets_df.index
adata_train_tgt.var_names = train_targets_df.columns
adata_train_tgt.write_h5ad(PATH_PROCESSED_TRAIN_TGT, compression="gzip")

# c) 处理 test_inputs
print(f"Processing '{PATH_PROCESSED_TEST_INP}'...")
test_inputs_sparse = csr_matrix(test_inputs_df.values)
adata_test_inp = ad.AnnData(test_inputs_sparse)
adata_test_inp.obs_names = test_inputs_df.index
adata_test_inp.var_names = test_inputs_df.columns
adata_test_inp.write_h5ad(PATH_PROCESSED_TEST_INP, compression="gzip")

print("\n--- Preprocessing complete. All files saved in .h5ad format. ---")
print(f"Final train inputs shape: {adata_train_inp.shape}")
print(f"Final train targets shape: {adata_train_tgt.shape}")
print(f"Final test inputs shape: {adata_test_inp.shape}")



>> Step 4: Converting DataFrames to sparse matrices and saving as .h5ad files...
Processing 'train_cite_inputs.h5ad'...
Processing 'train_cite_targets.h5ad'...
Processing 'test_cite_inputs.h5ad'...

--- Preprocessing complete. All files saved in .h5ad format. ---
Final train inputs shape: (64074, 15435)
Final train targets shape: (64074, 140)
Final test inputs shape: (48663, 15435)


In [4]:
import pandas as pd
import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix
import os

# -------------------- 0. 参数与路径设置 (Parameters and Paths) --------------------
# --- 输入文件路径 (Input File Paths) ---
PATH_TRAIN_INP = "train_multi_inputs.h5"
PATH_TRAIN_TGT = "train_multi_targets.h5"
PATH_TEST_INP  = "test_multi_inputs.h5" # 假设您指的是 test_multi_inputs.h5

# --- 过滤参数 (Filtering Parameters) ---
# 特征方差过滤：过滤掉方差最低的百分比 (ATAC-seq peak)
LOW_VARIANCE_FEAT_PERCENTILE = 30 
# 特征表达广度过滤：过滤掉在最少细胞中出现的特征百分比
LOW_COUNT_FEAT_PERCENTILE = 10
# 细胞表达广度过滤：过滤掉表达最少特征的细胞百分比
LOW_COUNT_CELL_PERCENTILE = 5

In [None]:
# -------------------- 1. 读取数据 (Read Data) --------------------
print(">> Step 1: Reading Multiome HDF5 files into pandas DataFrames...")

try:
    train_inputs_df = pd.read_hdf(PATH_TRAIN_INP)
    train_targets_df = pd.read_hdf(PATH_TRAIN_TGT)
    test_inputs_df = pd.read_hdf(PATH_TEST_INP)
    print("All Multiome HDF5 files loaded successfully.")
    print(f"Initial train_inputs_df shape: {train_inputs_df.shape}")
    print(f"Initial train_targets_df shape: {train_targets_df.shape}")
    print(f"Initial test_inputs_df shape: {test_inputs_df.shape}")
except Exception as e:
    print(f"An error occurred while reading HDF5 files: {e}")
    # 如果出错，则退出脚本
    exit()

In [None]:
# -------------------- 2. 核心清洗: train_multi_inputs (Clean train_multi_inputs) --------------------
print("\n>> Step 2: Applying filters to 'train_multi_inputs'...")

# --- 2.1 细胞过滤：过滤掉表达特征数较少的细胞 (后5%) ---
print(f"Filtering cells with fewest expressed features (bottom {LOW_COUNT_CELL_PERCENTILE}%)...")
n_cells_before = train_inputs_df.shape[0]

# 计算每个细胞表达了多少个特征 (ATAC peaks)
n_feats_per_cell = (train_inputs_df > 0).sum(axis=1)
# 计算过滤阈值
cell_filter_threshold = np.percentile(n_feats_per_cell, LOW_COUNT_CELL_PERCENTILE)
# 创建要保留的细胞的掩码
cells_to_keep_mask = n_feats_per_cell >= cell_filter_threshold

# 应用细胞过滤器
train_inputs_df = train_inputs_df[cells_to_keep_mask]
print(f"Cells before: {n_cells_before}, Cells after: {train_inputs_df.shape[0]}")

# 保存保留下来的细胞ID，用于后续同步过滤 train_multi_targets
cells_kept_index = train_inputs_df.index


# --- 2.2 特征过滤：过滤低变和低表达广度的特征 ---
n_feats_before = train_inputs_df.shape[1]

# a) 过滤低变特征 (方差后30%)
print(f"Filtering features with lowest variance (bottom {LOW_VARIANCE_FEAT_PERCENTILE}%)...")
feat_variances = train_inputs_df.var(axis=0)
variance_threshold = np.percentile(feat_variances, LOW_VARIANCE_FEAT_PERCENTILE)
variance_mask = feat_variances >= variance_threshold

# b) 过滤在少数细胞中出现的特征 (后10%)
print(f"Filtering features expressed in fewest cells (bottom {LOW_COUNT_FEAT_PERCENTILE}%)...")
n_cells_per_feat = (train_inputs_df > 0).sum(axis=0)
feat_cell_count_threshold = np.percentile(n_cells_per_feat, LOW_COUNT_FEAT_PERCENTILE)
cell_count_mask = n_cells_per_feat >= feat_cell_count_threshold

# c) 合并特征过滤器并应用
feats_to_keep_mask = variance_mask & cell_count_mask
train_inputs_df = train_inputs_df.loc[:, feats_to_keep_mask]
print(f"Features before: {n_feats_before}, Features after: {train_inputs_df.shape[1]}")

# 保存保留下来的特征ID，用于后续同步过滤 test_multi_inputs
feats_kept_columns = train_inputs_df.columns

In [None]:
# -------------------- 3. 同步清洗其他文件 (Apply Filters to Other Files) --------------------
print("\n>> Step 3: Applying consistent filters to other dataframes...")

# --- 3.1 过滤 train_multi_targets.h5 ---
# 重要：必须使用在训练输入端计算出的细胞列表来过滤训练目标端，以确保行一一对应。
print("Filtering cells in 'train_multi_targets'...")
n_cells_before = train_targets_df.shape[0]
train_targets_df = train_targets_df.loc[cells_kept_index]
print(f"Cells before: {n_cells_before}, Cells after: {train_targets_df.shape[0]}")


# --- 3.2 过滤 test_multi_inputs.h5 ---
# 重要：测试集的特征必须与训练集的特征完全一致。我们使用训练集筛选后的特征列表来过滤测试集。
print("Filtering features in 'test_multi_inputs'...")
n_feats_before = test_inputs_df.shape[1]
# 找到训练集和测试集共有的特征，并以此为准进行筛选
common_feats = feats_kept_columns.intersection(test_inputs_df.columns)
test_inputs_df = test_inputs_df[common_feats]
# 同时，也确保训练集的特征列与测试集完全一致（顺序和数量）
train_inputs_df = train_inputs_df[common_feats]
print(f"Features before: {n_feats_before}, Features after: {test_inputs_df.shape[1]}")

In [None]:
# -------------------- 4. 转换并存储为h5ad (Convert and Save to .h5ad) --------------------
# --- 输出文件路径 ---
PATH_PROCESSED_TRAIN_INP = "processed_train_multi_inputs.h5ad"
PATH_PROCESSED_TRAIN_TGT = "processed_train_multi_targets.h5ad"
PATH_PROCESSED_TEST_INP  = "processed_test_multi_inputs.h5ad"

print("\n>> Step 4: Converting DataFrames to sparse matrices and saving as .h5ad files...")

# a) 处理 train_inputs
print(f"Processing '{PATH_PROCESSED_TRAIN_INP}'...")
train_inputs_sparse = csr_matrix(train_inputs_df.values)
adata_train_inp = ad.AnnData(train_inputs_sparse)
adata_train_inp.obs_names = train_inputs_df.index
adata_train_inp.var_names = train_inputs_df.columns
adata_train_inp.write_h5ad(PATH_PROCESSED_TRAIN_INP, compression="gzip")

# b) 处理 train_targets
print(f"Processing '{PATH_PROCESSED_TRAIN_TGT}'...")
train_targets_sparse = csr_matrix(train_targets_df.values)
adata_train_tgt = ad.AnnData(train_targets_sparse)
adata_train_tgt.obs_names = train_targets_df.index
adata_train_tgt.var_names = train_targets_df.columns
adata_train_tgt.write_h5ad(PATH_PROCESSED_TRAIN_TGT, compression="gzip")

# c) 处理 test_inputs
print(f"Processing '{PATH_PROCESSED_TEST_INP}'...")
test_inputs_sparse = csr_matrix(test_inputs_df.values)
adata_test_inp = ad.AnnData(test_inputs_sparse)
adata_test_inp.obs_names = test_inputs_df.index
adata_test_inp.var_names = test_inputs_df.columns
adata_test_inp.write_h5ad(PATH_PROCESSED_TEST_INP, compression="gzip")

print("\n--- Multiome preprocessing complete. All files saved in .h5ad format. ---")
print(f"Final train inputs shape: {adata_train_inp.shape}")
print(f"Final train targets shape: {adata_train_tgt.shape}")
print(f"Final test inputs shape: {adata_test_inp.shape}")

In [2]:
# 由于Multi组数据普遍较大，直接加载会触发OOM，因此采用分块加载的模式
# ------------------ 1. 读取数据 -------------------------
from __future__ import annotations
import os
import h5py
import numpy as np
import pandas as pd
import anndata as ad
import scipy.sparse as sp
from typing import Optional, Tuple

# -------------------- 参数与路径 (与你的 notebook 命名保持一致) --------------------
PATH_TRAIN_INP = "train_multi_inputs.h5"     # 或 train_multi_inputs.h5
PATH_TRAIN_TGT = "train_multi_targets.h5"    # 或 train_multi_targets.h5
PATH_TEST_INP  = "test_multi_inputs.h5"      # 或 test_multi_inputs.h5

# pandas HDFStore -> 行块大小（根据列数自动调整，默认 100 行对 20~30 万列也稳妥）
DEFAULT_CHUNK_ROWS = 100

# -------------------- 工具函数 --------------------
def _maybe_decode(a) -> list[str]:
    """把 bytes/bytearray 转为 str 列表；若已是 str/数字则转为 str。"""
    out = []
    for x in a:
        if isinstance(x, (bytes, bytearray)):
            out.append(x.decode("utf-8"))
        else:
            out.append(str(x))
    return out

def _read_10x_matrix_to_adata(path: str) -> ad.AnnData:
    """
    读取 10x/OPSCB 风格 HDF5：要求根下有 'matrix' 组，含 data/indices/indptr/shape。
    同时尝试读取 barcodes 与 features/name|id。
    """
    with h5py.File(path, "r") as f:
        if "matrix" not in f:
            raise ValueError("Not 10x-style: missing 'matrix' group.")
        g = f["matrix"]

        # 基本稀疏矩阵
        shape = tuple(g["shape"][...].tolist())
        data = g["data"][...]
        indices = g["indices"][...].astype(np.int32, copy=False)
        indptr = g["indptr"][...].astype(np.int32, copy=False)
        X = sp.csr_matrix((data, indices, indptr), shape=shape, dtype=np.float32)

        # 细胞/特征名
        if "barcodes" in g:
            obs_names = _maybe_decode(g["barcodes"][...])
        elif "cell_ids" in g:
            obs_names = _maybe_decode(g["cell_ids"][...])
        else:
            obs_names = [f"cell_{i}" for i in range(shape[0])]

        var_names = None
        if "features" in g:
            feats = g["features"]
            if "name" in feats:
                var_names = _maybe_decode(feats["name"][...])
            elif "id" in feats:
                var_names = _maybe_decode(feats["id"][...])
        if var_names is None:
            # 兼容一些变体
            for k in ("gene_names", "genes", "var_names"):
                if k in g:
                    var_names = _maybe_decode(g[k][...])
                    break
        if var_names is None:
            var_names = [f"feat_{j}" for j in range(shape[1])]

    adata = ad.AnnData(X)
    adata.obs_names = pd.Index(obs_names, name=None)
    adata.var_names = pd.Index(var_names, name=None)
    return adata

def _probe_pandas_hdf(path: str):
    """
    探测 pandas HDFStore：返回 (key, nrows_or_None, columns_of_sample)
    fixed 格式下 nrows 可能为 None，不要转 int。
    """
    with pd.HDFStore(path, mode="r") as store:
        keys = store.keys()
        if not keys:
            raise ValueError("Empty HDF5 store.")
        key = keys[0]
        storer = store.get_storer(key)
        nrows = getattr(storer, "nrows", None)   # fixed 可能为 None
    sample = pd.read_hdf(path, key=key, start=0, stop=1)
    return key, nrows, sample.columns

def _auto_chunk_rows(n_cols: int, target_gb: float = 0.18) -> int:
    """
    粗略估算每块行数：float32 4B；对超宽矩阵（~20-30万列）建议 ≤0.18 GB/块。
    """
    bytes_per_row = n_cols * 4
    rows = max(1, int((target_gb * (1024**3)) // bytes_per_row))
    # 至少 25 行，最多 200 行，避免太碎或太大
    return int(np.clip(rows, 25, 200))

def _read_pandas_hdf_to_adata(path: str, chunk_rows: Optional[int] = None) -> ad.AnnData:
    """
    行分块读取为 AnnData(CSR)。当 nrows 为 None（fixed 格式）时，使用 while 循环直到读到空块。
    """
    key, nrows, cols = _probe_pandas_hdf(path)
    n_cols = len(cols)
    if chunk_rows is None:
        chunk_rows = _auto_chunk_rows(n_cols)

    print(f"[read_hdf] key={key}, nrows={nrows if nrows is not None else 'unknown (fixed)'}, "
          f"ncols={n_cols}, chunk_rows={chunk_rows}")

    blocks, obs_names = [], []
    start = 0
    while True:
        stop = start + chunk_rows
        df = pd.read_hdf(path, key=key, start=start, stop=stop)
        if df is None or df.empty:
            break
        if not df.columns.equals(cols):
            df = df.reindex(columns=cols)

        X_blk = sp.csr_matrix(df.to_numpy(dtype=np.float32, copy=False))
        blocks.append(X_blk)
        obs_names.extend(df.index.astype(str).tolist())

        start += len(df)
        if nrows is not None:
            print(f"  read rows: {start}/{nrows}")
        else:
            print(f"  read rows: {start} ...")

    if not blocks:
        raise RuntimeError("No data read from HDF5 (check key or file).")

    X = sp.vstack(blocks, format="csr")
    adata = ad.AnnData(X)
    adata.obs_names = pd.Index(obs_names, name=None)
    adata.var_names = pd.Index(cols.astype(str), name=None)
    return adata

def load_any_h5_to_adata(path: str, chunk_rows: Optional[int] = None) -> ad.AnnData:
    """
    尝试以 10x -> pandas HDFStore 的顺序读取为 AnnData。
    """
    # 先试 10x/OPSCB 风格
    try:
        with h5py.File(path, "r") as f:
            if "matrix" in f:
                print(f"[detect] {os.path.basename(path)} looks like 10x/OPSCB (has 'matrix')")
                return _read_10x_matrix_to_adata(path)
    except Exception as e:
        print(f"[warn] 10x-style read failed: {e}")

    # 再试 pandas HDFStore
    try:
        print(f"[detect] {os.path.basename(path)} will be read via pandas HDFStore (row-chunk).")
        return _read_pandas_hdf_to_adata(path, chunk_rows=chunk_rows)
    except Exception as e:
        raise RuntimeError(f"Unsupported HDF5 structure for {path}: {e}")

# -------------------- 主流程：读取三个文件为 AnnData --------------------
def load_triplet_to_adata(
    path_train_inp: str,
    path_train_tgt: str,
    path_test_inp : str,
    chunk_rows_inputs: Optional[int] = None,
    chunk_rows_targets: Optional[int] = None,
    chunk_rows_test   : Optional[int] = None,
):
    print(">> Loading TRAIN_INPUTS ...")
    adata_train_inp = load_any_h5_to_adata(path_train_inp, chunk_rows=chunk_rows_inputs)
    print(f"   TRAIN_INPUTS shape: {adata_train_inp.shape}")

    print(">> Loading TRAIN_TARGETS ...")
    adata_train_tgt = load_any_h5_to_adata(path_train_tgt, chunk_rows=chunk_rows_targets)
    print(f"   TRAIN_TARGETS shape: {adata_train_tgt.shape}")

    print(">> Loading TEST_INPUTS ...")
    adata_test_inp = load_any_h5_to_adata(path_test_inp, chunk_rows=chunk_rows_test)
    print(f"   TEST_INPUTS shape: {adata_test_inp.shape}")

    return adata_train_inp, adata_train_tgt, adata_test_inp


adata_train_inp, adata_train_tgt, adata_test_inp = load_triplet_to_adata(
    PATH_TRAIN_INP, PATH_TRAIN_TGT, PATH_TEST_INP,
    chunk_rows_inputs=100,
    chunk_rows_targets=100,
    chunk_rows_test=100,
)

>> Loading TRAIN_INPUTS ...
[detect] train_multi_inputs.h5 will be read via pandas HDFStore (row-chunk).
[read_hdf] key=/train_multi_inputs, nrows=unknown (fixed), ncols=228942, chunk_rows=100
  read rows: 100 ...
  read rows: 200 ...
  read rows: 300 ...
  read rows: 400 ...
  read rows: 500 ...
  read rows: 600 ...
  read rows: 700 ...
  read rows: 800 ...
  read rows: 900 ...
  read rows: 1000 ...
  read rows: 1100 ...
  read rows: 1200 ...
  read rows: 1300 ...
  read rows: 1400 ...
  read rows: 1500 ...
  read rows: 1600 ...
  read rows: 1700 ...
  read rows: 1800 ...
  read rows: 1900 ...
  read rows: 2000 ...
  read rows: 2100 ...
  read rows: 2200 ...
  read rows: 2300 ...
  read rows: 2400 ...
  read rows: 2500 ...
  read rows: 2600 ...
  read rows: 2700 ...
  read rows: 2800 ...
  read rows: 2900 ...
  read rows: 3000 ...
  read rows: 3100 ...
  read rows: 3200 ...
  read rows: 3300 ...
  read rows: 3400 ...
  read rows: 3500 ...
  read rows: 3600 ...
  read rows: 3700 ...
  

In [5]:
# -------------------- 2. 核心清洗: train_multi_inputs (Clean train_multi_inputs) --------------------
print("\n>> Step 2: Applying filters to 'train_multi_inputs'...")

# --- 2.1 细胞过滤：过滤掉表达特征数较少的细胞 (后5%) ---
print(f"Filtering cells with fewest expressed features (bottom {LOW_COUNT_CELL_PERCENTILE}%)...")
n_cells_before = adata_train_inp.n_obs

# 计算每个细胞表达了多少个特征 (ATAC peaks)
# (adata.X > 0) 会创建一个布尔稀疏矩阵，.sum(axis=1) 高效计算
n_feats_per_cell = np.asarray((adata_train_inp.X > 0).sum(axis=1)).ravel()
# 计算过滤阈值
cell_filter_threshold = np.percentile(n_feats_per_cell, LOW_COUNT_CELL_PERCENTILE)
# AnnData对象支持直接使用布尔列表进行切片
adata_train_inp = adata_train_inp[n_feats_per_cell >= cell_filter_threshold, :]
print(f"Cells before: {n_cells_before}, Cells after: {adata_train_inp.n_obs}")

# 保存保留下来的细胞ID，用于后续同步过滤
cells_kept_index = adata_train_inp.obs_names


# --- 2.2 特征过滤：过滤低变和低表达广度的特征 ---
n_feats_before = adata_train_inp.n_vars

# a) 过滤低变特征 (方差后30%)
print(f"Filtering features with lowest variance (bottom {LOW_VARIANCE_FEAT_PERCENTILE}%)...")
# 高效计算稀疏矩阵的方差: Var(X) = E[X^2] - (E[X])^2
mean_sq = np.asarray(adata_train_inp.X.power(2).mean(axis=0)).ravel()
mean = np.asarray(adata_train_inp.X.mean(axis=0)).ravel()
feat_variances = mean_sq - mean**2
variance_threshold = np.percentile(feat_variances, LOW_VARIANCE_FEAT_PERCENTILE)
variance_mask = feat_variances >= variance_threshold

# b) 过滤在少数细胞中出现的特征 (后10%)
print(f"Filtering features expressed in fewest cells (bottom {LOW_COUNT_FEAT_PERCENTILE}%)...")
n_cells_per_feat = np.asarray((adata_train_inp.X > 0).sum(axis=0)).ravel()
feat_cell_count_threshold = np.percentile(n_cells_per_feat, LOW_COUNT_FEAT_PERCENTILE)
cell_count_mask = n_cells_per_feat >= feat_cell_count_threshold

# c) 合并特征过滤器并应用
feats_to_keep_mask = variance_mask & cell_count_mask
adata_train_inp = adata_train_inp[:, feats_to_keep_mask]
print(f"Features before: {n_feats_before}, Features after: {adata_train_inp.n_vars}")

# 保存保留下来的特征ID
feats_kept_columns = adata_train_inp.var_names


>> Step 2: Applying filters to 'train_multi_inputs'...
Filtering cells with fewest expressed features (bottom 5%)...
Cells before: 105942, Cells after: 100646
Filtering features with lowest variance (bottom 30%)...
Filtering features expressed in fewest cells (bottom 10%)...
Features before: 228942, Features after: 160259


In [6]:
# -------------------- 3. 同步清洗其他文件 (Apply Filters to Other Files) --------------------
print("\n>> Step 3: Applying consistent filters to other anndata objects...")

# --- 3.1 过滤 train_multi_targets ---
print("Filtering cells in 'train_multi_targets'...")
n_cells_before = adata_train_tgt.n_obs
# 直接使用索引进行切片
adata_train_tgt = adata_train_tgt[cells_kept_index, :]
print(f"Cells before: {n_cells_before}, Cells after: {adata_train_tgt.n_obs}")

# --- 3.2 过滤 test_multi_inputs ---
print("Filtering features in 'test_multi_inputs'...")
n_feats_before = adata_test_inp.n_vars
# 找到共同特征
common_feats = feats_kept_columns.intersection(adata_test_inp.var_names)
# 应用切片
adata_test_inp = adata_test_inp[:, common_feats]
adata_train_inp = adata_train_inp[:, common_feats] # 再次对齐训练集，确保顺序一致
print(f"Features before: {n_feats_before}, Features after: {adata_test_inp.n_vars}")


>> Step 3: Applying consistent filters to other anndata objects...
Filtering cells in 'train_multi_targets'...
Cells before: 105942, Cells after: 100646
Filtering features in 'test_multi_inputs'...
Features before: 228942, Features after: 160259


In [7]:
# -------------------- 4. 存储为.h5ad (Save to .h5ad) --------------------
# --- 输出文件路径 ---
PATH_PROCESSED_TRAIN_INP = "train_multi_inputs.h5ad"
PATH_PROCESSED_TRAIN_TGT = "train_multi_targets.h5ad"
PATH_PROCESSED_TEST_INP  = "test_multi_inputs.h5ad"

print("\n>> Step 4: Saving final AnnData objects to .h5ad files...")

adata_train_inp.write_h5ad(PATH_PROCESSED_TRAIN_INP, compression="gzip")
print(f"Saved: {PATH_PROCESSED_TRAIN_INP}")

adata_train_tgt.write_h5ad(PATH_PROCESSED_TRAIN_TGT, compression="gzip")
print(f"Saved: {PATH_PROCESSED_TRAIN_TGT}")

adata_test_inp.write_h5ad(PATH_PROCESSED_TEST_INP, compression="gzip")
print(f"Saved: {PATH_PROCESSED_TEST_INP}")

print("\n--- Multiome preprocessing complete. ---")
print(f"Final train inputs shape: {adata_train_inp.shape}")
print(f"Final train targets shape: {adata_train_tgt.shape}")
print(f"Final test inputs shape: {adata_test_inp.shape}")


>> Step 4: Saving final AnnData objects to .h5ad files...
Saved: processed_train_multi_inputs.h5ad
Saved: processed_train_multi_targets.h5ad
Saved: processed_test_multi_inputs.h5ad

--- Multiome preprocessing complete. ---
Final train inputs shape: (100646, 160259)
Final train targets shape: (100646, 23418)
Final test inputs shape: (55935, 160259)
