In [None]:
# 同时写入cite数据集和multi数据集的预测结果
import gc
import numpy as np
import pandas as pd
import anndata as ad

# ----------------------------- Config -----------------------------
N_CHUNKS = 4                      # adjust based on RAM
SUBMISSION_PATH = "submission.csv"

PATH_EVAL_IDS = "evaluation_ids.csv"

# CITE paths
PATH_TEST_CITE_INP  = "test_cite_inputs.h5ad"
PATH_TRAIN_CITE_TGT = "train_cite_targets.h5ad"
PATH_SUB_PRED_CITE  = "sub_preds_cite.npy"   # shape: [n_cite_cells, n_proteins]

# MULTI paths
PATH_TEST_MULTI_INP  = "test_multi_inputs.h5ad"
PATH_TRAIN_MULTI_TGT = "train_multi_targets.h5ad"
PATH_SUB_PRED_MULTI  = "sub_preds_multi.npy" # shape: [n_multi_cells, n_genes]

# ----------------------- Load global evaluation map ----------------
print("--- Step 1: Loading necessary IDs and test predictions ---")
try:
    evaluation_ids = pd.read_csv(PATH_EVAL_IDS)
except FileNotFoundError:
    raise FileNotFoundError(f"Error: '{PATH_EVAL_IDS}' not found. This file is required.")

# ---------------------------- Load CITE ----------------------------
print("\n[ CITE ] Loading IDs ...")
adata_cite_inp = ad.read_h5ad(PATH_TEST_CITE_INP)
cite_cell_ids = pd.Index(adata_cite_inp.obs_names.astype(str))
del adata_cite_inp; gc.collect()

adata_cite_tgt = ad.read_h5ad(PATH_TRAIN_CITE_TGT)
cite_target_ids = pd.Index(adata_cite_tgt.var_names.astype(str))  # proteins
del adata_cite_tgt; gc.collect()

print("[ CITE ] Loading predictions (memmap) ...")
sub_preds_cite = np.load(PATH_SUB_PRED_CITE, mmap_mode='r')  # (n_cells, n_targets)

# --------------------------- Load MULTI ----------------------------
print("\n[ MULTI ] Loading IDs ...")
adata_multi_inp = ad.read_h5ad(PATH_TEST_MULTI_INP)
multi_cell_ids = pd.Index(adata_multi_inp.obs_names.astype(str))
del adata_multi_inp; gc.collect()

adata_multi_tgt = ad.read_h5ad(PATH_TRAIN_MULTI_TGT)
multi_target_ids = pd.Index(adata_multi_tgt.var_names.astype(str))  # genes
del adata_multi_tgt; gc.collect()

print("[ MULTI ] Loading predictions (memmap) ...")
sub_preds_multi = np.load(PATH_SUB_PRED_MULTI, mmap_mode='r')  # (n_cells, n_targets)

# ---------------------- Quick sanity assertions --------------------
assert sub_preds_cite.shape[0]  == len(cite_cell_ids),  \
    f"CITE cells mismatch: preds {sub_preds_cite.shape[0]} vs ids {len(cite_cell_ids)}"
assert sub_preds_cite.shape[1]  == len(cite_target_ids), \
    f"CITE targets mismatch: preds {sub_preds_cite.shape[1]} vs ids {len(cite_target_ids)}"
assert sub_preds_multi.shape[0] == len(multi_cell_ids), \
    f"MULTI cells mismatch: preds {sub_preds_multi.shape[0]} vs ids {len(multi_cell_ids)}"
assert sub_preds_multi.shape[1] == len(multi_target_ids), \
    f"MULTI targets mismatch: preds {sub_preds_multi.shape[1]} vs ids {len(multi_target_ids)}"

# ---------------------- Helper: chunked writer ---------------------
def write_dataset_in_chunks(
    name: str,
    preds_memmap: np.memmap,
    cell_ids: pd.Index,
    target_ids: pd.Index,
    eval_ids: pd.DataFrame,
    submission_path: str,
    n_chunks: int = 4,
    write_header: bool = False,
):
    """
    Writes predictions for one dataset (CITE or MULTI) into submission_path in chunks.
    Only rows present in eval_ids (cell_id, gene_id) are written.
    """
    print(f"\n--- Step 2 ({name}) : Processing test set predictions in chunks ---")

    # Restrict evaluation_ids to this dataset's cells and targets to minimize melt size
    target_idx = pd.Index(target_ids)
    eval_subset = eval_ids[
        eval_ids["cell_id"].isin(cell_ids)
    ]
    eval_targets = pd.Index(eval_subset["gene_id"].unique().astype(str))
    # Keep only targets that actually exist in this dataset
    eval_targets = eval_targets[eval_targets.isin(target_idx)]

    if len(eval_targets) == 0:
        print(f"[{name}] No matching (cell_id, gene_id) in evaluation_ids. Skipping.")
        return

    # Boolean mask to slice prediction columns (targets) efficiently
    col_mask = target_idx.isin(eval_targets)
    selected_targets = target_idx[col_mask]

    print(f"[{name}] Cells: {len(cell_ids)} | Targets (all): {len(target_ids)} | "
          f"Targets (evaluated): {len(selected_targets)}")

    # Chunking
    n = len(cell_ids)
    n_chunks = max(1, int(n_chunks))
    base = n // n_chunks
    edges = [(i * base, (i + 1) * base) for i in range(n_chunks - 1)]
    edges.append(((n_chunks - 1) * base, n))  # last chunk takes remainder

    for i, (start_idx, end_idx) in enumerate(edges, start=1):
        print(f"\n[{name}] --- Processing Chunk {i}/{n_chunks} ---")
        print(f"[{name}] Processing cells from index {start_idx} to {end_idx - 1} ...")

        chunk_cells = cell_ids[start_idx:end_idx]
        # Slice predictions: rows for current cells, columns for evaluated targets only
        chunk_preds = preds_memmap[start_idx:end_idx][:, col_mask]

        # Build long-format only for evaluated targets
        chunk_df = pd.DataFrame(chunk_preds, index=chunk_cells, columns=selected_targets)
        chunk_long = (
            chunk_df
            .reset_index()
            .rename(columns={"index": "cell_id"})
            .melt(id_vars="cell_id", var_name="gene_id", value_name="target")
        )

        # Further restrict eval to current chunk's cells before merging
        eval_subset_chunk = eval_subset[eval_subset["cell_id"].isin(chunk_cells)]

        merged = eval_subset_chunk.merge(
            chunk_long, on=["cell_id", "gene_id"], how="inner"
        )[["row_id", "target"]]

        # Write/append
        mode = "w" if (write_header and i == 1) else "a"
        header = bool(write_header and i == 1)
        merged.to_csv(submission_path, index=False, mode=mode, header=header)

        # Cleanup
        del chunk_df, chunk_long, eval_subset_chunk, merged, chunk_preds
        gc.collect()

    print(f"\n[{name}] Finished writing to '{submission_path}'.")

# ------------------- Write CITE first, then MULTI ------------------
# CITE: header=True (new file)
write_dataset_in_chunks(
    name="CITE",
    preds_memmap=sub_preds_cite,
    cell_ids=cite_cell_ids,
    target_ids=cite_target_ids,
    eval_ids=evaluation_ids,
    submission_path=SUBMISSION_PATH,
    n_chunks=N_CHUNKS,
    write_header=True,
)

# MULTI: header=False (append)
write_dataset_in_chunks(
    name="MULTI",
    preds_memmap=sub_preds_multi,
    cell_ids=multi_cell_ids,
    target_ids=multi_target_ids,
    eval_ids=evaluation_ids,
    submission_path=SUBMISSION_PATH,
    n_chunks=N_CHUNKS,
    write_header=False,
)

print(f"\n--- Done. Submission file written to '{SUBMISSION_PATH}' ---")

--- Step 1: Loading necessary IDs and test predictions ---

[ CITE ] Loading IDs ...
[ CITE ] Loading predictions (memmap) ...

[ MULTI ] Loading IDs ...
[ MULTI ] Loading predictions (memmap) ...

--- Step 2 (CITE) : Processing test set predictions in chunks ---
[CITE] Cells: 48663 | Targets (all): 140 | Targets (evaluated): 140

[CITE] --- Processing Chunk 1/4 ---
[CITE] Processing cells from index 0 to 12164 ...

[CITE] --- Processing Chunk 2/4 ---
[CITE] Processing cells from index 12165 to 24329 ...

[CITE] --- Processing Chunk 3/4 ---
[CITE] Processing cells from index 24330 to 36494 ...

[CITE] --- Processing Chunk 4/4 ---
[CITE] Processing cells from index 36495 to 48662 ...

[CITE] Finished writing to 'submission.csv'.

--- Step 2 (MULTI) : Processing test set predictions in chunks ---
[MULTI] Cells: 55935 | Targets (all): 23418 | Targets (evaluated): 23418

[MULTI] --- Processing Chunk 1/4 ---
[MULTI] Processing cells from index 0 to 13982 ...

[MULTI] --- Processing Chunk 2/

In [4]:
# 仅写入cite或multi数据集的预测结果
import gc
import numpy as np
import pandas as pd
import anndata as ad

def write_submission_part(name: str, n_chunks: int = 4):
    """
    name: 'cite' 或 'multi'
    读取 sub_preds_{name}.npy，并写入 submission_{name}.csv（首块用 'w' 覆盖，后续 'a' 追加）
    """
    assert name in {"cite", "multi"}

    # 路径映射
    PATH_EVAL_IDS = "evaluation_ids.csv"
    TEST_INP = {
        "cite": "test_cite_inputs.h5ad",
        "multi": "test_multi_inputs.h5ad",
    }
    TRAIN_TGT = {
        "cite": "train_cite_targets.h5ad",
        "multi": "train_multi_targets.h5ad",
    }
    SUB_PRED = {
        "cite": "sub_preds_cite.npy",
        "multi": "sub_preds_multi.npy",
    }
    OUT_PATH = f"submission_{name}.csv"

    # 读取评估映射
    evaluation_ids = pd.read_csv(PATH_EVAL_IDS, dtype={"cell_id": str, "gene_id": str})

    # 读取测试集 cell_id
    adata_inp = ad.read_h5ad(TEST_INP[name])
    cell_ids = pd.Index(adata_inp.obs_names.astype(str))
    del adata_inp; gc.collect()

    # 读取目标（列名顺序）
    adata_tgt = ad.read_h5ad(TRAIN_TGT[name])
    target_ids = pd.Index(adata_tgt.var_names.astype(str))
    del adata_tgt; gc.collect()

    # 读取预测（memmap）
    preds = np.load(SUB_PRED[name], mmap_mode="r")
    # 基本对齐检查（可按需注释）
    assert preds.shape[0] == len(cell_ids), f"{name}: cell数不匹配"
    assert preds.shape[1] == len(target_ids), f"{name}: target数不匹配"

    # 只保留需要评估的 (cell_id, gene_id)，并仅 melt 被评估到的 targets，减少内存
    eval_subset = evaluation_ids[evaluation_ids["cell_id"].isin(cell_ids)]
    eval_targets = pd.Index(eval_subset["gene_id"].unique())
    col_mask = target_ids.isin(eval_targets)
    selected_targets = target_ids[col_mask]

    # 分块
    n = len(cell_ids)
    n_chunks = max(1, int(n_chunks))
    base = n // n_chunks
    edges = [(i * base, (i + 1) * base) for i in range(n_chunks - 1)]
    edges.append(((n_chunks - 1) * base, n))

    for i, (s, e) in enumerate(edges, start=1):
        chunk_cells = cell_ids[s:e]
        chunk_preds = preds[s:e][:, col_mask]

        df = pd.DataFrame(chunk_preds, index=chunk_cells, columns=selected_targets)
        long_df = (
            df.reset_index()
              .rename(columns={"index": "cell_id"})
              .melt(id_vars="cell_id", var_name="gene_id", value_name="target")
        )
        eval_chunk = eval_subset[eval_subset["cell_id"].isin(chunk_cells)]
        merged = eval_chunk.merge(long_df, on=["cell_id", "gene_id"], how="inner")[["row_id", "target"]]

        mode = "w" if i == 1 else "a"     # 首块覆盖写，其余块追加
        header = (i == 1)
        merged.to_csv(OUT_PATH, index=False, mode=mode, header=header)

        del df, long_df, eval_chunk, merged, chunk_preds
        gc.collect()

    print(f"[{name}] wrote -> {OUT_PATH}")

write_submission_part('cite')
write_submission_part('multi')

[cite] wrote -> submission_cite.csv
[multi] wrote -> submission_multi.csv


In [6]:
# 合并cite和multi数据集的预测结果
def merge_submissions():
    """
    无检查地合并 'submission_cite.csv' 与 'submission_multi.csv' 到 'submission.csv'
    """
    df_cite  = pd.read_csv("submission_cite.csv")
    df_multi = pd.read_csv("submission_multi.csv")
    df = pd.concat([df_cite, df_multi], ignore_index=True)
    df.to_csv("submission.csv", index=False)
    print("merged -> submission.csv")

merge_submissions()

merged -> submission.csv
