In [1]:
import os
os.environ["OMP_NUM_THREADS"]="4"
os.environ["MKL_NUM_THREADS"]="4"
os.environ["VECLIB_MAXIMUM_THREADS"]="4"
os.environ["NUMEXPR_NUM_THREADS"]="4"
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"

In [2]:
import sys
import scanpy as sc

sys.path.append("../..")
from scripts.runner_models import cv_dca_5fold

In [None]:
from pathlib import Path
import h5py
import numpy as np

def _b2s(x):
    return x.decode("utf-8") if isinstance(x, (bytes, np.bytes_)) else x

def _bytes_to_str_arr(a):
    return np.array([_b2s(v) for v in a], dtype=object)

def _flatten_categoricals(table_group):
    """Turn pandas-categorical groups into plain utf-8 string datasets."""
    to_convert = []
    for k, v in list(table_group.items()):
        if isinstance(v, h5py.Group) and "codes" in v and "categories" in v:
            to_convert.append(k)
    for k in to_convert:
        g = table_group[k]
        codes = g["codes"][()]
        cats  = _bytes_to_str_arr(g["categories"][()])
        out   = np.empty(codes.shape[0], dtype=object)
        mask  = codes >= 0
        out[mask]  = cats[codes[mask]]
        out[~mask] = ""
        del table_group[k]
        dt = h5py.string_dtype(encoding="utf-8")
        table_group.create_dataset(k, data=out.astype(dt), dtype=dt)

def _purge_dict_groups(h5, path="/"):
    """Recursively delete any group with attrs['encoding-type']=='dict'."""
    grp = h5[path]
    # copy keys to avoid changing during iteration
    for name in list(grp.keys()):
        obj = grp[name]
        if isinstance(obj, h5py.Group):
            enc = _b2s(obj.attrs.get("encoding-type", None))
            if enc == "dict":
                del grp[name]
            else:
                _purge_dict_groups(h5, obj.name)

def fix_h5ad_for_anndata07(base_path: Path):
    print(f"[fix] Patching {base_path}")
    with h5py.File(base_path, "a") as f:
        # Drop common modern containers outright (not needed for DCA)
        for g in ("layers", "obsp", "varp", "obsm", "varm"):
            if g in f:
                print(f" - delete /{g}")
                del f[g]
        if "raw" in f and isinstance(f["raw"], h5py.Group) and "layers" in f["raw"]:
            print(" - delete /raw/layers")
            del f["raw"]["layers"]

        # Nuke any leftover dict-encoded groups anywhere (e.g., under /uns, nested)
        _purge_dict_groups(f, "/")

        # Flatten categoricals in obs/var
        if "obs" in f: _flatten_categoricals(f["obs"])
        if "var" in f: _flatten_categoricals(f["var"])
    print("[fix] Done.")

# Apply to both converted bases
for p in [
    Path("../data/raw_count/converted/GSE169569/base_legacy.h5ad"),
    Path("../data/raw_count/converted/GSE228841/base_legacy.h5ad"),
]:
    fix_h5ad_for_anndata07(p)


[fix] Patching ../data/raw_count/converted/GSE169569/base_legacy.h5ad


OSError: Unable to create file (unable to open file: name = '../data/raw_count/converted/GSE169569/base_legacy.h5ad', errno = 2, error message = 'No such file or directory', flags = 15, o_flags = a02)

In [5]:
import anndata as ad
import json
from scipy import sparse
from pathlib import Path

def load_converted_base_then_layers(root_dir: str):
    root = Path(root_dir)
    adata = ad.read_h5ad(root / "base_legacy.h5ad")  # should work now

    with open(root / "layers_manifest.json") as f:
        mf = json.load(f)

    for layer in mf["layers"]:
        lname = layer["name"]
        M = sparse.load_npz(root / layer["path"]).tocsr()
        if M.shape != adata.shape:
            raise ValueError(f"Layer {lname} shape {M.shape} != {adata.shape}")
        adata.layers[lname] = M
    return adata

adata_neu = load_converted_base_then_layers("../data/raw_count/converted/GSE169569")
adata_cov = load_converted_base_then_layers("../data/raw_count/converted/GSE228841")

print(adata_neu)
print("layers(neu):", list(adata_neu.layers.keys()))


AnnData object with n_obs × n_vars = 447 × 51777
    obs: 'total_counts_before_preprocessing', 'total_counts_after_trimming', 'total_counts_after_preprocessing', 'total_count_ratio__after_to_before', 'QC_mapping_ratio_bacterial', 'QC_mapping_ratio_viral', 'QC_mapping_ratio_miRNA', 'QC_mapping_ratio_sncRNA', 'Bases', 'Bytes', 'Avg_spot_length', 'BioProject', 'BioSample', 'Experiment', 'GEO_Accession_exp', 'SRA_study', 'Sample_name', 'Consent', 'Center_name', 'Organism', 'source_name', 'cohort', 'sex', 'age', 'treatment', 'diagnosis', 'Assay_type', 'Sequencer', 'Sample_type', 'Lab_library_layout', 'Lab_library_selection', 'Lab_library_source', 'Lab_RNA_extraction_protocol', 'Lab_Blocking_State', 'Lab_library_preparation_kit', 'Release_date', 'create_date', 'version', 'total_count'
    var: 'length', 'g_fraction', 'a_fraction', 't_fraction', 'c_fraction', 'gc_fraction', 'MFE_37', 'spikein', 'qiaseq_spikein', 'hbdx_spikein', 'hbdx_spikein_prefix_length', 'hbdx_spikein_suffix_length', 'hbdx

In [6]:
import numpy as np

X_counts = adata_neu.layers["counts"]
norm_layer = adata_neu.layers["log2_1p_CPM_original"]
if sparse.issparse(X_counts):
    X_counts = X_counts.A  # same as .toarray()
if sparse.issparse(norm_layer):
    norm_layer = norm_layer.A
gene_names = np.array(adata_neu.var_names, dtype=str)
batches = np.array(adata_neu.obs["BioProject"], dtype=str)
norm_layer = adata_neu.layers["log2_1p_CPM_original"]

In [None]:
import optuna

def objective(trial: optuna.Trial):
    hidden_size = [
        trial.suggest_categorical("h1", [32, 64, 128]),
        trial.suggest_categorical("h2", [16, 32, 64]),
        trial.suggest_categorical("h3", [32, 64, 128]),
    ]
    epochs = trial.suggest_categorical("epochs", [50, 100, 150])
    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    n_hvg = trial.suggest_categorical("n_hvg", [800, 1000, 1500, 2000])

    summary, _ = cv_dca_5fold(
        X_counts, k=3,
        dca_params=dict(hidden_size=hidden_size, epochs=epochs, batch_size=batch_size),
        n_hvg=int(n_hvg), R=1,
        mask_frac=0.10, thinning_p=0.10, random_state=123,
        hvg_mode="seurat_v3",
        gene_names=gene_names, batches=batches,
        norm_layer=norm_layer, batch_key="BioProject",
        seurat_layer_name="log2_1p_CPM_original",
        save_dir=None
    )
    # choose your target metric
    return float(summary["NB_ll_zero"].mean())

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
study.optimize(objective, n_trials=20, gc_after_trial=True)

print("Best params:", study.best_params)

# Final retrain with “fuller” settings
best = study.best_params
final_summary, final_details = cv_dca_5fold(
    X_counts, k=5,
    dca_params=dict(hidden_size=[best["h1"], best["h2"], best["h3"]],
                    epochs=300, batch_size=best["batch_size"]),
    n_hvg=int(best["n_hvg"]), R=3,
    mask_frac=0.10, thinning_p=0.10, random_state=123,
    hvg_mode="seurat_v3",
    gene_names=gene_names, batches=batches,
    norm_layer=norm_layer, batch_key="BioProject",
    seurat_layer_name="log2_1p_CPM_original",
    save_dir="results/dca_full"
)

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-14 19:02:44,543] A new study created in memory with name: no-name-afb3a28f-7a5b-4f3b-9ef7-4ae787ecc2f9


2025-09-14 19:02:44 | [cv_dca_5fold] start k=3 n_hvg=1000 mode=seurat_v3 R=1 mask_frac=0.1 thinning_p=0.1
2025-09-14 19:02:44 | X_counts: dense  (447, 51777) dtype=int64
2025-09-14 19:02:44 | X_counts: zeros=66.23%, approx_int=True
2025-09-14 19:02:44 | [time] [DCA] fold 1/3 start
2025-09-14 19:02:45 | X_tr: dense  (298, 51777) dtype=int64
2025-09-14 19:02:45 | X_va: dense  (149, 51777) dtype=int64




2025-09-14 19:02:45 | [DCA] fold 1 | HVG=1000
2025-09-14 19:02:45 | [DCA] fit hidden=[32, 32, 32] epochs=150 bs=64
2025-09-14 19:02:45 | X_train: dense  (298, 1000) dtype=int64
2025-09-14 19:02:45 | [time] [DCA] fold 1 predict mu_va start


In [None]:
# # (A) Simple: variance HVG
# summary_dca, details_dca = cv_dca_5fold(
#     X_counts,
#     k=5,
#     dca_params=dict(hidden_size=[64,32,64], epochs=300, batch_size=128),
#     n_hvg=2000,
#     R=3,                     # repeats per masking protocol
#     mask_frac=0.10,          # Nonzero Zeroing: 10% of nonzeros masked per gene
#     thinning_p=0.10,         # Binomial Thinning: 10% held out
#     random_state=123,
#     hvg_mode="variance",
#     labels=celltype_labels,  # optional
#     save_dir="results/dca"   # CSVs will be written here
# )





# (B) Batch-aware HVG (Seurat v3)
summary_dca, details_dca = cv_dca_5fold(
    X_counts,
    k=5,
    dca_params=dict(hidden_size=[64,32,64], epochs=300, batch_size=128),
    n_hvg=2000,
    R=3,
    mask_frac=0.10,
    thinning_p=0.10,
    random_state=123,
    hvg_mode="seurat_v3",
    gene_names=gene_names,
    batches=batches,
    norm_layer=norm_layer,
    batch_key="BioProject",
    seurat_layer_name="log2_1p_CPM_original",
    # labels=celltype_labels,        # optional
    save_dir="results/dca"         # CSVs will be written here
)


