In [1]:
import sys
import scanpy as sc

sys.path.append("../..")
from scripts.runner_models import cv_dca_5fold

In [2]:
# --- Put this in the very first cell, BEFORE importing dca ---
import sys, types
import tensorflow.keras.optimizers.legacy as legacy_opt

# Build a tiny shim module so `from keras.optimizers import RMSprop` etc. pick legacy classes
shim = types.ModuleType("keras.optimizers")
for name in ("SGD", "RMSprop", "Adam", "Adagrad", "Adadelta", "Adamax", "Nadam", "Ftrl", "Optimizer"):
    if hasattr(legacy_opt, name):
        setattr(shim, name, getattr(legacy_opt, name))

# Ensure our shim is used for future imports
sys.modules.pop("keras.optimizers", None)   # in case it's already imported
sys.modules["keras.optimizers"] = shim

# (Optional) make sure GPU mem growth is on
try:
    import tensorflow as tf
    for g in tf.config.list_physical_devices("GPU"):
        tf.config.experimental.set_memory_growth(g, True)
except Exception:
    pass


2025-09-16 22:47:39.495275: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-16 22:47:40.734945: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-16 22:47:40.737872: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-16 22:47:40.938878: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-16 22:47:41.416534: I tensorflow/core/platform/cpu_feature_guar

In [3]:
# Load data
IN_PATH_neu = "../../data/raw_count/GSE169569_raw_counts.h5ad"
IN_PATH_cov = "../../data/raw_count/GSE228841_raw_counts.h5ad"

adata_neu = sc.read_h5ad(IN_PATH_neu)
adata_cov = sc.read_h5ad(IN_PATH_cov)

In [None]:
# --- PATCH PyYAML load() to default to a safe/full loader if none was given ---
import yaml

if hasattr(yaml, "load"):
    _orig_load = yaml.load
    def _patched_load(stream, Loader=None, *args, **kwargs):
        if Loader is None:
            try:
                from yaml import FullLoader
                Loader = FullLoader
            except Exception:
                from yaml import SafeLoader
                Loader = SafeLoader
        return _orig_load(stream, Loader=Loader, *args, **kwargs)
    yaml.load = _patched_load

# Sanity check: should NOT raise TypeError now
assert yaml.load("a: 1") == {"a": 1}


In [4]:
import numpy as np

# Extract data matrices and metadata
X_counts = adata_neu.layers["counts"] # raw counts
gene_names = np.array(adata_neu.var_names, dtype=str)
batches = np.array(adata_neu.obs["BioProject"], dtype=str)
norm_layer = adata_neu.layers["log2_1p_CPM_original"]

In [5]:
import optuna

def objective(trial: optuna.Trial):
    hidden_size = [
        trial.suggest_categorical("h1", [32, 64, 128]),
        trial.suggest_categorical("h2", [16, 32, 64]),
        trial.suggest_categorical("h3", [32, 64, 128]),
    ]
    epochs = trial.suggest_categorical("epochs", [50, 100, 150])
    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    n_hvg = trial.suggest_categorical("n_hvg", [800, 1000, 1500, 2000])

    summary, _ = cv_dca_5fold(
        X_counts, k=3,
        dca_params=dict(hidden_size=hidden_size, epochs=epochs, batch_size=batch_size),
        n_hvg=int(n_hvg), R=1,
        mask_frac=0.10, thinning_p=0.10, random_state=123,
        hvg_mode="seurat_v3",
        gene_names=gene_names, batches=batches,
        norm_layer=norm_layer, batch_key="BioProject",
        seurat_layer_name="log2_1p_CPM_original",
        save_dir=None
    )
    # choose your target metric
    return float(summary["NB_ll_zero"].mean())

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
study.optimize(objective, n_trials=20, gc_after_trial=True)

print("Best params:", study.best_params)

# Final retrain with “fuller” settings
best = study.best_params
final_summary, final_details = cv_dca_5fold(
    X_counts, k=5,
    dca_params=dict(hidden_size=[best["h1"], best["h2"], best["h3"]],
                    epochs=300, batch_size=best["batch_size"]),
    n_hvg=int(best["n_hvg"]), R=3,
    mask_frac=0.10, thinning_p=0.10, random_state=123,
    hvg_mode="seurat_v3",
    gene_names=gene_names, batches=batches,
    norm_layer=norm_layer, batch_key="BioProject",
    seurat_layer_name="log2_1p_CPM_original",
    save_dir="results/dca_full"
)

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-16 22:57:26,157] A new study created in memory with name: no-name-557a9b0d-bfad-4b25-b395-50121eca3201


2025-09-16 22:57:26 | [cv_dca_5fold] start k=3 n_hvg=1000 mode=seurat_v3 R=1 mask_frac=0.1 thinning_p=0.1
2025-09-16 22:57:26 | X_counts: dense  (447, 51777) dtype=int64
2025-09-16 23:00:10 | X_counts: zeros=66.23%, approx_int=True
2025-09-16 23:00:10 | [time] [DCA] fold 1/3 start
2025-09-16 23:00:10 | X_tr: dense  (298, 51777) dtype=int64
2025-09-16 23:00:10 | X_va: dense  (149, 51777) dtype=int64




2025-09-16 23:03:45 | [DCA] fold 1 | HVG=1000
2025-09-16 23:03:45 | [DCA] fit hidden=[32, 32, 32] epochs=150 bs=64
2025-09-16 23:03:45 | X_train: dense  (298, 1000) dtype=int64
2025-09-16 23:03:45 | [time] [DCA] fold 1 predict mu_va start
2025-09-16 23:03:45 | [time] [DCA] fold 1 predict mu_va end in 0.80s
2025-09-16 23:03:45 | [time] [DCA] fold 1/3 end in 215.82s


  import pkg_resources
[W 2025-09-16 23:03:45,943] Trial 0 failed with parameters: {'h1': 32, 'h2': 32, 'h3': 32, 'epochs': 150, 'batch_size': 64, 'n_hvg': 1000} because of the following error: TypeError("load() missing 1 required positional argument: 'Loader'").
Traceback (most recent call last):
  File "/home/ma/ma_ma/ma_minjlee/.conda/envs/dca_gpu/lib/python3.10/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "/scratch/ipykernel_139197/2331375940.py", line 13, in objective
    summary, _ = cv_dca_5fold(
  File "/pfs/data6/home/ma/ma_ma/ma_minjlee/denoising_rna_sequencing/Thesis_project/notebooks/dropout_imputation/../../scripts/runner_models.py", line 381, in cv_dca_5fold
    mu_va = dca.predict_mean(X_va_G)
  File "/pfs/data6/home/ma/ma_ma/ma_minjlee/denoising_rna_sequencing/Thesis_project/notebooks/dropout_imputation/../../scripts/runner_models.py", line 151, in predict_mean
    from dca.api import dca
  File "/home/ma/ma_

TypeError: load() missing 1 required positional argument: 'Loader'

In [19]:
# # (A) Simple: variance HVG
# summary_dca, details_dca = cv_dca_5fold(
#     X_counts,
#     k=5,
#     dca_params=dict(hidden_size=[64,32,64], epochs=300, batch_size=128),
#     n_hvg=2000,
#     R=3,                     # repeats per masking protocol
#     mask_frac=0.10,          # Nonzero Zeroing: 10% of nonzeros masked per gene
#     thinning_p=0.10,         # Binomial Thinning: 10% held out
#     random_state=123,
#     hvg_mode="variance",
#     labels=celltype_labels,  # optional
#     save_dir="results/dca"   # CSVs will be written here
# )





# (B) Batch-aware HVG (Seurat v3)
summary_dca, details_dca = cv_dca_5fold(
    X_counts,
    k=5,
    dca_params=dict(hidden_size=[64,32,64], epochs=300, batch_size=128),
    n_hvg=2000,
    R=3,
    mask_frac=0.10,
    thinning_p=0.10,
    random_state=123,
    hvg_mode="seurat_v3",
    gene_names=gene_names,
    batches=batches,
    norm_layer=norm_layer,
    batch_key="BioProject",
    seurat_layer_name="log2_1p_CPM_original",
    # labels=celltype_labels,        # optional
    save_dir="results/dca"         # CSVs will be written here
)


2025-09-16 19:54:08 | [cv_dca_5fold] start k=5 n_hvg=2000 mode=seurat_v3 R=3 mask_frac=0.1 thinning_p=0.1
2025-09-16 19:54:08 | X_counts: dense  (447, 51777) dtype=int64
2025-09-16 19:55:47 | X_counts: zeros=66.23%, approx_int=True
2025-09-16 19:55:47 | [time] [DCA] fold 1/5 start
2025-09-16 19:55:47 | X_tr: dense  (357, 51777) dtype=int64
2025-09-16 19:55:47 | X_va: dense  (90, 51777) dtype=int64




2025-09-16 19:56:23 | [DCA] fold 1 | HVG=2000
2025-09-16 19:56:23 | [DCA] fit hidden=[64, 32, 64] epochs=300 bs=128
2025-09-16 19:56:23 | X_train: dense  (357, 2000) dtype=int64
2025-09-16 19:56:23 | [time] [DCA] fold 1 predict mu_va start
2025-09-16 19:56:23 | [DCA] predict_mean
2025-09-16 19:56:23 | [time] [DCA] train(dca.api.dca) start
dca: Successfully preprocessed 2000 genes and 447 cells.
2025-09-16 19:56:23 | [time] [DCA] train(dca.api.dca) end in 0.59s
2025-09-16 19:56:23 | [time] [DCA] fold 1 predict mu_va end in 0.60s
2025-09-16 19:56:23 | [time] [DCA] fold 1/5 end in 36.59s


2025-09-16 19:56:23.462827: W tensorflow/c/c_api.cc:305] Operation '{name:'dispersion_4/bias/Assign' id:1762 op device:{requested: '', assigned: ''} def:{{{node dispersion_4/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dispersion_4/bias, dispersion_4/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


AttributeError: 'RMSprop' object has no attribute 'get_updates'