In [9]:
import pandas as pd
import numpy as np
import scanpy as sc
import celloracle as co
import muon as mu
import scipy
import os
import argparse

In [20]:
path_input = "/cellar/users/aklie/data/datasets/neurips2021_small/annotation/2023_12_05/mdata.h5mu"
k = 10
dim_reduction_key = "X_pca"
path_out = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/celloracle.pre.h5mu"

In [21]:
# Read rna adata
mdata = mu.read(path_input)
mdata

In [22]:
# Extract raw counts data and assign labels
adata = mdata.mod['rna'].copy()

In [23]:
adata.X = adata.layers['counts'].copy()
adata.obs['celltype'] = mdata.obs['celltype']

In [24]:
if dim_reduction_key is None:
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.scale(adata, max_value=10)
    sc.pp.pca(adata, n_comps=50)
    adata.X = adata.layers['counts'].copy()

In [25]:
# Instantiate Oracle object
oracle = co.Oracle()
oracle.import_anndata_as_raw_count(
    adata=adata,
    cluster_column_name="celltype",
    embedding_name="X_pca"
)

512 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


In [26]:
# Compute PCA and select top pcs
oracle.perform_PCA()
n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
n_comps = min(n_comps, 50)

In [27]:
k

10

In [28]:
# Run imputation
oracle.knn_imputation(
    n_pca_dims=n_comps,
    k=k,
    balanced=True,
    b_sight=k*8,
    b_maxl=k*4,
    n_jobs=os.cpu_count(),
)

In [29]:
# Update object with imputet counts
mdata['rna'].X = oracle.adata.layers['imputed_count']

In [31]:
mdata['rna'].X[:5, :5]

array([[0.3150669 , 0.        , 1.3958067 , 0.98690159, 0.        ],
       [0.25205352, 0.        , 1.68915616, 0.89146321, 0.06301338],
       [0.        , 0.67770182, 0.        , 0.        , 0.12602676],
       [0.        , 0.82844983, 0.        , 0.        , 0.61799459],
       [0.        , 1.19392588, 0.        , 0.06301338, 1.17338074]])

In [37]:
# Write
mdata.write(path_out)

# Write command to run

```bash
python /cellar/users/aklie/opt/gene_program_evaluation/src/inference/grn_models/celloracle/workflow/scripts/pre.py \
-i /cellar/users/aklie/data/datasets/neurips2021_small/annotation/2023_12_05/mdata.h5mu \
-k 10 \
-d X_pca \
-o /cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/celloracle.pre.h5mu
```

# DONE!

---