In [51]:
import pandas as pd
import numpy as np
import scanpy as sc
import celloracle as co
import muon as mu
import scipy
import os
import argparse

In [88]:
path_input = "/cellar/users/aklie/data/datasets/neurips2021_small/annotation/2023_12_05/mdata.h5mu"
k = 10
dim_reduction_key = None
path_out = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/celloracle.pre.h5mu"

In [89]:
# Read rna adata
mdata = mu.read(path_input)
mdata

In [90]:
# Extract raw counts data and assign labels
adata = mdata.mod['rna'].copy()

In [82]:
adata.X = adata.layers['counts'].copy()
adata.obs['celltype'] = mdata.obs['celltype']

In [83]:
adata.X[:5, :5].A

array([[1., 0., 2., 5., 0.],
       [0., 0., 6., 5., 1.],
       [0., 3., 0., 0., 0.],
       [0., 5., 0., 0., 1.],
       [0., 5., 0., 1., 4.]], dtype=float32)

In [84]:
if dim_reduction_key is None:
    adata_pp = adata.copy()
    sc.pp.normalize_total(adata_pp, target_sum=1e4)
    sc.pp.log1p(adata_pp)
    sc.pp.scale(adata_pp, max_value=10)
    sc.pp.pca(adata_pp, n_comps=50)
    adata.obsm["X_pca"] = adata_pp.obsm["X_pca"]

In [85]:
adata.X[:5, :5].A

array([[1., 0., 2., 5., 0.],
       [0., 0., 6., 5., 1.],
       [0., 3., 0., 0., 0.],
       [0., 5., 0., 0., 1.],
       [0., 5., 0., 1., 4.]], dtype=float32)

In [86]:
adata

AnnData object with n_obs × n_vars = 123 × 512
    obs: 'pseudotime_order', 'celltype'
    var: 'gene_id'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts'

In [91]:
# Instantiate Oracle object
oracle = co.Oracle()
oracle.import_anndata_as_raw_count(
    adata=adata,
    cluster_column_name="celltype",
    embedding_name="X_pca"  # as far as I can tell, this is just used for simulation, which isn't part of this pipelin
)

512 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


KeyError: None

In [26]:
# Compute PCA and select top pcs
oracle.perform_PCA()
n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
n_comps = min(n_comps, 50)

In [28]:
# Run imputation
oracle.knn_imputation(
    n_pca_dims=n_comps,
    k=k,
    balanced=True,
    b_sight=k*8,
    b_maxl=k*4,
    n_jobs=os.cpu_count(),
)

In [29]:
# Update object with imputet counts
mdata['rna'].X = oracle.adata.layers['imputed_count']

In [31]:
mdata['rna'].X[:5, :5]

array([[0.3150669 , 0.        , 1.3958067 , 0.98690159, 0.        ],
       [0.25205352, 0.        , 1.68915616, 0.89146321, 0.06301338],
       [0.        , 0.67770182, 0.        , 0.        , 0.12602676],
       [0.        , 0.82844983, 0.        , 0.        , 0.61799459],
       [0.        , 1.19392588, 0.        , 0.06301338, 1.17338074]])

In [37]:
# Write
mdata.write(path_out)

# Write command to run

```bash
python /cellar/users/aklie/opt/gene_program_evaluation/src/inference/grn_models/celloracle/workflow/scripts/pre.py \
-i /cellar/users/aklie/data/datasets/neurips2021_small/annotation/2023_12_05/mdata.h5mu \
-k 10 \
-d X_pca \
-o /cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/celloracle.pre.h5mu
```

# DONE!

---