In [83]:
import numpy as np
import pandas as pd
import muon as mu
import scanpy as sc
import celloracle as co
import os
import argparse

In [90]:
path_mdata = "/cellar/users/aklie/data/datasets/neurips2021_small/annotation/2023_12_05/mdata.h5mu"
path_r2g = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/processed_peak_file.csv"
path_tf2r = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/celloracle.tfb.csv"
cluster_key = "celltype"
alpha = 10
bagging_number = 20
layer = None
path_out = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/grn.csv"

In [91]:
# Process base GRN
r2g = pd.read_csv(path_r2g)
tfb = pd.read_csv(path_tf2r)
if (r2g.shape[0] == 0) or (tfb.shape[0] == 0):
    grn = pd.DataFrame(columns=['source', 'target', 'score', 'pval'])
    grn.to_csv(path_out, index=False)
    exit()
tfb['score'] = 1
r2g = r2g[['cre', 'gene']]
base_grn = pd.merge(
    r2g,
    tfb
    .pivot(index='cre', columns='tf')
    .fillna(0)
    .droplevel(0, axis=1)
    .reset_index()
)
base_grn = base_grn.rename(columns={'cre': 'peak_id', 'gene': 'gene_short_name'})
base_grn['peak_id'] = base_grn['peak_id'].str.replace('-', '_')

In [92]:
mdata = mu.read(path_mdata)
adata = mdata.mod["rna"].copy()

In [93]:
adata.obs[cluster_key] = mdata.obs[cluster_key].copy()

In [94]:
if layer is not None:
    print(f"Using data in layer {layer} for regression.")
    adata.X = adata.layers[layer].copy()
else:
    print("Log normalizing data for regression.")
    adata.X = adata.layers["counts"].copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

Log normalizing data for regression.


In [95]:
# for every cluster
cluster_grns = {}
for cluster in adata.obs[cluster_key].cat.categories:
    print(f"Building GRN for {cluster}")
    adata_sub = adata[adata.obs[cluster_key] == cluster].copy()
    net = co.Net(
        gene_expression_matrix=adata_sub.to_df(), # Input gene expression matrix as data frame
        TFinfo_matrix=base_grn, # Input base GRN
        verbose=True
    )
    net.fit_All_genes(
        bagging_number=bagging_number,
        alpha=alpha,
        verbose=True
    )
    net.updateLinkList(verbose=True)
    inference_result = net.linkList.copy()
    cluster_grns[cluster] = inference_result
    print(f"Finished building GRN for {cluster}")

Building GRN for Erythroblast
initiating Net object ...
gem_shape: (26, 512)
TF info shape: (115, 27)
initiation completed.


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Finished building GRN for Erythroblast
Building GRN for HSC
initiating Net object ...
gem_shape: (26, 512)
TF info shape: (115, 27)
initiation completed.


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Finished building GRN for HSC
Building GRN for MK/E prog
initiating Net object ...
gem_shape: (24, 512)
TF info shape: (115, 27)
initiation completed.


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Finished building GRN for MK/E prog
Building GRN for Normoblast
initiating Net object ...
gem_shape: (21, 512)
TF info shape: (115, 27)
initiation completed.


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Finished building GRN for Normoblast
Building GRN for Proerythroblast
initiating Net object ...
gem_shape: (26, 512)
TF info shape: (115, 27)
initiation completed.


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Finished building GRN for Proerythroblast


In [98]:
# Make a pandas dataframe with new column for cluster
grn = pd.concat([v.assign(cluster=k) for k, v in cluster_grns.items()])
grn

Unnamed: 0,source,target,coef_mean,coef_abs,p,-logp,cluster
0,E2F2,ARID5B,-0.080554,0.080554,0.006691,2.174495,Erythroblast
1,MYEF2,ARID5B,0.000000,0.000000,,-0.000000,Erythroblast
2,TCF7L2,ARID5B,-0.006850,0.006850,0.356905,0.447447,Erythroblast
3,KLF6,ARID5B,-0.043443,0.043443,0.014623,1.834968,Erythroblast
4,SP4,ARID5B,0.169180,0.169180,0.001314,2.881531,Erythroblast
...,...,...,...,...,...,...,...
350,TCF7L2,ZSWIM5,-0.031369,0.031369,0.005940,2.226237,Proerythroblast
351,BCL11B,ZSWIM5,-0.096584,0.096584,0.001089,2.962793,Proerythroblast
352,MAFG,ZSWIM5,-0.023792,0.023792,0.259280,0.586231,Proerythroblast
353,TAL1,ZSWIM5,0.223483,0.223483,0.000116,3.934154,Proerythroblast


In [100]:
grn = grn.dropna()[['source', 'target', 'coef_mean', 'p']]
grn = grn.rename(columns={'coef_mean': 'score', 'p': 'pval'})

In [102]:
grn = grn.sort_values(['source', 'score'], ascending=[True, False])

In [104]:
# Write
grn.to_csv(path_out, index=False)

# Write command to run

```bash
python /cellar/users/aklie/opt/gene_program_evaluation/src/inference/grn_models/celloracle/workflow/scripts/grn.py \
-d /cellar/users/aklie/data/datasets/neurips2021_small/annotation/2023_12_05/mdata.h5mu \
-r /cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/r2g.csv \
-t /cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/tf2r.csv \
-c celltype \
-a 10 \
-b 20 \
-o /cellar/users/aklie/data/datasets/neurips2021_small/analysis/celloracle/2024_05_01/grn.csv
```

# DONE!

---