env: scGPT

# import

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.data import DataLoader

import pickle
import sys
import requests

from types import MethodType
import importlib
from scperturb import *

import anndata as ad

In [2]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

import celloracle as co
co.__version__

'0.18.0'

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from tqdm import tqdm

In [4]:
sys.path.append("/data1/lichen/code/single_cell_perturbation/scPerturb/Byte_Pert_Data/")

In [5]:
import v1
from v1.utils import *
from v1.dataloader import *

In [6]:
importlib.reload(v1)
importlib.reload(v1.utils)
importlib.reload(v1.dataloader)

<module 'v1.dataloader' from '/data1/lichen/code/single_cell_perturbation/scPerturb/Byte_Pert_Data/v1/dataloader.py'>

# L1000运行CellOracle-非并行版

In [7]:
# - get cell line name
common_cell_line = \
{   'A549': 'A549',
    'HEPG2': 'HepG2',
    'HT29': 'HT29',
    'MCF7': 'MCF7',
    # 'SKBR3': 'SK-BR-3',
    'SW480': 'SW480',
    'PC3': 'PC3',
    'A375': 'A375',
} # L1000 cell line : single-cell cell line

# - read adata_L1000, this is processed data
adata_L1000 = sc.read('/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/GSE92742/adata_gene_pert.h5ad')
adata_L1000

AnnData object with n_obs × n_vars = 36720 × 978
    obs: 'sig_id', 'pert_id', 'pert_iname', 'pert_type', 'cell_id', 'pert_dose', 'pert_dose_unit', 'pert_idose', 'pert_time', 'pert_time_unit', 'pert_itime', 'distil_id'
    var: 'pr_gene_id', 'pr_gene_symbol', 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'

In [9]:
from tqdm import tqdm
from sklearn.metrics import precision_recall_curve, auc

for cell_line_bulk in list(common_cell_line.keys())[:]:
    cell_line_single = common_cell_line[cell_line_bulk]
    print('='*20, f'cell line is {cell_line_single}')
    
    #####################################################
    
    #===================prepare data
    if cell_line_bulk in ['PC3', 'A375']:
        save_dir_adata = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/single_cell_data/SCP542'
    else:
        save_dir_adata = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/single_cell_data/CNP0003658'
    adata_rna = sc.read(os.path.join(save_dir_adata, cell_line_bulk, f'adata_{cell_line_bulk}.h5ad'))

    # - consctrut corr mtx
    if not isinstance(adata_rna.X, np.ndarray):
        adata_rna.X = adata_rna.X.toarray()
    # corr_mtx = np.corrcoef(adata_rna.X.T)
    
    # - get var_names
    var_names = list(adata_rna.var_names)
    
    # - get common pert
    adata_L1000_sub = adata_L1000[adata_L1000.obs['cell_id']==cell_line_bulk]
    L1000_total_perts = np.unique(adata_L1000_sub.obs['pert_iname'])
    
    
    n_cells_downsample = 10000
    threshold_number = 10000
    
    ##########################################################
    
    # - get control adata
    adata = adata_rna.copy()
    adata.obs['celltype'] = cell_line_bulk
    print(f'adata.shape is: ',adata.shape)

    # -- get the baseGRN
    # Load TF info which was made from mouse cell atlas dataset.
    base_GRN = co.data.load_human_promoter_base_GRN()
    print('base_GRN.shape: ', base_GRN.shape)

    tmp_dir = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/L1000'
    save_prefix = f'CellOracle/{cell_line_bulk}' # use result of K562 to do the direct transfer
    os.makedirs(os.path.join(tmp_dir, save_prefix), exist_ok=True)

    save_dir = os.path.join(tmp_dir, save_prefix)
    if os.path.exists(os.path.join(save_dir, "ctrl.celloracle.oracle")):
        print('file exists')
        oracle = co.load_hdf5(os.path.join(save_dir, "ctrl.celloracle.oracle"))
        links = co.load_hdf5(file_path=os.path.join(save_dir, "ctrl.celloracle.links"))
        
    else:

        # - start CellOracle process for the whole ctrl

        # -- keep raw cont data before log transformation
        adata.raw = adata
        if not isinstance(adata.raw.X, np.ndarray):
            adata.layers["raw_count"] = (np.exp(adata.raw.X.toarray())-1).copy()
        else:
            adata.layers["raw_count"] = (np.exp(adata.raw.X)-1).copy()
            
        # -- get umap 
        sc.pp.scale(adata)
        # PCA
        sc.tl.pca(adata, svd_solver='arpack', random_state=2022)
        # UMAP
        sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20, random_state=2022)
        sc.tl.umap(adata,random_state=2022)

        # -- Random downsampling into 30K cells if the anndata object include more than 30 K cells.
        if adata.shape[0] > n_cells_downsample:
            # Let's dowmsample into 30K cells
            sc.pp.subsample(adata, n_obs=n_cells_downsample, random_state=123)
        print(f"Cell number is :{adata.shape[0]}")

        # -- Instantiate Oracle object
        oracle = co.Oracle()

        # -- Check data in anndata
        print("Metadata columns :", list(adata.obs.columns))
        print("Dimensional reduction: ", list(adata.obsm.keys()))

        # -- In this notebook, we use the unscaled mRNA count for the nput of Oracle object.
        adata.X = adata.layers["raw_count"].copy()

        # -- Instantiate Oracle object.
        oracle.import_anndata_as_raw_count(adata=adata,
                                        cluster_column_name="celltype",
                                        embedding_name="X_umap")

        # -- You can load TF info dataframe with the following code.
        oracle.import_TF_data(TF_info_matrix=base_GRN)

        # -- knn imputation, this step is needed for the whole ctrl
        # Perform PCA
        oracle.perform_PCA()

        # Select important PCs
        plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])
        n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
        plt.axvline(n_comps, c="k")
        plt.show()
        print(n_comps)
        n_comps = min(n_comps, 50)

        n_cell = oracle.adata.shape[0]
        print(f"cell number is :{n_cell}")

        k = int(0.025*n_cell)
        print(f"Auto-selected k is :{k}")

        oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
                            b_maxl=k*4, n_jobs=4)

        # model_prefix = ''
        os.makedirs(save_dir, exist_ok=True)
        # -- save the oracle
        oracle.to_hdf5(os.path.join(save_dir, "ctrl.celloracle.oracle"))

        # -- get the links
        # Calculate GRN for each population in "louvain_annot" clustering unit.
        # This step may take some time.(~30 minutes)
        links = oracle.get_links(cluster_name_for_GRN_unit="celltype", alpha=10,
                                verbose_level=10)

        # -- Save Links object.
        links.to_hdf5(file_path=os.path.join(save_dir, "ctrl.celloracle.links"))


    # -- filter and get the coef_mtx
    links.filter_links(threshold_number=threshold_number,
                        p=0.001,
                        weight='coef_abs')
    oracle.get_cluster_specific_TFdict_from_Links(links_object=links)
    oracle.fit_GRN_for_simulation(alpha=10,
                                use_cluster_specific_TFdict=True)
    
    ###################################################
    # - get all the TFs in the base_GRN
    TFdict = import_TF_data(TF_info_matrix=base_GRN)
    tf_target_dict = {}
    for target, gene_set in TFdict.items():
        for tf in gene_set:
            if tf not in tf_target_dict:
                tf_target_dict[tf] = []
                tf_target_dict[tf].append(target)
            else:
                tf_target_dict[tf].append(target)
    total_tf_list = list(tf_target_dict.keys())
    


    
    #####################################################
    var_names = list(adata.var_names)    
    single_total_perts = np.intersect1d(total_tf_list, adata.var_names)
    common_perts = np.intersect1d(single_total_perts, L1000_total_perts)
    print('L1000_total_perts num: ', len(L1000_total_perts))
    print('common_perts num: ', len(common_perts))
    print('common var to L1000 data is: ', len(np.intersect1d(var_names, adata_L1000.var_names)))

    ###########################################
    celltype = adata.obs['celltype'].unique()[0]

    # - get the tf_GRN_dict, to check whether pert have regulatory relations
    gene_GRN_mtx = oracle.coef_matrix_per_cluster[celltype].copy()
    tf_GRN_mtx = gene_GRN_mtx[~(gene_GRN_mtx == 0).all(axis=1)]
    # - get TF-target pair and the regulatory values
    tf_GRN_dict = {} # the tf to targets
    for i in range(len(tf_GRN_mtx)):
        tmp = tf_GRN_mtx.iloc[i,:]
        tmp = tmp[tmp!=0]

        tf_GRN_dict[tf_GRN_mtx.index[i]] = {}
        for j in range(len(tmp)):
            tf_GRN_dict[tf_GRN_mtx.index[i]][tmp.index[j]] = tmp.values[j]

    ###########################################        
    # - get oracle_ctrl
    adata_rna.obs['celltype'] = cell_line_bulk
    adata_ctrl = adata_rna.copy()
    # keep raw cont data before log transformation
    adata_ctrl.raw = adata_ctrl

    # the result will be recovered in normalized_count
    if not isinstance(adata_ctrl.raw.X, np.ndarray):
        adata_ctrl.layers["raw_count"] = (np.exp(adata_ctrl.raw.X.toarray())-1).copy()
    else:
        adata_ctrl.layers["raw_count"] = (np.exp(adata_ctrl.raw.X)-1).copy()
        
    sc.pp.scale(adata_ctrl)
    # PCA
    sc.tl.pca(adata_ctrl, svd_solver='arpack', random_state=2022)

    # Diffusion map
    sc.pp.neighbors(adata_ctrl, n_neighbors=4, n_pcs=20, random_state=2022)
    sc.tl.umap(adata_ctrl,random_state=2022)

    # Instantiate Oracle object
    oracle_ctrl = co.Oracle()

    # In this notebook, we use the unscaled mRNA count for the nput of Oracle object.
    adata_ctrl.X = adata_ctrl.layers["raw_count"].copy()

    # Instantiate Oracle object.
    oracle_ctrl.import_anndata_as_raw_count(adata=adata_ctrl,
                                    cluster_column_name="celltype",
                                    embedding_name="X_umap")

    # You can load TF info dataframe with the following code.
    oracle_ctrl.import_TF_data(TF_info_matrix=base_GRN)

    # get the imputed_count, here we dont do the impute to get the prediction
    oracle_ctrl.adata.layers["imputed_count"] = oracle_ctrl.adata.layers["normalized_count"].copy()

    # get the coef from the whole ctrl
    oracle_ctrl.coef_matrix_per_cluster = oracle.coef_matrix_per_cluster
    
    pert_gene_rank_dict = {} 
    for pert in tqdm(common_perts):
        

        # - this is for crispra
        gois = [pert]
        goi_dict = {}

        # - all data in L1000 is knockdown
        for goi in gois:
            # -- if original value is zero
            if np.mean(adata_rna[:,goi].X.toarray())==0:
                print(f'{goi} ctrl expression is 0')
                continue
            # -- if the TF has no targets
            if goi not in list(tf_GRN_dict.keys()):
                print(f'{goi} is not in the tf_GRN_dict, no targets')
                continue
            goi_dict[goi] = 0
        if len(goi_dict) == 0:
            print(f'{pert} is filtered')
            continue


        # Enter perturbation conditions to simulate signal propagation after the perturbation.
        oracle_ctrl.simulate_shift(perturb_condition=goi_dict,
                            n_propagation=3)
        # - get the prediction; delta_X = simulated_count - imputed_count
        delta_X, simulated_count = oracle_ctrl.adata.layers["delta_X"], oracle_ctrl.adata.layers["simulated_count"]


        # - create adata_pert
        adata_pert = adata_rna.copy()
        adata_pert.X = simulated_count
        adata_pert.X[adata_pert.X < 0] = 0
        adata_pert.obs_names = [i+f'_{pert}' for i in adata_pert.obs_names]

        # - adata_ctrl
        adata_ctrl = adata_rna.copy()

        adata_ctrl.obs['batch'] = 'ctrl'
        adata_pert.obs['batch'] = 'pert'

        adata_concat = ad.concat([adata_ctrl, adata_pert])
        adata_concat.obs['batch'] = adata_concat.obs['batch'].astype('category') 
        adata_concat.obs['celltype'] = adata_concat.obs['celltype'].astype('category') 

        # - cal de genes
        rankby_abs = False

        sc.tl.rank_genes_groups(
            adata_concat,
            groupby='batch',
            reference='ctrl',
            rankby_abs=rankby_abs,
            n_genes=len(adata_concat.var),
            use_raw=False,
            method = 'wilcoxon'
        )
        de_genes = pd.DataFrame(adata_concat.uns['rank_genes_groups']['names'])
        pvals = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals'])
        pvals_adj = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals_adj'])
        scores = pd.DataFrame(adata_concat.uns['rank_genes_groups']['scores'])
        logfoldchanges = pd.DataFrame(adata_concat.uns['rank_genes_groups']['logfoldchanges'])

        # - get gene_score
        gene_score = pd.DataFrame({'gene':list(de_genes['pert']),
                                    'z-score':list(scores['pert'])})

        pert_gene_rank_dict[pert] = (list(de_genes['pert']), list(scores['pert']))
        
        break
        
    save_dir = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark_202410/zero_shot/result'
    save_prefix = f'CellOracle/{cell_line_bulk}' # use result of K562 to do the direct transfer
    os.makedirs(os.path.join(save_dir, save_prefix), exist_ok=True)

    import json
    with open(os.path.join(save_dir, save_prefix, 'pert_gene_rank_dict.json'), 'w') as f:
        json.dump(pert_gene_rank_dict, f)
        

    break

adata.shape is:  (500, 5155)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists


  0%|          | 0/1 [00:00<?, ?it/s]

L1000_total_perts num:  3620
common_perts num:  306
common var to L1000 data is:  933
5155 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical
  0%|          | 0/306 [00:00<?, ?it/s]... storing 'celltype' as categorical
... storing 'batch' as categorical
  0%|          | 0/306 [01:14<?, ?it/s]


# L1000运行CellOracle-并行版

In [14]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
from tqdm import tqdm
from sklearn.metrics import precision_recall_curve, auc
from scipy.spatial.distance import cdist
import concurrent.futures
import json

# 定义处理每个 cell_line_single 的函数
def process_cell_line(cell_line_bulk, cell_line_single, common_cell_line, adata_L1000):
    print('=' * 20, f'cell line is {cell_line_single}')

    #===================prepare data
    if cell_line_bulk in ['PC3', 'A375']:
        save_dir_adata = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/single_cell_data/SCP542'
    else:
        save_dir_adata = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/single_cell_data/CNP0003658'
    adata_rna = sc.read(os.path.join(save_dir_adata, cell_line_bulk, f'adata_{cell_line_bulk}.h5ad'))

    # - consctrut corr mtx
    if not isinstance(adata_rna.X, np.ndarray):
        adata_rna.X = adata_rna.X.toarray()
    # corr_mtx = np.corrcoef(adata_rna.X.T)
    
    # - get var_names
    var_names = list(adata_rna.var_names)
    
    # - get common pert
    adata_L1000_sub = adata_L1000[adata_L1000.obs['cell_id']==cell_line_bulk]
    L1000_total_perts = np.unique(adata_L1000_sub.obs['pert_iname'])
    
    
    n_cells_downsample = 10000
    threshold_number = 10000
    
    ##########################################################
    
    # - get control adata
    adata = adata_rna.copy()
    adata.obs['celltype'] = cell_line_bulk
    print(f'adata.shape is: ',adata.shape)

    # -- get the baseGRN
    # Load TF info which was made from mouse cell atlas dataset.
    base_GRN = co.data.load_human_promoter_base_GRN()
    print('base_GRN.shape: ', base_GRN.shape)

    tmp_dir = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/L1000'
    save_prefix = f'CellOracle/{cell_line_bulk}' # use result of K562 to do the direct transfer
    os.makedirs(os.path.join(tmp_dir, save_prefix), exist_ok=True)

    save_dir = os.path.join(tmp_dir, save_prefix)
    if os.path.exists(os.path.join(save_dir, "ctrl.celloracle.oracle")):
        print('file exists')
        oracle = co.load_hdf5(os.path.join(save_dir, "ctrl.celloracle.oracle"))
        links = co.load_hdf5(file_path=os.path.join(save_dir, "ctrl.celloracle.links"))
        
    else:

        # - start CellOracle process for the whole ctrl

        # -- keep raw cont data before log transformation
        adata.raw = adata
        if not isinstance(adata.raw.X, np.ndarray):
            adata.layers["raw_count"] = (np.exp(adata.raw.X.toarray())-1).copy()
        else:
            adata.layers["raw_count"] = (np.exp(adata.raw.X)-1).copy()
            
        # -- get umap 
        sc.pp.scale(adata)
        # PCA
        sc.tl.pca(adata, svd_solver='arpack', random_state=2022)
        # UMAP
        sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20, random_state=2022)
        sc.tl.umap(adata,random_state=2022)

        # -- Random downsampling into 30K cells if the anndata object include more than 30 K cells.
        if adata.shape[0] > n_cells_downsample:
            # Let's dowmsample into 30K cells
            sc.pp.subsample(adata, n_obs=n_cells_downsample, random_state=123)
        print(f"Cell number is :{adata.shape[0]}")

        # -- Instantiate Oracle object
        oracle = co.Oracle()

        # -- Check data in anndata
        print("Metadata columns :", list(adata.obs.columns))
        print("Dimensional reduction: ", list(adata.obsm.keys()))

        # -- In this notebook, we use the unscaled mRNA count for the nput of Oracle object.
        adata.X = adata.layers["raw_count"].copy()

        # -- Instantiate Oracle object.
        oracle.import_anndata_as_raw_count(adata=adata,
                                        cluster_column_name="celltype",
                                        embedding_name="X_umap")

        # -- You can load TF info dataframe with the following code.
        oracle.import_TF_data(TF_info_matrix=base_GRN)

        # -- knn imputation, this step is needed for the whole ctrl
        # Perform PCA
        oracle.perform_PCA()

        # Select important PCs
        plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])
        n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
        plt.axvline(n_comps, c="k")
        plt.show()
        print(n_comps)
        n_comps = min(n_comps, 50)

        n_cell = oracle.adata.shape[0]
        print(f"cell number is :{n_cell}")

        k = int(0.025*n_cell)
        print(f"Auto-selected k is :{k}")

        oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
                            b_maxl=k*4, n_jobs=4)

        # model_prefix = ''
        os.makedirs(save_dir, exist_ok=True)
        # -- save the oracle
        oracle.to_hdf5(os.path.join(save_dir, "ctrl.celloracle.oracle"))

        # -- get the links
        # Calculate GRN for each population in "louvain_annot" clustering unit.
        # This step may take some time.(~30 minutes)
        links = oracle.get_links(cluster_name_for_GRN_unit="celltype", alpha=10,
                                verbose_level=10)

        # -- Save Links object.
        links.to_hdf5(file_path=os.path.join(save_dir, "ctrl.celloracle.links"))


    # -- filter and get the coef_mtx
    links.filter_links(threshold_number=threshold_number,
                        p=0.001,
                        weight='coef_abs')
    oracle.get_cluster_specific_TFdict_from_Links(links_object=links)
    oracle.fit_GRN_for_simulation(alpha=10,
                                use_cluster_specific_TFdict=True)
    
    ###################################################
    # - get all the TFs in the base_GRN
    TFdict = import_TF_data(TF_info_matrix=base_GRN)
    tf_target_dict = {}
    for target, gene_set in TFdict.items():
        for tf in gene_set:
            if tf not in tf_target_dict:
                tf_target_dict[tf] = []
                tf_target_dict[tf].append(target)
            else:
                tf_target_dict[tf].append(target)
    total_tf_list = list(tf_target_dict.keys())
    


    
    #####################################################
    var_names = list(adata.var_names)    
    single_total_perts = np.intersect1d(total_tf_list, adata.var_names)
    common_perts = np.intersect1d(single_total_perts, L1000_total_perts)
    print('L1000_total_perts num: ', len(L1000_total_perts))
    print('common_perts num: ', len(common_perts))
    print('common var to L1000 data is: ', len(np.intersect1d(var_names, adata_L1000.var_names)))

    ###########################################
    celltype = adata.obs['celltype'].unique()[0]

    # - get the tf_GRN_dict, to check whether pert have regulatory relations
    gene_GRN_mtx = oracle.coef_matrix_per_cluster[celltype].copy()
    tf_GRN_mtx = gene_GRN_mtx[~(gene_GRN_mtx == 0).all(axis=1)]
    # - get TF-target pair and the regulatory values
    tf_GRN_dict = {} # the tf to targets
    for i in range(len(tf_GRN_mtx)):
        tmp = tf_GRN_mtx.iloc[i,:]
        tmp = tmp[tmp!=0]

        tf_GRN_dict[tf_GRN_mtx.index[i]] = {}
        for j in range(len(tmp)):
            tf_GRN_dict[tf_GRN_mtx.index[i]][tmp.index[j]] = tmp.values[j]

    ###########################################        
    # - get oracle_ctrl
    adata_rna.obs['celltype'] = cell_line_bulk
    adata_ctrl = adata_rna.copy()
    # keep raw cont data before log transformation
    adata_ctrl.raw = adata_ctrl

    # the result will be recovered in normalized_count
    if not isinstance(adata_ctrl.raw.X, np.ndarray):
        adata_ctrl.layers["raw_count"] = (np.exp(adata_ctrl.raw.X.toarray())-1).copy()
    else:
        adata_ctrl.layers["raw_count"] = (np.exp(adata_ctrl.raw.X)-1).copy()
        
    sc.pp.scale(adata_ctrl)
    # PCA
    sc.tl.pca(adata_ctrl, svd_solver='arpack', random_state=2022)

    # Diffusion map
    sc.pp.neighbors(adata_ctrl, n_neighbors=4, n_pcs=20, random_state=2022)
    sc.tl.umap(adata_ctrl,random_state=2022)

    # Instantiate Oracle object
    oracle_ctrl = co.Oracle()

    # In this notebook, we use the unscaled mRNA count for the nput of Oracle object.
    adata_ctrl.X = adata_ctrl.layers["raw_count"].copy()

    # Instantiate Oracle object.
    oracle_ctrl.import_anndata_as_raw_count(adata=adata_ctrl,
                                    cluster_column_name="celltype",
                                    embedding_name="X_umap")

    # You can load TF info dataframe with the following code.
    oracle_ctrl.import_TF_data(TF_info_matrix=base_GRN)

    # get the imputed_count, here we dont do the impute to get the prediction
    oracle_ctrl.adata.layers["imputed_count"] = oracle_ctrl.adata.layers["normalized_count"].copy()

    # get the coef from the whole ctrl
    oracle_ctrl.coef_matrix_per_cluster = oracle.coef_matrix_per_cluster
    
    pert_gene_rank_dict = {} 
    for pert in tqdm(common_perts):
        

        # - this is for crispra
        gois = [pert]
        goi_dict = {}

        # - all data in L1000 is knockdown
        for goi in gois:
            # -- if original value is zero
            if np.mean(adata_rna[:,goi].X.toarray())==0:
                print(f'{goi} ctrl expression is 0')
                continue
            # -- if the TF has no targets
            if goi not in list(tf_GRN_dict.keys()):
                print(f'{goi} is not in the tf_GRN_dict, no targets')
                continue
            goi_dict[goi] = 0
        if len(goi_dict) == 0:
            print(f'{pert} is filtered')
            continue


        # Enter perturbation conditions to simulate signal propagation after the perturbation.
        oracle_ctrl.simulate_shift(perturb_condition=goi_dict,
                            n_propagation=3)
        # - get the prediction; delta_X = simulated_count - imputed_count
        delta_X, simulated_count = oracle_ctrl.adata.layers["delta_X"], oracle_ctrl.adata.layers["simulated_count"]


        # - create adata_pert
        adata_pert = adata_rna.copy()
        adata_pert.X = simulated_count
        adata_pert.X[adata_pert.X < 0] = 0
        adata_pert.obs_names = [i+f'_{pert}' for i in adata_pert.obs_names]

        # - adata_ctrl
        adata_ctrl = adata_rna.copy()

        adata_ctrl.obs['batch'] = 'ctrl'
        adata_pert.obs['batch'] = 'pert'

        adata_concat = ad.concat([adata_ctrl, adata_pert])
        adata_concat.obs['batch'] = adata_concat.obs['batch'].astype('category') 
        adata_concat.obs['celltype'] = adata_concat.obs['celltype'].astype('category') 

        # - cal de genes
        rankby_abs = False

        sc.tl.rank_genes_groups(
            adata_concat,
            groupby='batch',
            reference='ctrl',
            rankby_abs=rankby_abs,
            n_genes=len(adata_concat.var),
            use_raw=False,
            method = 'wilcoxon'
        )
        de_genes = pd.DataFrame(adata_concat.uns['rank_genes_groups']['names'])
        pvals = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals'])
        pvals_adj = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals_adj'])
        scores = pd.DataFrame(adata_concat.uns['rank_genes_groups']['scores'])
        logfoldchanges = pd.DataFrame(adata_concat.uns['rank_genes_groups']['logfoldchanges'])

        # - get gene_score
        gene_score = pd.DataFrame({'gene':list(de_genes['pert']),
                                    'z-score':list(scores['pert'])})

        pert_gene_rank_dict[pert] = (list(de_genes['pert']), list(scores['pert']))
        
        # break
        
    save_dir = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark_202410/zero_shot/result'
    save_prefix = f'CellOracle/{cell_line_bulk}' # use result of K562 to do the direct transfer
    os.makedirs(os.path.join(save_dir, save_prefix), exist_ok=True)

    import json
    with open(os.path.join(save_dir, save_prefix, 'pert_gene_rank_dict.json'), 'w') as f:
        json.dump(pert_gene_rank_dict, f)
        
# 主函数
if __name__ == "__main__":
    # - get cell line name
    common_cell_line = \
    {   'A549': 'A549',
        'HEPG2': 'HepG2',
        'HT29': 'HT29',
        'MCF7': 'MCF7',
        # 'SKBR3': 'SK-BR-3',
        'SW480': 'SW480',
        'PC3': 'PC3',
        'A375': 'A375',
    } # L1000 cell line : single-cell cell line

    # - read adata_L1000, this is processed data
    adata_L1000 = sc.read('/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/GSE92742/adata_gene_pert.h5ad')


    # 使用并行执行
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(process_cell_line, cell_line_bulk, common_cell_line[cell_line_bulk], common_cell_line, adata_L1000)
            for cell_line_bulk in common_cell_line.keys()
        ]
        
        # 等待所有任务完成
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()  # 获取每个任务的结果，如果有异常，将在此处抛出
            except Exception as e:
                print(f"Error: {e}")


adata.shape is:  (500, 5155)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists

Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (500, 5141)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (409, 5450)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2

base_GRN.shape:  (37003, 1096)
adata.shape is:  (169, 5417)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
adata.shape is:  (500, 3687)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (500, 5379)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

L1000_total_perts num:  3620
common_perts num:  306
common var to L1000 data is:  933
L1000_total_perts num:  232
common_perts num:  35
common var to L1000 data is:  932
L1000_total_perts num:  3341
common_perts num:  252
common var to L1000 data is:  915
5155 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical




... storing 'celltype' as categorical


L1000_total_perts num:  3780
common_perts num:  278
common var to L1000 data is:  932
L1000_total_perts num:  3669
common_perts num:  291
common var to L1000 data is:  945
5032 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


L1000_total_perts num:  3302
common_perts num:  259
common var to L1000 data is:  931
5417 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


5379 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


L1000_total_perts num:  3649
common_perts num:  280
common var to L1000 data is:  930
5141 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


5450 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical
  0%|          | 0/280 [00:00<?, ?it/s]... storing 'x' as categorical
  3%|▎         | 1/35 [01:03<35:43, 63.05s/it]it]... storing 'x' as categorical
  1%|          | 2/278 [01:07<2:32:51, 33.23s/it]

ARHGEF12 is not in the tf_GRN_dict, no targets
ARHGEF12 is filtered


  0%|          | 1/259 [01:18<5:38:40, 78.76s/it]... storing 'x' as categorical
  0%|          | 1/291 [01:22<6:38:05, 82.37s/it]... storing 'x' as categorical
  1%|▏         | 4/278 [01:38<1:40:45, 22.06s/it]

ARID5B is not in the tf_GRN_dict, no targets
ARID5B is filtered


  0%|          | 1/280 [01:40<7:49:00, 100.86s/it]

# L1000运行CellOracle-并行版 - v2 使用delta直接加上去

In [7]:
save_prefix_method = 'CellOracle_v2'

# CellOracle: use simulated_count
# CellOracle_v2: use delte_count

In [8]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
from tqdm import tqdm
from sklearn.metrics import precision_recall_curve, auc
from scipy.spatial.distance import cdist
import concurrent.futures
import json

# 定义处理每个 cell_line_single 的函数
def process_cell_line(cell_line_bulk, cell_line_single, common_cell_line, adata_L1000):
    print('=' * 20, f'cell line is {cell_line_single}')

    #===================prepare data
    if cell_line_bulk in ['PC3', 'A375']:
        save_dir_adata = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/single_cell_data/SCP542'
    else:
        save_dir_adata = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/single_cell_data/CNP0003658'
    adata_rna = sc.read(os.path.join(save_dir_adata, cell_line_bulk, f'adata_{cell_line_bulk}.h5ad'))

    # - consctrut corr mtx
    if not isinstance(adata_rna.X, np.ndarray):
        adata_rna.X = adata_rna.X.toarray()
    # corr_mtx = np.corrcoef(adata_rna.X.T)
    
    # - get var_names
    var_names = list(adata_rna.var_names)
    
    # - get common pert
    adata_L1000_sub = adata_L1000[adata_L1000.obs['cell_id']==cell_line_bulk]
    L1000_total_perts = np.unique(adata_L1000_sub.obs['pert_iname'])
    
    
    n_cells_downsample = 10000
    threshold_number = 10000
    
    ##########################################################
    
    # - get control adata
    adata = adata_rna.copy()
    adata.obs['celltype'] = cell_line_bulk
    print(f'adata.shape is: ',adata.shape)

    # -- get the baseGRN
    # Load TF info which was made from mouse cell atlas dataset.
    base_GRN = co.data.load_human_promoter_base_GRN()
    print('base_GRN.shape: ', base_GRN.shape)

    tmp_dir = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/L1000'
    save_prefix = f'CellOracle/{cell_line_bulk}' # use result of K562 to do the direct transfer
    os.makedirs(os.path.join(tmp_dir, save_prefix), exist_ok=True)

    save_dir = os.path.join(tmp_dir, save_prefix)
    if os.path.exists(os.path.join(save_dir, "ctrl.celloracle.oracle")):
        print('file exists')
        oracle = co.load_hdf5(os.path.join(save_dir, "ctrl.celloracle.oracle"))
        links = co.load_hdf5(file_path=os.path.join(save_dir, "ctrl.celloracle.links"))
        
    else:

        # - start CellOracle process for the whole ctrl

        # -- keep raw cont data before log transformation
        adata.raw = adata
        if not isinstance(adata.raw.X, np.ndarray):
            adata.layers["raw_count"] = (np.exp(adata.raw.X.toarray())-1).copy()
        else:
            adata.layers["raw_count"] = (np.exp(adata.raw.X)-1).copy()
            
        # -- get umap 
        sc.pp.scale(adata)
        # PCA
        sc.tl.pca(adata, svd_solver='arpack', random_state=2022)
        # UMAP
        sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20, random_state=2022)
        sc.tl.umap(adata,random_state=2022)

        # -- Random downsampling into 30K cells if the anndata object include more than 30 K cells.
        if adata.shape[0] > n_cells_downsample:
            # Let's dowmsample into 30K cells
            sc.pp.subsample(adata, n_obs=n_cells_downsample, random_state=123)
        print(f"Cell number is :{adata.shape[0]}")

        # -- Instantiate Oracle object
        oracle = co.Oracle()

        # -- Check data in anndata
        print("Metadata columns :", list(adata.obs.columns))
        print("Dimensional reduction: ", list(adata.obsm.keys()))

        # -- In this notebook, we use the unscaled mRNA count for the nput of Oracle object.
        adata.X = adata.layers["raw_count"].copy()

        # -- Instantiate Oracle object.
        oracle.import_anndata_as_raw_count(adata=adata,
                                        cluster_column_name="celltype",
                                        embedding_name="X_umap")

        # -- You can load TF info dataframe with the following code.
        oracle.import_TF_data(TF_info_matrix=base_GRN)

        # -- knn imputation, this step is needed for the whole ctrl
        # Perform PCA
        oracle.perform_PCA()

        # Select important PCs
        plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])
        n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
        plt.axvline(n_comps, c="k")
        plt.show()
        print(n_comps)
        n_comps = min(n_comps, 50)

        n_cell = oracle.adata.shape[0]
        print(f"cell number is :{n_cell}")

        k = int(0.025*n_cell)
        print(f"Auto-selected k is :{k}")

        oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
                            b_maxl=k*4, n_jobs=4)

        # model_prefix = ''
        os.makedirs(save_dir, exist_ok=True)
        # -- save the oracle
        oracle.to_hdf5(os.path.join(save_dir, "ctrl.celloracle.oracle"))

        # -- get the links
        # Calculate GRN for each population in "louvain_annot" clustering unit.
        # This step may take some time.(~30 minutes)
        links = oracle.get_links(cluster_name_for_GRN_unit="celltype", alpha=10,
                                verbose_level=10)

        # -- Save Links object.
        links.to_hdf5(file_path=os.path.join(save_dir, "ctrl.celloracle.links"))


    # -- filter and get the coef_mtx
    links.filter_links(threshold_number=threshold_number,
                        p=0.001,
                        weight='coef_abs')
    oracle.get_cluster_specific_TFdict_from_Links(links_object=links)
    oracle.fit_GRN_for_simulation(alpha=10,
                                use_cluster_specific_TFdict=True)
    
    ###################################################
    # - get all the TFs in the base_GRN
    TFdict = import_TF_data(TF_info_matrix=base_GRN)
    tf_target_dict = {}
    for target, gene_set in TFdict.items():
        for tf in gene_set:
            if tf not in tf_target_dict:
                tf_target_dict[tf] = []
                tf_target_dict[tf].append(target)
            else:
                tf_target_dict[tf].append(target)
    total_tf_list = list(tf_target_dict.keys())
    


    
    #####################################################
    var_names = list(adata.var_names)    
    single_total_perts = np.intersect1d(total_tf_list, adata.var_names)
    common_perts = np.intersect1d(single_total_perts, L1000_total_perts)
    print('L1000_total_perts num: ', len(L1000_total_perts))
    print('common_perts num: ', len(common_perts))
    print('common var to L1000 data is: ', len(np.intersect1d(var_names, adata_L1000.var_names)))

    ###########################################
    celltype = adata.obs['celltype'].unique()[0]

    # - get the tf_GRN_dict, to check whether pert have regulatory relations
    gene_GRN_mtx = oracle.coef_matrix_per_cluster[celltype].copy()
    tf_GRN_mtx = gene_GRN_mtx[~(gene_GRN_mtx == 0).all(axis=1)]
    # - get TF-target pair and the regulatory values
    tf_GRN_dict = {} # the tf to targets
    for i in range(len(tf_GRN_mtx)):
        tmp = tf_GRN_mtx.iloc[i,:]
        tmp = tmp[tmp!=0]

        tf_GRN_dict[tf_GRN_mtx.index[i]] = {}
        for j in range(len(tmp)):
            tf_GRN_dict[tf_GRN_mtx.index[i]][tmp.index[j]] = tmp.values[j]

    ###########################################        
    # - get oracle_ctrl
    adata_rna.obs['celltype'] = cell_line_bulk
    adata_ctrl = adata_rna.copy()
    # keep raw cont data before log transformation
    adata_ctrl.raw = adata_ctrl

    # the result will be recovered in normalized_count
    if not isinstance(adata_ctrl.raw.X, np.ndarray):
        adata_ctrl.layers["raw_count"] = (np.exp(adata_ctrl.raw.X.toarray())-1).copy()
    else:
        adata_ctrl.layers["raw_count"] = (np.exp(adata_ctrl.raw.X)-1).copy()
        
    sc.pp.scale(adata_ctrl)
    # PCA
    sc.tl.pca(adata_ctrl, svd_solver='arpack', random_state=2022)

    # Diffusion map
    sc.pp.neighbors(adata_ctrl, n_neighbors=4, n_pcs=20, random_state=2022)
    sc.tl.umap(adata_ctrl,random_state=2022)

    # Instantiate Oracle object
    oracle_ctrl = co.Oracle()

    # In this notebook, we use the unscaled mRNA count for the nput of Oracle object.
    adata_ctrl.X = adata_ctrl.layers["raw_count"].copy()

    # Instantiate Oracle object.
    oracle_ctrl.import_anndata_as_raw_count(adata=adata_ctrl,
                                    cluster_column_name="celltype",
                                    embedding_name="X_umap")

    # You can load TF info dataframe with the following code.
    oracle_ctrl.import_TF_data(TF_info_matrix=base_GRN)

    # get the imputed_count, here we dont do the impute to get the prediction
    oracle_ctrl.adata.layers["imputed_count"] = oracle_ctrl.adata.layers["normalized_count"].copy()

    # get the coef from the whole ctrl
    oracle_ctrl.coef_matrix_per_cluster = oracle.coef_matrix_per_cluster
    
    pert_gene_rank_dict = {} 
    for pert in tqdm(common_perts):
        

        # - this is for crispra
        gois = [pert]
        goi_dict = {}

        # - all data in L1000 is knockdown
        for goi in gois:
            # -- if original value is zero
            if np.mean(adata_rna[:,goi].X.toarray())==0:
                print(f'{goi} ctrl expression is 0')
                continue
            # -- if the TF has no targets
            if goi not in list(tf_GRN_dict.keys()):
                print(f'{goi} is not in the tf_GRN_dict, no targets')
                continue
            goi_dict[goi] = 0
        if len(goi_dict) == 0:
            print(f'{pert} is filtered')
            continue


        # Enter perturbation conditions to simulate signal propagation after the perturbation.
        oracle_ctrl.simulate_shift(perturb_condition=goi_dict,
                            n_propagation=3)
        # - get the prediction; delta_X = simulated_count - imputed_count
        delta_X, simulated_count = oracle_ctrl.adata.layers["delta_X"], oracle_ctrl.adata.layers["simulated_count"]


        # - create adata_pert
        adata_pert = adata_rna.copy()
        if save_prefix_method == 'CellOracle':
            adata_pert.X = simulated_count
        if save_prefix_method == 'CellOracle_v2':
            adata_pert.X += delta_X
        adata_pert.X[adata_pert.X < 0] = 0
        adata_pert.obs_names = [i+f'_{pert}' for i in adata_pert.obs_names]

        # - adata_ctrl
        adata_ctrl = adata_rna.copy()

        adata_ctrl.obs['batch'] = 'ctrl'
        adata_pert.obs['batch'] = 'pert'

        adata_concat = ad.concat([adata_ctrl, adata_pert])
        adata_concat.obs['batch'] = adata_concat.obs['batch'].astype('category') 
        adata_concat.obs['celltype'] = adata_concat.obs['celltype'].astype('category') 

        # - cal de genes
        rankby_abs = False

        sc.tl.rank_genes_groups(
            adata_concat,
            groupby='batch',
            reference='ctrl',
            rankby_abs=rankby_abs,
            n_genes=len(adata_concat.var),
            use_raw=False,
            method = 'wilcoxon'
        )
        de_genes = pd.DataFrame(adata_concat.uns['rank_genes_groups']['names'])
        pvals = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals'])
        pvals_adj = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals_adj'])
        scores = pd.DataFrame(adata_concat.uns['rank_genes_groups']['scores'])
        logfoldchanges = pd.DataFrame(adata_concat.uns['rank_genes_groups']['logfoldchanges'])

        # - get gene_score
        gene_score = pd.DataFrame({'gene':list(de_genes['pert']),
                                    'z-score':list(scores['pert'])})

        pert_gene_rank_dict[pert] = (list(de_genes['pert']), list(scores['pert']))
        
        # break
        
    save_dir = '/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark_202410/zero_shot/result'
    save_prefix = f'{save_prefix_method}/{cell_line_bulk}' # use result of K562 to do the direct transfer
    os.makedirs(os.path.join(save_dir, save_prefix), exist_ok=True)

    import json
    with open(os.path.join(save_dir, save_prefix, 'pert_gene_rank_dict.json'), 'w') as f:
        json.dump(pert_gene_rank_dict, f)
        
# 主函数
if __name__ == "__main__":
    # - get cell line name
    common_cell_line = \
    {   'A549': 'A549',
        'HEPG2': 'HepG2',
        'HT29': 'HT29',
        'MCF7': 'MCF7',
        # 'SKBR3': 'SK-BR-3',
        'SW480': 'SW480',
        'PC3': 'PC3',
        'A375': 'A375',
    } # L1000 cell line : single-cell cell line

    # - read adata_L1000, this is processed data
    adata_L1000 = sc.read('/nfs/public/lichen/results/single_cell_perturbation/perturbation_benchmark/benchmark_data/L1000/GSE92742/adata_gene_pert.h5ad')


    # 使用并行执行
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(process_cell_line, cell_line_bulk, common_cell_line[cell_line_bulk], common_cell_line, adata_L1000)
            for cell_line_bulk in common_cell_line.keys()
        ]
        
        # 等待所有任务完成
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()  # 获取每个任务的结果，如果有异常，将在此处抛出
            except Exception as e:
                print(f"Error: {e}")


adata.shape is:  (500, 5155)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (500, 5032)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (500, 5141)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (409, 5450)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (169, 5417)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (500, 3687)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists
adata.shape is:  (500, 5379)
Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2
base_GRN.shape:  (37003, 1096)
file exists


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

L1000_total_perts num:  3780
common_perts num:  278
common var to L1000 data is:  932
L1000_total_perts num:  232
common_perts num:  35
common var to L1000 data is:  932
5417 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


L1000_total_perts num:  3620
common_perts num:  306
common var to L1000 data is:  933
L1000_total_perts num:  3669
common_perts num:  291
common var to L1000 data is:  945
L1000_total_perts num:  3341
common_perts num:  252
common var to L1000 data is:  915
L1000_total_perts num:  3649
common_perts num:  280
common var to L1000 data is:  930
L1000_total_perts num:  3302
common_perts num:  259
common var to L1000 data is:  931


  0%|          | 0/278 [00:00<?, ?it/s]



... storing 'celltype' as categorical


5155 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


5379 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


5032 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


5450 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical


5141 genes were found in the adata. Note that Celloracle is intended to use around 1000-3000 genes, so the behavior with this number of genes may differ from what is expected.


... storing 'celltype' as categorical
  0%|          | 0/259 [00:00<?, ?it/s]... storing 'x' as categorical
  3%|▎         | 1/35 [00:31<17:42, 31.24s/it]it]... storing 'x' as categorical
  1%|          | 2/278 [00:47<1:49:27, 23.79s/it]

ARHGEF12 is not in the tf_GRN_dict, no targets
ARHGEF12 is filtered


  0%|          | 1/252 [00:55<3:51:32, 55.35s/it]... storing 'x' as categorical
  1%|▏         | 4/278 [01:09<1:12:11, 15.81s/it]

ARID5B is not in the tf_GRN_dict, no targets
ARID5B is filtered


  6%|▌         | 2/35 [01:00<16:36, 30.20s/it]it]... storing 'x' as categorical
  0%|          | 1/291 [01:02<5:03:47, 62.85s/it]... storing 'x' as categorical
  9%|▊         | 3/35 [01:41<18:48, 35.26s/it]it]... storing 'x' as categorical
  1%|          | 2/306 [01:54<4:48:25, 56.92s/it]... storing 'x' as categorical
  1%|          | 2/291 [02:03<4:55:43, 61.39s/it]... storing 'x' as categorical
 11%|█▏        | 4/35 [02:11<16:57, 32.81s/it]it]... storing 'x' as categorical
  1%|          | 3/306 [02:50<4:45:55, 56.62s/it]... storing 'x' as categorical
 14%|█▍        | 5/35 [02:57<18:47, 37.59s/it]/it]... storing 'x' as categorical
  2%|▏         | 4/259 [03:36<3:48:26, 53.75s/it]]... storing 'x' as categorical
... storing 'x' as categorical
 20%|██        | 7/35 [03:56<15:29, 33.20s/it]it]]... storing 'x' as categorical
  2%|▏         | 5/280 [04:19<3:57:06, 51.73s/it]]... storing 'x' as categorical
  2%|▏         | 5/259 [04:29<3:46:28, 53.50s/it]]... storing 'x' as categorical
  2%

CBFB is not in the tf_GRN_dict, no targets
CBFB is filtered


 40%|████      | 14/35 [07:39<11:16, 32.21s/it]t]... storing 'x' as categorical
 43%|████▎     | 15/35 [08:08<10:26, 31.32s/it]t]]... storing 'x' as categorical
  3%|▎         | 9/306 [08:25<4:36:03, 55.77s/it]]... storing 'x' as categorical
 46%|████▌     | 16/35 [08:37<09:41, 30.63s/it]it]... storing 'x' as categorical
  4%|▍         | 10/259 [08:56<3:41:31, 53.38s/it]... storing 'x' as categorical
  4%|▎         | 10/280 [09:19<4:41:52, 62.64s/it]

ARX is not in the tf_GRN_dict, no targets
ARX is filtered


... storing 'x' as categorical
 49%|████▊     | 17/35 [09:23<10:33, 35.21s/it]it]... storing 'x' as categorical
 51%|█████▏    | 18/35 [09:52<09:27, 33.36s/it]it]... storing 'x' as categorical
  4%|▎         | 11/306 [10:17<4:34:19, 55.79s/it]... storing 'x' as categorical
  3%|▎         | 8/291 [10:26<6:41:28, 85.12s/it]... storing 'x' as categorical
  5%|▍         | 12/259 [10:42<3:39:24, 53.30s/it]... storing 'x' as categorical
 57%|█████▋    | 20/35 [11:08<08:41, 34.78s/it]it]... storing 'x' as categorical
  5%|▍         | 13/280 [11:23<3:44:48, 50.52s/it]... storing 'x' as categorical
  5%|▌         | 13/259 [11:36<3:38:47, 53.36s/it]... storing 'x' as categorical
  4%|▍         | 12/306 [11:45<5:21:40, 65.65s/it]

ARX is not in the tf_GRN_dict, no targets
ARX is filtered


  5%|▌         | 13/252 [11:53<3:39:47, 55.18s/it]... storing 'x' as categorical
  5%|▌         | 14/280 [12:15<3:45:11, 50.80s/it]... storing 'x' as categorical
 13%|█▎        | 36/278 [12:38<1:32:32, 22.94s/it]... storing 'x' as categorical
  5%|▌         | 14/259 [12:29<3:37:31, 53.27s/it]

ATF6 is not in the tf_GRN_dict, no targets
ATF6 is filtered


  6%|▌         | 14/252 [12:46<3:36:31, 54.58s/it]... storing 'x' as categorical
 13%|█▎        | 37/278 [13:01<1:32:13, 22.96s/it]

DLX2 is not in the tf_GRN_dict, no targets
DLX2 is filtered


  5%|▌         | 15/280 [13:06<3:44:45, 50.89s/it]... storing 'x' as categorical
  6%|▌         | 16/259 [13:22<2:45:48, 40.94s/it]... storing 'x' as categorical
 71%|███████▏  | 25/35 [13:34<05:02, 30.22s/it]it]... storing 'x' as categorical
  6%|▌         | 15/252 [13:40<3:33:59, 54.18s/it]... storing 'x' as categorical
  6%|▌         | 16/280 [14:18<4:10:57, 57.03s/it]... storing 'x' as categorical
 15%|█▌        | 42/278 [14:33<1:22:10, 20.89s/it]... storing 'x' as categorical
  6%|▋         | 16/252 [14:33<3:31:52, 53.86s/it]... storing 'x' as categorical
 80%|████████  | 28/35 [15:02<03:26, 29.56s/it]it]... storing 'x' as categorical
  7%|▋         | 17/252 [15:26<3:30:29, 53.74s/it]... storing 'x' as categorical
  6%|▌         | 17/280 [15:32<4:30:12, 61.65s/it]... storing 'x' as categorical
  7%|▋         | 19/259 [16:02<3:13:44, 48.44s/it]... storing 'x' as categorical
  4%|▍         | 13/291 [16:06<5:49:24, 75.41s/it]... storing 'x' as categorical
 89%|████████▊ | 31/35 [16:3

CBFB is not in the tf_GRN_dict, no targets
CBFB is filtered
CDX2 is not in the tf_GRN_dict, no targets
CDX2 is filtered


... storing 'x' as categorical
  8%|▊         | 26/306 [23:49<4:18:01, 55.29s/it]... storing 'x' as categorical
 24%|██▍       | 67/278 [24:03<1:19:55, 22.73s/it]... storing 'x' as categorical
 24%|██▍       | 68/278 [24:26<1:20:02, 22.87s/it]... storing 'x' as categorical
 11%|█         | 28/259 [24:29<3:28:28, 54.15s/it]... storing 'x' as categorical
  9%|▉         | 27/306 [24:44<4:16:32, 55.17s/it]

CBFB is not in the tf_GRN_dict, no targets
CBFB is filtered


... storing 'x' as categorical
 11%|█         | 28/252 [25:09<3:17:09, 52.81s/it]... storing 'x' as categorical
  7%|▋         | 21/291 [25:17<4:58:09, 66.26s/it]... storing 'x' as categorical
 10%|█         | 29/280 [25:30<3:02:09, 43.54s/it]

CEBPD is not in the tf_GRN_dict, no targets
CEBPD is filtered


  9%|▉         | 29/306 [25:40<3:16:11, 42.49s/it]... storing 'x' as categorical
 12%|█▏        | 29/252 [26:02<3:16:17, 52.81s/it]... storing 'x' as categorical
 26%|██▋       | 73/278 [26:21<1:18:16, 22.91s/it]... storing 'x' as categorical
 11%|█         | 31/280 [26:21<2:32:29, 36.75s/it]... storing 'x' as categorical
 10%|▉         | 30/306 [26:35<3:30:00, 45.65s/it]... storing 'x' as categorical
 12%|█▏        | 31/259 [27:10<3:23:52, 53.65s/it]... storing 'x' as categorical
  8%|▊         | 23/291 [27:17<4:41:24, 63.00s/it]... storing 'x' as categorical
 11%|█▏        | 32/280 [27:33<3:03:13, 44.33s/it]... storing 'x' as categorical
 12%|█▏        | 31/252 [27:47<3:14:32, 52.82s/it]... storing 'x' as categorical
 12%|█▏        | 32/259 [28:03<3:22:47, 53.60s/it]... storing 'x' as categorical
 10%|█         | 32/306 [28:26<3:49:02, 50.15s/it]... storing 'x' as categorical
 12%|█▏        | 33/280 [28:46<3:29:35, 50.91s/it]... storing 'x' as categorical
 13%|█▎        | 33/259 [28:

CREB3L1 is not in the tf_GRN_dict, no targets
CREB3L1 is filtered


... storing 'x' as categorical
  9%|▉         | 27/291 [31:18<4:28:16, 60.97s/it]

CEBPD is not in the tf_GRN_dict, no targets
CEBPD is filtered


 14%|█▍        | 35/252 [31:19<3:11:03, 52.83s/it]... storing 'x' as categorical
 13%|█▎        | 36/280 [31:41<3:44:33, 55.22s/it]... storing 'x' as categorical
 14%|█▍        | 36/252 [32:12<3:10:31, 52.92s/it]... storing 'x' as categorical
 32%|███▏      | 89/278 [32:28<1:12:22, 22.98s/it]... storing 'x' as categorical
 13%|█▎        | 37/280 [32:32<3:39:07, 54.10s/it]... storing 'x' as categorical
 32%|███▏      | 90/278 [32:51<1:12:01, 22.99s/it]... storing 'x' as categorical
 12%|█▏        | 38/306 [33:04<3:22:22, 45.31s/it]... storing 'x' as categorical
 15%|█▍        | 38/259 [33:23<3:16:20, 53.30s/it]... storing 'x' as categorical
 33%|███▎      | 92/278 [33:37<1:11:21, 23.02s/it]

HOXA10 is not in the tf_GRN_dict, no targets
HOXA10 is filtered


 14%|█▎        | 38/280 [33:45<3:59:47, 59.45s/it]... storing 'x' as categorical
 13%|█▎        | 39/306 [34:00<3:33:21, 47.95s/it]... storing 'x' as categorical
 15%|█▌        | 38/252 [34:16<3:22:08, 56.68s/it]... storing 'x' as categorical
 11%|█         | 31/291 [34:19<3:48:59, 52.84s/it]... storing 'x' as categorical
 13%|█▎        | 40/306 [34:55<3:41:36, 49.99s/it]... storing 'x' as categorical
 15%|█▌        | 39/252 [35:09<3:17:39, 55.68s/it]... storing 'x' as categorical
 11%|█         | 32/291 [35:19<3:56:56, 54.89s/it]... storing 'x' as categorical
 14%|█▍        | 40/280 [35:28<3:41:41, 55.42s/it]

DLX2 is not in the tf_GRN_dict, no targets
DLX2 is filtered


 15%|█▌        | 40/259 [35:40<3:37:59, 59.72s/it]... storing 'x' as categorical
 13%|█▎        | 41/306 [35:51<3:47:37, 51.54s/it]

DLX2 is not in the tf_GRN_dict, no targets
DLX2 is filtered


 16%|█▌        | 40/252 [36:02<3:13:41, 54.82s/it]... storing 'x' as categorical
 36%|███▌      | 100/278 [36:18<1:05:58, 22.24s/it]... storing 'x' as categorical
 11%|█▏        | 33/291 [36:20<4:02:37, 56.42s/it]... storing 'x' as categorical
 14%|█▍        | 43/306 [36:47<2:59:57, 41.05s/it]]... storing 'x' as categorical
 16%|█▋        | 41/252 [36:55<3:10:58, 54.30s/it]]... storing 'x' as categorical
 37%|███▋      | 103/278 [37:27<1:06:26, 22.78s/it]... storing 'x' as categorical
 16%|█▌        | 42/259 [37:26<3:24:08, 56.44s/it]... storing 'x' as categorical
 15%|█▌        | 43/280 [37:53<3:29:48, 53.12s/it]]... storing 'x' as categorical
 38%|███▊      | 105/278 [38:13<1:06:02, 22.90s/it]... storing 'x' as categorical
 17%|█▋        | 43/259 [38:20<3:20:04, 55.58s/it]... storing 'x' as categorical
 16%|█▌        | 44/280 [38:45<3:27:07, 52.66s/it]]... storing 'x' as categorical
 38%|███▊      | 107/278 [39:00<1:05:30, 22.99s/it]... storing 'x' as categorical
 17%|█▋        | 44/

ELK3 is not in the tf_GRN_dict, no targets
ELK3 is filtered


... storing 'x' as categorical
 40%|███▉      | 111/278 [40:32<1:04:15, 23.08s/it]... storing 'x' as categorical
 15%|█▌        | 47/306 [40:30<3:41:52, 51.40s/it]... storing 'x' as categorical
 18%|█▊        | 47/259 [41:00<2:27:08, 41.64s/it]]... storing 'x' as categorical
 18%|█▊        | 46/252 [41:20<3:02:36, 53.19s/it]]... storing 'x' as categorical
 16%|█▌        | 48/306 [41:26<3:46:38, 52.71s/it]... storing 'x' as categorical
 41%|████      | 114/278 [41:40<1:02:14, 22.77s/it]... storing 'x' as categorical
 41%|████▏     | 115/278 [42:03<1:02:07, 22.87s/it]

ISL1 is not in the tf_GRN_dict, no targets
ISL1 is filtered


 19%|█▊        | 47/252 [42:13<3:02:02, 53.28s/it]... storing 'x' as categorical
 16%|█▌        | 49/306 [42:22<3:49:13, 53.52s/it] ... storing 'x' as categorical
 13%|█▎        | 39/291 [42:22<4:11:31, 59.89s/it]

DLX2 is not in the tf_GRN_dict, no targets
DLX2 is filtered


... storing 'x' as categorical
 17%|█▋        | 48/280 [42:52<3:39:23, 56.74s/it]... storing 'x' as categorical
 16%|█▋        | 50/306 [43:17<3:50:51, 54.11s/it]... storing 'x' as categorical
 14%|█▍        | 41/291 [43:22<3:12:55, 46.30s/it]... storing 'x' as categorical
 18%|█▊        | 49/280 [43:44<3:32:20, 55.15s/it]... storing 'x' as categorical
 19%|█▉        | 49/252 [43:59<2:59:44, 53.12s/it]... storing 'x' as categorical
 17%|█▋        | 51/306 [44:13<3:52:24, 54.68s/it]... storing 'x' as categorical
 14%|█▍        | 42/291 [44:23<3:26:28, 49.75s/it]... storing 'x' as categorical
 20%|█▉        | 50/252 [44:52<2:58:58, 53.16s/it]... storing 'x' as categorical
 17%|█▋        | 52/306 [45:09<3:52:43, 54.97s/it]... storing 'x' as categorical
 45%|████▍     | 125/278 [45:31<57:41, 22.62s/it]... storing 'x' as categorical
 20%|██        | 52/259 [45:28<2:55:53, 50.98s/it]... storing 'x' as categorical
 17%|█▋        | 53/306 [46:03<3:51:14, 54.84s/it]

ELF4 is not in the tf_GRN_dict, no targets
ELF4 is filtered


... storing 'x' as categorical
 46%|████▌     | 127/278 [46:16<56:37, 22.50s/it]... storing 'x' as categorical
 20%|██        | 53/259 [46:21<2:57:28, 51.69s/it]... storing 'x' as categorical
 19%|█▊        | 52/280 [46:37<3:29:42, 55.19s/it]... storing 'x' as categorical
 21%|██        | 52/252 [47:06<3:24:50, 61.45s/it]... storing 'x' as categorical
 21%|██        | 54/259 [47:15<2:58:27, 52.23s/it]... storing 'x' as categorical
 19%|█▉        | 53/280 [47:27<3:23:13, 53.72s/it]... storing 'x' as categorical
 47%|████▋     | 131/278 [47:47<55:33, 22.68s/it]... storing 'x' as categorical
 21%|██        | 55/259 [48:08<2:58:30, 52.50s/it]... storing 'x' as categorical
 48%|████▊     | 133/278 [48:32<54:37, 22.61s/it]... storing 'x' as categorical
 19%|█▉        | 54/280 [48:38<3:41:57, 58.93s/it]... storing 'x' as categorical
 22%|██▏       | 56/259 [49:01<2:58:29, 52.76s/it]... storing 'x' as categorical
 19%|█▊        | 57/306 [49:19<3:41:53, 53.47s/it]... storing 'x' as categorical


PRDM4 is not in the tf_GRN_dict, no targets
PRDM4 is filtered


... storing 'x' as categorical
 29%|██▉       | 74/252 [1:06:12<2:34:20, 52.03s/it]... storing 'x' as categorical
 25%|██▍       | 76/306 [1:06:37<3:29:48, 54.73s/it]... storing 'x' as categorical
 26%|██▋       | 74/280 [1:06:46<3:17:50, 57.62s/it]... storing 'x' as categorical
 30%|██▉       | 75/252 [1:07:03<2:33:11, 51.93s/it]... storing 'x' as categorical
 22%|██▏       | 65/291 [1:07:05<3:42:24, 59.04s/it]... storing 'x' as categorical
 29%|██▉       | 76/259 [1:07:46<3:13:37, 63.48s/it]... storing 'x' as categorical
 67%|██████▋   | 185/278 [1:08:01<32:43, 21.12s/it]... storing 'x' as categorical
 23%|██▎       | 66/291 [1:08:04<3:41:05, 58.96s/it]... storing 'x' as categorical
 27%|██▋       | 76/280 [1:08:26<3:03:00, 53.83s/it]... storing 'x' as categorical
 30%|██▉       | 77/259 [1:08:39<3:03:35, 60.52s/it]... storing 'x' as categorical
 68%|██████▊   | 188/278 [1:09:10<33:08, 22.10s/it]... storing 'x' as categorical
 28%|██▊       | 77/280 [1:09:17<2:58:23, 52.73s/it]

FOXO4 is not in the tf_GRN_dict, no targets
FOXO4 is filtered


... storing 'x' as categorical
 30%|███       | 78/259 [1:09:33<2:56:23, 58.47s/it]... storing 'x' as categorical
 68%|██████▊   | 190/278 [1:09:56<33:08, 22.60s/it]... storing 'x' as categorical
 23%|██▎       | 68/291 [1:10:02<3:39:22, 59.02s/it]... storing 'x' as categorical
 31%|███       | 79/259 [1:10:26<2:50:49, 56.94s/it]... storing 'x' as categorical
 69%|██████▉   | 192/278 [1:10:42<32:45, 22.86s/it]... storing 'x' as categorical
 29%|██▊       | 80/280 [1:10:57<2:21:39, 42.50s/it]... storing 'x' as categorical
 26%|██▋       | 81/306 [1:11:10<3:24:51, 54.63s/it]... storing 'x' as categorical
 31%|███       | 80/259 [1:11:20<2:46:49, 55.92s/it]... storing 'x' as categorical
 32%|███▏      | 80/252 [1:11:53<2:35:07, 54.11s/it]... storing 'x' as categorical
 24%|██▍       | 70/291 [1:12:00<3:37:54, 59.16s/it]... storing 'x' as categorical
 31%|███▏      | 81/259 [1:12:13<2:43:51, 55.23s/it]

HIC2 is not in the tf_GRN_dict, no targets
HIC2 is filtered


... storing 'x' as categorical
 29%|██▉       | 82/280 [1:12:38<2:32:15, 46.14s/it]

GLI3 is not in the tf_GRN_dict, no targets
GLI3 is filtered


 32%|███▏      | 81/252 [1:12:45<2:32:29, 53.51s/it]... storing 'x' as categorical
 71%|███████   | 198/278 [1:12:59<30:32, 22.91s/it]... storing 'x' as categorical
 32%|███▏      | 83/259 [1:13:07<2:03:31, 42.11s/it]... storing 'x' as categorical
 30%|███       | 84/280 [1:13:28<2:00:38, 36.93s/it]... storing 'x' as categorical
 27%|██▋       | 84/306 [1:13:55<3:22:37, 54.76s/it]

GLI3 is not in the tf_GRN_dict, no targets
GLI3 is filtered


... storing 'x' as categorical
 72%|███████▏  | 201/278 [1:14:08<29:24, 22.92s/it]... storing 'x' as categorical
 32%|███▏      | 84/259 [1:14:00<2:10:46, 44.84s/it]... storing 'x' as categorical
 30%|███       | 85/280 [1:14:39<2:26:33, 45.09s/it]... storing 'x' as categorical
 33%|███▎      | 85/259 [1:14:53<2:16:30, 47.07s/it]... storing 'x' as categorical
 25%|██▌       | 73/291 [1:14:58<3:35:10, 59.22s/it]... storing 'x' as categorical
 33%|███▎      | 84/252 [1:15:21<2:27:01, 52.51s/it]... storing 'x' as categorical
 28%|██▊       | 87/306 [1:15:44<2:45:18, 45.29s/it]... storing 'x' as categorical
 74%|███████▍  | 206/278 [1:16:01<27:15, 22.71s/it]... storing 'x' as categorical
 25%|██▌       | 74/291 [1:15:57<3:33:55, 59.15s/it]... storing 'x' as categorical
 33%|███▎      | 86/259 [1:16:17<2:44:43, 57.13s/it]... storing 'x' as categorical
 31%|███       | 87/280 [1:16:40<2:50:17, 52.94s/it]... storing 'x' as categorical
 26%|██▌       | 75/291 [1:16:56<3:32:50, 59.12s/it]... st

HERPUD1 is not in the tf_GRN_dict, no targets
HERPUD1 is filtered


 29%|██▉       | 89/306 [1:17:34<2:59:32, 49.64s/it]... storing 'x' as categorical
 76%|███████▌  | 211/278 [1:17:56<25:39, 22.98s/it]... storing 'x' as categorical
 34%|███▍      | 88/259 [1:18:04<2:37:27, 55.25s/it]... storing 'x' as categorical
 29%|██▉       | 90/306 [1:18:29<3:04:07, 51.15s/it]... storing 'x' as categorical
 77%|███████▋  | 213/278 [1:18:42<24:55, 23.00s/it]... storing 'x' as categorical
 77%|███████▋  | 214/278 [1:19:05<24:32, 23.00s/it]... storing 'x' as categorical
 32%|███▎      | 91/280 [1:19:11<2:14:22, 42.66s/it]... storing 'x' as categorical
 30%|██▉       | 91/306 [1:19:24<3:07:09, 52.23s/it]

HERPUD1 is not in the tf_GRN_dict, no targets
HERPUD1 is filtered


 34%|███▍      | 89/259 [1:19:28<3:00:00, 63.53s/it]... storing 'x' as categorical
 78%|███████▊  | 216/278 [1:19:50<23:27, 22.71s/it]... storing 'x' as categorical
 35%|███▌      | 89/252 [1:19:58<2:28:30, 54.66s/it]... storing 'x' as categorical
 33%|███▎      | 92/280 [1:20:22<2:36:29, 49.94s/it]... storing 'x' as categorical
 78%|███████▊  | 218/278 [1:20:37<22:53, 22.90s/it]... storing 'x' as categorical
 36%|███▌      | 90/252 [1:20:50<2:25:19, 53.83s/it]... storing 'x' as categorical
 27%|██▋       | 79/291 [1:20:53<3:29:17, 59.23s/it]... storing 'x' as categorical
 35%|███▌      | 91/259 [1:21:15<2:43:49, 58.51s/it]... storing 'x' as categorical
 36%|███▌      | 91/252 [1:21:42<2:23:01, 53.30s/it]... storing 'x' as categorical
 27%|██▋       | 80/291 [1:21:53<3:28:23, 59.26s/it]... storing 'x' as categorical
 34%|███▎      | 94/280 [1:22:03<2:35:19, 50.10s/it]

HMGA2 is not in the tf_GRN_dict, no targets
HMGA2 is filtered
HOMEZ is not in the tf_GRN_dict, no targets
HOMEZ is filtered
HOXA10 is not in the tf_GRN_dict, no targets
HOXA10 is filtered
HOXA5 is not in the tf_GRN_dict, no targets
HOXA5 is filtered


 36%|███▌      | 92/259 [1:22:08<2:38:38, 57.00s/it]... storing 'x' as categorical
 37%|███▋      | 92/252 [1:22:34<2:21:05, 52.91s/it]

HOXA2 is not in the tf_GRN_dict, no targets
HOXA2 is filtered


... storing 'x' as categorical
 81%|████████  | 224/278 [1:22:55<20:48, 23.11s/it]... storing 'x' as categorical
 36%|███▌      | 93/259 [1:23:02<2:34:43, 55.92s/it]

HOXA5 is not in the tf_GRN_dict, no targets
HOXA5 is filtered
HOXA6 is not in the tf_GRN_dict, no targets
HOXA6 is filtered


... storing 'x' as categorical
 37%|███▋      | 94/252 [1:23:26<1:46:43, 40.53s/it]... storing 'x' as categorical
 36%|███▌      | 100/280 [1:23:43<1:23:22, 27.79s/it]... storing 'x' as categorical
 28%|██▊       | 82/291 [1:23:51<3:26:17, 59.22s/it]... storing 'x' as categorical
 32%|███▏      | 97/306 [1:23:57<2:56:23, 50.64s/it]... storing 'x' as categorical
 38%|███▊      | 95/252 [1:24:18<1:53:35, 43.41s/it]... storing 'x' as categorical
 37%|███▋      | 97/259 [1:24:49<1:44:05, 38.55s/it]... storing 'x' as categorical
 36%|███▌      | 101/280 [1:24:55<1:46:56, 35.85s/it]... storing 'x' as categorical
 38%|███▊      | 96/252 [1:25:10<1:58:45, 45.68s/it]... storing 'x' as categorical
 38%|███▊      | 98/259 [1:25:42<1:52:31, 41.93s/it]... storing 'x' as categorical
 36%|███▋      | 102/280 [1:25:45<1:55:25, 38.91s/it]

HOXB7 is not in the tf_GRN_dict, no targets
HOXB7 is filtered


 32%|███▏      | 99/306 [1:25:47<3:01:33, 52.63s/it]... storing 'x' as categorical
 38%|███▊      | 97/252 [1:26:03<2:02:39, 47.48s/it]... storing 'x' as categorical
 84%|████████▍ | 233/278 [1:26:22<17:12, 22.94s/it]... storing 'x' as categorical
 33%|███▎      | 100/306 [1:26:42<3:02:55, 53.28s/it]... storing 'x' as categorical
 29%|██▉       | 85/291 [1:26:49<3:23:29, 59.27s/it]... storing 'x' as categorical
 39%|███▉      | 98/252 [1:26:55<2:05:17, 48.82s/it]... storing 'x' as categorical
 33%|███▎      | 101/306 [1:27:36<3:03:37, 53.75s/it]... storing 'x' as categorical
 39%|███▉      | 99/252 [1:27:47<2:06:43, 49.70s/it]... storing 'x' as categorical
 30%|██▉       | 86/291 [1:27:48<3:22:49, 59.37s/it]... storing 'x' as categorical
 39%|███▉      | 101/259 [1:28:22<2:08:26, 48.78s/it]... storing 'x' as categorical
 40%|███▉      | 100/252 [1:28:39<2:07:40, 50.40s/it]... storing 'x' as categorical
 30%|██▉       | 87/291 [1:28:48<3:21:42, 59.33s/it]... storing 'x' as categorical
 

HOXA2 is not in the tf_GRN_dict, no targets
HOXA2 is filtered


... storing 'x' as categorical
 41%|████      | 105/259 [1:32:26<2:25:31, 56.70s/it]... storing 'x' as categorical
 90%|████████▉ | 250/278 [1:32:51<10:41, 22.91s/it]... storing 'x' as categorical
 42%|████▏     | 105/252 [1:33:00<2:06:59, 51.83s/it]... storing 'x' as categorical
 41%|████      | 106/259 [1:33:20<2:22:05, 55.72s/it]... storing 'x' as categorical
 40%|███▉      | 111/280 [1:33:30<2:47:11, 59.36s/it]... storing 'x' as categorical
 32%|███▏      | 92/291 [1:33:45<3:16:48, 59.34s/it]... storing 'x' as categorical
 36%|███▌      | 109/306 [1:34:00<2:28:39, 45.28s/it]... storing 'x' as categorical
 40%|████      | 112/280 [1:34:20<2:38:45, 56.70s/it]... storing 'x' as categorical
 92%|█████████▏| 255/278 [1:34:44<08:43, 22.77s/it]... storing 'x' as categorical
 36%|███▌      | 110/306 [1:34:55<2:36:04, 47.78s/it]... storing 'x' as categorical
 92%|█████████▏| 256/278 [1:35:07<08:23, 22.87s/it]

ZNF136 is not in the tf_GRN_dict, no targets
ZNF136 is filtered


 40%|████      | 113/280 [1:35:11<2:32:52, 54.92s/it]... storing 'x' as categorical
 43%|████▎     | 108/252 [1:35:37<2:05:30, 52.29s/it]... storing 'x' as categorical
 93%|█████████▎| 259/278 [1:35:53<06:00, 18.97s/it]... storing 'x' as categorical
 41%|████      | 114/280 [1:36:02<2:28:42, 53.75s/it]

IRF5 is not in the tf_GRN_dict, no targets
IRF5 is filtered


... storing 'x' as categorical
 36%|███▋      | 111/306 [1:36:23<3:10:34, 58.64s/it]... storing 'x' as categorical
 42%|████▏     | 109/259 [1:36:31<2:38:24, 63.36s/it]... storing 'x' as categorical
 33%|███▎      | 95/291 [1:36:45<3:15:41, 59.91s/it]... storing 'x' as categorical
 41%|████▏     | 116/280 [1:36:53<1:51:38, 40.84s/it]

IRF7 is not in the tf_GRN_dict, no targets
IRF7 is filtered


... storing 'x' as categorical
 42%|████▏     | 110/259 [1:37:24<2:29:44, 60.30s/it]... storing 'x' as categorical
 95%|█████████▍| 264/278 [1:37:49<05:11, 22.24s/it]... storing 'x' as categorical
 33%|███▎      | 96/291 [1:37:45<3:15:00, 60.00s/it]]... storing 'x' as categorical
 43%|████▎     | 111/259 [1:38:17<2:23:39, 58.24s/it]... storing 'x' as categorical
 42%|████▎     | 119/280 [1:38:35<1:42:53, 38.35s/it]... storing 'x' as categorical
 33%|███▎      | 97/291 [1:38:46<3:14:29, 60.15s/it]

HOXA2 is not in the tf_GRN_dict, no targets
HOXA2 is filtered


... storing 'x' as categorical
 44%|████▍     | 112/252 [1:39:08<2:02:57, 52.69s/it]... storing 'x' as categorical
 43%|████▎     | 120/280 [1:39:26<1:50:24, 41.40s/it]... storing 'x' as categorical
 97%|█████████▋| 269/278 [1:39:44<03:26, 22.96s/it]... storing 'x' as categorical
 34%|███▍      | 99/291 [1:39:46<2:28:23, 46.37s/it]... storing 'x' as categorical
 44%|████▎     | 113/259 [1:40:04<2:15:41, 55.76s/it]... storing 'x' as categorical
 97%|█████████▋| 271/278 [1:40:30<02:39, 22.85s/it]

ZNF490 is not in the tf_GRN_dict, no targets
ZNF490 is filtered


 43%|████▎     | 121/280 [1:40:38<2:10:29, 49.24s/it]... storing 'x' as categorical
 98%|█████████▊| 273/278 [1:40:53<01:28, 17.63s/it]... storing 'x' as categorical
 38%|███▊      | 116/306 [1:41:01<2:57:54, 56.18s/it]... storing 'x' as categorical
 99%|█████████▊| 274/278 [1:41:16<01:15, 18.97s/it]

ZNF586 is not in the tf_GRN_dict, no targets
ZNF586 is filtered


... storing 'x' as categorical
 44%|████▎     | 122/280 [1:41:29<2:10:53, 49.71s/it]... storing 'x' as categorical
 46%|████▌     | 115/252 [1:41:47<2:00:32, 52.80s/it]... storing 'x' as categorical
 38%|███▊      | 117/306 [1:41:56<2:56:04, 55.90s/it]... storing 'x' as categorical
100%|██████████| 278/278 [1:42:23<00:00, 22.10s/it]
 46%|████▌     | 116/252 [1:42:39<1:59:31, 52.73s/it]... storing 'x' as categorical
 44%|████▍     | 124/280 [1:43:33<2:26:26, 56.33s/it]... storing 'x' as categorical
 39%|███▉      | 120/306 [1:44:43<2:52:35, 55.67s/it]... storing 'x' as categorical
 47%|████▋     | 119/252 [1:45:18<1:57:05, 52.82s/it]

LEF1 is not in the tf_GRN_dict, no targets
LEF1 is filtered


 40%|███▉      | 121/306 [1:45:38<2:51:39, 55.67s/it]... storing 'x' as categorical
 45%|████▌     | 127/280 [1:46:26<2:25:46, 57.17s/it]

LEF1 is not in the tf_GRN_dict, no targets
LEF1 is filtered


 40%|███▉      | 122/306 [1:46:34<2:50:49, 55.70s/it]... storing 'x' as categorical
 46%|████▌     | 129/280 [1:47:39<1:59:40, 47.55s/it]... storing 'x' as categorical
 41%|████      | 124/306 [1:48:25<2:48:49, 55.66s/it]... storing 'x' as categorical
 47%|████▋     | 123/259 [1:49:45<2:05:07, 55.20s/it]... storing 'x' as categorical
 48%|████▊     | 124/259 [1:50:38<2:02:28, 54.43s/it]... storing 'x' as categorical
 50%|█████     | 127/252 [1:51:26<1:45:49, 50.80s/it]... storing 'x' as categorical
 48%|████▊     | 133/280 [1:52:26<2:38:36, 64.74s/it]... storing 'x' as categorical
 49%|████▉     | 127/259 [1:53:45<2:06:34, 57.54s/it]... storing 'x' as categorical
 52%|█████▏    | 130/252 [1:54:11<1:49:47, 54.00s/it]... storing 'x' as categorical
 43%|████▎     | 131/306 [1:55:27<3:10:43, 65.39s/it]... storing 'x' as categorical
 43%|████▎     | 132/306 [1:56:24<3:02:39, 62.98s/it]... storing 'x' as categorical
 51%|█████     | 131/259 [1:57:45<2:00:58, 56.71s/it]... storing 'x' as cate

NFIL3 is not in the tf_GRN_dict, no targets
NFIL3 is filtered


 55%|█████▍    | 153/280 [2:09:47<1:30:06, 42.57s/it]... storing 'x' as categorical
 55%|█████▌    | 154/280 [2:10:46<1:37:47, 46.57s/it]... storing 'x' as categorical
 55%|█████▌    | 155/280 [2:12:04<1:54:16, 54.85s/it]

NKX2-1 is not in the tf_GRN_dict, no targets
NKX2-1 is filtered


... storing 'x' as categorical
 60%|█████▉    | 150/252 [2:12:46<1:43:41, 60.99s/it]

NR2F2 is not in the tf_GRN_dict, no targets
NR2F2 is filtered


 56%|█████▌    | 157/280 [2:12:55<1:26:41, 42.29s/it]... storing 'x' as categorical
 57%|█████▋    | 147/259 [2:14:03<1:50:19, 59.10s/it]... storing 'x' as categorical
 57%|█████▋    | 148/259 [2:14:55<1:45:41, 57.13s/it]... storing 'x' as categorical
 58%|█████▊    | 149/259 [2:15:48<1:42:20, 55.82s/it]... storing 'x' as categorical
 50%|█████     | 154/306 [2:17:05<2:21:19, 55.78s/it]... storing 'x' as categorical
 58%|█████▊    | 151/259 [2:18:03<1:53:30, 63.06s/it]... storing 'x' as categorical
 59%|█████▊    | 152/259 [2:18:55<1:46:56, 59.97s/it]... storing 'x' as categorical
 59%|█████▉    | 165/280 [2:20:05<1:39:20, 51.83s/it]... storing 'x' as categorical
 59%|█████▉    | 166/280 [2:20:56<1:38:03, 51.61s/it]... storing 'x' as categorical
 60%|█████▉    | 155/259 [2:22:04<1:51:20, 64.24s/it]... storing 'x' as categorical
 60%|██████    | 168/280 [2:22:59<1:43:59, 55.71s/it]... storing 'x' as categorical
 61%|██████    | 157/259 [2:23:49<1:39:20, 58.44s/it]... storing 'x' as cate

PBX4 is not in the tf_GRN_dict, no targets
PBX4 is filtered


 55%|█████▍    | 167/306 [2:29:01<2:07:29, 55.04s/it]... storing 'x' as categorical
 63%|██████▎   | 176/280 [2:29:58<1:20:28, 46.42s/it]... storing 'x' as categorical
 55%|█████▌    | 169/306 [2:30:52<2:06:14, 55.29s/it]

NR1D2 is not in the tf_GRN_dict, no targets
NR1D2 is filtered


 63%|██████▎   | 164/259 [2:31:00<1:29:26, 56.49s/it]... storing 'x' as categorical
 64%|██████▎   | 178/280 [2:32:01<1:29:22, 52.57s/it]

PPARD is not in the tf_GRN_dict, no targets
PPARD is filtered
PPARG is not in the tf_GRN_dict, no targets
PPARG is filtered
PRDM4 is not in the tf_GRN_dict, no targets
PRDM4 is filtered


 69%|██████▊   | 173/252 [2:32:04<1:09:10, 52.54s/it]... storing 'x' as categorical
 69%|██████▉   | 174/252 [2:32:57<1:08:18, 52.55s/it]... storing 'x' as categorical
 69%|██████▉   | 175/252 [2:33:50<1:07:33, 52.64s/it]... storing 'x' as categorical
 66%|██████▌   | 184/280 [2:34:55<1:01:15, 38.29s/it]... storing 'x' as categorical
 66%|██████▌   | 185/280 [2:35:46<1:05:04, 41.10s/it]... storing 'x' as categorical
 71%|███████   | 178/252 [2:36:28<1:04:56, 52.66s/it]

SALL4 is not in the tf_GRN_dict, no targets
SALL4 is filtered


 66%|██████▋   | 186/280 [2:36:37<1:08:11, 43.52s/it]... storing 'x' as categorical
 66%|██████▋   | 172/259 [2:38:01<1:16:35, 52.82s/it]... storing 'x' as categorical
 67%|██████▋   | 173/259 [2:38:53<1:15:22, 52.58s/it]... storing 'x' as categorical
 59%|█████▉    | 180/306 [2:40:05<1:54:44, 54.64s/it]... storing 'x' as categorical
 59%|█████▉    | 181/306 [2:41:00<1:54:14, 54.84s/it]

OSR1 is not in the tf_GRN_dict, no targets
OSR1 is filtered


 68%|██████▊   | 175/259 [2:41:02<1:24:03, 60.04s/it]... storing 'x' as categorical
 68%|██████▊   | 191/280 [2:41:57<1:32:07, 62.11s/it]... storing 'x' as categorical
 69%|██████▊   | 192/280 [2:42:48<1:26:18, 58.85s/it]

RXRB is not in the tf_GRN_dict, no targets
RXRB is filtered


 60%|██████    | 184/306 [2:42:51<1:32:35, 45.54s/it]... storing 'x' as categorical
 60%|██████    | 185/306 [2:43:46<1:36:53, 48.05s/it]... storing 'x' as categorical
 70%|██████▉   | 195/280 [2:44:51<1:11:39, 50.58s/it]... storing 'x' as categorical
 61%|██████    | 187/306 [2:45:36<1:41:57, 51.41s/it]

PBX4 is not in the tf_GRN_dict, no targets
PBX4 is filtered


 70%|███████   | 196/280 [2:46:04<1:18:58, 56.41s/it]... storing 'x' as categorical
 76%|███████▌  | 191/252 [2:47:01<53:19, 52.46s/it]t]... storing 'x' as categorical
 71%|███████   | 183/259 [2:48:02<1:07:03, 52.94s/it]

SALL4 is not in the tf_GRN_dict, no targets
SALL4 is filtered


... storing 'x' as categorical
 71%|███████▏  | 185/259 [2:48:55<50:11, 40.69s/it]  ... storing 'x' as categorical
 77%|███████▋  | 194/252 [2:49:41<51:07, 52.88s/it]t]

SP140 is not in the tf_GRN_dict, no targets
SP140 is filtered


 72%|███████▏  | 186/259 [2:49:47<53:02, 43.60s/it]... storing 'x' as categorical
 72%|███████▏  | 187/259 [2:50:39<55:05, 45.91s/it]t]... storing 'x' as categorical
 64%|██████▎   | 195/306 [2:52:04<1:38:32, 53.27s/it]... storing 'x' as categorical
 64%|██████▍   | 196/306 [2:53:00<1:38:48, 53.90s/it]

PRDM4 is not in the tf_GRN_dict, no targets
PRDM4 is filtered


... storing 'x' as categorical
 73%|███████▎  | 205/280 [2:54:05<1:12:01, 57.62s/it]... storing 'x' as categorical
 74%|███████▎  | 206/280 [2:54:56<1:08:34, 55.59s/it]... storing 'x' as categorical
 60%|█████▉    | 174/291 [2:55:08<1:57:04, 60.04s/it]

PBX4 is not in the tf_GRN_dict, no targets
PBX4 is filtered


 74%|███████▍  | 207/280 [2:55:47<1:05:56, 54.21s/it]... storing 'x' as categorical
 81%|████████  | 203/252 [2:57:01<43:10, 52.87s/it]t]... storing 'x' as categorical
 75%|███████▍  | 209/280 [2:57:29<1:02:14, 52.60s/it]

SNAI2 is not in the tf_GRN_dict, no targets
SNAI2 is filtered


 81%|████████  | 204/252 [2:57:54<42:14, 52.80s/it]... storing 'x' as categorical
 76%|███████▌  | 196/259 [2:59:06<55:52, 53.22s/it]t]... storing 'x' as categorical
 76%|███████▌  | 212/280 [2:59:32<54:28, 48.07s/it]t]

SOX3 is not in the tf_GRN_dict, no targets
SOX3 is filtered


 76%|███████▌  | 197/259 [2:59:58<54:47, 53.02s/it]t]... storing 'x' as categorical
 67%|██████▋   | 205/306 [3:00:55<1:32:12, 54.78s/it]... storing 'x' as categorical
 77%|███████▋  | 216/280 [3:02:05<46:42, 43.79s/it]t]... storing 'x' as categorical
 63%|██████▎   | 182/291 [3:02:06<1:44:48, 57.70s/it]

PRDM4 is not in the tf_GRN_dict, no targets
PRDM4 is filtered
PROX1 is not in the tf_GRN_dict, no targets
PROX1 is filtered


 78%|███████▊  | 217/280 [3:02:56<47:50, 45.57s/it]t]... storing 'x' as categorical
 84%|████████▎ | 211/252 [3:04:01<35:44, 52.30s/it]t]... storing 'x' as categorical
 84%|████████▍ | 212/252 [3:04:53<34:46, 52.17s/it]t]... storing 'x' as categorical
 69%|██████▊   | 210/306 [3:06:01<1:31:12, 57.01s/it]... storing 'x' as categorical
 79%|███████▉  | 221/280 [3:06:58<53:03, 53.95s/it]t]... storing 'x' as categorical
 80%|███████▉  | 206/259 [3:07:49<46:21, 52.47s/it]t]... storing 'x' as categorical
 70%|██████▉   | 213/306 [3:08:45<1:26:04, 55.53s/it]

SIM2 is not in the tf_GRN_dict, no targets
SIM2 is filtered


... storing 'x' as categorical
 86%|████████▌ | 217/252 [3:09:13<30:21, 52.03s/it]t]

TFE3 is not in the tf_GRN_dict, no targets
TFE3 is filtered


 70%|███████   | 215/306 [3:09:39<1:04:23, 42.45s/it]

SIRT6 is not in the tf_GRN_dict, no targets
SIRT6 is filtered


 80%|████████  | 224/280 [3:09:50<53:55, 57.78s/it]... storing 'x' as categorical
 81%|████████  | 209/259 [3:10:26<43:32, 52.26s/it]t]

STAT5A is not in the tf_GRN_dict, no targets
STAT5A is filtered


 87%|████████▋ | 220/252 [3:10:57<22:53, 42.92s/it]  ... storing 'x' as categorical
 88%|████████▊ | 221/252 [3:11:48<23:20, 45.18s/it]t]... storing 'x' as categorical
 82%|████████▏ | 212/259 [3:12:10<33:48, 43.15s/it]t]

TBX3 is not in the tf_GRN_dict, no targets
TBX3 is filtered


 81%|████████  | 227/280 [3:12:41<52:13, 59.12s/it]t]... storing 'x' as categorical
 88%|████████▊ | 223/252 [3:13:32<23:23, 48.40s/it]t]... storing 'x' as categorical
 83%|████████▎ | 215/259 [3:14:25<34:15, 46.72s/it]t]... storing 'x' as categorical
 82%|████████▏ | 230/280 [3:15:32<49:27, 59.36s/it]t]... storing 'x' as categorical
 84%|████████▍ | 217/259 [3:16:10<34:28, 49.25s/it]t]

TCF7L1 is not in the tf_GRN_dict, no targets
TCF7L1 is filtered


 82%|████████▎ | 231/280 [3:16:22<46:10, 56.55s/it]... storing 'x' as categorical
 74%|███████▎  | 225/306 [3:17:50<1:11:24, 52.89s/it]... storing 'x' as categorical
 90%|█████████ | 228/252 [3:17:51<20:25, 51.08s/it]t]

ZBTB49 is not in the tf_GRN_dict, no targets
ZBTB49 is filtered


 85%|████████▌ | 221/259 [3:18:48<28:29, 44.99s/it]t]... storing 'x' as categorical
 84%|████████▍ | 235/280 [3:19:42<38:40, 51.56s/it]t]... storing 'x' as categorical
 92%|█████████▏| 232/252 [3:20:43<16:00, 48.02s/it]t]

ZNF134 is not in the tf_GRN_dict, no targets
ZNF134 is filtered
ZNF136 is not in the tf_GRN_dict, no targets
ZNF136 is filtered


... storing 'x' as categorical
 85%|████████▍ | 237/280 [3:21:43<39:33, 55.19s/it]t]... storing 'x' as categorical
 94%|█████████▎| 236/252 [3:22:27<09:33, 35.86s/it]t]

ZNF232 is not in the tf_GRN_dict, no targets
ZNF232 is filtered


 85%|████████▌ | 238/280 [3:22:34<37:37, 53.76s/it]... storing 'x' as categorical
 94%|█████████▍| 238/252 [3:23:19<07:32, 32.30s/it]t]... storing 'x' as categorical
 86%|████████▌ | 240/280 [3:24:35<37:38, 56.46s/it]t]... storing 'x' as categorical
 86%|████████▌ | 241/280 [3:25:26<35:31, 54.65s/it]t]

TFE3 is not in the tf_GRN_dict, no targets
TFE3 is filtered


... storing 'x' as categorical
 89%|████████▉ | 230/259 [3:26:39<25:04, 51.86s/it]t]... storing 'x' as categorical
 96%|█████████▋| 243/252 [3:27:39<07:03, 47.05s/it]t]... storing 'x' as categorical
 88%|████████▊ | 245/280 [3:28:38<31:01, 53.20s/it]t]... storing 'x' as categorical
 97%|█████████▋| 245/252 [3:29:22<05:44, 49.28s/it]t]

ZNF490 is not in the tf_GRN_dict, no targets
ZNF490 is filtered


 78%|███████▊  | 238/306 [3:29:38<1:01:44, 54.48s/it]... storing 'x' as categorical
 98%|█████████▊| 247/252 [3:30:13<03:13, 38.70s/it]t]

ZNF586 is not in the tf_GRN_dict, no targets
ZNF586 is filtered


 90%|█████████ | 234/259 [3:30:38<23:00, 55.23s/it]t]... storing 'x' as categorical
 91%|█████████ | 235/259 [3:31:30<21:46, 54.44s/it]  ... storing 'x' as categorical
 99%|█████████▉| 250/252 [3:31:57<01:15, 37.70s/it]t]

ZNF8 is not in the tf_GRN_dict, no targets
ZNF8 is filtered


 91%|█████████ | 236/259 [3:32:23<20:39, 53.90s/it]... storing 'x' as categorical
100%|██████████| 252/252 [3:32:49<00:00, 50.67s/it]t]
 92%|█████████▏| 237/259 [3:33:15<19:35, 53.43s/it]

ZBTB49 is not in the tf_GRN_dict, no targets
ZBTB49 is filtered


... storing 'x' as categorical
 90%|████████▉ | 251/280 [3:34:20<28:20, 58.64s/it]t]

ZBTB48 is not in the tf_GRN_dict, no targets
ZBTB48 is filtered
ZBTB49 is not in the tf_GRN_dict, no targets
ZBTB49 is filtered


... storing 'x' as categorical
 91%|█████████ | 254/280 [3:35:30<16:56, 39.08s/it]t]... storing 'x' as categorical
 93%|█████████▎| 241/259 [3:35:52<13:47, 45.99s/it]t]

ZNF232 is not in the tf_GRN_dict, no targets
ZNF232 is filtered


 91%|█████████ | 255/280 [3:36:20<17:13, 41.36s/it]... storing 'x' as categorical
 91%|█████████▏| 256/280 [3:37:10<17:19, 43.33s/it]t]... storing 'x' as categorical
 92%|█████████▏| 257/280 [3:38:21<19:12, 50.10s/it]t]... storing 'x' as categorical
 95%|█████████▍| 246/259 [3:39:22<09:59, 46.09s/it]t]... storing 'x' as categorical
 92%|█████████▎| 259/280 [3:40:22<18:53, 53.99s/it]t]... storing 'x' as categorical
 82%|████████▏ | 251/306 [3:41:25<49:48, 54.33s/it]t]... storing 'x' as categorical
 82%|████████▏ | 252/306 [3:42:19<48:56, 54.38s/it]t]... storing 'x' as categorical
 83%|████████▎ | 253/306 [3:43:13<47:55, 54.25s/it]t]... storing 'x' as categorical
 97%|█████████▋| 251/259 [3:43:43<06:49, 51.19s/it]t]

ZNF490 is not in the tf_GRN_dict, no targets
ZNF490 is filtered
ZNF502 is not in the tf_GRN_dict, no targets
ZNF502 is filtered


 83%|████████▎ | 254/306 [3:44:08<46:59, 54.23s/it]... storing 'x' as categorical
 94%|█████████▍| 263/280 [3:44:44<17:37, 62.19s/it]t]

ZNF317 is not in the tf_GRN_dict, no targets
ZNF317 is filtered


 98%|█████████▊| 254/259 [3:45:06<03:09, 37.99s/it]

ZNF554 is not in the tf_GRN_dict, no targets
ZNF554 is filtered
ZNF563 is not in the tf_GRN_dict, no targets
ZNF563 is filtered


... storing 'x' as categorical
 99%|█████████▉| 257/259 [3:45:59<00:57, 28.97s/it]t]... storing 'x' as categorical
100%|█████████▉| 258/259 [3:46:51<00:33, 33.10s/it]t]... storing 'x' as categorical
100%|██████████| 259/259 [3:47:44<00:00, 52.76s/it]  
 95%|█████████▌| 267/280 [3:47:55<11:58, 55.29s/it]... storing 'x' as categorical
 96%|█████████▌| 268/280 [3:49:06<11:53, 59.45s/it]... storing 'x' as categorical
 80%|███████▉  | 232/291 [3:49:17<57:47, 58.77s/it]

TBX3 is not in the tf_GRN_dict, no targets
TBX3 is filtered


 96%|█████████▌| 269/280 [3:49:56<10:25, 56.84s/it]

ZNF436 is not in the tf_GRN_dict, no targets
ZNF436 is filtered
ZNF449 is not in the tf_GRN_dict, no targets
ZNF449 is filtered
ZNF490 is not in the tf_GRN_dict, no targets
ZNF490 is filtered


... storing 'x' as categorical
 98%|█████████▊| 273/280 [3:50:46<03:27, 29.61s/it]

ZNF554 is not in the tf_GRN_dict, no targets
ZNF554 is filtered


... storing 'x' as categorical
 98%|█████████▊| 275/280 [3:51:57<02:36, 31.39s/it]

ZNF586 is not in the tf_GRN_dict, no targets
ZNF586 is filtered


... storing 'x' as categorical
 86%|████████▋ | 264/306 [3:53:09<37:52, 54.11s/it]... storing 'x' as categorical
 87%|████████▋ | 265/306 [3:54:04<37:01, 54.18s/it]... storing 'x' as categorical
 87%|████████▋ | 266/306 [3:54:58<36:07, 54.18s/it]... storing 'x' as categorical
100%|██████████| 280/280 [3:55:38<00:00, 50.49s/it]
 87%|████████▋ | 267/306 [3:55:52<35:14, 54.23s/it]... storing 'x' as categorical
 88%|████████▊ | 268/306 [3:56:46<34:15, 54.09s/it]... storing 'x' as categorical
 88%|████████▊ | 269/306 [3:57:40<33:18, 54.02s/it]... storing 'x' as categorical
 88%|████████▊ | 270/306 [3:58:34<32:21, 53.94s/it]... storing 'x' as categorical
 89%|████████▊ | 271/306 [3:59:28<31:27, 53.93s/it]... storing 'x' as categorical
 89%|████████▉ | 272/306 [4:00:22<30:34, 53.96s/it]... storing 'x' as categorical
 89%|████████▉ | 273/306 [4:01:15<29:38, 53.91s/it]... storing 'x' as categorical
 90%|████████▉ | 274/306 [4:02:09<28:46, 53.95s/it]... storing 'x' as categorical
 90%|████████▉ 

ZBTB49 is not in the tf_GRN_dict, no targets
ZBTB49 is filtered


 91%|█████████ | 277/306 [4:03:57<20:03, 41.49s/it]... storing 'x' as categorical
 85%|████████▌ | 248/291 [4:03:58<41:55, 58.50s/it]... storing 'x' as categorical
 91%|█████████ | 278/306 [4:05:22<24:24, 52.32s/it]... storing 'x' as categorical
 91%|█████████ | 279/306 [4:06:16<23:42, 52.68s/it]... storing 'x' as categorical
 92%|█████████▏| 280/306 [4:07:10<22:57, 52.97s/it]... storing 'x' as categorical
 92%|█████████▏| 281/306 [4:08:03<22:08, 53.13s/it]... storing 'x' as categorical
 92%|█████████▏| 282/306 [4:08:57<21:19, 53.32s/it]

ZNF136 is not in the tf_GRN_dict, no targets
ZNF136 is filtered


... storing 'x' as categorical
 93%|█████████▎| 285/306 [4:10:45<15:33, 44.47s/it]... storing 'x' as categorical
 93%|█████████▎| 286/306 [4:11:39<15:38, 46.95s/it]... storing 'x' as categorical
 94%|█████████▍| 287/306 [4:12:33<15:28, 48.85s/it]... storing 'x' as categorical
 94%|█████████▍| 288/306 [4:13:27<15:04, 50.24s/it]... storing 'x' as categorical
 94%|█████████▍| 289/306 [4:14:21<14:31, 51.26s/it]... storing 'x' as categorical
 95%|█████████▍| 290/306 [4:15:15<13:52, 52.02s/it]... storing 'x' as categorical
 89%|████████▉ | 260/291 [4:15:39<30:09, 58.37s/it]

ZBTB48 is not in the tf_GRN_dict, no targets
ZBTB48 is filtered
ZBTB49 is not in the tf_GRN_dict, no targets
ZBTB49 is filtered


 95%|█████████▌| 291/306 [4:16:09<13:08, 52.58s/it]... storing 'x' as categorical
 95%|█████████▌| 292/306 [4:17:02<12:20, 52.87s/it]... storing 'x' as categorical
 96%|█████████▌| 293/306 [4:17:56<11:31, 53.16s/it]... storing 'x' as categorical
 96%|█████████▌| 294/306 [4:18:50<10:40, 53.37s/it]... storing 'x' as categorical
 91%|█████████▏| 266/291 [4:19:32<20:09, 48.38s/it]

ZNF140 is not in the tf_GRN_dict, no targets
ZNF140 is filtered


 96%|█████████▋| 295/306 [4:19:43<09:47, 53.38s/it]... storing 'x' as categorical
 97%|█████████▋| 296/306 [4:20:37<08:54, 53.49s/it]... storing 'x' as categorical
 97%|█████████▋| 298/306 [4:22:24<07:08, 53.59s/it]

ZNF490 is not in the tf_GRN_dict, no targets
ZNF490 is filtered
ZNF502 is not in the tf_GRN_dict, no targets
ZNF502 is filtered


... storing 'x' as categorical
 98%|█████████▊| 301/306 [4:23:18<02:47, 33.57s/it]... storing 'x' as categorical
 99%|█████████▊| 302/306 [4:24:12<02:31, 37.87s/it]... storing 'x' as categorical
 99%|█████████▉| 303/306 [4:25:06<02:04, 41.59s/it]... storing 'x' as categorical
 99%|█████████▉| 304/306 [4:26:00<01:29, 44.60s/it]... storing 'x' as categorical
100%|█████████▉| 305/306 [4:26:54<00:47, 47.07s/it]... storing 'x' as categorical
100%|██████████| 306/306 [4:27:48<00:00, 52.51s/it]
... storing 'x' as categorical
 95%|█████████▍| 276/291 [4:28:16<14:11, 56.75s/it]... storing 'x' as categorical
 95%|█████████▌| 277/291 [4:29:14<13:18, 57.05s/it]... storing 'x' as categorical
 96%|█████████▌| 278/291 [4:30:11<12:23, 57.22s/it]... storing 'x' as categorical
 96%|█████████▌| 279/291 [4:31:09<11:28, 57.39s/it]

ZNF436 is not in the tf_GRN_dict, no targets
ZNF436 is filtered


... storing 'x' as categorical
 97%|█████████▋| 281/291 [4:32:07<07:22, 44.30s/it]

ZNF490 is not in the tf_GRN_dict, no targets
ZNF490 is filtered


... storing 'x' as categorical
 97%|█████████▋| 283/291 [4:33:05<05:05, 38.19s/it]... storing 'x' as categorical
 98%|█████████▊| 284/291 [4:34:02<04:57, 42.45s/it]... storing 'x' as categorical
 98%|█████████▊| 285/291 [4:35:00<04:36, 46.11s/it]... storing 'x' as categorical
 98%|█████████▊| 286/291 [4:35:58<04:05, 49.08s/it]

ZNF582 is not in the tf_GRN_dict, no targets
ZNF582 is filtered
ZNF586 is not in the tf_GRN_dict, no targets
ZNF586 is filtered


... storing 'x' as categorical
 99%|█████████▉| 289/291 [4:36:56<01:07, 33.52s/it]... storing 'x' as categorical
100%|█████████▉| 290/291 [4:37:53<00:38, 38.37s/it]... storing 'x' as categorical
100%|██████████| 291/291 [4:38:51<00:00, 57.50s/it]


# debug

In [12]:
for pert in tqdm(common_perts):
    

    # - this is for crispra
    gois = [pert]
    goi_dict = {}

    # - all data in L1000 is knockdown
    for goi in gois:
        # -- if original value is zero
        if np.mean(adata_rna[:,goi].X.toarray())==0:
            print(f'{goi} ctrl expression is 0')
            continue
        # -- if the TF has no targets
        if goi not in list(tf_GRN_dict.keys()):
            print(f'{goi} is not in the tf_GRN_dict, no targets')
            continue
        goi_dict[goi] = 0
    if len(goi_dict) == 0:
        print(f'{pert} is filtered')
        continue


    # Enter perturbation conditions to simulate signal propagation after the perturbation.
    oracle_ctrl.simulate_shift(perturb_condition=goi_dict,
                        n_propagation=3)
    # - get the prediction; delta_X = simulated_count - imputed_count
    delta_X, simulated_count = oracle_ctrl.adata.layers["delta_X"], oracle_ctrl.adata.layers["simulated_count"]


    # - create adata_pert
    adata_pert = adata_rna.copy()
    adata_pert.X = simulated_count
    adata_pert.X[adata_pert.X < 0] = 0
    adata_pert.obs_names = [i+f'_{pert}' for i in adata_pert.obs_names]

    # - adata_ctrl
    adata_ctrl = adata_rna.copy()

    adata_ctrl.obs['batch'] = 'ctrl'
    adata_pert.obs['batch'] = 'pert'

    adata_concat = ad.concat([adata_ctrl, adata_pert])
    adata_concat.obs['batch'] = adata_concat.obs['batch'].astype('category') 
    adata_concat.obs['celltype'] = adata_concat.obs['celltype'].astype('category') 

    # - cal de genes
    rankby_abs = False

    sc.tl.rank_genes_groups(
        adata_concat,
        groupby='batch',
        reference='ctrl',
        rankby_abs=rankby_abs,
        n_genes=len(adata_concat.var),
        use_raw=False,
        method = 'wilcoxon'
    )
    de_genes = pd.DataFrame(adata_concat.uns['rank_genes_groups']['names'])
    pvals = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals'])
    pvals_adj = pd.DataFrame(adata_concat.uns['rank_genes_groups']['pvals_adj'])
    scores = pd.DataFrame(adata_concat.uns['rank_genes_groups']['scores'])
    logfoldchanges = pd.DataFrame(adata_concat.uns['rank_genes_groups']['logfoldchanges'])

    # - get gene_score
    gene_score = pd.DataFrame({'gene':list(de_genes['pert']),
                                'z-score':list(scores['pert'])})

    pert_gene_rank_dict[pert] = (list(de_genes['pert']), list(scores['pert']))
    
    # break

  0%|          | 0/306 [00:00<?, ?it/s]

  1%|          | 2/306 [02:51<7:14:43, 85.80s/it]


KeyboardInterrupt: 

In [None]:
sc.tl.rank_genes_groups(
    adata_concat,
    groupby='batch',
    reference='ctrl',
    rankby_abs=rankby_abs,
    n_genes=len(adata_concat.var),
    use_raw=False,
    method = 'wilcoxon'
)