In [1]:
import scanpy as sc
import numpy as np
from flecs.utils import get_project_root
from flecs.data.utils import load_interaction_data
import mygene
from tqdm import tqdm
import os

## Load original data

In [2]:
adata = sc.read_h5ad("/Users/paul/PycharmProjects/FLeCS/datasets/Sciplex3/SrivatsanTrapnell2020_sciplex3.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 799317 × 110984
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells'

## Filter rows and columns

In [4]:
adata = adata[adata.obs["cell_line"] == 'A549', 
              adata.var["ensembl_id"].apply(lambda x: x.startswith("ENSG"))].copy()

adata = adata[adata.obs["replicate"] == 'rep1'].copy()
adata = adata[adata.obs["dose_value"].apply(lambda d: d==10000. or d==0.)].copy()

In [5]:
adata

AnnData object with n_obs × n_vars = 25581 × 58347
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [6]:
mg = mygene.MyGeneInfo()
geneSyms = mg.querymany(list(adata.var["ensembl_id"]) , scopes='ensembl.gene', fields='symbol', species='human')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [7]:
len(['symbol' in d.keys() for d in geneSyms])

58402

In [8]:
sum(['symbol' in d.keys() for d in geneSyms])

44365

In [9]:
ensembl_to_symbol_dict = {d['query']: d['symbol'] for d in geneSyms if 'symbol' in d.keys()}

In [10]:
len(ensembl_to_symbol_dict)

44271

In [11]:
len(np.unique(list(ensembl_to_symbol_dict.values())))

43229

In [12]:
adata.var["symbol"] = adata.var["ensembl_id"].apply(lambda x: ensembl_to_symbol_dict[x] 
                                                    if x in ensembl_to_symbol_dict.keys() else "")

In [13]:
adata

AnnData object with n_obs × n_vars = 25581 × 58347
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells', 'symbol'

In [14]:
adata = adata[:, (~adata.var["symbol"].duplicated()) & (adata.var["symbol"] != "")].copy()

In [15]:
adata.var.set_index("symbol", inplace=True)

## Load A549 GRN

In [16]:
grn = load_interaction_data(interaction_type="fantom5", realnet_tissue_type_file="17_adenocarcinoma.txt.gz")

In [17]:
grn

InteractionData. 14373 nodes and 1542735 edges.
2 different types of nodes: ['TF_gene', 'gene'].
gene       13730
TF_gene      643
2 different types of edges: [('TF_gene', '', 'gene'), ('TF_gene', '', 'TF_gene')].
    1542735

In [18]:
node_names = [v['name'] for v in grn.node_data().values()]

In [19]:
len(node_names)

14373

In [20]:
len(np.unique(node_names))

14373

## Map GRN gene names to the variable names in the adata

In [21]:
intersection_names = list(set(adata.var.index).intersection(node_names))

In [22]:
len(intersection_names)

13283

In [23]:
# Take the subgraph restricted to nodes in the intersection
kept_nodes = [k for k, v in grn.node_data().items() if v['name'] in intersection_names]

In [24]:
sub_grn = grn.to_digraph().subgraph(kept_nodes)

In [25]:
sub_grn.number_of_nodes()

13283

In [26]:
edges = list(sub_grn.edges())

## Subset adata and build GRN adjacency matrix

In [27]:
adata_subset = adata[:, intersection_names]

In [28]:
adata_subset

View of AnnData object with n_obs × n_vars = 25581 × 13283
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [29]:
grn_adj_mat = np.zeros((adata_subset.shape[1], adata_subset.shape[1]))

In [30]:
names_to_new_index = {n: i for i, n in enumerate(intersection_names)}

In [31]:
old_index_to_name = {k: v["name"] for k, v in grn.node_data().items()}

In [32]:
named_edges = [(old_index_to_name[x[0]], old_index_to_name[x[1]]) for x in edges]
reindexed_edges = [(names_to_new_index[x[0]], names_to_new_index[x[1]]) for x in named_edges]

In [33]:
len(reindexed_edges)

1288885

In [34]:
# Build adjacency matrix of the GRN, in the order consistent with adata_subset
for edge in reindexed_edges:
    grn_adj_mat[edge[0], edge[1]] = 1

In [35]:
adata_subset.varp["grn_adj_mat"] = grn_adj_mat

In [37]:
np.sum(grn_adj_mat) / (grn_adj_mat.shape[0]**2)

0.007305026977479903

## Save

In [36]:
adata_subset.write_h5ad(os.path.join(get_project_root(), 
                                     "datasets/Sciplex3/processed/SrivatsanTrapnell2020_sciplex3_with_grn.h5ad"))