Computing stats for single cell RNA bubbleplot

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import synapseclient

In [10]:
#loading h5ad file after qc, described in 'krishna_data_prep.ipynb'
syn = synapseclient.Synapse()
syn.login()

#Load data
#entity = syn.get('syn61363126')
adata = sc.read_h5ad("krishna_iatlas_from_h5adfile.h5ad")

adata

Welcome, heimann!



AnnData object with n_obs × n_vars = 167283 × 15979
    obs: 'nGene', 'nUMI', 'author_type', 'author_sample', 'author_cluster', 'author_cluster_name', 'hca_data_portal_cellsuspension_uuid', 'hca_data_portal_donor_uuid', 'donor_id', 'suspension_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'cell_type_iatlas'
    var: 'index', 'feature_is_filtered', 'feature_reference', 'feature_biotype', 'feature_length'
    uns: 'citation', 'default_embedding', 'log1p', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    layers: 'counts', 'normalized'

In [11]:
#getting the list of immunomodulators from CRI iAtlas
entity = syn.get('syn59016496')
immunomodulators = pd.read_csv(entity.path)

immunomodulators

Unnamed: 0,entrez,hgnc,friendly_name,description,gene_family,gene_function,immune_checkpoint,super_category,publications
0,135,ADORA2A,ADORA2A,It is a popular target in immuno-oncology due ...,Receptor,,Inhibitory,Receptor,
1,383,ARG1,ARG1,"In tumor microenvironment, arginase degrades a...",Enzyme,Immune suppression,Inhibitory,Other,
2,151888,BTLA,BTLA,B and T lymphocyte atttenuator negatively regu...,Immunoglobulin,,Inhibitory,Receptor,
3,11119,BTN3A1,BTN3A1,,Butyrophilins,Activation of γδ T cells,Stimulatory,Co-inhibitor,
4,11118,BTN3A2,BTN3A2,,Butyrophilins,Higher expression leads to increased CD4+ infi...,Stimulatory,Co-inhibitor,
...,...,...,...,...,...,...,...,...,...
73,8744,TNFSF9,4-1BB-L,,TNF,,Stimulatory,Ligand,
74,7422,VEGFA,VEGFA,VEGFA exerts it primary functions through the ...,Growth factor,Immune suppressor,Inhibitory,Ligand,
75,7423,VEGFB,VEGFB,,Growth factor,Immune suppressor,Inhibitory,Ligand,
76,64115,VSIR,VISTA,VISTA is an immune checkpoint molecule. It is ...,Immunoglobulin,,Inhibitory,Co-inhibitor,


In [12]:
#get a list of the genes that are present in the dataset & are immunomodulators
genes = pd.Series(adata.var[np.isin(adata.var.index, immunomodulators["hgnc"])].index)
genes

0     TNFRSF18
1      TNFRSF4
2     TNFRSF14
3      TNFRSF9
4        VTCN1
        ...   
66     KIR2DL3
67     KIR2DL1
68     ADORA2A
69      ICOSLG
70       ITGB2
Name: feature_name, Length: 71, dtype: category
Categories (15979, object): ['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', ..., 'ZYG11B', 'ZYX', 'ZZEF1', 'ZZZ3']

In [16]:
#compute how many counts map to each cell type
freq_cell_types = adata.obs["cell_type_iatlas"].value_counts()
freq_cell_types

cell_type_iatlas
T cell                73806
macrophage            20547
endothelium           14062
monocyte              14011
NK                    12538
Ambiguous             10924
Dendritic cell         6665
tumor                  5715
fibroblast             3076
TAM/TCR (Ambiguos)     2797
B cell                 2722
mast cell               242
megakaryocyte           117
Ambiguous/Dead           61
Name: count, dtype: int64

In [17]:
#create grid with all cell x gene combinations
lp1, lp2 = pd.core.reshape.util.cartesian_product([freq_cell_types.index, genes])
cell_gene = pd.DataFrame(dict(cell=lp1, gene=lp2))
cell_gene

Unnamed: 0,cell,gene
0,T cell,TNFRSF18
1,T cell,TNFRSF4
2,T cell,TNFRSF14
3,T cell,TNFRSF9
4,T cell,VTCN1
...,...,...
66,Ambiguous/Dead,KIR2DL3
67,Ambiguous/Dead,KIR2DL1
68,Ambiguous/Dead,ADORA2A
69,Ambiguous/Dead,ICOSLG


We need to compute:
- % of cell of a given type that expresses a gene (count expr by type/freq of cell type)
- mean expr value

In [22]:
def get_expr_by_cell(cell_type, gene):
    expr = adata[adata.obs.cell_type_iatlas == cell_type].to_df(layer="normalized")[gene]
    counts = sum(expr != 0)
    if(counts>0): avg = (expr[expr != 0]).mean()
    else: avg = 0 #technically this is wrong, but plotting libraries crash with NAs or characters
    return counts, avg

In [30]:
# Function to apply to each row
def apply_function(row):
    counts, avg = get_expr_by_cell(row['cell'], row['gene'])  
    return pd.Series({'counts': counts, 'avg': avg})

# Apply the function to each row and concatenate the results
result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)


In [31]:
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg
0,T cell,TNFRSF18,6197.0,2.186137
1,T cell,TNFRSF4,7440.0,2.19297
2,T cell,TNFRSF14,26395.0,2.184081
3,T cell,TNFRSF9,17317.0,2.273305
4,T cell,VTCN1,2.0,2.252233
5,T cell,SLAMF7,7246.0,2.133539
6,T cell,SELP,232.0,2.023073
7,T cell,TNFSF4,4257.0,2.104519
8,T cell,IL10,829.0,2.179649
9,T cell,IL1A,40.0,1.653637


In [32]:
#Now we compute the % of cells from a given cell type that have expression for a gene
result = pd.merge(result, freq_cell_types, left_on='cell', right_index=True)
result['perc_expr'] = result['counts'] / result['count']
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,count,perc_expr
0,T cell,TNFRSF18,6197.0,2.186137,73806,0.083963
1,T cell,TNFRSF4,7440.0,2.19297,73806,0.100805
2,T cell,TNFRSF14,26395.0,2.184081,73806,0.357627
3,T cell,TNFRSF9,17317.0,2.273305,73806,0.234629
4,T cell,VTCN1,2.0,2.252233,73806,2.7e-05
5,T cell,SLAMF7,7246.0,2.133539,73806,0.098176
6,T cell,SELP,232.0,2.023073,73806,0.003143
7,T cell,TNFSF4,4257.0,2.104519,73806,0.057678
8,T cell,IL10,829.0,2.179649,73806,0.011232
9,T cell,IL1A,40.0,1.653637,73806,0.000542


In [33]:
result.columns = ["cell", "gene", "counts", "avg", "Freq", "perc_expr"]
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,Freq,perc_expr
0,T cell,TNFRSF18,6197.0,2.186137,73806,0.083963
1,T cell,TNFRSF4,7440.0,2.19297,73806,0.100805
2,T cell,TNFRSF14,26395.0,2.184081,73806,0.357627
3,T cell,TNFRSF9,17317.0,2.273305,73806,0.234629
4,T cell,VTCN1,2.0,2.252233,73806,2.7e-05
5,T cell,SLAMF7,7246.0,2.133539,73806,0.098176
6,T cell,SELP,232.0,2.023073,73806,0.003143
7,T cell,TNFSF4,4257.0,2.104519,73806,0.057678
8,T cell,IL10,829.0,2.179649,73806,0.011232
9,T cell,IL1A,40.0,1.653637,73806,0.000542


In [34]:
#Add dataset info and save data into file
result['dataset'] = "Krishna_2021"
result.to_csv('Krishna_2021_bubble_plot_df.tsv', sep='\t', index=False)
file_entity = synapseclient.File('Krishna_2021_bubble_plot_df.tsv', 'syn59202660')
file_entity = syn.store(file_entity)

Uploading to Synapse storage: 100%|██████████| 75.1k/75.1k [00:00<00:00, 90.6kB/s, Krishna_2021_bubble_plot_df.tsv]
