Computing stats for single cell RNA bubbleplot

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import synapseclient

In [3]:
#loading h5ad file after qc, described in 'bi_data_prep.ipynb'
syn = synapseclient.Synapse()
syn.login()

#Load data
entity = syn.get('syn60521378')
adata = sc.read_h5ad(entity.path)

adata

Welcome, heimann!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.3.0) is available. Your version (2.7.2) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.3.0 release notes

https://python-docs.synapse.org/news/




AnnData object with n_obs × n_vars = 34326 × 32636
    obs: 'NAME', 'biosample_id', 'donor_id', 'species', 'species__ontology_label', 'disease', 'disease__ontology_label', 'organ', 'organ__ontology_label', 'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'ICB_Exposed', 'ICB_Response', 'TKI_Exposed', 'Lineage', 'InferCNV', 'FinalCellType', 'sex', 'cell_type_iatlas'
    var: 'symbol'
    uns: 'log1p'
    layers: 'counts', 'normalized'

In [None]:
#export the obs table for future use in clinical data mapping
result.to_csv('Bi_2021_obs.tsv', sep='\t', index=False)
file_entity = synapseclient.File("Bi_2021_obs.tsv", parent = "syn59966587")
syn.store(file_entity)

In [4]:
#now, we will explore the available cell type annotations
adata.obs["cell_type_iatlas"].value_counts()

T cell               13090
tumor                 8040
macrophage            5053
NK                    3056
myeloid cell          1407
monocyte              1157
B cell                 962
plasma cell            463
Dendritic cell         419
Misc/Undetermined      278
endothelium            271
fibroblast              91
mast cell               39
Name: cell_type_iatlas, dtype: int64

In [5]:
adata.var

Unnamed: 0_level_0,symbol
symbol,Unnamed: 1_level_1
WASH7P,WASH7P
AL627309.1,AL627309.1
AL627309.3,AL627309.3
CICP27,CICP27
AL627309.1.1,AL627309.1.1
...,...
CU638689.5,CU638689.5
CU634019.2,CU634019.2
CU634019.6,CU634019.6
CU638689.1,CU638689.1


In [6]:
#getting the list of immunomodulators from CRI iAtlas
entity = syn.get('syn59016496')
immunomodulators = pd.read_csv(entity.path)

immunomodulators

Unnamed: 0,entrez,hgnc,friendly_name,description,gene_family,gene_function,immune_checkpoint,super_category,publications
0,135,ADORA2A,ADORA2A,It is a popular target in immuno-oncology due ...,Receptor,,Inhibitory,Receptor,
1,383,ARG1,ARG1,"In tumor microenvironment, arginase degrades a...",Enzyme,Immune suppression,Inhibitory,Other,
2,151888,BTLA,BTLA,B and T lymphocyte atttenuator negatively regu...,Immunoglobulin,,Inhibitory,Receptor,
3,11119,BTN3A1,BTN3A1,,Butyrophilins,Activation of γδ T cells,Stimulatory,Co-inhibitor,
4,11118,BTN3A2,BTN3A2,,Butyrophilins,Higher expression leads to increased CD4+ infi...,Stimulatory,Co-inhibitor,
...,...,...,...,...,...,...,...,...,...
73,8744,TNFSF9,4-1BB-L,,TNF,,Stimulatory,Ligand,
74,7422,VEGFA,VEGFA,VEGFA exerts it primary functions through the ...,Growth factor,Immune suppressor,Inhibitory,Ligand,
75,7423,VEGFB,VEGFB,,Growth factor,Immune suppressor,Inhibitory,Ligand,
76,64115,VSIR,VISTA,VISTA is an immune checkpoint molecule. It is ...,Immunoglobulin,,Inhibitory,Co-inhibitor,


In [7]:
#get a list of the genes that are present in the dataset & are immunomodulators
genes = pd.Series(adata.var[np.isin(adata.var.index, immunomodulators["hgnc"])].index)
genes

0     TNFRSF18
1      TNFRSF4
2     TNFRSF14
3      TNFRSF9
4        VTCN1
        ...   
67        CD40
68      ICOSLG
69       ITGB2
70     ADORA2A
71      CD40LG
Name: symbol, Length: 72, dtype: object

In [8]:
#compute how many counts map to each cell type
freq_cell_types = adata.obs["cell_type_iatlas"].value_counts()
freq_cell_types

T cell               13090
tumor                 8040
macrophage            5053
NK                    3056
myeloid cell          1407
monocyte              1157
B cell                 962
plasma cell            463
Dendritic cell         419
Misc/Undetermined      278
endothelium            271
fibroblast              91
mast cell               39
Name: cell_type_iatlas, dtype: int64

In [9]:
#create grid with all cell x gene combinations
lp1, lp2 = pd.core.reshape.util.cartesian_product([freq_cell_types.index, genes])
cell_gene = pd.DataFrame(dict(cell=lp1, gene=lp2))
cell_gene

Unnamed: 0,cell,gene
0,T cell,TNFRSF18
1,T cell,TNFRSF4
2,T cell,TNFRSF14
3,T cell,TNFRSF9
4,T cell,VTCN1
...,...,...
67,mast cell,CD40
68,mast cell,ICOSLG
69,mast cell,ITGB2
70,mast cell,ADORA2A


We need to compute:
- % of cell of a given type that expresses a gene (count expr by type/freq of cell type)
- mean expr value

In [10]:
def get_expr_by_cell(cell_type, gene):
    expr = adata[adata.obs.cell_type_iatlas == cell_type].to_df(layer="normalized")[gene]
    counts = sum(expr != 0)
    if(counts>0): avg = (expr[expr != 0]).mean()
    else: avg = 0 #technically this is wrong, but plotting libraries crash with NAs or characters
    return counts, avg

In [11]:
# Function to apply to each row
def apply_function(row):
    counts, avg = get_expr_by_cell(row['cell'], row['gene'])  
    return pd.Series({'counts': counts, 'avg': avg})

# Apply the function to each row and concatenate the results
#result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)


In [12]:
result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg
0,T cell,TNFRSF18,1195.0,2.025234
1,T cell,TNFRSF4,1177.0,1.931231
2,T cell,TNFRSF14,2750.0,1.813556
3,T cell,TNFRSF9,2002.0,1.962041
4,T cell,VTCN1,0.0,0.0
5,T cell,SLAMF7,1064.0,1.804381
6,T cell,SELP,19.0,1.553889
7,T cell,TNFSF4,842.0,1.760308
8,T cell,IL10,212.0,1.968722
9,T cell,IL1A,0.0,0.0


In [15]:
#Now we compute the % of cells from a given cell type that have expression for a gene
result = pd.merge(result, freq_cell_types, left_on='cell', right_index=True)
result['perc_expr'] = result['counts'] / result['cell_type_iatlas']
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,cell_type_iatlas,perc_expr
0,T cell,TNFRSF18,1195.0,2.025234,13090,0.091291
1,T cell,TNFRSF4,1177.0,1.931231,13090,0.089916
2,T cell,TNFRSF14,2750.0,1.813556,13090,0.210084
3,T cell,TNFRSF9,2002.0,1.962041,13090,0.152941
4,T cell,VTCN1,0.0,0.0,13090,0.0
5,T cell,SLAMF7,1064.0,1.804381,13090,0.081283
6,T cell,SELP,19.0,1.553889,13090,0.001451
7,T cell,TNFSF4,842.0,1.760308,13090,0.064324
8,T cell,IL10,212.0,1.968722,13090,0.016196
9,T cell,IL1A,0.0,0.0,13090,0.0


In [16]:
result.columns = ["cell", "gene", "counts", "avg", "Freq", "perc_expr"]
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,Freq,perc_expr
0,T cell,TNFRSF18,1195.0,2.025234,13090,0.091291
1,T cell,TNFRSF4,1177.0,1.931231,13090,0.089916
2,T cell,TNFRSF14,2750.0,1.813556,13090,0.210084
3,T cell,TNFRSF9,2002.0,1.962041,13090,0.152941
4,T cell,VTCN1,0.0,0.0,13090,0.0
5,T cell,SLAMF7,1064.0,1.804381,13090,0.081283
6,T cell,SELP,19.0,1.553889,13090,0.001451
7,T cell,TNFSF4,842.0,1.760308,13090,0.064324
8,T cell,IL10,212.0,1.968722,13090,0.016196
9,T cell,IL1A,0.0,0.0,13090,0.0


In [17]:
#Add dataset info and save data into file
result['dataset'] = "Bi_2021"
result.to_csv('Bi_2021_bubble_plot_df.tsv', sep='\t', index=False)
file_entity = synapseclient.File("Bi_2021_bubble_plot_df.tsv", parent = "syn59966587")
syn.store(file_entity)


##################################################
 Uploading file to Synapse storage 
##################################################



File(modifiedBy='3398555', isLatestVersion=True, _file_handle={'id': '142327197', 'etag': '1ac5c587-051c-4cea-9d61-cf442e137f89', 'createdBy': '3398555', 'createdOn': '2024-06-05T22:26:59.000Z', 'modifiedOn': '2024-06-05T22:26:59.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/tab-separated-values', 'contentMd5': 'd53349857cf12e6a5d2391fd9cc93519', 'fileName': 'Bi_2021_bubble_plot_df.tsv', 'storageLocationId': 1, 'contentSize': 63649, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '3398555/1eba71c6-a24a-4f40-a25f-07a66ad2c4cd/Bi_2021_bubble_plot_df.tsv', 'isPreview': False, 'externalURL': None}, dataFileHandleId='142327197', etag='395046f8-3dcb-4211-8fa5-2c9199280f41', concreteType='org.sagebionetworks.repo.model.FileEntity', cacheDir='', createdBy='3398555', versionLabel='1', synapseStore=True, modifiedOn='2024-06-05T22:26:59.761Z', files=['Bi_2021_bubble_plot_df.tsv'], name='Bi_2021_bubble_plot_df.tsv', versionNumb