Computing stats for single cell RNA bubbleplot

In [10]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import synapseclient

In [11]:
#loading h5ad file after qc, described in 'li_data_prep.ipynb'
syn = synapseclient.Synapse()
syn.login()

#Load data
entity = syn.get('syn59809731')
adata = sc.read_h5ad(entity.path)

adata

Welcome, heimann!



AnnData object with n_obs × n_vars = 270855 × 19736
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'patient', 'percent.mt', 'summaryDescription', 'annotation', 'region', 'broad_type', 'cell_type_iatlas'
    var: 'name'
    uns: 'annotation_colors', 'broad_type_colors', 'log1p', 'region_colors', 'summaryDescription_colors'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts', 'normalized'

In [28]:
adata.obs["summaryDescription"].value_counts()

summaryDescription
Tumour            157636
Blood              60084
Tumour-normal      23482
Normal kidney      11838
Normal adrenal      6642
Metastasis          4657
Thrombus            4395
Fat                 2121
Name: count, dtype: int64

In [12]:
adata.var

Unnamed: 0,name
FO538757.3,FO538757.3
FO538757.2,FO538757.2
AP006222.2,AP006222.2
RP5-857K21.4,RP5-857K21.4
RP11-206L10.9,RP11-206L10.9
...,...
BACH1-IT2,BACH1-IT2
AP000282.2,AP000282.2
LINC01436,LINC01436
CRYAA,CRYAA


In [13]:
#getting the list of immunomodulators from CRI iAtlas
entity = syn.get('syn59016496')
immunomodulators = pd.read_csv(entity.path)

immunomodulators

Unnamed: 0,entrez,hgnc,friendly_name,description,gene_family,gene_function,immune_checkpoint,super_category,publications
0,135,ADORA2A,ADORA2A,It is a popular target in immuno-oncology due ...,Receptor,,Inhibitory,Receptor,
1,383,ARG1,ARG1,"In tumor microenvironment, arginase degrades a...",Enzyme,Immune suppression,Inhibitory,Other,
2,151888,BTLA,BTLA,B and T lymphocyte atttenuator negatively regu...,Immunoglobulin,,Inhibitory,Receptor,
3,11119,BTN3A1,BTN3A1,,Butyrophilins,Activation of γδ T cells,Stimulatory,Co-inhibitor,
4,11118,BTN3A2,BTN3A2,,Butyrophilins,Higher expression leads to increased CD4+ infi...,Stimulatory,Co-inhibitor,
...,...,...,...,...,...,...,...,...,...
73,8744,TNFSF9,4-1BB-L,,TNF,,Stimulatory,Ligand,
74,7422,VEGFA,VEGFA,VEGFA exerts it primary functions through the ...,Growth factor,Immune suppressor,Inhibitory,Ligand,
75,7423,VEGFB,VEGFB,,Growth factor,Immune suppressor,Inhibitory,Ligand,
76,64115,VSIR,VISTA,VISTA is an immune checkpoint molecule. It is ...,Immunoglobulin,,Inhibitory,Co-inhibitor,


In [16]:
#get a list of the genes that are present in the dataset & are immunomodulators
genes = pd.Series(adata.var[np.isin(adata.var.index, immunomodulators["hgnc"])].index)
genes

0     TNFRSF18
1      TNFRSF4
2     TNFRSF14
3      TNFRSF9
4       SLAMF7
        ...   
67     ADORA2A
68      ICOSLG
69       ITGB2
70        IL13
71       VTCN1
Length: 72, dtype: object

In [14]:
#compute how many counts map to each cell type
freq_cell_types = adata.obs["cell_type_iatlas"].value_counts()
freq_cell_types

cell_type_iatlas
T cell             147172
myeloid cell        49329
NK                  35541
tumor               14399
B cell               7841
endothelium          6048
fibroblast           3671
epithelium           2378
proximal tubule      1950
plasma cell          1254
Dendritic cell        756
mast cell             516
Name: count, dtype: int64

In [17]:
#create grid with all cell x gene combinations
lp1, lp2 = pd.core.reshape.util.cartesian_product([freq_cell_types.index, genes])
cell_gene = pd.DataFrame(dict(cell=lp1, gene=lp2))
cell_gene

Unnamed: 0,cell,gene
0,T cell,TNFRSF18
1,T cell,TNFRSF4
2,T cell,TNFRSF14
3,T cell,TNFRSF9
4,T cell,SLAMF7
...,...,...
67,mast cell,ADORA2A
68,mast cell,ICOSLG
69,mast cell,ITGB2
70,mast cell,IL13


We need to compute:
- % of cell of a given type that expresses a gene (count expr by type/freq of cell type)
- mean expr value

In [21]:
def get_expr_by_cell(cell_type, gene):
    expr = adata[adata.obs.cell_type_iatlas == cell_type].to_df(layer="normalized")[gene]
    counts = sum(expr != 0)
    if(counts>0): avg = (expr[expr != 0]).mean()
    else: avg = 0 #technically this is wrong, but plotting libraries crash with NAs or characters
    return counts, avg

In [19]:
# Function to apply to each row
def apply_function(row):
    counts, avg = get_expr_by_cell(row['cell'], row['gene'])  
    return pd.Series({'counts': counts, 'avg': avg})

# Apply the function to each row and concatenate the results
#result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)


In [22]:
result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg
0,T cell,TNFRSF18,7525.0,1.792267
1,T cell,TNFRSF4,9596.0,1.730815
2,T cell,TNFRSF14,49350.0,1.639123
3,T cell,TNFRSF9,38576.0,1.902407
4,T cell,SLAMF7,13815.0,1.545854
5,T cell,SELP,188.0,1.502002
6,T cell,TNFSF4,4393.0,1.580039
7,T cell,IL10,4292.0,2.038293
8,T cell,IL1A,16.0,1.418233
9,T cell,IL1B,94.0,1.615522


In [25]:
#Now we compute the % of cells from a given cell type that have expression for a gene
result = pd.merge(result, freq_cell_types, left_on='cell', right_index=True)
result['perc_expr'] = result['counts'] / result['count']
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,count,perc_expr
0,T cell,TNFRSF18,7525.0,1.792267,147172,0.051131
1,T cell,TNFRSF4,9596.0,1.730815,147172,0.065203
2,T cell,TNFRSF14,49350.0,1.639123,147172,0.335322
3,T cell,TNFRSF9,38576.0,1.902407,147172,0.262115
4,T cell,SLAMF7,13815.0,1.545854,147172,0.09387
5,T cell,SELP,188.0,1.502002,147172,0.001277
6,T cell,TNFSF4,4393.0,1.580039,147172,0.029849
7,T cell,IL10,4292.0,2.038293,147172,0.029163
8,T cell,IL1A,16.0,1.418233,147172,0.000109
9,T cell,IL1B,94.0,1.615522,147172,0.000639


In [26]:
result.columns = ["cell", "gene", "counts", "avg", "Freq", "perc_expr"]
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,Freq,perc_expr
0,T cell,TNFRSF18,7525.0,1.792267,147172,0.051131
1,T cell,TNFRSF4,9596.0,1.730815,147172,0.065203
2,T cell,TNFRSF14,49350.0,1.639123,147172,0.335322
3,T cell,TNFRSF9,38576.0,1.902407,147172,0.262115
4,T cell,SLAMF7,13815.0,1.545854,147172,0.09387
5,T cell,SELP,188.0,1.502002,147172,0.001277
6,T cell,TNFSF4,4393.0,1.580039,147172,0.029849
7,T cell,IL10,4292.0,2.038293,147172,0.029163
8,T cell,IL1A,16.0,1.418233,147172,0.000109
9,T cell,IL1B,94.0,1.615522,147172,0.000639


In [27]:
#Add dataset info and save data into file
result['dataset'] = "Li_2022"
result.to_csv('Li_2022_bubble_plot_df.tsv', sep='\t', index=False)
file_entity = synapseclient.File("Li_2022_bubble_plot_df.tsv", parent = "syn59809728")
syn.store(file_entity)

Uploading to Synapse storage: 100%|█| 62.0k/62.0k [00:00<00:00, 92.8kB/s, Li_202


File(name='Li_2022_bubble_plot_df.tsv', _file_handle={'id': '141419749', 'etag': '0fce8827-0785-4180-934e-685e6fc6ab7d', 'createdBy': '3398555', 'createdOn': '2024-05-24T00:06:25.000Z', 'modifiedOn': '2024-05-24T00:06:25.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/tab-separated-values', 'contentMd5': '8f5dc0ea65c67fb95594a951d6991e1f', 'fileName': 'Li_2022_bubble_plot_df.tsv', 'storageLocationId': 1, 'contentSize': 62038, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '3398555/529d5e21-5e95-4223-bb0d-9001b6b6fb98/Li_2022_bubble_plot_df.tsv', 'isPreview': False, 'externalURL': None}, id='syn59881857', files=['Li_2022_bubble_plot_df.tsv'], isLatestVersion=True, versionLabel='1', path='Li_2022_bubble_plot_df.tsv', concreteType='org.sagebionetworks.repo.model.FileEntity', modifiedBy='3398555', synapseStore=True, modifiedOn='2024-05-24T00:06:25.523Z', etag='9a34df44-7ed6-4f27-9334-4c5000654f5b', createdBy='3398555', p