Computing stats for single cell RNA bubbleplot

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import synapseclient

In [9]:
#loading h5ad file after qc, described in 'krishna_data_prep.ipynb'
syn = synapseclient.Synapse()
syn.login()

#Load data
#entity = syn.get('syn59473615')
adata = sc.read_h5ad("krishna_iatlas_from_h5adfile.h5ad")

adata


UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.2.0) is available. Your version (2.7.2) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.2.0 release notes

https://python-docs.synapse.org/news/



Welcome, heimann!



In [4]:
adata.var

A1BG
A1BG-AS1
A1CF
A2M
A2M-AS1
...
ZXDC
ZYG11B
ZYX
ZZEF1
ZZZ3


In [5]:
#saving the expression matrix
from scipy.sparse import csr_matrix, find
A = adata.layers['normalized']

In [6]:
#get non-zero elements
rows, cols = A.nonzero()
values = A.data

#organize in dataframe
df = pd.DataFrame({
    'Row': [(adata.obs).index[row] for row in rows],
    'Column': [adata.var.index[col] for col in cols],
    'Value': values
})

In [7]:
df.to_csv('Krishna_2021_norm_expr.tsv', sep='\t', index=False)

In [10]:
file_entity = synapseclient.File("Krishna_2021_norm_expr.tsv", parent = "syn59202660")
syn.store(file_entity)


##################################################
 Uploading file to Synapse storage 
##################################################



File(path='Krishna_2021_norm_expr.tsv', versionNumber=1, synapseStore=True, etag='a78c7fb7-e1dc-4a3b-8124-38a0659a3995', cacheDir='', dataFileHandleId='140888030', versionLabel='1', parentId='syn59202660', isLatestVersion=True, concreteType='org.sagebionetworks.repo.model.FileEntity', _file_handle={'id': '140888030', 'etag': 'b37b174b-31d3-426f-8193-304c915279f2', 'createdBy': '3398555', 'createdOn': '2024-05-13T22:52:24.000Z', 'modifiedOn': '2024-05-13T22:52:24.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/tab-separated-values', 'contentMd5': 'c7ea91a646f4de13cfcc3d63969200fc', 'fileName': 'Krishna_2021_norm_expr.tsv', 'storageLocationId': 1, 'contentSize': 7085225417, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '3398555/4eba4171-ff01-4d50-a639-3025754baada/Krishna_2021_norm_expr.tsv', 'isPreview': False, 'externalURL': None}, files=['Krishna_2021_norm_expr.tsv'], id='syn59473653', createdOn='2024-05-13T22:52:2

In [11]:
#getting the list of immunomodulators from CRI iAtlas
entity = syn.get('syn59016496')
immunomodulators = pd.read_csv(entity.path)

immunomodulators

Unnamed: 0,entrez,hgnc,friendly_name,description,gene_family,gene_function,immune_checkpoint,super_category,publications
0,135,ADORA2A,ADORA2A,It is a popular target in immuno-oncology due ...,Receptor,,Inhibitory,Receptor,
1,383,ARG1,ARG1,"In tumor microenvironment, arginase degrades a...",Enzyme,Immune suppression,Inhibitory,Other,
2,151888,BTLA,BTLA,B and T lymphocyte atttenuator negatively regu...,Immunoglobulin,,Inhibitory,Receptor,
3,11119,BTN3A1,BTN3A1,,Butyrophilins,Activation of γδ T cells,Stimulatory,Co-inhibitor,
4,11118,BTN3A2,BTN3A2,,Butyrophilins,Higher expression leads to increased CD4+ infi...,Stimulatory,Co-inhibitor,
...,...,...,...,...,...,...,...,...,...
73,8744,TNFSF9,4-1BB-L,,TNF,,Stimulatory,Ligand,
74,7422,VEGFA,VEGFA,VEGFA exerts it primary functions through the ...,Growth factor,Immune suppressor,Inhibitory,Ligand,
75,7423,VEGFB,VEGFB,,Growth factor,Immune suppressor,Inhibitory,Ligand,
76,64115,VSIR,VISTA,VISTA is an immune checkpoint molecule. It is ...,Immunoglobulin,,Inhibitory,Co-inhibitor,


In [12]:
#get a list of the genes that are present in the dataset & are immunomodulators
genes = pd.Series(adata.var[np.isin(adata.var.index, immunomodulators["hgnc"])].index)
genes

0     ADORA2A
1        BTLA
2      BTN3A1
3      BTN3A2
4        CD27
       ...   
66     TNFSF9
67      VEGFA
68      VEGFB
69       VSIR
70      VTCN1
Name: gene, Length: 71, dtype: object

In [13]:
#compute how many counts map to each cell type
freq_cell_types = adata.obs["cell_type"].value_counts()
freq_cell_types

T cell            29986
macrophage         7968
monocyte           6395
NK                 5685
tumor cell         5223
dendritic cell     4059
endothelium        3077
fibroblast         1939
epithelium         1391
B cell             1031
mast cell           173
megakaryocyte         9
Name: cell_type, dtype: int64

In [14]:
#create grid with all cell x gene combinations
lp1, lp2 = pd.core.reshape.util.cartesian_product([freq_cell_types.index, genes])
cell_gene = pd.DataFrame(dict(cell=lp1, gene=lp2))
cell_gene

Unnamed: 0,cell,gene
0,T cell,ADORA2A
1,T cell,BTLA
2,T cell,BTN3A1
3,T cell,BTN3A2
4,T cell,CD27
...,...,...
66,megakaryocyte,TNFSF9
67,megakaryocyte,VEGFA
68,megakaryocyte,VEGFB
69,megakaryocyte,VSIR


We need to compute:
- % of cell of a given type that expresses a gene (count expr by type/freq of cell type)
- mean expr value

In [18]:
def get_expr_by_cell(cell_type, gene):
    expr = adata[adata.obs.cell_type == cell_type].to_df(layer="normalized")[gene]
    counts = sum(expr != 0)
    if(counts>0): avg = (expr[expr != 0]).mean()
    else: avg = 0 #technically this is wrong, but plotting libraries crash with NAs or characters
    return counts, avg

In [19]:
# Function to apply to each row
def apply_function(row):
    counts, avg = get_expr_by_cell(row['cell'], row['gene'])  
    return pd.Series({'counts': counts, 'avg': avg})

# Apply the function to each row and concatenate the results
#result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)


In [28]:
result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg
0,T cell,ADORA2A,603.0,1.462132
1,T cell,BTLA,1119.0,1.530739
2,T cell,BTN3A1,6741.0,1.428604
3,T cell,BTN3A2,12385.0,1.583535
4,T cell,CD27,9111.0,2.394756
5,T cell,CD274,440.0,1.188999
6,T cell,CD276,1995.0,1.018305
7,T cell,CD28,2792.0,1.659164
8,T cell,CD40,4003.0,1.195334
9,T cell,CD40LG,2509.0,1.881143


In [29]:
#Now we compute the % of cells from a given cell type that have expression for a gene
result = pd.merge(result, freq_cell_types, left_on='cell', right_index=True)
result['perc_expr'] = result['counts'] / result['cell_type']
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,cell_type,perc_expr
0,T cell,ADORA2A,603.0,1.462132,29986,0.020109
1,T cell,BTLA,1119.0,1.530739,29986,0.037317
2,T cell,BTN3A1,6741.0,1.428604,29986,0.224805
3,T cell,BTN3A2,12385.0,1.583535,29986,0.413026
4,T cell,CD27,9111.0,2.394756,29986,0.303842
5,T cell,CD274,440.0,1.188999,29986,0.014674
6,T cell,CD276,1995.0,1.018305,29986,0.066531
7,T cell,CD28,2792.0,1.659164,29986,0.09311
8,T cell,CD40,4003.0,1.195334,29986,0.133496
9,T cell,CD40LG,2509.0,1.881143,29986,0.083672


In [30]:
result.columns = ["cell", "gene", "counts", "avg", "Freq", "perc_expr"]
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,Freq,perc_expr
0,T cell,ADORA2A,603.0,1.462132,29986,0.020109
1,T cell,BTLA,1119.0,1.530739,29986,0.037317
2,T cell,BTN3A1,6741.0,1.428604,29986,0.224805
3,T cell,BTN3A2,12385.0,1.583535,29986,0.413026
4,T cell,CD27,9111.0,2.394756,29986,0.303842
5,T cell,CD274,440.0,1.188999,29986,0.014674
6,T cell,CD276,1995.0,1.018305,29986,0.066531
7,T cell,CD28,2792.0,1.659164,29986,0.09311
8,T cell,CD40,4003.0,1.195334,29986,0.133496
9,T cell,CD40LG,2509.0,1.881143,29986,0.083672


In [31]:
#Add dataset info and save data into file
result['dataset'] = "Krishna_2021"
result.to_csv('Krishna_2021_bubble_plot_df.tsv', sep='\t', index=False)