Computing stats for single cell RNA bubbleplot

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import synapseclient

In [2]:
#loading h5ad file after qc, described in 'msk_data_prep.ipynb'
syn = synapseclient.Synapse()
syn.login()

#Load data
entity = syn.get('syn55258687')
adata = sc.read_h5ad(entity.path)

adata


UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.2.0) is available. Your version (2.7.2) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.2.0 release notes

https://python-docs.synapse.org/news/



Welcome, heimann!



AnnData object with n_obs × n_vars = 342749 × 36601
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'scrublet', 'batch', 'CD45_enrich', 'batch_num', 'cohort', 'pCR', 'RCB', 'cleared_nodes', 'treatment', 'patient_treatment', 'hormone_receptor', 'combined_tcr', 'umap1', 'umap2', 'leiden_50nbr_res1.2', 'celltype', 'global_clusters', 'bcell_leiden_nbr100_res0.6', 'tcell_leiden_nbr100_res0.6', 'myeloid_leiden_nbr30_res0.8', 'subcluster'
    var: 'gene_ids', 'feature_types'
    uns: 'bcell_leiden_nbr100_res0.6_colors', 'celltype_colors', 'global_clusters_colors', 'leiden_50nbr_res1.2_colors', 'log1p', 'myeloid_leiden_nbr30_res0.8_colors', 'subcluster_colors', 'tcell_leiden_nbr100_res0.6_colors'
    obsm: 'X_bcell_umap', 'X_myeloid_umap', 'X_tcell_umap', 'X_umap'
    layers: 'counts

In [26]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


In [12]:
#saving the expression matrix
from scipy.sparse import csr_matrix, find
A = adata.layers['normalized']

In [27]:
#get non-zero elements
rows, cols = A.nonzero()
values = A.data

#organize in dataframe
df = pd.DataFrame({
    'Row': [(adata.obs).index[row] for row in rows],
    'Column': [adata.var.index[col] for col in cols],
    'Value': values
})

In [29]:
df.to_csv('Shiao_2024_norm_expr.tsv', sep='\t', index=False)

In [30]:
file_entity = synapseclient.File("Shiao_2024_norm_expr.tsv", parent = "syn55271493")
syn.store(file_entity)


##################################################
 Uploading file to Synapse storage 
##################################################



File(modifiedOn='2024-05-06T22:35:09.504Z', createdOn='2024-05-06T22:35:09.358Z', path='Shiao_2024_norm_expr.tsv', files=['Shiao_2024_norm_expr.tsv'], modifiedBy='3398555', versionLabel='1', synapseStore=True, isLatestVersion=True, dataFileHandleId='140385388', parentId='syn55271493', etag='689a90cf-5cca-4c31-9e51-91d9f4574373', createdBy='3398555', concreteType='org.sagebionetworks.repo.model.FileEntity', cacheDir='', name='Shiao_2024_norm_expr.tsv', versionNumber=1, _file_handle={'id': '140385388', 'etag': 'b51cc4ed-3995-4f16-82d6-0d22e6f1fa4b', 'createdBy': '3398555', 'createdOn': '2024-05-06T22:35:09.000Z', 'modifiedOn': '2024-05-06T22:35:09.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/tab-separated-values', 'contentMd5': '50609196f5b35213d46c652f8012b61f', 'fileName': 'Shiao_2024_norm_expr.tsv', 'storageLocationId': 1, 'contentSize': 19921242607, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '3398555/7642cd0

In [1]:
#getting the list of immunomodulators from CRI iAtlas
entity = syn.get('syn59016496')
immunomodulators = pd.read_csv(entity.path)

immunomodulators

NameError: name 'syn' is not defined

In [19]:
#get a list of the genes that are present in the dataset & are immunomodulators
genes = pd.Series(adata.var[np.isin(adata.var.index, immunomodulators["hgnc"])].index)
genes

0     TNFRSF18
1      TNFRSF4
2     TNFRSF14
3      TNFRSF9
4        VTCN1
        ...   
70        CD40
71      ICOSLG
72       ITGB2
73     ADORA2A
74      CD40LG
Length: 75, dtype: object

In [20]:
#compute how many counts map to each cell type
freq_cell_types = adata.obs["celltype"].value_counts()
freq_cell_types

Tcell       188098
myeloid     110168
Bcell        42841
mastcell      1642
Name: celltype, dtype: int64

In [21]:
#create grid with all cell x gene combinations
lp1, lp2 = pd.core.reshape.util.cartesian_product([freq_cell_types.index, genes])
cell_gene = pd.DataFrame(dict(cell=lp1, gene=lp2))
cell_gene

Unnamed: 0,cell,gene
0,Tcell,TNFRSF18
1,Tcell,TNFRSF4
2,Tcell,TNFRSF14
3,Tcell,TNFRSF9
4,Tcell,VTCN1
...,...,...
70,mastcell,CD40
71,mastcell,ICOSLG
72,mastcell,ITGB2
73,mastcell,ADORA2A


We need to compute:
- % of cell of a given type that expresses a gene (count expr by type/freq of cell type)
- mean expr value

In [23]:
def get_expr_by_cell(cell_type, gene):
    expr = adata[adata.obs.celltype == cell_type].to_df(layer="normalized")[gene]
    counts = sum(expr != 0)
    if(counts>0): avg = (expr[expr != 0]).mean()
    else: avg = 0 #technically this is wrong, but plotting libraries crash with NAs or characters
    return counts, avg

In [24]:
# Function to apply to each row
def apply_function(row):
    counts, avg = get_expr_by_cell(row['cell'], row['gene'])  
    return pd.Series({'counts': counts, 'avg': avg})

# Apply the function to each row and concatenate the results
#result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)


In [25]:
result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg
0,Tcell,TNFRSF18,51316.0,1.998775
1,Tcell,TNFRSF4,40241.0,1.986621
2,Tcell,TNFRSF14,57782.0,1.612135
3,Tcell,TNFRSF9,20166.0,1.55538
4,Tcell,VTCN1,293.0,1.691238
5,Tcell,SLAMF7,11676.0,1.415022
6,Tcell,SELP,798.0,1.298185
7,Tcell,TNFSF4,6237.0,1.475875
8,Tcell,IL10,3136.0,1.695357
9,Tcell,IL1A,242.0,1.373145


In [28]:
#Now we compute the % of cells from a given cell type that have expression for a gene
#result = pd.merge(result, freq_cell_types, left_on='cell', right_index=True)
result['perc_expr'] = result['counts'] / result['celltype']
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,celltype,perc_expr
0,Tcell,TNFRSF18,51316.0,1.998775,188098,0.272815
1,Tcell,TNFRSF4,40241.0,1.986621,188098,0.213936
2,Tcell,TNFRSF14,57782.0,1.612135,188098,0.307191
3,Tcell,TNFRSF9,20166.0,1.55538,188098,0.10721
4,Tcell,VTCN1,293.0,1.691238,188098,0.001558
5,Tcell,SLAMF7,11676.0,1.415022,188098,0.062074
6,Tcell,SELP,798.0,1.298185,188098,0.004242
7,Tcell,TNFSF4,6237.0,1.475875,188098,0.033158
8,Tcell,IL10,3136.0,1.695357,188098,0.016672
9,Tcell,IL1A,242.0,1.373145,188098,0.001287


In [29]:
result.columns = ["cell", "gene", "counts", "avg", "Freq", "perc_expr"]
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,Freq,perc_expr
0,Tcell,TNFRSF18,51316.0,1.998775,188098,0.272815
1,Tcell,TNFRSF4,40241.0,1.986621,188098,0.213936
2,Tcell,TNFRSF14,57782.0,1.612135,188098,0.307191
3,Tcell,TNFRSF9,20166.0,1.55538,188098,0.10721
4,Tcell,VTCN1,293.0,1.691238,188098,0.001558
5,Tcell,SLAMF7,11676.0,1.415022,188098,0.062074
6,Tcell,SELP,798.0,1.298185,188098,0.004242
7,Tcell,TNFSF4,6237.0,1.475875,188098,0.033158
8,Tcell,IL10,3136.0,1.695357,188098,0.016672
9,Tcell,IL1A,242.0,1.373145,188098,0.001287


In [30]:
#The Shiao data has different names for cells than what we will use in iAtlas, so let's update them
result['cell'] = result['cell'].replace(['Tcell', 'myeloid', 'Bcell', 'mastcell'], ['T cell', 'myeloid cell', 'B cell', 'mast cell'])

In [31]:
#Add dataset info and save data into file
result['dataset'] = "Shiao_2024"
result.to_csv('Shiao_2024_bubble_plot_df.tsv', sep='\t', index=False)