Computing stats for single cell RNA bubbleplot

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import synapseclient

In [3]:
syn = synapseclient.Synapse()
syn.login()

Welcome, heimann!



In [27]:
#loading h5ad file after qc, described in 'vanderbilt_data_prep.ipynb'
entity = syn.get('syn61518721')
adata = sc.read_h5ad(entity.path)
adata

AnnData object with n_obs × n_vars = 10696 × 30234
    obs: 'HTAN Parent Data File ID', 'HTAN Specimen ID', 'Cell_Type', 'Tumor_Type', 'Sample_Classification', 'development_stage_ontology_term_id', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'assay_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'mt'
    uns: 'log1p'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts', 'normalized'

In [5]:
#we'll need to use the R package that queries the iAtlas database, the libraries below are necessary to call R code

#getting the list of immunomodulators from CRI iAtlas
from rpy2.robjects.packages import importr
import rpy2.robjects as ro

iatlasGraphQLClient = importr('iatlasGraphQLClient')
immunomodulators = ro.conversion.rpy2py(iatlasGraphQLClient.query_immunomodulators())
immunomodulators

entrez,hgnc,friendly_name,...,immune_checkpoint,super_category,publications
135,'ADORA2A','ADORA2A',...,'Inhibito...,'Receptor',ListSexpV...
383,'ARG1','ARG1',,'Inhibito...,'Other',ListSexpV...
151888,'BTLA','BTLA',,'Inhibito...,'Receptor',ListSexpV...
11119,'BTN3A1','BTN3A1',,'Stimulat...,'Co-inhib...,ListSexpV...
...,...,...,,...,...,...
7422,'VEGFA','VEGFA',,'Inhibito...,'Ligand',ListSexpV...
7423,'VEGFB','VEGFB',,'Inhibito...,'Ligand',ListSexpV...
64115,'VSIR','VISTA',,'Inhibito...,'Co-inhib...,ListSexpV...
79679,'VTCN1','VTCN1',,'Inhibito...,'Co-inhib...,ListSexpV...


In [18]:
#get a list of the genes that are present in the dataset & are immunomodulators
genes = pd.Series(adata.var[np.isin(adata.var["feature_name"], immunomodulators[2])]["feature_name"]).reset_index(drop=True)
genes

0      ADORA2A
1         ARG1
2         BTLA
3       BTN3A1
4       BTN3A2
5         CD27
6        CD276
7         CD28
8         CD40
9       CD40LG
10        CD70
11        CD80
12       CTLA4
13      CX3CL1
14       CXCL9
15       EDNRB
16      ENTPD1
17        GZMA
18       HLA-A
19       HLA-B
20       HLA-C
21    HLA-DPA1
22    HLA-DPB1
23    HLA-DQA1
24    HLA-DQA2
25    HLA-DQB1
26    HLA-DQB2
27     HLA-DRA
28    HLA-DRB1
29    HLA-DRB5
30       HMGB1
31       ICAM1
32        ICOS
33      ICOSLG
34        IDO1
35       IFNA1
36        IFNG
37        IL10
38        IL1A
39        IL1B
40         IL2
41       IL2RA
42         IL4
43       ITGB2
44     KIR2DL1
45     KIR2DL3
46        LAG3
47        MICA
48        MICB
49        PRF1
50        SELP
51      SLAMF7
52       TGFB1
53       TIGIT
54        TLR4
55         TNF
56    TNFRSF18
57       VEGFA
58       VEGFB
59       VTCN1
Name: feature_name, dtype: category
Categories (29577, object): ['A1BG', 'A1CF', 'A2M', 'A2M-AS1', .

In [15]:
#compute how many counts map to each cell type
freq_cell_types = adata.obs["cell_type"].value_counts()
freq_cell_types

cell_type
T cell              4410
plasma cell         2236
B cell              1228
myeloid cell        1209
fibroblast           713
mast cell            506
endothelial cell     394
Name: count, dtype: int64

In [19]:
#create grid with all cell x gene combinations
lp1, lp2 = pd.core.reshape.util.cartesian_product([freq_cell_types.index, genes])
cell_gene = pd.DataFrame(dict(cell=lp1, gene=lp2))
cell_gene

Unnamed: 0,cell,gene
0,T cell,ADORA2A
1,T cell,ARG1
2,T cell,BTLA
3,T cell,BTN3A1
4,T cell,BTN3A2
...,...,...
55,endothelial cell,TNF
56,endothelial cell,TNFRSF18
57,endothelial cell,VEGFA
58,endothelial cell,VEGFB


We need to compute:
- % of cell of a given type that expresses a gene (count expr by type/freq of cell type)
- mean expr value

In [22]:
def get_expr_by_cell(cell_type, gene):
    expr = adata[adata.obs.cell_type == cell_type].to_df(layer="counts")[gene]
    counts = sum(expr != 0)
    if(counts>0): avg = (expr[expr != 0]).mean()
    else: avg = 0 #technically this is wrong, but plotting libraries crash with NAs or characters
    return counts, avg

In [28]:
# Function to apply to each row
def apply_function(row):
    counts, avg = get_expr_by_cell(row['cell'], row['gene'])  
    return pd.Series({'counts': counts, 'avg': avg})

# Apply the function to each row and concatenate the results
result = pd.concat([cell_gene, cell_gene.apply(apply_function, axis=1)], axis=1)


In [30]:
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg
0,T cell,ADORA2A,72.0,1.774461
1,T cell,ARG1,0.0,0.0
2,T cell,BTLA,90.0,1.650936
3,T cell,BTN3A1,632.0,1.793955
4,T cell,BTN3A2,1092.0,1.867897
5,T cell,CD27,2.0,1.615647
6,T cell,CD276,36.0,1.511705
7,T cell,CD28,272.0,1.913442
8,T cell,CD40,30.0,1.581251
9,T cell,CD40LG,170.0,1.771789


In [32]:
#Now we compute the % of cells from a given cell type that have expression for a gene
result = pd.merge(result, freq_cell_types, left_on='cell', right_index=True)
result['perc_expr'] = result['counts'] / result['count']
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,count,perc_expr
0,T cell,ADORA2A,72.0,1.774461,4410,0.016327
1,T cell,ARG1,0.0,0.0,4410,0.0
2,T cell,BTLA,90.0,1.650936,4410,0.020408
3,T cell,BTN3A1,632.0,1.793955,4410,0.143311
4,T cell,BTN3A2,1092.0,1.867897,4410,0.247619
5,T cell,CD27,2.0,1.615647,4410,0.000454
6,T cell,CD276,36.0,1.511705,4410,0.008163
7,T cell,CD28,272.0,1.913442,4410,0.061678
8,T cell,CD40,30.0,1.581251,4410,0.006803
9,T cell,CD40LG,170.0,1.771789,4410,0.038549


In [33]:
#change column names
result.columns = ["cell", "gene", "counts", "avg", "Freq", "perc_expr"]
result.iloc[0:10]

Unnamed: 0,cell,gene,counts,avg,Freq,perc_expr
0,T cell,ADORA2A,72.0,1.774461,4410,0.016327
1,T cell,ARG1,0.0,0.0,4410,0.0
2,T cell,BTLA,90.0,1.650936,4410,0.020408
3,T cell,BTN3A1,632.0,1.793955,4410,0.143311
4,T cell,BTN3A2,1092.0,1.867897,4410,0.247619
5,T cell,CD27,2.0,1.615647,4410,0.000454
6,T cell,CD276,36.0,1.511705,4410,0.008163
7,T cell,CD28,272.0,1.913442,4410,0.061678
8,T cell,CD40,30.0,1.581251,4410,0.006803
9,T cell,CD40LG,170.0,1.771789,4410,0.038549


In [34]:
#Add dataset info and save data into file
result['dataset'] = "Vanderbilt"
result.to_csv('vanderbilt_bubble_plot_df.tsv', sep='\t', index=False)
file_entity = synapseclient.File('vanderbilt_bubble_plot_df.tsv', 'syn61518110')
file_entity = syn.store(file_entity)

Uploading to Synapse storage: 100%|██████████| 30.2k/30.2k [00:00<00:00, 46.2kB/s, vanderbilt_bubble_plot_df.tsv]
