In [1]:
# it works for cell type program for Healthy and Lesion
import scanpy as sc
import numpy as np
import pandas as pd
import sys
from collections import Counter

In [2]:
def write_celltypeprogram_matrix(adata, filedir, filename, celltypelabel):
    # set up the ordering of genes and cells
    genes = list(set(adata.var_names))
    gene2idx = {gene:i for i, gene in enumerate(genes)}
    
    pvalmtxs, logfoldmtxs, scoremtxs = [], [], []
    
    ctlabels = [celltypelabel]
    print(adata.obs.columns)
    print(ctlabels)
    for ctlabel in ctlabels:
        delabel = ctlabel + '_DE'
        cellsubsets = adata.uns[delabel]['names'].dtype.fields.keys()
        cell2idx = {cellsubset:i for i, cellsubset in enumerate(cellsubsets)}

        # create empty matrix
        pvalmtx = np.zeros((len(gene2idx), len(cell2idx)))

        logfoldmtx = np.zeros((len(gene2idx), len(cell2idx)))
        scoremtx = np.zeros((len(gene2idx), len(cell2idx)))

        # loop through and add the matrix with pvalue, logfold and score
        for gene, pval, logfold, score in zip(adata.uns[delabel]['names'], 
                                       adata.uns[delabel]['pvals_adj'], 
                                       adata.uns[delabel]['logfoldchanges'], 
                                       adata.uns[delabel]['scores']):
            for cell_subset in cellsubsets:
                if gene[cell_subset] in gene2idx:
                    pvalmtx[gene2idx[gene[cell_subset]], cell2idx[cell_subset]] = pval[cell_subset]
                    logfoldmtx[gene2idx[gene[cell_subset]], cell2idx[cell_subset]] = logfold[cell_subset]
                    scoremtx[gene2idx[gene[cell_subset]], cell2idx[cell_subset]] = score[cell_subset]

        # transform matrix to dataframe
        pvalmtxs.append(pd.DataFrame(pvalmtx, index=genes, columns=cellsubsets))
        logfoldmtxs.append(pd.DataFrame(logfoldmtx, index=genes, columns=cellsubsets))
        scoremtxs.append(pd.DataFrame(scoremtx, index=genes, columns=cellsubsets))
    pvalmtxs = pd.concat(pvalmtxs, axis=1)
    logfoldmtxs = pd.concat(logfoldmtxs, axis=1)
    scoremtxs = pd.concat(scoremtxs, axis=1)

    # write matrix to file
    pvalmtxs.to_csv("%s/%s_pval.csv"%(filedir, filename))
    logfoldmtxs.to_csv("%s/%s_logfold.csv"%(filedir, filename))
    scoremtxs.to_csv("%s/%s_score.csv"%(filedir, filename))

In [3]:
def compute_celltype_programs(filename, tissue, sampleid, celltypelabel):
    tissueadata = sc.read(filename)
    for ctlabel in [celltypelabel]:
        print(ctlabel)
        counts = Counter(tissueadata.obs[ctlabel])
        tissueadata.obs[ctlabel+'_counts'] = [counts[ct] for ct in tissueadata.obs[ctlabel]]
        adata = tissueadata[tissueadata.obs[ctlabel+'_counts'] > 10].copy()
        n_genes = adata.shape[1]
        sc.tl.rank_genes_groups(adata, 
                                ctlabel, 
                                key_added=ctlabel+'_DE', 
                                use_raw=False, 
                                method='wilcoxon', 
                                n_genes=n_genes)
        tissueadata.uns[ctlabel+'_DE'] = adata.uns[ctlabel+'_DE']
    write_celltypeprogram_matrix(tissueadata, 
                                 filedir, 
                                 tissue, 
                                 celltypelabel)

In [5]:
# check the input data:
# 1. Make sure cell_type and other values are STRING,not Integer
# 2. Check adata.X, adata.raw.X. Make suer adata.X is the log normalized data

In [4]:
adata_test = sc.read("/projects/abv/users/tangfx1/scRNAseq_data/hca_skin_portal/HCA_AD_Lesion_level1.h5ad")
adata_test.obs 

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,sample_id,mad_prd,Status,Site,Tissue,Enrichment,Location,Sex,Age,stage,full_clustering,patient_ID,celltype_level1,celltype_level2
AAACCTGAGAGGGCTT-1-SKN8090528,SeuratProject,2210.0,967,SKN8090528,0,Eczema,lesion,Epidermis,CD45P,Lower_back,Female,62.0,adult,Tc,E2,T_cells,Tc
AAACGGGAGAAGGACA-1-SKN8090528,SeuratProject,2637.0,1080,SKN8090528,0,Eczema,lesion,Epidermis,CD45P,Lower_back,Female,62.0,adult,Tc,E2,T_cells,Tc
AAACGGGTCAATCACG-1-SKN8090528,SeuratProject,1731.0,787,SKN8090528,0,Eczema,lesion,Epidermis,CD45P,Lower_back,Female,62.0,adult,Tc,E2,T_cells,Tc
AAAGATGCACATGGGA-1-SKN8090528,SeuratProject,2885.0,1026,SKN8090528,0,Eczema,lesion,Epidermis,CD45P,Lower_back,Female,62.0,adult,moDC_1,E2,Dendritic_cells,moDC_1
AAAGATGCACCGAATT-1-SKN8090528,SeuratProject,1563.0,752,SKN8090528,0,Eczema,lesion,Epidermis,CD45P,Lower_back,Female,62.0,adult,Tc,E2,T_cells,Tc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCGCTCCCTAATT-1-SKN8090565,SeuratProject,4918.0,1438,SKN8090565,0,Eczema,lesion,Dermis,Live_single,Lower_back,Male,20.0,adult,F3,E1,Fibroblasts,Fibroblast_3
TTTGGTTGTAAATACG-1-SKN8090565,SeuratProject,3026.0,1160,SKN8090565,0,Eczema,lesion,Dermis,Live_single,Lower_back,Male,20.0,adult,Schwann_1,E1,Schwann_cells,Schwann_1
TTTGTCAAGGGTCGAT-1-SKN8090565,SeuratProject,1941.0,680,SKN8090565,0,Eczema,lesion,Dermis,Live_single,Lower_back,Male,20.0,adult,Undifferentiated_KC,E1,Keratinocytes,Undifferentiated_KC
TTTGTCACAATGGTCT-1-SKN8090565,SeuratProject,3713.0,1455,SKN8090565,0,Eczema,lesion,Dermis,Live_single,Lower_back,Male,20.0,adult,VE2,E1,Vascular_endothelium,Vascular_endothelium_2


In [5]:
print(adata_test.X)

  (0, 27)	2.3075501223766746
  (0, 55)	1.709262772592952
  (0, 89)	1.709262772592952
  (0, 154)	2.679284447813446
  (0, 201)	1.709262772592952
  (0, 361)	1.709262772592952
  (0, 493)	3.6162844336343807
  (0, 503)	1.709262772592952
  (0, 526)	2.3075501223766746
  (0, 531)	2.3075501223766746
  (0, 546)	1.709262772592952
  (0, 559)	2.3075501223766746
  (0, 618)	1.709262772592952
  (0, 682)	1.709262772592952
  (0, 683)	1.709262772592952
  (0, 733)	1.709262772592952
  (0, 741)	1.709262772592952
  (0, 752)	1.709262772592952
  (0, 761)	1.709262772592952
  (0, 771)	1.709262772592952
  (0, 918)	3.3375232429972668
  (0, 935)	2.3075501223766746
  (0, 953)	2.3075501223766746
  (0, 977)	2.3075501223766746
  (0, 1019)	1.709262772592952
  :	:
  (63511, 33070)	1.3692884059981618
  (63511, 33079)	1.3692884059981618
  (63511, 33131)	1.3692884059981618
  (63511, 33207)	1.926450990207394
  (63511, 33243)	1.3692884059981618
  (63511, 33253)	1.3692884059981618
  (63511, 33257)	2.2821429650732394
  (63511, 3

In [6]:
print(adata_test.raw.X)

  (0, 27)	2.0
  (0, 55)	1.0
  (0, 89)	1.0
  (0, 154)	3.0
  (0, 201)	1.0
  (0, 361)	1.0
  (0, 493)	8.0
  (0, 503)	1.0
  (0, 526)	2.0
  (0, 531)	2.0
  (0, 546)	1.0
  (0, 559)	2.0
  (0, 618)	1.0
  (0, 682)	1.0
  (0, 683)	1.0
  (0, 733)	1.0
  (0, 741)	1.0
  (0, 752)	1.0
  (0, 761)	1.0
  (0, 771)	1.0
  (0, 918)	6.0
  (0, 935)	2.0
  (0, 953)	2.0
  (0, 977)	2.0
  (0, 1019)	1.0
  :	:
  (63511, 33070)	1.0
  (63511, 33079)	1.0
  (63511, 33131)	1.0
  (63511, 33207)	2.0
  (63511, 33243)	1.0
  (63511, 33253)	1.0
  (63511, 33257)	3.0
  (63511, 33282)	4.0
  (63511, 33326)	1.0
  (63511, 33381)	1.0
  (63511, 33396)	1.0
  (63511, 33446)	1.0
  (63511, 33474)	4.0
  (63511, 33479)	1.0
  (63511, 33496)	1.0
  (63511, 33497)	5.0
  (63511, 33498)	11.0
  (63511, 33499)	5.0
  (63511, 33500)	2.0
  (63511, 33501)	2.0
  (63511, 33502)	3.0
  (63511, 33503)	6.0
  (63511, 33504)	1.0
  (63511, 33505)	2.0
  (63511, 33508)	3.0


In [7]:
adata_test

AnnData object with n_obs × n_vars = 63512 × 33538
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_id', 'mad_prd', 'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age', 'stage', 'full_clustering', 'patient_ID', 'celltype_level1', 'celltype_level2'
    var: 'features'

In [8]:
# Compute cell type program:
#compute_celltype_programs(filename,
#                                  tissue,
#                                  sampleid,
#                                  celltypelabel
#                                 )
# AD Lesion, 14 clusters
#the output dir:
filedir = '/projects/abv/users/tangfx1/scRNAseq_data/hca_skin_portal/AD_HCA_scanpy_output/14_clusters_AD_Lesion_CT/' 

##the input file name:
filename = 'HCA_AD_Lesion_level1.h5ad' 

compute_celltype_programs(filename, 
                          'skin',
                          'sampleid',
                          'celltype_level1' #decide 41 or 14 clusters
                         )


celltype_level1


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'orig.ident' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Status' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Site' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Enrichment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Location' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'stage' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'full_clustering' as categorical
  c.

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_id', 'mad_prd',
       'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age',
       'stage', 'full_clustering', 'patient_ID', 'celltype_level1',
       'celltype_level2', 'celltype_level1_counts'],
      dtype='object')
['celltype_level1']


In [9]:
# HC, 14 clusters:

#the output dir:
filedir = '/projects/abv/users/tangfx1/scRNAseq_data/hca_skin_portal/AD_HCA_scanpy_output/14_clusters_HC_CT/' 

##the input file name:
filename = 'HCA_Healthy_level1.h5ad' 

compute_celltype_programs(filename, 
                          'skin',
                          'sampleid',
                          'celltype_level1' #decide 41 or 14 clusters
                         )


celltype_level1


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'orig.ident' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Status' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Site' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Enrichment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Location' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'stage' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'full_clustering' as categorical
  c.

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_id', 'mad_prd',
       'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age',
       'stage', 'full_clustering', 'patient_ID', 'celltype_level1',
       'celltype_level2', 'celltype_level1_counts'],
      dtype='object')
['celltype_level1']


In [10]:
# AD_Lesion, 41 clusters:
filedir = '/projects/abv/users/tangfx1/scRNAseq_data/hca_skin_portal/AD_HCA_scanpy_output/41_clusters_AD_Lesion_CT/' 

##the input file name:
filename = 'HCA_AD_Lesion_level1.h5ad' 

compute_celltype_programs(filename, 
                          'skin',
                          'sampleid',
                          'celltype_level2' #decide 41 or 14 clusters
                         )


celltype_level2


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'orig.ident' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Status' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Site' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Enrichment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Location' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'stage' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'full_clustering' as categorical
  c.

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_id', 'mad_prd',
       'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age',
       'stage', 'full_clustering', 'patient_ID', 'celltype_level1',
       'celltype_level2', 'celltype_level2_counts'],
      dtype='object')
['celltype_level2']


In [11]:
# HC, 41 clusters:

#the output dir:
filedir = '/projects/abv/users/tangfx1/scRNAseq_data/hca_skin_portal/AD_HCA_scanpy_output/41_clusters_HC_CT/' 

##the input file name:
filename = 'HCA_Healthy_level1.h5ad' 

compute_celltype_programs(filename, 
                          'skin',
                          'sampleid',
                          'celltype_level2' #decide 41 or 14 clusters
                         )


celltype_level2


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'orig.ident' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Status' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Site' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Enrichment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Location' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'stage' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'full_clustering' as categorical
  c.

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_id', 'mad_prd',
       'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age',
       'stage', 'full_clustering', 'patient_ID', 'celltype_level1',
       'celltype_level2', 'celltype_level2_counts'],
      dtype='object')
['celltype_level2']
