In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import synapseclient
import mygene
from scipy.sparse import csr_matrix

# Krishna 2021 dataset
Source: https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=analysis&acc=SRZ190804

Stored at: https://www.synapse.org/#!Synapse:syn54861463

In [2]:
syn = synapseclient.Synapse()
syn.login()

Welcome, heimann!



## Loading metadata

In [3]:
#Load metadata

#metadata extracted from Seurat file
entity = syn.get('syn56438105')
print(entity)
metadata = pd.read_csv(entity.path)

metadata.columns.values

File: krishna2021_metadata.csv (syn56438105)
  md5=52ffadd67e319039566e07ceeef9580a
  fileSize=30794830
  contentType=text/csv
  externalURL=None
  cacheDir=/Users/heimann/.synapseCache/11/137425011
  files=['krishna2021_metadata.csv']
  path=/Users/heimann/.synapseCache/11/137425011/krishna2021_metadata.csv
  synapseStore=True
properties:
  concreteType=org.sagebionetworks.repo.model.FileEntity
  createdBy=3398555
  createdOn=2024-04-11T17:07:40.501Z
  dataFileHandleId=137425011
  etag=595d4d92-b577-4a57-b8c8-43fc3291ead7
  id=syn56438105
  isLatestVersion=True
  modifiedBy=3398555
  modifiedOn=2024-04-11T17:07:40.501Z
  name=krishna2021_metadata.csv
  parentId=syn54858333
  versionLabel=1
  versionNumber=1
annotations:



array(['Unnamed: 0', 'orig.ident', 'res.0.2', 'res.0.3', 'res.0.4',
       'res.0.5', 'res.0.6', 'res.0.7', 'res.0.8', 'res.0.9', 'cell',
       'region', 'type', 'Sample', 'Sample2', 'b1', 'b2', 'b3', 'b4',
       'b5', 'b6', 'cluster', 'nCount_RNA', 'nFeature_RNA'], dtype=object)

In [4]:
# Some annotation in obs seem to be clinical annotation
print("region", metadata["region"].unique())
print("type", metadata["type"].unique())
print("Sample", metadata["Sample"].unique())
print("Sample2", metadata["Sample2"].unique())
print("cluster", metadata["cluster"].unique())

region ['Center' 'Upper' 'Lower' 'Far' 'Near' 'Normal' 'SupraLateral'
 'LowerLateral' 'LowerMedial' 'PBMC' 'LymphNode' 'Lateral' 'Medial']
type ['Tumor' 'Normal' 'PBMC' 'LymphNode']
Sample ['UT1' 'UT2' 't1' 't2' 't3' 't4']
Sample2 ['UT1_Center' 'UT1_Upper' 'UT1_Lower' 'UT2_Center' 'UT2_Far' 'UT2_Near'
 'UT2_Normal' 'UT1_Normal' 't1_SupraLateral' 't1_LowerLateral'
 't1_LowerMedial' 't1_Normal' 't1_PBMC' 't2_Center' 't2_Far' 't2_Near'
 't2_LymphNode' 't2_Normal' 't2_PBMC' 't3_Center' 't3_Far' 't3_Near'
 't3_Normal' 't3_PBMC' 't4_Center' 't4_Lateral' 't4_Medial' 't4_Normal'
 't4_PBMC']
cluster [ 4 15 19 10  2 11 17  7 18  9  1  0 14  5 20  3 13 12 24  8 21 32 23 16
 31 30 22  6 25 27 33 28 34 26 29]


In [7]:
#The authors also made available a file with cell annotations
entity = syn.get('syn59205039')
cell_annot = pd.read_csv(entity.path, sep='\t')
cell_annot.head()

Unnamed: 0,cell,type,region,Sample,Sample2,cluster,cluster_name,UMAP1,UMAP2,Sample_name
0,AAACCTGAGCGTTGCC-1_UT1_Center,Tumor,Center,UT1,UT1_Center,4,CD8A+ NK-like,5.896925,-4.373115,Untreated 1
1,AAACCTGAGCTGCCCA-1_UT1_Center,Tumor,Center,UT1,UT1_Center,15,CD8A+ Proliferating,3.004102,1.83449,Untreated 1
2,AAACCTGAGTGGACGT-1_UT1_Center,Tumor,Center,UT1,UT1_Center,19,CD8A+ Exhausted IEG,1.148642,1.801541,Untreated 1
3,AAACCTGCAGGTCGTC-1_UT1_Center,Tumor,Center,UT1,UT1_Center,10,Conventional NK,3.64934,-8.709116,Untreated 1
4,AAACCTGCAGGTCTCG-1_UT1_Center,Tumor,Center,UT1,UT1_Center,2,CD8A+ Tissue-resident,4.961056,-0.395439,Untreated 1


In [8]:
#Inspect more closely unique values for each column
pd.unique(cell_annot["cluster_name"])

array(['CD8A+ NK-like', 'CD8A+ Proliferating', 'CD8A+ Exhausted IEG',
       'Conventional NK', 'CD8A+ Tissue-resident',
       'CD45- Vascular Endothelium', 'NK HSP+', 'CD4+ Activated IEG',
       'TAM/TCR (Ambiguos)', 'CD45- PAX8+ renal epithelium', 'TAM HLAhi',
       'CD8A+ Exhausted', 'CD14+ Monocyte', 'CD14+/CD16+ Monocyte',
       'CD45- Myofibroblast', 'CD4+ Naive', 'cDC2', 'TAM HLAint',
       'Ambiguous', 'Mast', 'CD4+ Proliferating', 'CD45- ccRCC CA9+',
       'CD4+ Effector', 'TAM ISGint', 'B cell', 'CD4+ Treg', 'TAM ISGhi',
       'cDC1', 'Megakaryocyte', 'pDC', 'Ambiguous/Dead'], dtype=object)

# QC

According to the study manuscript, the following filtering was performed:

*the count matrix of cell barcodes by genes used for downstream analysis was generated using the Cell Ranger count function with parameter –expect-cells = 3000. The raw count matrix for each of the 29 samples was obtained from the Cell Ranger count filter_matrix output. Briefly, for each sample, Cell Ranger plots the total UMI count against the cell barcode rank in decreasing order of total counts and filters cell barcodes out of the resulting count matrix based on the inflection point of the plot. This step minimizes the number of empty droplets that are included in downstream analyses. We created an initial count matrix combining all 29 samples from all patients. Following count matrix generation, cells with >20% of transcripts derived from mitochondrial genes were considered apoptotic and were thus excluded. Following this step, all mitochondrial genes were filtered out of the count matrix. Ribosomal genes and the noncoding RNAs NEAT1 and MALAT1 were excluded due to prior reports of strong influence on downstream clustering (Freytag et al., 2018). Genes with mean raw count <3.0 were removed from
the analysis, resulting in a count matrix of 167283 cells and 16,323 genes for downstream analysis.*

However, there is no record of these metrics in the object. We only know that the file wennt through preprocessing because the number of cells and genes match the described in the manuscript. We will not make any additional filtering at this moment.

# Building the counts matrix

We have the counts stored in two files. We will need to:

- load
- change the transcripts ids to gene ids
- exclude genes not required in the GSEA step (optional)
- convert to matrix format
- store it as an anndata file, with patient ids and cell type clusters in obs

In [9]:
#loading files
entity = syn.get('syn56559704')
entity
k1 = pd.read_feather(entity.path)

entity = syn.get('syn56875708')
entity
k2 = pd.read_feather(entity.path)

KeyboardInterrupt: 

In [21]:
k2.iloc[0:10]

Unnamed: 0,row,col,x
0,ENSG00000071073,CCGGGATCACCAGTTA-1_t2_Center,1.0
1,ENSG00000196872,CCGGGATCACCAGTTA-1_t2_Center,1.0
2,ENSG00000158417,CCGGGATCACCAGTTA-1_t2_Center,2.0
3,ENSG00000115539,CCGGGATCACCAGTTA-1_t2_Center,1.0
4,ENSG00000163162,CCGGGATCACCAGTTA-1_t2_Center,7.0
5,ENSG00000135972,CCGGGATCACCAGTTA-1_t2_Center,1.0
6,ENSG00000169756,CCGGGATCACCAGTTA-1_t2_Center,3.0
7,ENSG00000153201,CCGGGATCACCAGTTA-1_t2_Center,1.0
8,ENSG00000224959,CCGGGATCACCAGTTA-1_t2_Center,1.0
9,ENSG00000188177,CCGGGATCACCAGTTA-1_t2_Center,2.0


In [7]:
#getting unique transcript ids
transcripts = pd.concat([k1["row"], k2["row"]]).unique()
transcripts.shape

(16323,)

In [8]:
#getting gene symbols from transcript ids
mg = mygene.MyGeneInfo()
geneSyms = mg.querymany(transcripts, scopes='ensembl.gene', fields='symbol', species='human', returnall = False)

1 input query terms found dup hits:	[('ENSG00000249738', 2)]
350 input query terms found no hit:	['ENSG00000231360', 'ENSG00000117289', 'ENSG00000131795', 'ENSG00000136653', 'ENSG00000269028', 'ENS


In [9]:
#Saving data in a dictionary
ensembl_to_genesymbol = {} 
ensembl_without_genesymbol = {} 
for line in geneSyms:
    try: 
        line["symbol"]
    except:
        ensembl_without_genesymbol[line["query"]] = line["query"]
    else: 
        ensembl_to_genesymbol[line["query"]] = line["symbol"]                

In [18]:
#removing from the counts tables the transcripts without gene ids (they are mostly RNA genes)
fk1 = k1[~k1["row"].isin(ensembl_without_genesymbol.keys())]
fk2 = k2[~k2["row"].isin(ensembl_without_genesymbol.keys())]

In [85]:
print(k1.shape)
print(fk1.shape)
print(k2.shape)
print(fk2.shape)

(120000000, 3)
(118559029, 3)
(123448230, 3)
(121979334, 3)


In [19]:
#we're also excluding cells that are labeled as type: 'Ambiguous/Dead', 'TAM/TCR (Ambiguos)', 'Ambiguous'
cell_exclude = cell_annot[cell_annot["cluster_name"].isin(['Ambiguous/Dead', 'TAM/TCR (Ambiguos)', 'Ambiguous'])]["cell"]

fk1 = fk1[~fk1["col"].isin(cell_exclude)]
fk2 = fk2[~fk2["col"].isin(cell_exclude)]

In [90]:
print(fk1.shape)
print(fk2.shape)

(112108495, 3)
(113461358, 3)


In [15]:
#replacing ensembl ids with gene symbol
fk1 = fk1.iloc[0:100].replace(to_replace = ensembl_to_genesymbol)

AttributeError: 'NoneType' object has no attribute 'iloc'

In [17]:
fk1.iloc[0:10]

AttributeError: 'NoneType' object has no attribute 'iloc'

In [None]:
fk2 = fk2.replace(to_replace = ensembl_to_genesymbol)

# Formatting to run pseudobulk

In [33]:
# #change gene id from ensembl to gene symbol (stored in var)
adata.var = adata.var.reset_index().rename(columns={'gene_ids': 'ensembl_gene_id'}
                                          ).set_index('feature_name')

# save raw counts as a layer
adata.layers['counts'] = adata.X

# Normalize and log-transform
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers['normalized'] = adata.X




AnnData expects .var.index to contain strings, but got values like:
    ['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [35]:
adata

AnnData object with n_obs × n_vars = 16475 × 25090
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'mito_frac', 'RBP_frac', 'batch', 'donor_id', 'treatment', 'procedure', 'author_cell_type', 'cell_type_broad', 'clusters', 'treatment_categorized', 'subtype', 'H_treatment', 'H_subtype', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'disease_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'development_stage_ontology_term_id', 'assay_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'suspension_type', 'HTAN_Biospecimen_ID', 'HTAN_Participant_ID', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'ensembl_gene_id', 'feature_is_filtered', 'feature_reference', 'feature_biotype'
    uns: 'neighbors', 'schema_version', 'title', 'log1p'
    obsm: 'X_pca', 'X_umap'
    layers: 'log2(X+0.1)', 'normalized', 'counts'
   

In [36]:
adata.write('data/msk_SCLCimmunecells_iatlas.h5ad')

In [29]:
adata.to_df(layer = "counts").to_csv("msk_counts_data.csv")