In [1]:
import pandas as pd
import numpy as np
import os
import sys
import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

In [2]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/TabulaSapiens.h5ad"
data = sc.read_h5ad(fpath)
data

AnnData object with n_obs × n_vars = 483152 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'compartment_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [3]:
data.obs.head()

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
AAACCCACACTCCTGT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,7633.0,2259,macrophage,Monocyte/Macrophage,True,immune,male
AAACGAAGTACCAGAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,2858.0,1152,monocyte,Monocyte,True,immune,male
AAACGCTCAACGGCTC_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,7787.0,2983,endothelial cell of hepatic sinusoid,Endothelial,True,endothelial,male
AAAGAACAGCCTCTTC_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10395.0,2598,macrophage,Monocyte/Macrophage,True,immune,male
AAAGAACGTAGCACAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,6610.0,2125,liver dendritic cell,Dendritic cell,True,immune,male


In [4]:
celltypes = ['fibroblast',  'hematopoietic stem cell']

tissue_blacklist = ['Uterus', 
                    'Salivary_Gland', 
                    'Pancreas', 
                    'Lymph_Node', 
                    'Lung', 
                    'Liver', 
                    'Eye',
                    ]

mask = (data.obs['cell_ontology_class'].isin(celltypes) & ~data.obs['organ_tissue'].isin(tissue_blacklist))
pdf = data[mask].copy()

cellmap = {
    'fibroblast' : 'FB',
    'hematopoietic stem cell' : 'HSC',
}

pdf.obs['celltype'] = pdf.obs['cell_ontology_class'].map(cellmap)

pdf

AnnData object with n_obs × n_vars = 27346 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'celltype'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [5]:
# add a cell ID

pdf.obs['record_id'] = list(range(1, len(pdf.obs)+1))
pdf.obs['cell_id'] = pdf.obs['celltype'].astype(str) + "_" + pdf.obs['record_id'].astype(str)
pdf.obs.head()

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender,celltype,record_id,cell_id
AAAGAACTCCTACCGT_TSP6_Trachea_NA_10X_1_1,Trachea,10X,TSP6,,17016.0,4353,fibroblast,fibroblast,True,stromal,male,FB,1,FB_1
AAAGTGACATCATGAC_TSP6_Trachea_NA_10X_1_1,Trachea,10X,TSP6,,8717.0,2673,fibroblast,fibroblast,True,stromal,male,FB,2,FB_2
AACCACACACTCCGGA_TSP6_Trachea_NA_10X_1_1,Trachea,10X,TSP6,,14208.0,3610,fibroblast,fibroblast,True,stromal,male,FB,3,FB_3
AACGGGACAGAGATGC_TSP6_Trachea_NA_10X_1_1,Trachea,10X,TSP6,,5770.0,2117,fibroblast,fibroblast,True,stromal,male,FB,4,FB_4
AACGTCACACGCGCAT_TSP6_Trachea_NA_10X_1_1,Trachea,10X,TSP6,,16684.0,3954,fibroblast,fibroblast,True,stromal,male,FB,5,FB_5


# preprocess

In [6]:
sc.pp.normalize_total(pdf, target_sum=1e4)
sc.pp.log1p(pdf)

In [7]:
outpath = "/nfs/turbo/umms-indikar/shared/projects/adaptive_sampling/data/tabula_sapiens_filtered.h5ad"
pdf.write(outpath)
print('done')

done


In [8]:
# download the pbmc3k dataset