<a href="https://colab.research.google.com/github/GBakalkinOAI/DDLS2024/blob/main/CellxGene_Census_scVI_Monocytes_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installation commands are taken from: https://docs.scvi-tools.org/en/latest/tutorials/notebooks/hub/cellxgene_census_model.html
# See also: https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_installation.html
!pip install --quiet scvi-colab
!pip install --quiet cellxgene-census
!pip install --quiet pybiomart
from scvi_colab import install

install()

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.8/54.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.0/129.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0

In [None]:
# Import necessary libraries
import cellxgene_census
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scvi

In [None]:
# Define parameters to access the dataset
emb_names = ["scvi"]  # specify the embedding you are interested in
dataset_version = "2024-07-01"  # the version of the dataset you want to use
organism = "homo_sapiens"
cell_type_query = "monocyte"  #ToDo# here we will place cell types with detectable LOY in adult (or older) males
cell_type_counts_min = 5 # drop cells if the corresponding cell type has too few cells in a particular donor
cell_type_donors_min = 5 # drop cells if too few donors have the corresponding cell type

In [None]:
# Later we will use this overview of the CELLxGENE Census to check how representative our LOY-related slice is.
with cellxgene_census.open_soma(census_version=dataset_version) as census:
  # Which version of schema do we use exactly?
  census_summary = census["census_info"]["summary"].read().concat().to_pandas()

  # To add citations and human readable names to `dataset_id` we can augment
  # .get_obs() results with .merge(census_datasets, on="dataset_id")
  census_datasets = (
      census["census_info"]["datasets"]
      .read(column_names=[
          # "citation", # will need it for the publication
          # "collection_name", "collection_doi",
          "dataset_title", # human readable name
          # "dataset_h5ad_path", # downloable from AWS using these file names
          "dataset_total_cell_count", # how many cells are contributed (including duplicated cells)
          "dataset_id" # key for .get_obs() results
          ])
      .concat().to_pandas()
      .set_index("dataset_id")
  )

  # Census summary with counts per cell type
  # `label` is specific for each `category`, allowing further zoom-in
  # number of labels per each `category` is as follows:
  # 1 all  (label is 'na')
  # 1 suspension_type (label is 'cell')
  # 3 sex (label is 53.7% 'male', 41.2% 'female' and 5.1% 'unknown')
  # 24 assay
  # 31 self_reported_ethnicity
  # 55 tissue_general
  # 109 disease
  # 267 tissue
  # 698 cell_type
  census_summary_cell_counts = (
      census["census_info"]["summary_cell_counts"]
      .read().concat().to_pandas()
      .query("organism == 'Homo sapiens'") # Not like in `organism`, different spelling
  )

print( census_summary )
# General information for all schemes: https://raw.githubusercontent.com/chanzuckerberg/cellxgene-census/refs/heads/main/docs/cellxgene_census_schema.md
# Specific details for our schema v.5.0.0: https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/refs/heads/main/schema/5.0.0/schema.md

# census_datasets # has 812 datasets, nothing interesting to see yet

Unnamed: 0_level_0,dataset_title,dataset_total_cell_count
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0895c838-e550-48a3-a777-dbcd35d30272,Healthy human liver: B cells,146
00ff600e-6e2e-4d76-846f-0eec4f0ae417,Human tonsil nonlymphoid cells scRNA,363
bdacc907-7c26-419f-8808-969eab3ca2e8,Molecular characterization of selectively vuln...,3799
a5d95a42-0137-496f-8a60-101e17f263c8,Steady-state B cells - scRNA-seq,1324
d3566d6a-a455-4a15-980f-45eb29114cab,blood and bone marrow from a healthy young donor,15502
...,...,...
0bce33ed-455c-4e12-93f8-b7b04a2de4a1,Whole dataset: Normalized subset 2,2863559
c2876b1b-06d8-4d96-a56b-5304f815b99a,Whole Taxonomy - MTG: Seattle Alzheimer's Dise...,1226855
6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,Whole Taxonomy - DLPFC: Seattle Alzheimer's Di...,1309414
dcfa2614-7ca7-4d82-814c-350626eccb26,Major cell cluster: Mesoderm,3267338


In [None]:
# Search all cell types containing pattern "*monocyte*" so capture all cell types related to Monocytes
cell_types_monocyte = census_summary_cell_counts.query("category == 'cell_type' and label.str.contains('monocyte', case=False, na=False)")
monocyte_labels = cell_types_monocyte['label'].tolist()
monocyte_filter = " or ".join(["cell_type == '{}'".format(label) for label in monocyte_labels])
cell_types_monocyte[['label', 'ontology_term_id', 'unique_cell_count']]

Unnamed: 0,label,ontology_term_id,unique_cell_count
158,granulocyte monocyte progenitor cell,CL:0000557,5893
160,promonocyte,CL:0000559,10669
165,monocyte,CL:0000576,314205
262,classical monocyte,CL:0000860,1059249
265,non-classical monocyte,CL:0000875,178362
322,CD14-positive monocyte,CL:0001054,580910
350,"CD14-positive, CD16-negative classical monocyte",CL:0002057,261782
433,intermediate monocyte,CL:0002393,6148
435,"CD14-low, CD16-positive monocyte",CL:0002396,87379
436,"CD14-positive, CD16-positive monocyte",CL:0002397,20045


In [None]:
# Lets examine male cells metadata (39.9M cells x 26 columns) and decide which columns do we really need.
# In the command we name every possible metadata column, and then
# - label it #drop# and comment out if we do not need it
# - label it #ToDo# if we need to investigate it further
# - label it #batch#ToDo# if this column may cause batch effects, ToDo: check if it cause significant batch effect
# - label it #batch# if we needed this column to stratify batch effect or it is confounded with LOY
with cellxgene_census.open_soma(census_version=dataset_version) as census:
    cell_metadata = cellxgene_census.get_obs(
        census,
        organism,
        value_filter = "sex == 'male' and is_primary_data == True and (" + monocyte_filter + ")",
        column_names = [
            # 'soma_joinid', #drop# special SOMADataFrame column that is used for join operations
            # 'sex', 'sex_ontology_term_id', # filter "male" to study LOY
            # 'is_primary_data', # 56% True, filter True to look only at non-duplicate data
            'dataset_id', #batch#ToDo# 11.3% + 4% + 3.6% + 3.1% + 2.8% + ...
            # 'assay', 'assay_ontology_term_id', 'suspension_type', #batch#ToDo# single cell/nuclei technology
            'cell_type_ontology_term_id', 'cell_type', #batch# later we will filter for LOY-enriched cell types
            'development_stage_ontology_term_id', 'development_stage', #batch# age, filter >=20 years old, 7% '50-year-old human stage'
            'disease_ontology_term_id', 'disease', #batch# 70% healthy + 11% covid
            'observation_joinid', # unique observation identifier for each cell, after filtering we use it to download the right cells
            'self_reported_ethnicity_ontology_term_id', # 'self_reported_ethnicity',  #batch#ToDo# 52% 'unknown', 40% 'European'
            'tissue_ontology_term_id', 'tissue', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', #batch#ToDo# cell types will filter tissues automatically?
            'raw_sum', #ToDo# Is this like Seurat's nReads?
            'nnz', 'raw_mean_nnz', 'raw_variance_nnz', # Is nnz like Seurat's nUMIs? Mean/variance over what?
            'n_measured_vars', # Is this like Seurat's nFeatures ?
            'donor_id' #batch# We study LOY withing each donor, then look at inter-donor variability of DEGs
            ]
    )

In [None]:
# cell_metadata_old = cell_metadata.copy()
cell_metadata = cell_metadata_old.copy()

In [None]:
# Only warn if missing data is actually present, print columns with missing data, and remove missing values
missing_data = cell_metadata.isna().sum()
missing_columns = missing_data[missing_data > 0]
if not missing_columns.empty:
    print("Warning: Missing data found in the following columns:")
    print(missing_columns)
cell_metadata = cell_metadata.dropna(subset=['cell_type', 'donor_id', 'development_stage', 'disease'])

In [None]:
# Because we have some follow up experiments, same 'donor_id' can have different 'disease' and/or 'development_stage'
cell_metadata['donor_follow_up'] = (
    cell_metadata['donor_id'].astype(str) + '_' +
    cell_metadata['disease'].astype(str) + '_' +
    cell_metadata['development_stage'].astype(str)
)

#cell_metadata['n_disease_per_donor'] = cell_metadata.groupby(['donor_id'], observed=False)['disease'].transform('nunique')
#cell_metadata['n_development_stage_per_donor'] = cell_metadata.groupby(['donor_id'], observed=False)['development_stage'].transform('nunique')
#filtered_data = cell_metadata[(cell_metadata['n_disease_per_donor'] > 1) | (cell_metadata['n_development_stage_per_donor'] > 1)]
#filtered_data = filtered_data.sort_values(by=['donor_id', 'disease', 'development_stage'])
#print(filtered_data[['donor_id', 'development_stage', 'disease']].value_counts(sort=False))

In [None]:
# First, drop cells if the corresponding cell type has too few cells in the corresponding donor_follow_up
cell_metadata['cell_type_size'] = cell_metadata.groupby(['cell_type', 'donor_follow_up'], observed=False)['cell_type'].transform('size')
cell_metadata = cell_metadata[cell_metadata['cell_type_size'] >= cell_type_counts_min]
# cell_metadata[['cell_type', 'donor_follow_up', 'cell_type_size']].sort_values(by=['cell_type', 'donor_follow_up'])

Unnamed: 0,cell_type,donor_follow_up,cell_type_size
476345,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63
476350,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63
476352,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63
476355,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63
476357,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63
...,...,...,...
20275,promonocyte,pooled_normal_fifth LMP month human stage,471
20278,promonocyte,pooled_normal_fifth LMP month human stage,471
20283,promonocyte,pooled_normal_fifth LMP month human stage,471
20285,promonocyte,pooled_normal_fifth LMP month human stage,471


In [None]:
# Next, drop cells if too few donors still have the corresponding cell type
cell_metadata['cell_type_donors'] = cell_metadata.groupby('cell_type', observed=False)['donor_follow_up'].transform('nunique')
cell_metadata = cell_metadata[cell_metadata['cell_type_donors'] >= cell_type_donors_min]
# cell_metadata[['cell_type', 'donor_follow_up', 'cell_type_size', 'cell_type_donors']].sort_values(by=['cell_type', 'donor_follow_up'])

Unnamed: 0,cell_type,donor_follow_up,cell_type_size,cell_type_donors
476345,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63,550
476350,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63,550
476352,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63,550
476355,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63,550
476357,"CD14-low, CD16-positive monocyte",1002_1003_normal_73-year-old human stage,63,550
...,...,...,...,...
20275,promonocyte,pooled_normal_fifth LMP month human stage,471,15
20278,promonocyte,pooled_normal_fifth LMP month human stage,471,15
20283,promonocyte,pooled_normal_fifth LMP month human stage,471,15
20285,promonocyte,pooled_normal_fifth LMP month human stage,471,15


In [None]:
print( cell_metadata.value_counts(subset=['cell_type', 'donor_id', 'development_stage', 'disease']).sort_values() )

cell_type                                        donor_id                 development_stage          disease                      
non-classical monocyte                           HGR0000092               45-year-old human stage    COVID-19                              5
monocyte                                         P-M056                   16-year-old human stage    COVID-19                              5
classical monocyte                               Chen_Zhang_2020_NSCLC-4  unknown                    non-small cell lung carcinoma         5
CD14-low, CD16-positive monocyte                 649_650                  80-year-old human stage    normal                                5
                                                 197_198                  60-year-old human stage    normal                                5
                                                                                                                                       ...  
monocyte               

In [None]:
print(cell_metadata.value_counts(subset=['dataset_id'], normalize=True).head(3)) #ToDo# check batch effects of contributing datasets

dataset_id                          
9dbab10c-118d-496b-966a-67f1763a6b7d    0.148544
2c820d53-cbd7-4e0a-be7a-a0ad1989a98f    0.112576
ebc2e1ff-c8f9-466a-acf4-9d291afaf8b3    0.110708
Name: proportion, dtype: float64


In [None]:
print(cell_metadata.value_counts(subset=['cell_type_ontology_term_id', 'cell_type'], normalize=True).head(3)) # most abundant cell types

cell_type_ontology_term_id  cell_type                                      
CL:0000860                  classical monocyte                                 0.366664
CL:0001054                  CD14-positive monocyte                             0.265152
CL:0002057                  CD14-positive, CD16-negative classical monocyte    0.132731
Name: proportion, dtype: float64


In [None]:
print(cell_metadata.value_counts(subset=['tissue_ontology_term_id', 'tissue', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id'], normalize=True).head(3))

tissue_ontology_term_id  tissue       tissue_type  tissue_general  tissue_general_ontology_term_id
UBERON:0000178           blood        tissue       blood           UBERON:0000178                     0.816243
UBERON:0002048           lung         tissue       lung            UBERON:0002048                     0.047753
UBERON:0002371           bone marrow  tissue       bone marrow     UBERON:0002371                     0.026239
Name: proportion, dtype: float64


In [None]:
print( cell_metadata.value_counts(subset=['development_stage'], normalize=True) )

development_stage           
20-year-old human stage         0.113315
sixth decade human stage        0.036485
7-year-old human stage          0.035691
5-year-old human stage          0.031982
fourth decade human stage       0.026198
                                  ...   
child stage                     0.000000
eighth LMP month human stage    0.000000
8-month-old human stage         0.000000
embryonic human stage           0.000000
1-month-old human stage         0.000000
Name: proportion, Length: 176, dtype: float64


In [None]:
# Open the dataset and query for Monocytes with the scVI embeddings
with cellxgene_census.open_soma(census_version=dataset_version) as census:
    adata = cellxgene_census.get_anndata(
        census,
        organism=organism,
        measurement_name="RNA",
        obs_value_filter="cell_type == '{}'".format(cell_type_query),  # Selecting Monocytes only
        obs_embeddings=emb_names
    )

'Monocyte'

In [None]:
adata

AnnData object with n_obs × n_vars = 0 × 60530
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'
    obsm: 'scvi'

In [None]:
census.close()
del census