<a href="https://colab.research.google.com/github/GBakalkinOAI/DDLS2024/blob/main/CellxGene_Census_scVI_Monocytes_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installation commands are taken from: https://docs.scvi-tools.org/en/latest/tutorials/notebooks/hub/cellxgene_census_model.html
# See also: https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_installation.html
!pip install --quiet scvi-colab
!pip install --quiet cellxgene-census
!pip install --quiet pybiomart
from scvi_colab import install

install()

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.8/54.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.0/129.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0

In [2]:
# Import necessary libraries
import cellxgene_census
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scvi

In [3]:
# Define parameters to access the dataset
emb_names = ["scvi"]  # specify the embedding you are interested in
dataset_version = "2024-07-01"  # the version of the dataset you want to use
organism = "homo_sapiens"
cell_type_query = "monocyte"  #ToDo# here we will place cell types with detectable LOY in adult (or older) males
cell_type_counts_min = 5 # drop cells if the corresponding cell type has too few cells in a particular donor
cell_type_donors_min = 5 # drop cells if too few donors have the corresponding cell type

In [4]:
# Later we will use this overview of the CELLxGENE Census to check how representative our LOY-related slice is.
with cellxgene_census.open_soma(census_version=dataset_version) as census:
  # Which version of schema do we use exactly?
  census_summary = census["census_info"]["summary"].read().concat().to_pandas()

  # To add citations and human readable names to `dataset_id` we can augment
  # .get_obs() results with .merge(census_datasets, on="dataset_id")
  census_datasets = (
      census["census_info"]["datasets"]
      .read(column_names=[
          # "citation", # will need it for the publication
          # "collection_name", "collection_doi",
          "dataset_title", # human readable name
          # "dataset_h5ad_path", # downloable from AWS using these file names
          "dataset_total_cell_count", # how many cells are contributed (including duplicated cells)
          "dataset_id" # key for .get_obs() results
          ])
      .concat().to_pandas()
      .set_index("dataset_id")
  )

  # Census summary with counts per cell type
  # `label` is specific for each `category`, allowing further zoom-in
  # number of labels per each `category` is as follows:
  # 1 all  (label is 'na')
  # 1 suspension_type (label is 'cell')
  # 3 sex (label is 53.7% 'male', 41.2% 'female' and 5.1% 'unknown')
  # 24 assay
  # 31 self_reported_ethnicity
  # 55 tissue_general
  # 109 disease
  # 267 tissue
  # 698 cell_type
  census_summary_cell_counts = (
      census["census_info"]["summary_cell_counts"]
      .read().concat().to_pandas()
      .query("organism == 'Homo sapiens'") # Not like in `organism`, different spelling
  )

print( census_summary )
# General information for all schemes: https://raw.githubusercontent.com/chanzuckerberg/cellxgene-census/refs/heads/main/docs/cellxgene_census_schema.md
# Specific details for our schema v.5.0.0: https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/refs/heads/main/schema/5.0.0/schema.md

# census_datasets # has 812 datasets, nothing interesting to see yet

   soma_joinid                       label       value
0            0       census_schema_version       2.0.1
1            1           census_build_date  2024-05-20
2            2      dataset_schema_version       5.0.0
3            3            total_cell_count   115556140
4            4           unique_cell_count    60597966
5            5  number_donors_homo_sapiens       17651
6            6  number_donors_mus_musculus        4216


In [5]:
# Search all cell types containing pattern "*monocyte*" so capture all cell types related to Monocytes
cell_types_monocyte = census_summary_cell_counts.query("category == 'cell_type' and label.str.contains('monocyte', case=False, na=False)")
monocyte_labels = cell_types_monocyte['label'].tolist()
monocyte_filter = " or ".join(["cell_type == '{}'".format(label) for label in monocyte_labels])
cell_types_monocyte[['label', 'ontology_term_id', 'unique_cell_count']]

Unnamed: 0,label,ontology_term_id,unique_cell_count
158,granulocyte monocyte progenitor cell,CL:0000557,5893
160,promonocyte,CL:0000559,10669
165,monocyte,CL:0000576,314205
262,classical monocyte,CL:0000860,1059249
265,non-classical monocyte,CL:0000875,178362
322,CD14-positive monocyte,CL:0001054,580910
350,"CD14-positive, CD16-negative classical monocyte",CL:0002057,261782
433,intermediate monocyte,CL:0002393,6148
435,"CD14-low, CD16-positive monocyte",CL:0002396,87379
436,"CD14-positive, CD16-positive monocyte",CL:0002397,20045


In [6]:
# Lets examine male cells metadata (39.9M cells x 26 columns) and decide which columns do we really need.
# In the command we name every possible metadata column, and then
# - label it #drop# and comment out if we do not need it
# - label it #ToDo# if we need to investigate it further
# - label it #batch#ToDo# if this column may cause batch effects, ToDo: check if it cause significant batch effect
# - label it #batch# if we needed this column to stratify batch effect or it is confounded with LOY
with cellxgene_census.open_soma(census_version=dataset_version) as census:
    cell_metadata = cellxgene_census.get_obs(
        census,
        organism,
        value_filter = "sex == 'male' and is_primary_data == True and (" + monocyte_filter + ")",
        column_names = [
            # 'soma_joinid', #drop# special SOMADataFrame column that is used for join operations
            # 'sex', 'sex_ontology_term_id', # filter "male" to study LOY
            # 'is_primary_data', # 56% True, filter True to look only at non-duplicate data
            'dataset_id', #batch#ToDo# 11.3% + 4% + 3.6% + 3.1% + 2.8% + ...
            # 'assay', 'assay_ontology_term_id', 'suspension_type', #batch#ToDo# single cell/nuclei technology
            'cell_type_ontology_term_id', 'cell_type', #batch# later we will filter for LOY-enriched cell types
            'development_stage_ontology_term_id', 'development_stage', #batch# age, filter >=20 years old, 7% '50-year-old human stage'
            'disease_ontology_term_id', 'disease', #batch# 70% healthy + 11% covid
            'observation_joinid', # unique observation identifier for each cell, after filtering we use it to download the right cells
            'self_reported_ethnicity_ontology_term_id', # 'self_reported_ethnicity',  #batch#ToDo# 52% 'unknown', 40% 'European'
            'tissue_ontology_term_id', 'tissue', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', #batch#ToDo# cell types will filter tissues automatically?
            'raw_sum', #ToDo# Is this like Seurat's nReads?
            'nnz', 'raw_mean_nnz', 'raw_variance_nnz', # Is nnz like Seurat's nUMIs? Mean/variance over what?
            'n_measured_vars', # Is this like Seurat's nFeatures ?
            'donor_id' #batch# We study LOY withing each donor, then look at inter-donor variability of DEGs
            ]
    )

In [42]:
#cell_metadata_old = cell_metadata.copy()
cell_metadata = cell_metadata_old.copy()

In [43]:
# Only warn if missing data is actually present, print columns with missing data, and remove missing values
missing_data = cell_metadata.isna().sum()
missing_columns = missing_data[missing_data > 0]
if not missing_columns.empty:
    print("Warning: Missing data found in the following columns:")
    print(missing_columns)
cell_metadata = cell_metadata.dropna(subset=['cell_type', 'donor_id', 'development_stage', 'disease'])

In [45]:
filtered_data = cell_metadata[(cell_metadata['cell_type'] == 'classical monocyte') & (cell_metadata['donor_id'] == 'PD44966')]
print(filtered_data.value_counts(subset=['cell_type', 'donor_id', 'development_stage', 'disease']))

cell_type           donor_id  development_stage         disease                          
classical monocyte  PD44966   sixth decade human stage  normal                               2559
                                                        nonpapillary renal cell carcinoma       1
Name: count, dtype: int64


In [26]:
cell_metadata['n_disease_per_donor'] = cell_metadata.groupby(['disease', 'donor_id'], observed=False)['disease'].transform('nunique')
filtered_data = cell_metadata[cell_metadata['n_disease_per_donor'] > 1]
print(filtered_data.value_counts(subset=['donor_id', 'development_stage', 'disease']))

Series([], Name: count, dtype: int64)


In [27]:
# First, drop cells if the corresponding cell type has too few cells in the corresponding donor
cell_metadata['n_cells_per_cell_type'] = cell_metadata.groupby(['cell_type', 'donor_id'], observed=False)['cell_type'].transform('size')
cell_metadata = cell_metadata[cell_metadata['n_cells_per_cell_type'] >= cell_type_counts_min]
cell_metadata[['cell_type', 'donor_id', 'n_cells_per_cell_type']].sort_values(by=['cell_type', 'donor_id'])

Unnamed: 0,cell_type,donor_id,n_cells_per_cell_type
476345,"CD14-low, CD16-positive monocyte",1002_1003,63
476350,"CD14-low, CD16-positive monocyte",1002_1003,63
476352,"CD14-low, CD16-positive monocyte",1002_1003,63
476355,"CD14-low, CD16-positive monocyte",1002_1003,63
476357,"CD14-low, CD16-positive monocyte",1002_1003,63
...,...,...,...
32475,promonocyte,pooled,490
32476,promonocyte,pooled,490
32479,promonocyte,pooled,490
32480,promonocyte,pooled,490


In [28]:
# Next, drop cells if too few donors still have the corresponding cell type
cell_metadata['n_donors_per_cell_type'] = cell_metadata.groupby('cell_type', observed=False)['donor_id'].transform('nunique')
cell_metadata = cell_metadata[cell_metadata['n_donors_per_cell_type'] >= cell_type_donors_min]
cell_metadata[['cell_type', 'donor_id', 'n_cells_per_cell_type', 'n_donors_per_cell_type']].sort_values(by=['cell_type', 'donor_id'])

Unnamed: 0,cell_type,donor_id,n_cells_per_cell_type,n_donors_per_cell_type
476345,"CD14-low, CD16-positive monocyte",1002_1003,63,550
476350,"CD14-low, CD16-positive monocyte",1002_1003,63,550
476352,"CD14-low, CD16-positive monocyte",1002_1003,63,550
476355,"CD14-low, CD16-positive monocyte",1002_1003,63,550
476357,"CD14-low, CD16-positive monocyte",1002_1003,63,550
...,...,...,...,...
32475,promonocyte,pooled,490,14
32476,promonocyte,pooled,490,14
32479,promonocyte,pooled,490,14
32480,promonocyte,pooled,490,14


In [29]:
print( cell_metadata.value_counts(subset=['cell_type', 'donor_id']).sort_values() )

cell_type                                        donor_id          
CD14-low, CD16-positive monocyte                 989_990                    5
CD14-positive, CD16-positive monocyte            P-S040                     5
CD14-positive monocyte                           CV0025                     5
                                                 25_25                      5
granulocyte monocyte progenitor cell             HTA4_8                     5
                                                                        ...  
monocyte                                         CV-003                 10429
CD14-positive, CD16-negative classical monocyte  P-S085                 10617
monocyte                                         H06                    16088
CD14-low, CD16-positive monocyte                 allcells:889004399     17340
CD14-positive monocyte                           allcells:889004399    126506
Name: count, Length: 3174, dtype: int64


In [30]:
print( cell_metadata.value_counts(subset=['cell_type', 'donor_id', 'development_stage', 'disease']).sort_values() )

cell_type                                        donor_id            development_stage          disease                          
classical monocyte                               PD44966             sixth decade human stage   nonpapillary renal cell carcinoma         1
non-classical monocyte                           PD44967             eighth decade human stage  nonpapillary renal cell carcinoma         1
monocyte                                         Patient07           45-year-old human stage    normal                                    1
MHC-II-positive classical monocyte               PD43824             fifth decade human stage   normal                                    1
monocyte                                         Patient17           74-year-old human stage    normal                                    2
                                                                                                                                      ...  
                              

In [31]:
filtered_data = cell_metadata[(cell_metadata['cell_type'] == 'classical monocyte') & (cell_metadata['donor_id'] == 'PD44966')]
print(filtered_data.value_counts(subset=['cell_type', 'donor_id', 'development_stage', 'disease']))

cell_type           donor_id  development_stage         disease                          
classical monocyte  PD44966   sixth decade human stage  normal                               2559
                                                        nonpapillary renal cell carcinoma       1
Name: count, dtype: int64


In [None]:
print(cell_metadata.value_counts(subset=['dataset_id'], normalize=True).head(3)) #ToDo# check batch effects of contributing datasets

dataset_id                          
f7c1c579-2dc0-47e2-ba19-8165c5a0e353    0.112806
9dbab10c-118d-496b-966a-67f1763a6b7d    0.039711
d6505c89-c43d-4c28-8c4f-7351a5fd5528    0.036205
Name: proportion, dtype: float64


In [None]:
print(cell_metadata.value_counts(subset=['cell_type_ontology_term_id', 'cell_type'], normalize=True).head(3)) # most abundant cell types

cell_type_ontology_term_id  cell_type           
CL:0000540                  neuron                  0.117337
CL:0000128                  oligodendrocyte         0.044505
CL:0000679                  glutamatergic neuron    0.042780
Name: proportion, dtype: float64


In [None]:
print(cell_metadata.value_counts(subset=['tissue_ontology_term_id', 'tissue', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id'], normalize=True).head(3))

tissue_ontology_term_id  tissue         tissue_type  tissue_general  tissue_general_ontology_term_id
UBERON:0000178           blood          tissue       blood           UBERON:0000178                     0.218233
UBERON:0001893           telencephalon  tissue       brain           UBERON:0000955                     0.059609
UBERON:0002048           lung           tissue       lung            UBERON:0002048                     0.052309
Name: proportion, dtype: float64


In [None]:
print( cell_metadata.value_counts(subset=['development_stage'], normalize=True) )

development_stage                       
50-year-old human stage                     0.073846
42-year-old human stage                     0.060202
29-year-old human stage                     0.057824
15th week post-fertilization human stage    0.037218
12th week post-fertilization human stage    0.035109
                                              ...   
Carnegie stage 12                           0.000000
Carnegie stage 13                           0.000000
Carnegie stage 14                           0.000000
Carnegie stage 16                           0.000000
Carnegie stage 09                           0.000000
Name: proportion, Length: 176, dtype: float64


In [None]:
# Open the dataset and query for Monocytes with the scVI embeddings
with cellxgene_census.open_soma(census_version=dataset_version) as census:
    adata = cellxgene_census.get_anndata(
        census,
        organism=organism,
        measurement_name="RNA",
        obs_value_filter="cell_type == '{}'".format(cell_type_query),  # Selecting Monocytes only
        obs_embeddings=emb_names
    )

'Monocyte'

In [None]:
adata

AnnData object with n_obs × n_vars = 0 × 60530
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'
    obsm: 'scvi'

In [None]:
census.close()
del census