ENV: census_env

In [1]:
"""
From:
https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_datasets.html
https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_query_extract.html
"""

import cellxgene_census
import pandas as pd
import os
from pathlib import Path
from habanero import cn
import json
from tqdm import tqdm


# Open connection
census = cellxgene_census.open_soma()

# Where to save datsets
SAVE_BASE_FOLDER = "/work/upcourtine/clock-classifier/gabriele-results/census_results/datasets"


The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.


# Extract the `dataset_id` of selected datasets

In [2]:
# Reads SOMADataFrame as a slice
    # ATTENTION: select only human 
human_datasets = census["census_data"]["homo_sapiens"]["obs"]


# Display column names
print("Cell metadata columns (obs):", list(human_datasets.keys()))

Cell metadata columns (obs): ['soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars']


In [3]:
# Read METADATA OF EACH DATASET and convert it to a DataFrame panda
    # ATTNETION: information with cells duplicated
human_metadata_df_with_dup = human_datasets.read(
                            #value_filter = "is_primary_data==True", #cell_type in ['microglial cell', 'neuron']",
                            column_names=["dataset_id", 
                                          "assay", 
                                          "title",
                                          "cell_type", 
                                          "tissue",
                                          "suspension_type", 
                                          "disease",
                                          "donor_id", 
                                          "is_primary_data"
                                          ]
                            ).concat().to_pandas()

display(human_metadata_df_with_dup)



Unnamed: 0,dataset_id,assay,cell_type,tissue,suspension_type,disease,donor_id,is_primary_data
0,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,plasma cell,caudate lobe of liver,cell,normal,C41,False
1,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,mature B cell,caudate lobe of liver,cell,normal,C41,False
2,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,plasma cell,caudate lobe of liver,cell,normal,C41,False
3,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,mature B cell,caudate lobe of liver,cell,normal,C41,False
4,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,mature B cell,caudate lobe of liver,cell,normal,C41,False
...,...,...,...,...,...,...,...,...
74322505,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,pvalb GABAergic cortical interneuron,dorsolateral prefrontal cortex,nucleus,dementia,H21.33.044,True
74322506,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,vip GABAergic cortical interneuron,dorsolateral prefrontal cortex,nucleus,dementia,H20.33.018,True
74322507,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,L2/3-6 intratelencephalic projecting glutamate...,dorsolateral prefrontal cortex,nucleus,normal,H21.33.003,True
74322508,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,astrocyte of the cerebral cortex,dorsolateral prefrontal cortex,nucleus,normal,H21.33.019,True


In [4]:
# Read METADATA OF EACH DATASET and convert it to a DataFrame panda
    # ATTNETION:!NO! information with cells duplicated
human_metadata_df = human_datasets.read(
                            value_filter = "is_primary_data==True", #cell_type in ['microglial cell', 'neuron']",
                            column_names=["dataset_id", 
                                          "assay", 
                                          "title",
                                          "cell_type", 
                                          "tissue",
                                          "suspension_type", 
                                          "disease",
                                          "donor_id", 
                                          "is_primary_data"]
                            ).concat().to_pandas()

display(human_metadata_df)




Unnamed: 0,dataset_id,assay,cell_type,tissue,suspension_type,disease,donor_id,is_primary_data
0,a5d95a42-0137-496f-8a60-101e17f263c8,Smart-seq2,naive B cell,blood,cell,normal,CONTROL,True
1,a5d95a42-0137-496f-8a60-101e17f263c8,Smart-seq2,naive B cell,blood,cell,normal,CONTROL,True
2,a5d95a42-0137-496f-8a60-101e17f263c8,Smart-seq2,naive B cell,blood,cell,normal,CONTROL,True
3,a5d95a42-0137-496f-8a60-101e17f263c8,Smart-seq2,naive B cell,blood,cell,normal,CONTROL,True
4,a5d95a42-0137-496f-8a60-101e17f263c8,Smart-seq2,naive B cell,blood,cell,normal,CONTROL,True
...,...,...,...,...,...,...,...,...
44265927,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,pvalb GABAergic cortical interneuron,dorsolateral prefrontal cortex,nucleus,dementia,H21.33.044,True
44265928,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,vip GABAergic cortical interneuron,dorsolateral prefrontal cortex,nucleus,dementia,H20.33.018,True
44265929,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,L2/3-6 intratelencephalic projecting glutamate...,dorsolateral prefrontal cortex,nucleus,normal,H21.33.003,True
44265930,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,10x 3' v3,astrocyte of the cerebral cortex,dorsolateral prefrontal cortex,nucleus,normal,H21.33.019,True


In [5]:
# How many cells in each dataset
human_metadata_df.groupby("dataset_id").size()

  human_metadata_df.groupby("dataset_id").size()


dataset_id
0041b9c3-6a49-4bf7-8514-9bc7190067a7         0
00476f9f-ebc1-4b72-b541-32f912ce36ea     10099
00e5dedd-b9b7-43be-8c28-b0e5c6414a62         0
00ff600e-6e2e-4d76-846f-0eec4f0ae417         0
01209dce-3575-4bed-b1df-129f57fbc031     51876
                                         ...  
fe1a73ab-a203-45fd-84e9-0f7fd19efcbd     35285
fe4b89d5-461e-440c-a5a8-621b37b122c0    154136
fe52003e-1460-4a65-a213-2bb1a508332f         0
ff45e623-7f5f-46e3-b47d-56be0341f66b         0
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded     28051
Length: 678, dtype: int64

In [6]:
# How many dataset with more than 0 cells
counts = human_metadata_df.groupby("dataset_id").size()
counts = counts.sort_values()
print(f"Datasets with more than 0 cells: {counts[counts > 0]}")

  counts = human_metadata_df.groupby("dataset_id").size()


Datasets with more than 0 cells: dataset_id
d567b692-c374-4628-a508-8008f6778f22          1
0ba16f4b-cb87-4fa3-9363-19fc51eec6e7          4
810ac45f-8969-4698-b42c-652f802f75c2         10
f64e1be1-de15-4d27-8da4-82225cd4c035         56
f9846bb4-784d-4582-92c1-3f279e4c6f0c        176
                                         ...   
6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3    1309414
d6505c89-c43d-4c28-8c4f-7351a5fd5528    1378557
9dbab10c-118d-496b-966a-67f1763a6b7d    1462702
9f222629-9e39-47d0-b83f-e08d610c7479    1959503
f7c1c579-2dc0-47e2-ba19-8165c5a0e353    4062980
Length: 369, dtype: int64


In [7]:
# Let's find ouit if these datset have few cells
    # because they are like this
    # or because they are made of duplicated cells

# Count hoe many cells in each dataset, in case replicated cells are present and  not
counts = human_metadata_df.groupby("dataset_id").size()
counts_dup = human_metadata_df_with_dup.groupby("dataset_id").size()

df_diff = pd.DataFrame({"n_cells_unique": counts, "n_cells_tot": counts_dup})
df_diff["n_cells_duplicated"] = df_diff["n_cells_tot"] - df_diff["n_cells_unique"]

df_diff

# Filter df_diff to only include rows where the index is in counts[counts > 0].index
#filtered_df = df_diff[df_diff.index.isin(counts[counts > 0].index)]
#filtered_df.sort_values("tot")

  counts = human_metadata_df.groupby("dataset_id").size()


  counts_dup = human_metadata_df_with_dup.groupby("dataset_id").size()


Unnamed: 0_level_0,n_cells_unique,n_cells_tot,n_cells_duplicated
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0041b9c3-6a49-4bf7-8514-9bc7190067a7,0,9424,9424
00476f9f-ebc1-4b72-b541-32f912ce36ea,10099,10099,0
00e5dedd-b9b7-43be-8c28-b0e5c6414a62,0,21422,21422
00ff600e-6e2e-4d76-846f-0eec4f0ae417,0,363,363
01209dce-3575-4bed-b1df-129f57fbc031,51876,51876,0
...,...,...,...
fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,35285,35285,0
fe4b89d5-461e-440c-a5a8-621b37b122c0,154136,154136,0
fe52003e-1460-4a65-a213-2bb1a508332f,0,51552,51552
ff45e623-7f5f-46e3-b47d-56be0341f66b,0,13497,13497


In [8]:
# Selct datasets that have at least thr cell (without counting replicate)
    #Attention: in this way later, when i will download the entire datset, i will also download the duplicated cells
thr = 1000
df_diff_filtered = df_diff[df_diff["n_cells_unique"] >= thr]
display(df_diff_filtered)

# Select datasets wiht ONLY unique cells (not even a single dup)
df_diff_filtered = df_diff[df_diff["n_cells_duplicated"] == 0]
display(df_diff_filtered)

Unnamed: 0_level_0,n_cells_unique,n_cells_tot,n_cells_duplicated
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,10099,10099,0
01209dce-3575-4bed-b1df-129f57fbc031,51876,51876,0
0129dbd9-a7d3-4f6b-96b9-1da155a93748,244474,244474,0
019c7af2-c827-4454-9970-44d5e39ce068,12590,12590,0
01ad3cd7-3929-4654-84c0-6db05bd5fd59,600929,600929,0
...,...,...,...
fbf173f9-f809-4d84-9b65-ae205d35b523,17660,17660,0
fd072bc3-2dfb-46f8-b4e3-467cb3223182,610392,908046,297654
fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,35285,35285,0
fe4b89d5-461e-440c-a5a8-621b37b122c0,154136,154136,0


Unnamed: 0_level_0,n_cells_unique,n_cells_tot,n_cells_duplicated
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,10099,10099,0
01209dce-3575-4bed-b1df-129f57fbc031,51876,51876,0
0129dbd9-a7d3-4f6b-96b9-1da155a93748,244474,244474,0
019c7af2-c827-4454-9970-44d5e39ce068,12590,12590,0
01ad3cd7-3929-4654-84c0-6db05bd5fd59,600929,600929,0
...,...,...,...
fa8605cf-f27e-44af-ac2a-476bee4410d3,59506,59506,0
fbf173f9-f809-4d84-9b65-ae205d35b523,17660,17660,0
fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,35285,35285,0
fe4b89d5-461e-440c-a5a8-621b37b122c0,154136,154136,0


In [9]:
# Extract the dataset ids of the dataset that we want to use
# In this case:
    # only human datasets
    # with 0 replicated cells

dataset_ids = df_diff_filtered.index

print(f"Number of dataset to use: {len(dataset_ids)}, {dataset_ids[:5]}...")

Number of dataset to use: 319, CategoricalIndex(['00476f9f-ebc1-4b72-b541-32f912ce36ea',
                  '01209dce-3575-4bed-b1df-129f57fbc031',
                  '0129dbd9-a7d3-4f6b-96b9-1da155a93748',
                  '019c7af2-c827-4454-9970-44d5e39ce068',
                  '01ad3cd7-3929-4654-84c0-6db05bd5fd59'],
                 categories=['0041b9c3-6a49-4bf7-8514-9bc7190067a7', '00476f9f-ebc1-4b72-b541-32f912ce36ea', '00e5dedd-b9b7-43be-8c28-b0e5c6414a62', '00ff600e-6e2e-4d76-846f-0eec4f0ae417', ..., 'fe4b89d5-461e-440c-a5a8-621b37b122c0', 'fe52003e-1460-4a65-a213-2bb1a508332f', 'ff45e623-7f5f-46e3-b47d-56be0341f66b', 'ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded'], ordered=False, dtype='category', name='dataset_id')...


# Select Info of Human Datasets

In [10]:
# Read df with datset info
df_info = census["census_info"]["datasets"].read().concat().to_pandas()


In [11]:
# Select info of dtaset fodun before (filter by previous found datset_id)
df_info = df_info[df_info["dataset_id"].isin(dataset_ids)]
print(df_info.shape)
display(df_info.head(3))

(319, 10)


Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
3,3,Publication: https://doi.org/10.1038/s41467-02...,bf325905-5e8e-42e3-933d-9a9053e9af80,Single-cell Atlas of common variable immunodef...,10.1038/s41467-022-29450-x,a5d95a42-0137-496f-8a60-101e17f263c8,40832710-d7b1-43fb-b2c2-1cd2255bc3ac,Steady-state B cells - scRNA-seq,a5d95a42-0137-496f-8a60-101e17f263c8.h5ad,1324
4,4,Publication: https://doi.org/10.1038/s41590-02...,93eebe82-d8c3-41bc-a906-63b5b5f24a9d,Single-cell proteo-genomic reference maps of t...,10.1038/s41590-021-01059-0,d3566d6a-a455-4a15-980f-45eb29114cab,eb6c070c-ff67-4c1f-8d4d-65f9fe2119ee,blood and bone marrow from a healthy young donor,d3566d6a-a455-4a15-980f-45eb29114cab.h5ad,15502
16,16,Publication: https://doi.org/10.1016/j.immuni....,29f92179-ca10-4309-a32b-d383d80347c1,Longitudinal profiling of respiratory and syst...,10.1016/j.immuni.2021.03.005,eec804b9-2ae5-44f0-a1b5-d721e21257de,761a676a-d686-4e42-8c21-a25d32bd819f,74 years old female - Airway Wash (5 days post...,eec804b9-2ae5-44f0-a1b5-d721e21257de.h5ad,1324


In [12]:
# This info are not enough
# Lets add
    # tissue
    # cell types
    # disease / label
    # essay
# In this case i need to use also the metasata of each single cell ("human_metadata_df")

# df with cell level information, only not duplicated cells
    # ATTENTION: as we are unsing only datasets with no replicated cells, we are not adding any duplicated cells
human_metadata_df

dataset_summary = human_metadata_df.groupby("dataset_id").agg({
    "cell_type": "unique",
    "tissue": "unique",
    "disease": "unique",
    "assay": "unique",
    "donor_id": "unique",
})

# Clean up and convert the unique values to readable format
dataset_summary["cell_type"] = dataset_summary["cell_type"].apply(lambda x: "|".join(x))
dataset_summary["tissue"] = dataset_summary["tissue"].apply(lambda x: "|".join(x))
dataset_summary["disease"] = dataset_summary["disease"].apply(lambda x: "|".join(x))
dataset_summary["assay"] = dataset_summary["assay"].apply(lambda x: "|".join(x))
dataset_summary["donor_id"] = dataset_summary["donor_id"].apply(lambda x: "|".join(x))

dataset_summary["size"] = human_metadata_df.groupby("dataset_id").size()

# reset index and create "dataset_id" col
dataset_summary = dataset_summary.reset_index(drop=False)

# Filter only the df that we decided to use
dataset_summary_filtered = dataset_summary[dataset_summary["dataset_id"].isin(dataset_ids)].copy()

print(f"Datasets selected: {len(dataset_summary_filtered.dataset_id.unique())}")

dataset_summary_filtered


  dataset_summary = human_metadata_df.groupby("dataset_id").agg({
  dataset_summary["size"] = human_metadata_df.groupby("dataset_id").size()


Datasets selected: 319


Unnamed: 0,dataset_id,cell_type,tissue,disease,assay,donor_id,size
1,00476f9f-ebc1-4b72-b541-32f912ce36ea,neuron|endothelial cell|vascular associated sm...,hypothalamus,normal,10x 3' v3,H19.30.001,10099
4,01209dce-3575-4bed-b1df-129f57fbc031,"CD8-positive, alpha-beta thymocyte|CD4-positiv...",lower lobe of left lung|bone marrow|bronchopul...,normal,10x 3' v2,Donor1|Donor2|DonorA|DonorB,51876
5,0129dbd9-a7d3-4f6b-96b9-1da155a93748,retinal rod cell|retinal cone cell|glial cell|...,macula lutea proper|fovea centralis|peripheral...,normal,10x 3' v3,19D014|19D013|19D015|19D016|D001-12|17D013,244474
7,019c7af2-c827-4454-9970-44d5e39ce068,enterocyte of epithelium of small intestine|in...,jejunum|ascending colon|ileum|transverse colon...,normal,10x 3' v3,Donor 2|Donor 1|Donor 3,12590
8,01ad3cd7-3929-4654-84c0-6db05bd5fd59,"CD4-positive, alpha-beta T cell|classical mono...",blood,COVID-19|normal|respiratory system disorder,10x 5' transcription profiling,Rep_C_1025|ICC_C_0004|Rep_C_1007|Rep_C_1001|Re...,600929
...,...,...,...,...,...,...,...
667,fa8605cf-f27e-44af-ac2a-476bee4410d3,"CD4-positive, alpha-beta T cell|monocyte|natur...",blood,COVID-19|normal,10x 5' v1,336|367|184|259|218|370|90|345|144|219|366|3|1...,59506
669,fbf173f9-f809-4d84-9b65-ae205d35b523,leukocyte|central nervous system macrophage|fi...,thalamic complex,normal,10x 3' v3,H18.30.002,17660
673,fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,neuron|oligodendrocyte|oligodendrocyte precurs...,cerebral nuclei,normal,10x 3' v3,H19.30.001|H18.30.002|H19.30.002,35285
674,fe4b89d5-461e-440c-a5a8-621b37b122c0,stem cell|enterocyte|goblet cell|epithelial ce...,lamina propria of small intestine|epithelium o...,Crohn disease|normal,10x 3' v3|10x 3' v2,105446|127643|178961|158108|180749|110216|1991...,154136


In [13]:
# join new info with old one
df_info = pd.merge(
    left = df_info, 
    right = dataset_summary_filtered, 
    on = "dataset_id",
    how = "left"
)

df_info

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count,cell_type,tissue,disease,assay,donor_id,size
0,3,Publication: https://doi.org/10.1038/s41467-02...,bf325905-5e8e-42e3-933d-9a9053e9af80,Single-cell Atlas of common variable immunodef...,10.1038/s41467-022-29450-x,a5d95a42-0137-496f-8a60-101e17f263c8,40832710-d7b1-43fb-b2c2-1cd2255bc3ac,Steady-state B cells - scRNA-seq,a5d95a42-0137-496f-8a60-101e17f263c8.h5ad,1324,naive B cell|unswitched memory B cell|class sw...,blood,normal|common variable immunodeficiency,Smart-seq2,CONTROL|CVID,1324
1,4,Publication: https://doi.org/10.1038/s41590-02...,93eebe82-d8c3-41bc-a906-63b5b5f24a9d,Single-cell proteo-genomic reference maps of t...,10.1038/s41590-021-01059-0,d3566d6a-a455-4a15-980f-45eb29114cab,eb6c070c-ff67-4c1f-8d4d-65f9fe2119ee,blood and bone marrow from a healthy young donor,d3566d6a-a455-4a15-980f-45eb29114cab.h5ad,15502,plasma cell|erythroid progenitor cell|non-clas...,bone marrow|blood,normal,BD Rhapsody Targeted mRNA,198,15502
2,16,Publication: https://doi.org/10.1016/j.immuni....,29f92179-ca10-4309-a32b-d383d80347c1,Longitudinal profiling of respiratory and syst...,10.1016/j.immuni.2021.03.005,eec804b9-2ae5-44f0-a1b5-d721e21257de,761a676a-d686-4e42-8c21-a25d32bd819f,74 years old female - Airway Wash (5 days post...,eec804b9-2ae5-44f0-a1b5-d721e21257de.h5ad,1324,mature NK T cell|unknown|myeloid cell|epitheli...,respiratory airway,COVID-19,10x 3' v3,COV026,1324
3,21,Publication: https://doi.org/10.1016/j.immuni....,29f92179-ca10-4309-a32b-d383d80347c1,Longitudinal profiling of respiratory and syst...,10.1016/j.immuni.2021.03.005,b25f3834-69b3-4d87-a272-3938432d1f30,944d87b4-443d-494a-926e-e3c12e60ede4,82 years old female - Airway Wash (1 day post-...,b25f3834-69b3-4d87-a272-3938432d1f30.h5ad,1074,epithelial cell|mature NK T cell|myeloid cell|...,respiratory airway,COVID-19,10x 3' v3,COV027,1074
4,23,Publication: https://doi.org/10.1038/s41598-02...,1d1c7275-476a-49e2-9022-ad1b1c793594,Cell Atlas of The Human Fovea and Peripheral R...,10.1038/s41598-020-66092-9,2f6a20f1-173d-4b8d-860b-c47ffea120fa,1f88c4b8-7993-46ec-a817-dd85c315932f,Horizontal cells of the human fovea and periph...,2f6a20f1-173d-4b8d-860b-c47ffea120fa.h5ad,2868,retina horizontal cell,fovea centralis|peripheral region of retina,normal,10x 3' v2|10x 3' v3,sanes_Pt2|H3|H4|H5|H9|H11|H1,2868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,790,Publication: https://doi.org/10.1126/science.a...,436154da-bcf1-4130-9c8b-120ff9a888f2,Single-cell RNA-seq reveals the cell-type-spec...,10.1126/science.abf1970,218acb0f-9f2f-4f76-b90b-15a4b7c7f629,cece7dd6-cad7-450d-8b1c-841d49552cea,multiplexed scRNA-seq of 1.2 million PBMCs fro...,218acb0f-9f2f-4f76-b90b-15a4b7c7f629.h5ad,1263676,"CD4-positive, alpha-beta T cell|classical mono...",blood,normal|systemic lupus erythematosus,10x 3' v2,HC-546|1132|FLARE006|1110|1479|1334|1333|IGTB6...,1263676
315,793,Publication: https://doi.org/10.1016/j.cell.20...,0a839c4b-10d0-4d64-9272-684c49a2c8ba,COVID-19 immune features revealed by a large-s...,10.1016/j.cell.2021.01.053,9dbab10c-118d-496b-966a-67f1763a6b7d,b50b6b99-fd44-4a6d-9ca8-b5b3479eabbd,Large-scale single-cell analysis reveals criti...,9dbab10c-118d-496b-966a-67f1763a6b7d.h5ad,1462702,"CD14-positive, CD16-negative classical monocyt...",blood|lung|saliva,COVID-19|normal,10x 3' v3|10x 5' v2,P-S070|P-S069|P-S071|P-S072|P-M056|P-M057|P-M0...,1462702
316,794,Publication: https://doi.org/10.1016/j.xgen.20...,af893e86-8e9f-41f1-a474-ef05359b1fb7,Single-cell transcriptomic atlas for adult hum...,10.1016/j.xgen.2023.100298,0129dbd9-a7d3-4f6b-96b9-1da155a93748,f8b6b116-b432-4b2c-9cf4-683ddab403c1,All major cell types in adult human retina,0129dbd9-a7d3-4f6b-96b9-1da155a93748.h5ad,244474,retinal rod cell|retinal cone cell|glial cell|...,macula lutea proper|fovea centralis|peripheral...,normal,10x 3' v3,19D014|19D013|19D015|19D016|D001-12|17D013,244474
317,799,Publication: https://doi.org/10.1126/science.a...,c114c20f-1ef4-49a5-9c2e-d965787fb90c,A human cell atlas of fetal gene expression,10.1126/science.aba7721,f7c1c579-2dc0-47e2-ba19-8165c5a0e353,0fd7ede0-7d5d-4f1a-852e-f4facbc9a46f,Survey of human embryonic development,f7c1c579-2dc0-47e2-ba19-8165c5a0e353.h5ad,4062980,Mueller cell|amacrine cell|photoreceptor cell|...,eye|adrenal gland|cerebellum|telencephalon|hea...,normal|trisomy 18,sci-RNA-seq,H27552|H27458|H27634|H27472|H27620|H26350|H274...,4062980


# Add Authors Info

As some datasets come from same collection and thus same article/first authur, in order to differenciate between them we decided to just add _N.

In [14]:
display(df_info.head(1))
df_info.iloc[0, 2]

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count,cell_type,tissue,disease,assay,donor_id,size
0,3,Publication: https://doi.org/10.1038/s41467-02...,bf325905-5e8e-42e3-933d-9a9053e9af80,Single-cell Atlas of common variable immunodef...,10.1038/s41467-022-29450-x,a5d95a42-0137-496f-8a60-101e17f263c8,40832710-d7b1-43fb-b2c2-1cd2255bc3ac,Steady-state B cells - scRNA-seq,a5d95a42-0137-496f-8a60-101e17f263c8.h5ad,1324,naive B cell|unswitched memory B cell|class sw...,blood,normal|common variable immunodeficiency,Smart-seq2,CONTROL|CVID,1324


'bf325905-5e8e-42e3-933d-9a9053e9af80'

In [15]:
# Create the title for each dataset
save_titles = []
missing_titles = [] # soma_joinid

for i, row in tqdm(df_info.iterrows(), total=df_info.shape[0], desc="Processing rows"):

    # Extract the DOI from the "citation" column
    doi = row["citation"].split(" ")[1]  # e.g., "https://doi.org/10.1038/s41467-022-29450-x"
    doi = doi.split("https://doi.org/")[-1]

    rep = 1

    # Fetch metadata in JSON format
    try:
        metadata = cn.content_negotiation(ids=doi, format = "citeproc-json")

        # Extract and display author information
        metadata_dict = json.loads(metadata)
        #for key, value in metadata_dict.items():
        #    print(f"Key: {key}, Value: {value}")

        # Extract author information
        authors = metadata_dict.get("author", [])
        years = metadata_dict.get("published", [])
        ISNNs = metadata_dict.get("ISSN", [])

        first_author = authors[0]["family"]
        year =  years['date-parts'][0][0]

    except:
        missing_titles.append(row['soma_joinid']) # qunieu identifier of the dataset
        first_author = "missing"
        year =  "missing"
        print()

    finally:        
        save_title = f"{first_author}_{year}_{rep}"

        # Manage case of datasets in same article
        while (save_title in save_titles):
            rep +=1
            save_title = f"{first_author}_{year}_{rep}"

        
        save_titles.append(save_title)


df_info["save_title"] = save_titles


Processing rows:   0%|          | 0/319 [00:00<?, ?it/s]

Processing rows:  28%|██▊       | 90/319 [00:22<00:56,  4.08it/s]




Processing rows:  51%|█████     | 162/319 [00:40<00:38,  4.08it/s]




Processing rows:  54%|█████▍    | 172/319 [00:43<00:38,  3.81it/s]




Processing rows:  69%|██████▊   | 219/319 [00:55<00:24,  4.13it/s]




Processing rows:  71%|███████▏  | 228/319 [00:57<00:22,  3.98it/s]




Processing rows:  87%|████████▋ | 276/319 [01:09<00:10,  4.11it/s]




Processing rows:  88%|████████▊ | 281/319 [01:11<00:09,  4.12it/s]




Processing rows:  89%|████████▉ | 285/319 [01:12<00:08,  3.81it/s]




Processing rows:  92%|█████████▏| 295/319 [01:14<00:05,  4.06it/s]




Processing rows:  93%|█████████▎| 297/319 [01:15<00:05,  4.17it/s]




Processing rows:  97%|█████████▋| 310/319 [01:18<00:02,  3.99it/s]




Processing rows:  98%|█████████▊| 314/319 [01:19<00:01,  4.04it/s]




Processing rows: 100%|██████████| 319/319 [01:20<00:00,  3.95it/s]


In [16]:
# Show the dataset where we were not able to find a title
pd.set_option('display.max_colwidth', None)  # Allows columns to expand as needed
df_info[df_info.soma_joinid.isin(missing_titles)][["soma_joinid", "save_title", "citation"]]

Unnamed: 0,soma_joinid,save_title,citation
89,254,missing_missing_1,Dataset Version: https://datasets.cellxgene.cziscience.com/c407eff2-0315-416b-a3da-acf5b7a271e9.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/10bf5c50-8d85-4c5f-94b4-22c1363d9f31
161,408,missing_missing_2,Dataset Version: https://datasets.cellxgene.cziscience.com/64cb5e87-ba99-4180-8114-252a653394c5.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/cae8bad0-39e9-4771-85a7-822b0e06de9f
171,419,missing_missing_3,Publication: https://doi.org/10.1101/2020.11.20.20227355 Dataset Version: https://datasets.cellxgene.cziscience.com/cd0bb8f2-087d-4c1b-a037-8f14551693a3.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0434a9d4-85fd-4554-b8e3-cf6c582bb2fa
218,519,missing_missing_4,Dataset Version: https://datasets.cellxgene.cziscience.com/a99e616d-3455-44b0-9028-87e6c3007972.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/a98b828a-622a-483a-80e0-15703678befd
227,549,missing_missing_5,Publication: https://doi.org/10.1101/2020.11.20.20227355 Dataset Version: https://datasets.cellxgene.cziscience.com/5a26b6b4-c656-42db-85a1-d2e531205317.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/eb735cc9-d0a7-48fa-b255-db726bf365af
275,658,missing_missing_6,Dataset Version: https://datasets.cellxgene.cziscience.com/442baf4a-f707-41d7-abb8-777e6851b26e.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/6b701826-37bb-4356-9792-ff41fc4c3161
280,673,missing_missing_7,Dataset Version: https://datasets.cellxgene.cziscience.com/51101a5a-7f7e-4559-b67a-5f5fd484569c.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0f528c8a-a25c-4840-8fa3-d156fa11086f
284,691,missing_missing_8,Dataset Version: https://datasets.cellxgene.cziscience.com/4631cbcf-d16d-4866-bc64-b9fcaab01087.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/cae8bad0-39e9-4771-85a7-822b0e06de9f
294,723,missing_missing_9,Dataset Version: https://datasets.cellxgene.cziscience.com/b2f1886e-a3aa-4d48-80c9-dcdd44e89e02.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0f528c8a-a25c-4840-8fa3-d156fa11086f
296,730,missing_missing_10,Dataset Version: https://datasets.cellxgene.cziscience.com/4b0fe297-fd25-4fee-bb1d-93dd554f4f90.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/4a9fd4d7-d870-4265-89a5-ad51ab811d89


In [17]:
# Manually write info of missing articles
df_missing_titles = pd.DataFrame(
    {
        #85: "Bhattacharya_2021_1",
        #241: "Fenton_2023_1",
        #251: "Fenton_2023_2",
        254: "Hernández_2022_1",
        #360: "Fenton_2023_3",
        #365: "Zheng_2023_1",
        #388: "Fenton_2023_4",
        408: "Nascimento_2023_1",
        419: "Ballestar_2020_1",
        519: "Bitzer_2022_1",  # noPublication
        #544: "Easter_2023_1",
        549: "Ballestar_2023_2",
        #606: "Ulrich_2021_1",
        658: "missing_missing_missing",  # NoInformations
        673: "Lake_2023_1",
        691: "Nascimento_2023_2",
        723: "Lake_2023_2",
        730: "missing_missing_missing",  # NoInformations
        #731: "Hoo_2023_1",
        774: "Stephenson_2021_1",
        786: "Kock_2024_1",
        #809: "Gabitto_2024_1",
    },
    index=["save_title"]
).transpose()

df_missing_titles = df_missing_titles.reset_index(names="soma_joinid")

df_missing_titles

Unnamed: 0,soma_joinid,save_title
0,254,Hernández_2022_1
1,408,Nascimento_2023_1
2,419,Ballestar_2020_1
3,519,Bitzer_2022_1
4,549,Ballestar_2023_2
5,658,missing_missing_missing
6,673,Lake_2023_1
7,691,Nascimento_2023_2
8,723,Lake_2023_2
9,730,missing_missing_missing


In [18]:
# check if osm eof the save_titles given manually are alredy inside the df_inof
common_elements = [item for item in df_missing_titles.save_title if item in df_info.save_title]
common_elements


[]

In [19]:
# Map missing titles

for i, row in df_info.iterrows():
    soma_joinid = row["soma_joinid"]
    if soma_joinid in missing_titles:
        save_title = df_missing_titles.loc[df_missing_titles.soma_joinid == soma_joinid, "save_title"].iloc[0]
        df_info.loc[df_info.soma_joinid == soma_joinid, "save_title"] = save_title
        print(f"{soma_joinid} title substituted --> {save_title}")

254 title substituted --> Hernández_2022_1
408 title substituted --> Nascimento_2023_1
419 title substituted --> Ballestar_2020_1
519 title substituted --> Bitzer_2022_1
549 title substituted --> Ballestar_2023_2
658 title substituted --> missing_missing_missing
673 title substituted --> Lake_2023_1
691 title substituted --> Nascimento_2023_2
723 title substituted --> Lake_2023_2
730 title substituted --> missing_missing_missing
774 title substituted --> Stephenson_2021_1
786 title substituted --> Kock_2024_1


In [20]:
# check after remapping
df_info[df_info.soma_joinid.isin(missing_titles)][["soma_joinid", "save_title", "citation"]]

Unnamed: 0,soma_joinid,save_title,citation
89,254,Hernández_2022_1,Dataset Version: https://datasets.cellxgene.cziscience.com/c407eff2-0315-416b-a3da-acf5b7a271e9.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/10bf5c50-8d85-4c5f-94b4-22c1363d9f31
161,408,Nascimento_2023_1,Dataset Version: https://datasets.cellxgene.cziscience.com/64cb5e87-ba99-4180-8114-252a653394c5.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/cae8bad0-39e9-4771-85a7-822b0e06de9f
171,419,Ballestar_2020_1,Publication: https://doi.org/10.1101/2020.11.20.20227355 Dataset Version: https://datasets.cellxgene.cziscience.com/cd0bb8f2-087d-4c1b-a037-8f14551693a3.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0434a9d4-85fd-4554-b8e3-cf6c582bb2fa
218,519,Bitzer_2022_1,Dataset Version: https://datasets.cellxgene.cziscience.com/a99e616d-3455-44b0-9028-87e6c3007972.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/a98b828a-622a-483a-80e0-15703678befd
227,549,Ballestar_2023_2,Publication: https://doi.org/10.1101/2020.11.20.20227355 Dataset Version: https://datasets.cellxgene.cziscience.com/5a26b6b4-c656-42db-85a1-d2e531205317.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/eb735cc9-d0a7-48fa-b255-db726bf365af
275,658,missing_missing_missing,Dataset Version: https://datasets.cellxgene.cziscience.com/442baf4a-f707-41d7-abb8-777e6851b26e.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/6b701826-37bb-4356-9792-ff41fc4c3161
280,673,Lake_2023_1,Dataset Version: https://datasets.cellxgene.cziscience.com/51101a5a-7f7e-4559-b67a-5f5fd484569c.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0f528c8a-a25c-4840-8fa3-d156fa11086f
284,691,Nascimento_2023_2,Dataset Version: https://datasets.cellxgene.cziscience.com/4631cbcf-d16d-4866-bc64-b9fcaab01087.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/cae8bad0-39e9-4771-85a7-822b0e06de9f
294,723,Lake_2023_2,Dataset Version: https://datasets.cellxgene.cziscience.com/b2f1886e-a3aa-4d48-80c9-dcdd44e89e02.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0f528c8a-a25c-4840-8fa3-d156fa11086f
296,730,missing_missing_missing,Dataset Version: https://datasets.cellxgene.cziscience.com/4b0fe297-fd25-4fee-bb1d-93dd554f4f90.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/4a9fd4d7-d870-4265-89a5-ad51ab811d89


In [21]:
# Remove articles that still have no info
pd.set_option('display.max_colwidth', 20)  # Allows columns to expand as needed

print("Remove article due to missing info:")
display(df_info[df_info.save_title == "missing_missing_missing"])

df_info = df_info[~(df_info.save_title == "missing_missing_missing")]
print(f"Final number of datsets: {df_info.shape}")

Remove article due to missing info:


Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count,cell_type,tissue,disease,assay,donor_id,size,save_title
275,658,Dataset Version:...,6b701826-37bb-43...,Abdominal White ...,,9d8e5dca-03a3-45...,442baf4a-f707-41...,22 integrated sa...,9d8e5dca-03a3-45...,72335,myeloid cell|adi...,subcutaneous abd...,normal,10x 3' v3,donor-GOLD|donor...,72335,missing_missing_...
296,730,Dataset Version:...,4a9fd4d7-d870-42...,ScaleBio Single ...,,2c820d53-cbd7-4e...,4b0fe297-fd25-4f...,ScaleBio Single ...,2c820d53-cbd7-4e...,685024,naive B cell|CD1...,blood,normal,ScaleBio single ...,allcells:889004399,685024,missing_missing_...


Final number of datsets: (317, 17)


# Remove Cencer Datasets

In [23]:
df_info["disease"].values

array(['normal|common variable immunodeficiency', 'normal', 'COVID-19',
       'COVID-19', 'normal', 'COVID-19',
       'acute myeloid leukemia|acute promyelocytic leukemia', 'COVID-19',
       'COVID-19', 'COVID-19', 'normal', 'COVID-19', 'COVID-19',
       'COVID-19', 'COVID-19', 'glioblastoma', 'COVID-19', 'normal',
       'normal', 'chromophobe renal cell carcinoma', 'normal',
       'normal|multiple sclerosis', 'COVID-19', 'anencephaly', 'COVID-19',
       'COVID-19', 'normal', 'COVID-19', 'COVID-19', 'normal', 'normal',
       'COVID-19', 'COVID-19', 'COVID-19', 'adenocarcinoma', 'normal',
       'normal', 'COVID-19', 'adenocarcinoma',
       'neuroendocrine carcinoma|adenocarcinoma', 'COVID-19', 'COVID-19',
       'normal', 'Wilms tumor', 'COVID-19', 'normal', 'COVID-19',
       'COVID-19', 'normal', 'Alzheimer disease|temporal lobe epilepsy',
       'normal', 'normal', 'normal', 'normal', 'normal', 'normal',
       'normal', 'normal', 'normal|type 2 diabetes mellitus', 'normal'

In [26]:
# Use chatGPT to obtain a good list
cancer_related_diseases = [
    'acute myeloid leukemia', 'acute promyelocytic leukemia',
    'adenocarcinoma', 'B-cell non-Hodgkin lymphoma', 'follicular lymphoma',
    'breast carcinoma', 'breast cancer', 'blastoma',
    'chromophobe renal cell carcinoma', 'B-cell acute lymphoblastic leukemia',
    'colorectal cancer', 'colorectal neoplasm', 'benign prostatic hyperplasia',
    'colon sessile serrated adenoma/polyp', 'hyperplastic polyp', 'tubulovillous adenoma',
    'glioblastoma', 'B-cell acute lymphoblastic leukemia', 
    'kidney benign neoplasm', 'kidney oncocytoma', 'nonpapillary renal cell carcinoma',
    'lung adenocarcinoma', 'malignant pancreatic neoplasm',
    'luminal A breast carcinoma', 'luminal B breast carcinoma', 'small cell lung carcinoma',
    'neuroendocrine carcinoma', 'plasma cell myeloma', 'tubular adenoma', 'triple-negative breast carcinoma',
    'Wilms tumor', 'clear cell renal carcinoma', 'pilocytic astrocytoma', 'premalignant hematological system disease'
]

# Check for specific disease
#df_info[df_info['disease'].apply(lambda x: 'colorectal cancer' in x.split('|'))]

# filter
df_info = df_info[ ~ df_info['disease'].apply(lambda x: any(disease in x.split('|') for disease in cancer_related_diseases))]

# After filtering what remains
print(df_info.shape)
print(df_info.disease.unique())


(290, 17)
['normal|common variable immunodeficiency' 'normal' 'COVID-19'
 'normal|multiple sclerosis' 'anencephaly'
 'Alzheimer disease|temporal lobe epilepsy'
 'normal|type 2 diabetes mellitus' 'Crohn ileitis'
 'normal|Alzheimer disease' 'common variable immunodeficiency|normal'
 'myocardial infarction' 'normal|cataract' 'Crohn disease|normal'
 'COVID-19|normal' 'Down syndrome' 'Alzheimer disease|normal'
 'normal|pulmonary emphysema'
 'normal|COVID-19|respiratory failure|long COVID-19'
 'normal|macular degeneration' 'COVID-19|normal|influenza'
 'digestive system disorder' 'normal|COVID-19'
 'normal|periodontitis|gingivitis' 'epilepsy'
 'normal|type 1 diabetes mellitus'
 'normal|congenital heart disease|acute myocardial infarction|heart failure'
 'normal|opiate dependence' 'normal|hydrosalpinx'
 'normal|frontotemporal dementia'
 'normal|acute kidney failure|chronic kidney disease'
 'normal|myocardial infarction'
 'amyotrophic lateral sclerosis|amyotrophic lateral sclerosis 26 with or w

# Save

In [32]:
# Save df_info 
    # Later i can see the realtive info of the datset downlaode
    # e.g. mapping between datset_id dataset_name

df_info.to_csv(os.path.join(SAVE_BASE_FOLDER, "datasets_info.csv"))

df_info.head(2)

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count,cell_type,tissue,disease,assay,donor_id,size,save_title
0,3,Publication: htt...,bf325905-5e8e-42...,Single-cell Atla...,10.1038/s41467-0...,a5d95a42-0137-49...,40832710-d7b1-43...,Steady-state B c...,a5d95a42-0137-49...,1324,naive B cell|uns...,blood,normal|common va...,Smart-seq2,CONTROL|CVID,1324,Rodríguez-Ubreva...
1,4,Publication: htt...,93eebe82-d8c3-41...,Single-cell prot...,10.1038/s41590-0...,d3566d6a-a455-4a...,eb6c070c-ff67-4c...,blood and bone m...,d3566d6a-a455-4a...,15502,plasma cell|eryt...,bone marrow|blood,normal,BD Rhapsody Targ...,198,15502,Triana_2021_1


In [40]:
# Number of unique datasets
num_datasets = df_info["dataset_id"].nunique()
print(f"Number of unique datasets: {num_datasets}")

# Number of unique collections
num_collections = df_info["collection_id"].nunique()
print(f"Number of unique collections: {num_collections}")

# Number of unique cell types
all_cell_types = df_info['cell_type'].apply(lambda x: x.split('|')).explode().unique()
num_cell_types = len(all_cell_types)
print(f"Number of unique cell types: {num_cell_types}")

# Number of unique diseases
all_disease = df_info['disease'].apply(lambda x: x.split('|')).explode().unique()
num_diseases = len(all_disease)
print(f"Number of unique diseases: {num_diseases}")

# List of unique diseases
print("Unique diseases:", all_disease)


Number of unique datasets: 290
Number of unique collections: 112
Number of unique cell types: 593
Number of unique diseases: 40
Unique diseases: ['normal' 'common variable immunodeficiency' 'COVID-19'
 'multiple sclerosis' 'anencephaly' 'Alzheimer disease'
 'temporal lobe epilepsy' 'type 2 diabetes mellitus' 'Crohn ileitis'
 'myocardial infarction' 'cataract' 'Crohn disease' 'Down syndrome'
 'pulmonary emphysema' 'respiratory failure' 'long COVID-19'
 'macular degeneration' 'influenza' 'digestive system disorder'
 'periodontitis' 'gingivitis' 'epilepsy' 'type 1 diabetes mellitus'
 'congenital heart disease' 'acute myocardial infarction' 'heart failure'
 'opiate dependence' 'hydrosalpinx' 'frontotemporal dementia'
 'acute kidney failure' 'chronic kidney disease'
 'amyotrophic lateral sclerosis'
 'amyotrophic lateral sclerosis 26 with or without frontotemporal dementia'
 'toxoplasmosis' 'Plasmodium malariae malaria' 'listeriosis'
 'respiratory system disorder' 'systemic lupus erythematos

# Download datasets

In [29]:
max = 30 # Max number of df to downloas
collection_id = [] #To download datsets form differetn collections/pubblication --> ensure balance

i = 0
for index, row in df_info.iterrows():

    print()
    print(f"Daset {index+1}/{df_info.shape[0]}")

    if i == max:
        break
    if row['collection_id'] in collection_id:
        print("Dataset present in an alredy downloaded collection/publication.")
        continue

    print(f"Downloading dataset {row['save_title']} | {row['dataset_title']} | id:{row['dataset_id']}")

    save_path = os.path.join(
            SAVE_BASE_FOLDER, 
            f"{row['save_title']}.h5ad"
            )
    
    if not Path(save_path).exists(): #download_source_h5ad() raises an exceptrion if the file alredy exixts
        cellxgene_census.download_source_h5ad(
            dataset_id=row["dataset_id"],
            to_path=save_path,
            progress_bar=True
        )
    else:
        print("\tAlredy downloaded.")
    
    collection_id.append(row['collection_id'])
    i+=1

collection_id


Daset 1/290
Downloading dataset Rodríguez-Ubreva_2022_1 | Steady-state B cells - scRNA-seq | id:a5d95a42-0137-496f-8a60-101e17f263c8


Downloading: 100%|██████████| 9.39M/9.39M [00:02<00:00, 4.75MB/s]



Daset 2/290
Downloading dataset Triana_2021_1 | blood and bone marrow from a healthy young donor | id:d3566d6a-a455-4a15-980f-45eb29114cab


Downloading: 100%|██████████| 9.50M/9.50M [00:02<00:00, 4.50MB/s]



Daset 3/290
Downloading dataset Szabo_2021_1 | 74 years old female - Airway Wash (5 days post-intubation) | id:eec804b9-2ae5-44f0-a1b5-d721e21257de


Downloading: 100%|██████████| 15.7M/15.7M [00:02<00:00, 5.71MB/s]



Daset 4/290
Dataset present in an alredy downloaded collection/publication.

Daset 5/290
Downloading dataset Yan_2020_1 | Horizontal cells of the human fovea and peripheral retina | id:2f6a20f1-173d-4b8d-860b-c47ffea120fa


Downloading: 100%|██████████| 19.5M/19.5M [00:02<00:00, 7.16MB/s]



Daset 6/290
Dataset present in an alredy downloaded collection/publication.

Daset 8/290
Dataset present in an alredy downloaded collection/publication.

Daset 9/290
Dataset present in an alredy downloaded collection/publication.

Daset 10/290
Dataset present in an alredy downloaded collection/publication.

Daset 11/290
Downloading dataset Enge_2017_1 | Single cell transcriptome analysis of human pancreas reveals transcriptional signatures of aging and somatic mutation patterns | id:66d15835-5dc8-4e96-b0eb-f48971cb65e8


Downloading: 100%|██████████| 27.3M/27.3M [00:04<00:00, 6.29MB/s]



Daset 12/290
Dataset present in an alredy downloaded collection/publication.

Daset 13/290
Dataset present in an alredy downloaded collection/publication.

Daset 14/290
Dataset present in an alredy downloaded collection/publication.

Daset 15/290
Dataset present in an alredy downloaded collection/publication.

Daset 17/290
Dataset present in an alredy downloaded collection/publication.

Daset 18/290
Downloading dataset Yu_2021_1 | Adult duodenum | id:ee195b7d-184d-4dfa-9b1c-51a7e601ac11


Downloading: 100%|██████████| 34.0M/34.0M [00:03<00:00, 8.96MB/s]



Daset 19/290
Downloading dataset Siletti_2023_1 | Dissection: Midbrain (RN) - Red Nucleus - RN | id:470565f2-5afc-456a-b617-18e4496c04fd


Downloading: 100%|██████████| 38.2M/38.2M [00:57<00:00, 700kB/s] 



Daset 21/290
Downloading dataset Muraro_2016_1 | A Single-Cell Transcriptome Atlas of the Human Pancreas | id:b07e5164-baf6-43d2-bdba-5a249d0da879


Downloading: 100%|██████████| 39.7M/39.7M [00:04<00:00, 9.89MB/s]



Daset 22/290
Downloading dataset Jäkel_2019_1 | Oligodendrocytes in MS | id:dc30c3ec-46d6-4cd8-8ec1-b544a3d0f503


Downloading: 100%|██████████| 44.5M/44.5M [00:05<00:00, 9.00MB/s]



Daset 23/290
Dataset present in an alredy downloaded collection/publication.

Daset 24/290
Downloading dataset Bhattacharya_2020_1 | Single cell transcriptomic profiling identifies molecular phenotypes of newborn human lung cells | id:6e00ccf7-0749-46ef-a999-dba785630d52


Downloading: 100%|██████████| 46.5M/46.5M [00:05<00:00, 8.43MB/s]



Daset 25/290
Dataset present in an alredy downloaded collection/publication.

Daset 26/290
Dataset present in an alredy downloaded collection/publication.

Daset 27/290
Dataset present in an alredy downloaded collection/publication.

Daset 28/290
Dataset present in an alredy downloaded collection/publication.

Daset 29/290
Dataset present in an alredy downloaded collection/publication.

Daset 30/290
Downloading dataset Menon_2019_1 | Retina | id:856c1b98-5727-49da-bf0f-151bdb8cb056


Downloading: 100%|██████████| 53.3M/53.3M [00:34<00:00, 1.60MB/s]



Daset 31/290
Dataset present in an alredy downloaded collection/publication.

Daset 32/290
Dataset present in an alredy downloaded collection/publication.

Daset 33/290
Dataset present in an alredy downloaded collection/publication.

Daset 34/290
Dataset present in an alredy downloaded collection/publication.

Daset 36/290
Downloading dataset Zhang_2021_2 | normal - Single-cell analyses of renal cell cancers reveal insights into tumor microenvironment, cell of origin, and therapy response | id:f801b7a9-80a6-4d09-9161-71474deb58ae


Downloading: 100%|██████████| 56.8M/56.8M [00:05<00:00, 10.5MB/s]



Daset 37/290
Dataset present in an alredy downloaded collection/publication.

Daset 38/290
Dataset present in an alredy downloaded collection/publication.

Daset 41/290
Dataset present in an alredy downloaded collection/publication.

Daset 42/290
Dataset present in an alredy downloaded collection/publication.

Daset 43/290
Dataset present in an alredy downloaded collection/publication.

Daset 45/290
Dataset present in an alredy downloaded collection/publication.

Daset 46/290
Dataset present in an alredy downloaded collection/publication.

Daset 47/290
Dataset present in an alredy downloaded collection/publication.

Daset 48/290
Dataset present in an alredy downloaded collection/publication.

Daset 49/290
Downloading dataset Mimpen_2024_1 | Single nucleus transcriptomic profiling of human healthy hamstring tendon | id:06ef6b36-6c9b-4e10-8a94-d0baf274276e


Downloading: 100%|██████████| 88.5M/88.5M [00:25<00:00, 3.64MB/s]



Daset 50/290
Downloading dataset Olah_2020_1 | Olah et al (2020) Single-cell Human Microglia | id:a1b9c51e-a408-4f7f-bccb-abefe20ae2a5


Downloading: 100%|██████████| 92.2M/92.2M [00:10<00:00, 9.38MB/s]



Daset 51/290
Dataset present in an alredy downloaded collection/publication.

Daset 52/290
Downloading dataset Solé-Boldo_2020_1 | Single-cell transcriptomes of the human skin reveal age-related loss of fibroblast priming | id:124744b8-4681-474a-9894-683896122708


Downloading: 100%|██████████| 95.4M/95.4M [00:11<00:00, 8.64MB/s]



Daset 53/290
Downloading dataset Fan_2019_1 | Single-cell reconstruction of follicular remodeling in the human adult ovary | id:1f1c5c14-5949-4c81-b28e-b272e271b672


Downloading: 100%|██████████| 101M/101M [00:23<00:00, 4.57MB/s] 



Daset 54/290
Dataset present in an alredy downloaded collection/publication.

Daset 55/290
Dataset present in an alredy downloaded collection/publication.

Daset 56/290
Dataset present in an alredy downloaded collection/publication.

Daset 57/290
Dataset present in an alredy downloaded collection/publication.

Daset 58/290
Dataset present in an alredy downloaded collection/publication.

Daset 59/290
Downloading dataset Calandrelli_2020_1 | scRNA-seq data analysis of endothelium-enriched mesenteric arterial tissues from human donors | id:42b6a476-c51d-4f8b-b68b-44941b3a11bf


Downloading: 100%|██████████| 118M/118M [00:10<00:00, 11.7MB/s] 



Daset 60/290
Dataset present in an alredy downloaded collection/publication.

Daset 61/290
Dataset present in an alredy downloaded collection/publication.

Daset 62/290
Downloading dataset Martin_2019_1 | Ileum | id:36c867a7-be10-4e69-9b39-5de12b0af6da


Downloading: 100%|██████████| 121M/121M [00:09<00:00, 12.8MB/s] 



Daset 63/290
Dataset present in an alredy downloaded collection/publication.

Daset 64/290
Dataset present in an alredy downloaded collection/publication.

Daset 65/290
Downloading dataset Leng_2021_1 | Molecular characterization of selectively vulnerable neurons in Alzheimer’s Disease: Entorhinal Cortex | id:2727d83a-0af0-443a-bff8-58dc7028289a


Downloading: 100%|██████████| 124M/124M [00:34<00:00, 3.80MB/s] 



Daset 66/290
Dataset present in an alredy downloaded collection/publication.

Daset 67/290
Dataset present in an alredy downloaded collection/publication.

Daset 68/290
Dataset present in an alredy downloaded collection/publication.

Daset 69/290
Downloading dataset Lukassen_2020_1 | Airway | id:8fcf0ccc-67b2-43a3-90be-075f85169bef


Downloading: 100%|██████████| 132M/132M [00:11<00:00, 12.4MB/s] 



Daset 70/290
Downloading dataset Smith_2021_1 | Infant human neocortex cells | id:82f6af6d-5313-439a-9936-5e844be49a70


Downloading: 100%|██████████| 132M/132M [00:10<00:00, 12.8MB/s] 



Daset 71/290
Dataset present in an alredy downloaded collection/publication.

Daset 72/290
Dataset present in an alredy downloaded collection/publication.

Daset 73/290
Dataset present in an alredy downloaded collection/publication.

Daset 74/290
Downloading dataset Wiedemann_2023_1 | UMAP visualization of all 12 datasets | id:de94c504-4b58-4f42-b68d-74a8e4892f0e


Downloading: 100%|██████████| 134M/134M [00:12<00:00, 11.4MB/s] 



Daset 76/290
Dataset present in an alredy downloaded collection/publication.

Daset 77/290
Dataset present in an alredy downloaded collection/publication.

Daset 78/290
Dataset present in an alredy downloaded collection/publication.

Daset 79/290
Downloading dataset Muto_2021_1 | Single cell transcriptional and chromatin accessibility profiling redefine cellular heterogeneity in the adult human kidney - RNAseq | id:9df60c57-fdf3-4e93-828e-fe9303f20438


Downloading: 100%|██████████| 158M/158M [00:12<00:00, 13.7MB/s] 



Daset 80/290
Downloading dataset Kuppe_2022_1 | Ischemia-snRNA-Spatial multi-omic map of human myocardial infarction | id:f15e263b-6544-46cb-a46e-e33ab7ce8347


Downloading: 100%|██████████| 158M/158M [00:11<00:00, 14.5MB/s] 



Daset 81/290
Dataset present in an alredy downloaded collection/publication.

Daset 82/290
Dataset present in an alredy downloaded collection/publication.

Daset 83/290
Dataset present in an alredy downloaded collection/publication.

Daset 84/290
Dataset present in an alredy downloaded collection/publication.

Daset 85/290
Downloading dataset Fenton_2021_1 | Lamina propria Epcam-CD235ab-CD45-CD31- | id:33911db3-f461-464b-8083-a397ab616a09


Downloading: 100%|██████████| 164M/164M [00:25<00:00, 6.72MB/s] 



Daset 86/290
Downloading dataset Jardine_2021_1 | Human Fetal Bone Marrow (CITE-seq) | id:343ff97c-85df-494b-8400-beb937618611


Downloading: 100%|██████████| 165M/165M [00:11<00:00, 15.0MB/s] 



Daset 87/290
Downloading dataset King_2021_1 | Human tonsil memory B cells scRNA | id:7970bd6b-f752-47a9-8643-2af16855ec49


Downloading: 100%|██████████| 169M/169M [01:08<00:00, 2.60MB/s] 



Daset 88/290
Dataset present in an alredy downloaded collection/publication.

Daset 89/290
Dataset present in an alredy downloaded collection/publication.

Daset 91/290
Dataset present in an alredy downloaded collection/publication.

Daset 92/290
Dataset present in an alredy downloaded collection/publication.

Daset 93/290
Downloading dataset Travaglini_2020_1 | Krasnow Lab Human Lung Cell Atlas, Smart-seq2 | id:e04daea4-4412-45b5-989e-76a9be070a89


Downloading: 100%|██████████| 179M/179M [00:18<00:00, 10.3MB/s] 



Daset 94/290
Downloading dataset van Zyl_2022_1 | Lens | id:489318a0-24c3-4f5c-b105-f084ed0ea026


Downloading: 100%|██████████| 179M/179M [00:25<00:00, 7.30MB/s] 



Daset 95/290
Dataset present in an alredy downloaded collection/publication.

Daset 96/290
Downloading dataset Lukowski_2019_1 | Retina | id:d5c67a4e-a8d9-456d-a273-fa01adb1b308


Downloading: 100%|██████████| 183M/183M [00:18<00:00, 10.6MB/s] 


Daset 97/290





['bf325905-5e8e-42e3-933d-9a9053e9af80',
 '93eebe82-d8c3-41bc-a906-63b5b5f24a9d',
 '29f92179-ca10-4309-a32b-d383d80347c1',
 '1d1c7275-476a-49e2-9022-ad1b1c793594',
 'a238e9fa-2bdf-41df-8522-69046f99baff',
 'dfc09a93-bce0-4c77-893d-e153d1b7f9fa',
 '283d65eb-dd53-496d-adb7-7570c7caa443',
 '6e8c5415-302c-492a-a5f9-f29c57ff18fb',
 '16c1e722-96ae-4bf6-b408-cd7f8918484f',
 '28e9d721-6816-48a2-8d0b-43bf0b0c0ebc',
 '1a486c4c-c115-4721-8c9f-f9f096e10857',
 '1df8c90d-d299-4b2e-a54d-a5a80f36e780',
 'd2684035-a36e-458e-96af-8e37930bfdf6',
 'fcb3d1c1-03d2-41ac-8229-458e072b7a1c',
 'c353707f-09a4-4f12-92a0-cb741e57e5f0',
 '2902f08c-f83c-470e-a541-e463e25e5058',
 'db468083-041c-41ca-8f6f-bf991a070adf',
 '2b02dff7-e427-4cdc-96fb-c0f354c099aa',
 '180bff9c-c8a5-4539-b13b-ddbc00d643e6',
 '6ff3401b-d72c-4940-a00c-3f0792397082',
 'e02201d7-f49f-401f-baf0-1eb1406546c0',
 '6d203948-a779-4b69-9b3f-1ee1dadc3980',
 '9b02383a-9358-4f0f-9795-a891ec523bcc',
 '8191c283-0816-424b-9b61-c3e1d6258a77',
 '0c3f148e-02ff-

In [None]:
census.close()