In [1]:
import scanpy as sc
import cellxgene_census
import cellxgene_census.experimental as czi_exp
import tiledb
import os
import time

In [2]:
# Verify that the proxy settings are correctly set
http_proxy = os.getenv('http_proxy')
https_proxy = os.getenv('https_proxy')

print("HTTP_PROXY: {}".format(http_proxy))
print("HTTPS_PROXY: {}".format(https_proxy))

um_proxy = "proxy1.arc-ts.umich.edu" ##### this is the setup used below in the open_soma call
um_proxy_port = "3128"

config= {
    "vfs.s3.proxy_host": um_proxy, 
    "vfs.s3.proxy_port": um_proxy_port,
    "vfs.s3.request_timeout_ms": 1000000, 
}

HTTP_PROXY: http://proxy1.arc-ts.umich.edu:3128/
HTTPS_PROXY: http://proxy1.arc-ts.umich.edu:3128/


In [3]:
cellxgene_census.get_census_version_description('stable')

{'release_date': None,
 'release_build': '2024-07-01',
 'soma': {'uri': 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/',
  'relative_uri': '/cell-census/2024-07-01/soma/',
  's3_region': 'us-west-2'},
 'h5ads': {'uri': 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/h5ads/',
  'relative_uri': '/cell-census/2024-07-01/h5ads/',
  's3_region': 'us-west-2'},
 'flags': {'lts': True}}

# Get all cell metadata (obs)

In [4]:
 ##### these are the magic words

# organism = "mus_musculus"
organism = "homo_sapiens"

with cellxgene_census.open_soma(census_version="2024-07-01", tiledb_config=config) as census: 

    # Reads SOMADataFrame as a slice
    cell_metadata = census["census_data"][organism].obs.read()

    # Concatenates results to pyarrow.Table
    cell_metadata = cell_metadata.concat()
    
    # Converts to pandas.DataFrame
    cell_metadata = cell_metadata.to_pandas()
    census.close()

print(f"{cell_metadata.shape=}")
cell_metadata.head()

cell_metadata.shape=(74322510, 28)


Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,tissue,tissue_ontology_term_id,tissue_type,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,0,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,EFO:0009899,plasma cell,CL:0000786,human adult stage,HsapDv:0000087,normal,PATO:0000461,...,caudate lobe of liver,UBERON:0001117,tissue,liver,UBERON:0002107,1742.0,221,7.882353,6661.549733,13696
1,1,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,EFO:0009899,mature B cell,CL:0000785,human adult stage,HsapDv:0000087,normal,PATO:0000461,...,caudate lobe of liver,UBERON:0001117,tissue,liver,UBERON:0002107,278.0,143,1.944056,3.616567,13696
2,2,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,EFO:0009899,plasma cell,CL:0000786,human adult stage,HsapDv:0000087,normal,PATO:0000461,...,caudate lobe of liver,UBERON:0001117,tissue,liver,UBERON:0002107,3723.0,709,5.251058,2959.510327,13696
3,3,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,EFO:0009899,mature B cell,CL:0000785,human adult stage,HsapDv:0000087,normal,PATO:0000461,...,caudate lobe of liver,UBERON:0001117,tissue,liver,UBERON:0002107,351.0,179,1.960894,8.487226,13696
4,4,0895c838-e550-48a3-a777-dbcd35d30272,10x 3' v2,EFO:0009899,mature B cell,CL:0000785,human adult stage,HsapDv:0000087,normal,PATO:0000461,...,caudate lobe of liver,UBERON:0001117,tissue,liver,UBERON:0002107,605.0,275,2.2,18.978102,13696


In [5]:
cell_metadata.columns

Index(['soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id',
       'cell_type', 'cell_type_ontology_term_id', 'development_stage',
       'development_stage_ontology_term_id', 'disease',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'observation_joinid', 'self_reported_ethnicity',
       'self_reported_ethnicity_ontology_term_id', 'sex',
       'sex_ontology_term_id', 'suspension_type', 'tissue',
       'tissue_ontology_term_id', 'tissue_type', 'tissue_general',
       'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz',
       'raw_variance_nnz', 'n_measured_vars'],
      dtype='object')

In [6]:
df = cell_metadata[(cell_metadata['is_primary_data']) & (cell_metadata['disease'] == 'normal')].copy()
print(f"{df.shape=}")

column = 'cell_type'
print(df[column].nunique())
df[column].value_counts()

df.shape=(30197419, 28)
664


cell_type
neuron                                                               2692914
glutamatergic neuron                                                 1566247
L2/3-6 intratelencephalic projecting glutamatergic neuron            1104521
oligodendrocyte                                                       901217
fibroblast                                                            715861
                                                                      ...   
nephron tubule epithelial cell                                             0
oocyte                                                                     0
pyramidal neuron                                                           0
smooth muscle cell of large intestine                                      0
CD56-positive, CD161-positive immature natural killer cell, human          0
Name: count, Length: 698, dtype: int64

In [7]:
[x for x in cell_metadata['cell_type'].unique() if 'stem' in x.lower()]

['stem cell',
 'central nervous system macrophage',
 'mesenchymal stem cell',
 'hematopoietic stem cell',
 'stem cell of epidermis',
 'intestinal crypt stem cell',
 'intestinal crypt stem cell of large intestine',
 'intestinal crypt stem cell of small intestine',
 'epithelial fate stem cell',
 'peripheral nervous system neuron',
 'brainstem motor neuron',
 'skeletal muscle satellite stem cell',
 'intestinal crypt stem cell of colon',
 'cord blood hematopoietic stem cell',
 'embryonic stem cell',
 'CD34-positive, CD38-negative hematopoietic stem cell']

In [9]:
cell_metadata['cell_type'].nunique()

698

In [12]:
cell_metadata['cell_type'].unique()

filepath = "/home/cstansbu/git_repositories/ONT-single-cell/resources/czi_cell_types.txt"
with open(filepath, 'w') as f:
  for item in cell_metadata['cell_type'].unique():
    f.write(str(item) + '\n')
  print('done')

done


In [8]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
cell_metadata.columns

In [None]:
cell_metadata['disease'].value_counts()

In [None]:
[x for x in cell_metadata['cell_type'].unique() if 'chondro' in x.lower()]

In [None]:
# break

# Variables

In [None]:
with cellxgene_census.open_soma(census_version="2024-07-01", tiledb_config=config) as census: 
    var = cellxgene_census.get_var(
        census = census,
        organism = "Homo sapiens",
    )
    census.close()

var.head()

In [None]:
CENSUS_VERSION = "2024-07-01"

for e in czi_exp.get_all_available_embeddings(CENSUS_VERSION):
    print(f"{e['embedding_name']:15} {e['experiment_name']:15} {e['data_type']:15}")

# Get actual cell data

In [None]:
start_time = time.time()

census = cellxgene_census.open_soma(
    census_version="2024-07-01",  
    tiledb_config=config,
)

end_time = time.time()
print(f"Time to open census: {end_time - start_time:.2f} seconds")

start_time = time.time()

adata = cellxgene_census.get_anndata(
    census = census,
    organism = "Homo sapiens",
    obs_value_filter = "tissue == 'tongue' and is_primary_data==True",
)

end_time = time.time()
print(f"Time to get anndata: {end_time - start_time:.2f} seconds")

sc.logging.print_memory_usage()
adata

# Get HSCs

In [None]:
start_time = time.time()

census = cellxgene_census.open_soma(
    census_version="2024-07-01",  
    tiledb_config=config,
)

end_time = time.time()
print(f"Time to open census: {end_time - start_time:.2f} seconds")

start_time = time.time()

# Construct the obs_value_filter string
cell_types = [
    # 'hematopoietic multipotent progenitor cell',
    # 'hematopoietic stem cell',
    # 'hematopoietic cell',
    # 'hematopoietic precursor cell',
    # 'cord blood hematopoietic stem cell',
    # 'CD34-positive, CD38-negative hematopoietic stem cell'
    'chondrocyte',
]
cell_type_filter = ' or '.join([f"cell_type == '{cell_type}'" for cell_type in cell_types])

obs_value_filter = f"is_primary_data == True and disease == 'normal' and ({cell_type_filter})"

adata = cellxgene_census.get_anndata(
    census = census,
    organism = "Homo sapiens",
    obs_value_filter = obs_value_filter,
)

end_time = time.time()
print(f"Time to get anndata: {end_time - start_time:.2f} seconds")

sc.logging.print_memory_usage()
adata

In [None]:
# write the data
outpath = "/nfs/turbo/umms-indikar/shared/projects/chondro_data/czi_data/chondrocytes_raw.h5ad"
adata.write(outpath)
adata

In [None]:
adata.obs.columns

In [None]:
adata.obs['cell|tissue|sex'] = adata.obs['cell_type'].str.replace(" ", "-") + "|" + adata.obs['tissue_general'].str.replace(" ", "-") + "|" + adata.obs['sex'].str.replace(" ", "-")
adata.obs['cell|tissue|sex'].value_counts()

In [None]:
adata.obs['dataset_id'].unique()

In [None]:
adata.obs['dataset_id'].nunique()

In [None]:
break

# aggregate

In [None]:
aggdata = sc.get.aggregate(
    adata,
    by='cell|tissue|sex',
    func=['count_nonzero', 'mean', 'sum', 'var'],
    axis='obs',
)

outpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/CZI/all_HSC_aggregated.h5ad"
aggdata.write(outpath)
aggdata

In [None]:
break