In [1]:
import pandas as pd
import scanpy as sc

In [2]:
_project_id = "project_id"
_sample_id = "sample_id"
_cancer_type = "cancer_type"
_count_type = "count_type"
_technology = "technology"
_suspension_type = "suspension_type"
_primary_side = "primary_side"
_sample_side = "sample_side"

In [3]:
def clean_adata(adata: sc.AnnData) -> sc.AnnData:
    for key in list(adata.obsm_keys()):
        del adata.obsm[key]
    for key in list(adata.obsp.keys()):
        del adata.obsp[key]
    
    return adata

In [6]:
metadata = pd.read_csv("study_overview.csv", index_col=0)

## GBM Extend Atlas


In [7]:
metadata = metadata[(metadata["Source"] == "CellxGene(Ruiz)") & (metadata["Include"] == "Yes")]
metadata.index = metadata["author_last_name"] + metadata["year"].astype(str)
studies = (metadata["author_last_name"] + metadata["year"].astype(str)).to_list()

In [8]:
adata = sc.read("data/cellxgene/ruiz.h5ad", backed="r",backup_url="https://datasets.cellxgene.cziscience.com/d36a23ec-65f6-446e-8e69-c72ccbd44d9e.h5ad")
adata = adata[(adata.obs["cell_type"]=="malignant cell") & (adata.obs["author"].isin(studies))]
adata = adata.raw.to_adata()


In [9]:
del adata.obsm["X_umap"]
adata.var_names = adata.var["feature_name"].astype(str)
adata.uns = {}

In [None]:
sample_counts = {}
for study in studies:
    bdata = adata[adata.obs["author"]==study].copy()
    if bdata.n_obs < 100:
        print(study)
        continue
    bdata.var = bdata.var.drop(bdata.var.columns, axis=1)
    bdata.obs = bdata.obs[["donor_id", "assay"]]
    bdata.obs = bdata.obs.rename(columns={"donor_id": _sample_id, "assay": _technology})
    bdata.obs[_primary_side] = "brain"
    bdata.obs[_sample_side] = "brain"
    bdata.obs[_suspension_type] = "cell"
    bdata.obs[_project_id] = metadata.loc[study, ["Project ID"]].item()
    bdata.uns = {_count_type: "raw_counts"}
    bdata.write_h5ad(f"data/cellxgene/ruiz/{study}.h5ad")

Goswami2019
Sankowski2019
Mathewson2021
Friedrich2021
Xie2021


## Breast Cancer CellxGene dataset


In [None]:
adatas = []
for s_type, url in zip(["cell", "nucleus"], ["https://datasets.cellxgene.cziscience.com/0a563530-3000-458e-a4d2-d549e026f326.h5ad", "https://datasets.cellxgene.cziscience.com/ccdb972d-6655-43ae-9ad8-f895bd893d8a.h5ad"]):
    adata = sc.read(f"data/cellxgene/klughammer_{s_type}.h5ad", backed="r", backup_url=url)

    adata = adata[adata.obs["cell_type"]=="malignant cell"]
    adata = adata.raw.to_adata()
    adata.var_names = adata.var["feature_name"].astype(str)
    adata = clean_adata(adata)
    adata.obs[_suspension_type] = s_type
    adata.obs[_sample_id] = adata.obs["donor_id"].astype(str) + "_" + s_type
    adatas.append(adata)



adata = sc.concat(adatas)
del adatas

adata.var = adata.var.drop(adata.var.columns, axis=1)
adata.obs[_primary_side] = "breast"
adata.obs[_sample_side] = adata.obs["site_biopsy"]
adata.obs[_project_id] = "CG-28"
adata.uns = {_count_type: "raw_counts"}

#adata.var = adata.var.drop(adata.var.columns, axis=1)
adata.obs[_primary_side] = "breast"
adata.obs[_sample_side] = adata.obs["site_biopsy"]
adata.obs[_project_id] = "CG-28"
adata.uns = {_count_type: "raw_counts"}
adata.obs[_technology] = adata.obs["assay"]
adata.obs = adata.obs[[_sample_id, _project_id, _sample_side, _primary_side, _technology, _suspension_type]]
adata.write_h5ad("data/cellxgene/klughammer/klughammer.h5ad")

(85033, 27361)
['HTAPP-983', 'HTAPP-382', 'HTAPP-982', 'HTAPP-321', 'HTAPP-285', ..., 'HTAPP-649', 'HTAPP-908', 'HTAPP-586', 'HTAPP-759', 'HTAPP-812']
Length: 26
Categories (26, object): ['HTAPP-262', 'HTAPP-285', 'HTAPP-313', 'HTAPP-321', ..., 'HTAPP-908', 'HTAPP-917', 'HTAPP-982', 'HTAPP-983']
(201003, 30450)
['HTAPP-225', 'HTAPP-589', 'HTAPP-231', 'HTAPP-862', 'HTAPP-232', ..., 'HTAPP-226', 'HTAPP-890', 'HTAPP-262', 'HTAPP-861', 'HTAPP-997']
Length: 30
Categories (30, object): ['HTAPP-211', 'HTAPP-213', 'HTAPP-223', 'HTAPP-225', ..., 'HTAPP-944', 'HTAPP-947', 'HTAPP-963', 'HTAPP-997']


## 

In [67]:
adata = sc.read("data/cellxgene/garcia.h5ad", backed="r", backup_url="https://datasets.cellxgene.cziscience.com/c613f4a1-abfe-4a2c-98d6-59748a8a234c.h5ad")

In [72]:
adata = adata[(adata.obs["cluster_label"].str.lower().str.contains("cancer", regex=False)) & (adata.obs["doublet"]=="False"), :]
adata = adata.raw.to_adata()

In [73]:
adata = clean_adata(adata)
adata.var_names = adata.var["feature_name"].astype(str)
adata.uns = {}
adata.obs[_sample_id] = adata.obs["patient_id"].astype(str)
adata.obs[_cancer_type] = "HGSOC"
adata.obs[_project_id] = "CG-29"
adata.obs[_technology] = adata.obs["assay"]
adata.obs[_suspension_type] = "cell"
adata.obs[_primary_side] = "Ovarian"
adata.obs[_sample_side] =  "Ovarian"

adata.uns = {_count_type: "raw_counts"}
adata.var = adata.var.drop(adata.var.columns, axis=1)

adata.obs = adata.obs[[_sample_id, _project_id, _sample_side, _primary_side, _technology, _suspension_type]]

In [76]:
adata.write_h5ad("data/cellxgene/garcia/garcia.h5ad")

## The single-cell lung cancer atlas (LuCA) -- extended atlas

In [77]:
adata = sc.read("data/cellxgene/salcher.h5ad", backed="r", backup_url="https://datasets.cellxgene.cziscience.com/c366b76c-b5cf-499b-ad99-84bd57f71f0f.h5ad")

  0%|          | 0.00/16.2G [00:00<?, ?B/s]

: 