In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
current_path = os.getcwd()

In [3]:
current_path

'/home/ec2-user/SageMaker'

In [4]:
geo_id = 'GSE172127'

f"/home/ec2-user/SageMaker/test_data/dissociated/raw_data_{geo_id}.tar"


'/home/ec2-user/SageMaker/test_data/dissociated/raw_data_GSE172127.tar'

In [5]:
import anndata
import os 
import sys
import pandas as pd
import scanpy as sc
from tqdm import tqdm


In [6]:
from nicheformer.data.validate import validate
from nicheformer.data.constants import DefaultPaths, ObsConstants, UnsConstants, VarConstants, AssayOntologyTermId, SexOntologyTermId, OrganismOntologyTermId, TissueOntologyTermId, SuspensionTypeId
from nicheformer.data.tools import qc_filter
from nicheformer.data.validate import validate
from nicheformer.data.download import download_tar
from nicheformer.data.constants import GeneExpressionOmnibus, DefaultPaths


In [7]:
raw_path = f"/home/ec2-user/SageMaker/test_data/"
preprocessed_path = f"/home/ec2-user/SageMaker/test-data/preprocessed"


In [8]:
geo_id = 'GSE172127'
doi = "10.1038/s41421-021-00266-1"

In [9]:

## manual entries that are equal across the dataset
assay = str(AssayOntologyTermId.TENX_3V2.value)
sex = str(SexOntologyTermId.FEMALE.value)
organism = str(OrganismOntologyTermId.MOUSE.value)
organism_validator = "mouse"
tissue = str(TissueOntologyTermId.LIVER.value)
suspension_type = str(SuspensionTypeId.SMART_SEQ_CELL.value)
tissue_type = "tissue" # or alternatively "organoid"
condition_id = "wild type"


In [10]:
sample_ids = ["GSM5242402_E14.5FL"]
adatas = []
with tqdm(total=len(sample_ids), desc =geo_id) as pbar:
    for sample in sample_ids:
        adata = sc.read_mtx(f"{raw_path}/dissociated/{sample}_matrix.mtx.gz").T
        features = pd.read_table(f"{raw_path}/dissociated/{sample}_features.tsv.gz", index_col=0, header=None)
        features.columns = ["gene_name", "feature_types"]
        features.index.name = 'gene_ids'
        barcodes = pd.read_table(f"{raw_path}/dissociated/{sample}_barcodes.tsv.gz", index_col=0, header=None)
        barcodes.index.name = None
        adata.var = features
        adata.obs = barcodes

        # Ontology terms defined for AnnData object
        adata.obs[ObsConstants.ASSAY_ONTOLOGY_TERM_ID] = pd.Categorical([assay for i in range(len(adata))])
        adata.obs[ObsConstants.SEX_ONTOLOGY_TERM_ID] = pd.Categorical([sex for i in range(len(adata))])
        adata.obs[ObsConstants.ORGANISM_ONTOLOGY_TERM_ID] = pd.Categorical([organism for i in range(len(adata))])
        adata.obs[ObsConstants.TISSUE_ONTOLOGY_TERM_ID] = pd.Categorical([tissue for i in range(len(adata))])
        adata.obs[ObsConstants.SUSPENSION_TYPE] = pd.Categorical([suspension_type for i in range(len(adata))])

        # NicheFormer data schema
        adata.obs[ObsConstants.CONDITION_ID] = pd.Categorical([condition_id for i in range(len(adata))])
        adata.obs[ObsConstants.DONOR_ID] = pd.Categorical([sample for i in range(len(adata))])
        adata.obs[ObsConstants.TISSUE_TYPE] = pd.Categorical([tissue_type for i in range(len(adata))])

        adatas.append(adata)
        pbar.update(1)


GSE172127: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]


In [11]:

adata = anndata.concat(adatas, index_unique='_')
adata.uns[UnsConstants.TITLE] = doi
adata.var[VarConstants.FEATURE_IS_FILTERED] = False

# after concatenation these are dtype=object, but need to be category
adata.obs[ObsConstants.CONDITION_ID] = adata.obs[ObsConstants.CONDITION_ID].astype('category')
adata.obs[ObsConstants.DONOR_ID] = adata.obs[ObsConstants.DONOR_ID].astype('category')

# run basic filtering with default values
print(f"\nPerforming basic quality control for {geo_id}.")
adata = qc_filter(adata=adata)



Performing basic quality control for GSE172127.
AnnData object before filtering has 9448 cells and 28692 genes.
AnnData object after cell filtering: 9448 cells, 28692 genes.
AnnData object after gene filtering: 9448 cells, 15454 genes.


In [12]:

# run validator
print(f"\nValidating {geo_id}.")
adata_output, valid, errors, is_seurat_convertible = validate(adata, organism=organism_validator)

if valid:
    print(f"DONE: Successfully preprocessed {geo_id}, validation completed with status is_valid={valid}.")
    print(f"\nWRITING PREPROCESSED FILE TO: {geo_id}.h5ad")
    # adata_output.write(f"{preprocessed_path}/{geo_id}.h5ad")
else:
    print(f"ERROR: Preprocessing of {geo_id} failed, validation completed with status is_valid={valid}.")




Validating GSE172127.


Starting validation...
Validation complete in 0:00:01.849675 with status is_valid=True
Writing labels
enforce canonical format in X
H5AD label writing complete in 0:00:31.688287


DONE: Successfully preprocessed GSE172127, validation completed with status is_valid=True.

WRITING PREPROCESSED FILE TO: GSE172127.h5ad
