In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import anndata
import os 
import sys
import pandas as pd
import scanpy as sc
from tqdm import tqdm

from nicheformer.data.constants import DefaultPaths, ObsConstants, UnsConstants, VarConstants, AssayOntologyTermId, SexOntologyTermId, OrganismOntologyTermId, TissueOntologyTermId, SuspensionTypeId
from nicheformer.data.tools import qc_filter
from nicheformer.data.validate import validate
from nicheformer.data.download import download_tar
from nicheformer.data.constants import GeneExpressionOmnibus, DefaultPaths
import numpy as np
from pyensembl import EnsemblRelease

In [3]:
raw_path = f"/home/ec2-user/SageMaker/our_data_LC_NE/"
preprocessed_path = f"/home/ec2-user/SageMaker/our_data_LC_NE/preprocessed"

In [4]:
str(AssayOntologyTermId.MERFISH_SPATIAL.value)

'EFO:0008992'

In [26]:
assay = str(AssayOntologyTermId.TENX_3.value)
organism = str(OrganismOntologyTermId.MOUSE.value)
organism_validator = "mouse"
tissue_type = "tissue" # or alternatively "organoid"
condition_id = "wild type"

In [7]:
str(TissueOntologyTermId.BRAIN.value)

'UBERON:0000955'

In [8]:
str(SuspensionTypeId.TENX_NUCLEUS.value)

'nucleus'

In [None]:
SuspensionTypeId.TENX_NUCLEUS

In [9]:
tissue = str(TissueOntologyTermId.BRAIN.value)
suspension_type = str(SuspensionTypeId.TENX_NUCLEUS.value)
sex_f = str(SexOntologyTermId.FEMALE.value)
sex_m = str(SexOntologyTermId.MALE.value)

In [10]:
filename = raw_path + 'adata_sc.h5ad'
filename

'/home/ec2-user/SageMaker/our_data_LC_NE/adata_sc.h5ad'

# load data!!!!

In [87]:
filename = raw_path + 'adata_sc.h5ad'
adata = sc.read_h5ad(filename)

genderinfo = pd.Categorical(adata.obs['sex'].replace({'M': sex_m, 'F': sex_f}))

adata.obs = adata.obs.iloc[:,:3]

In [88]:
def ensem_for_adata(adata):         
    data = EnsemblRelease(104, species='mus_musculus')    
    data.index()
    new_names = []
    counter = 0
    for i,m in enumerate(adata.var_names):
        try:
            out=data.genes_by_name(m)[0].id
            new_names.append(out)
        except:# Exception as e:  
            # # print(e)
            # try:
            #     m1 = m.split(' ')[0]
            #     out=data.genes_by_name(m1)[0].id
            #     new_names.append(out)
            # except:# Exception as e:
                # print(e)
            counter += 1
            new_names.append(f'NA_{counter}')        
    print(f'total invalid {counter}, from {len(adata.var_names)} {100*counter/len(adata.var_names)} %')    
    adata_1 = adata.copy()    
    adata_1.var.index = (new_names)    
    adata_1 = adata_1[:,adata_1.var.index != 'NA']
    
    print(f'new data size {adata_1.shape}')
    return(adata_1)

In [89]:
# import sys
# sys.path.append("/home/ec2-user/SageMaker/functions")
# from preprocessing import *
adata_orig = adata.copy()
adata_foo = ensem_for_adata(adata) 

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/ec2-user/.cache/pyensembl/GRCm39/ensembl104/Mus_musculus.GRCm39.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/ec2-user/.cache/pyensembl/GRCm39/ensembl104/Mus_musculus.GRCm39.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/ec2-user/.cache/pyensembl/GRCm39/ensembl104/Mus_musculus.GRCm39.pep.all.fa.gz.pickle


total invalid 459, from 29617 1.549785596110342 %
new data size (5040, 29617)


In [90]:
adata_orig.var.index[:100]

Index(['Xkr4', 'Gm1992', 'Gm19938', 'Gm37381', 'Rp1', 'Sox17', 'Gm37587',
       'Gm37323', 'Mrpl15', 'Lypla1', 'Tcea1', 'Rgs20', 'Gm16041', 'Atp6v1h',
       'Oprk1', 'Npbwr1', 'Rb1cc1', '4732440D04Rik', 'Alkal1', 'St18',
       'Pcmtd1', 'Gm26901', 'Gm30414', 'Sntg1', 'Rrs1', 'Adhfe1',
       '2610203C22Rik', 'Vxn', 'Mybl1', 'Vcpip1', '1700034P13Rik', 'Sgk3',
       'Mcmdc2', 'Snhg6', 'Tcf24', 'Ppp1r42', 'Gm15818', 'Cops5', 'Cspp1',
       'Arfgef1', 'Cpa6', 'Prex2', 'A830018L16Rik', 'Gm17644', 'Gm29663',
       'Sulf1', 'Slco5a1', 'Gm29283', 'Prdm14', 'Ncoa2', 'Gm29570', 'Tram1',
       'Lactb2', 'Xkr9', 'Eya1', 'Gm9947', 'Msc', 'Trpa1', 'Kcnb2', 'Terf1',
       'Sbspon', '4930444P10Rik', 'Rpl7', 'Rdh10', 'Gm28095', 'Stau2',
       'Gm7568', 'Ube2w', 'Eloc', 'D030040B21Rik', 'Tmem70', 'Ly96', 'Gm28376',
       'Jph1', 'Gm28783', 'Gdap1', 'Gm28784', 'Pi15', 'Gm28154', 'Gm16070',
       'Crispld1', 'Gm28153', 'Gm28756', 'Crisp4', 'Defb41', 'Gm15825',
       'Tfap2d', 'Tfap2b', 'Gm2834

In [91]:
'Pkhd1 ENSEMBMUSZZZZ'

'Pkhd1 ENSEMBMUSZZZZ'

In [92]:
adata = adata_foo.copy()

In [93]:
adata.var.keys()

Index(['vf_vst_counts_mean', 'vf_vst_counts_variance',
       'vf_vst_counts_variance.expected',
       'vf_vst_counts_variance.standardized', 'vf_vst_counts_variable',
       'vf_vst_counts_rank', 'var.features', 'var.features.rank'],
      dtype='object')

In [94]:
adata.obs[ObsConstants.ASSAY_ONTOLOGY_TERM_ID] = pd.Categorical([assay for i in range(len(adata))])
adata.obs[ObsConstants.SEX_ONTOLOGY_TERM_ID] = genderinfo
adata.obs[ObsConstants.ORGANISM_ONTOLOGY_TERM_ID] = pd.Categorical([organism for i in range(len(adata))])
adata.obs[ObsConstants.TISSUE_ONTOLOGY_TERM_ID] = pd.Categorical([tissue for i in range(len(adata))])
adata.obs[ObsConstants.SUSPENSION_TYPE] = pd.Categorical([suspension_type for i in range(len(adata))])

# NicheFormer data schema
adata.obs[ObsConstants.CONDITION_ID] = pd.Categorical([condition_id for i in range(len(adata))])
adata.obs[ObsConstants.DONOR_ID] = adata.obs['sample_id']
adata.obs[ObsConstants.TISSUE_TYPE] = pd.Categorical([tissue_type for i in range(len(adata))])




In [95]:
adata.var[VarConstants.FEATURE_IS_FILTERED] = False

# after concatenation these are dtype=object, but need to be category
adata.obs[ObsConstants.CONDITION_ID] = adata.obs[ObsConstants.CONDITION_ID].astype('category')
adata.obs[ObsConstants.DONOR_ID] = adata.obs[ObsConstants.DONOR_ID].astype('category')

# run basic filtering with default values
print(f"\nPerforming basic quality control for this data.")
adata = qc_filter(adata=adata)



Performing basic quality control for this data.
AnnData object before filtering has 5040 cells and 29617 genes.
AnnData object after cell filtering: 5040 cells, 29617 genes.
AnnData object after gene filtering: 5040 cells, 24669 genes.


In [96]:
# adata = anndata.concat(adatas, index_unique='_')
adata.uns[UnsConstants.TITLE] = 'foo'
adata.var[VarConstants.FEATURE_IS_FILTERED] = False



In [97]:

# run validator
print(f"\nValidating....")
adata_output, valid, errors, is_seurat_convertible = validate(adata, organism=organism_validator)

if valid:
    print(f"DONE: Successfully preprocessed this data, validation completed with status is_valid={valid}.")
    print(f"\nWRITING PREPROCESSED FILE TO: LC_NE.h5ad")
    adata_output.write(f"{preprocessed_path}/LC_NE.h5ad")
else:
    print(f"ERROR: Preprocessing of yoru LC-NE failed, validation completed with status is_valid={valid}.")




Validating....


INFO:cellxgene_schema.validate:Starting validation...
INFO:cellxgene_schema.validate:Validation complete in 0:00:03.254093 with status is_valid=True
INFO:cellxgene_schema.validate:Writing labels
INFO:cellxgene_schema.utils:enforce canonical format in X
INFO:cellxgene_schema.validate:H5AD label writing complete in 0:01:22.043738


DONE: Successfully preprocessed this data, validation completed with status is_valid=True.

WRITING PREPROCESSED FILE TO: LC_NE.h5ad


In [66]:
organism_validator

'mouse'

In [80]:
adata.var.index.shape

(24669,)

In [82]:
np.unique(adata.var.index).shape

(24653,)

In [None]:

adata.var.index.shape