purpose: export data from the neuron and nonneuron files for brain atlas (stilleti et al. 2023) for identifying marker genes. cerebellum had to be done using an obscene amount of memory (128gb)

"Read counts are normalized using a log transformation of pseudocounts per 10,000 reads, ln(CPTT+1)"- cellxgene website


# HDF5 file

In [1]:
import os
import h5py
import scanpy as sc
import pandas as pd
import numpy as np
import json
import math
import nexusformat.nexus as nx
import anndata as ad
import matplotlib.pyplot as plt

In [2]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [3]:
file_path_neuron='scRNA_seq/Neurons.h5ad'
file_path_nonneuron='scRNA_seq/Nonneurons.h5ad'

## check file

In [4]:
#check if corrupted
try:
    with h5py.File(file_path_neuron, "r") as f:
        print("Neuron file is a valid HDF5 file.")
except Exception as e:
    print("Neruon file is not a valid HDF5 file:")

try:
    with h5py.File(file_path_nonneuron, "r") as f:
        print("Neonnuron file is a valid HDF5 file.")
except Exception as e:
    print("Nonneruon file is not a valid HDF5 file:")

Neuron file is a valid HDF5 file.
Neonnuron file is a valid HDF5 file.


In [5]:
adata_neuron = sc.read_h5ad(file_path_neuron,backed='r')
adata_neuron.obs['group']=(adata_neuron.obs['ROIGroup'].astype(str)+'_Neuron').apply(lambda x: x.replace(' ','-'))
adata_neuron.obs['dis_group']=(adata_neuron.obs['dissection'].astype(str)+'_'+adata_neuron.obs['supercluster_term'].astype(str)).apply(lambda x: x.replace(' ','-'))

In [6]:
adata_nonneuron=sc.read_h5ad(file_path_nonneuron,backed='r')
adata_nonneuron.obs['group']=(adata_nonneuron.obs['ROIGroup'].astype(str)+'_'+adata_nonneuron.obs['supercluster_term'].astype(str)).apply(lambda x: x.replace(' ','-'))
adata_nonneuron.obs['dis_group']=(adata_nonneuron.obs['dissection'].astype(str)+'_'+adata_nonneuron.obs['supercluster_term'].astype(str)).apply(lambda x: x.replace(' ','-'))

In [6]:
#adata_nonneuron.var.to_csv('scRNA_seq/gene_meta_data_brain_atlas.csv')

In [15]:
t=adata_nonneuron.obs[['ROIGroup','ROIGroupFine']].drop_duplicates().sort_values('ROIGroup')
#t.to_csv('scRNA_seq/ROIGroup_ROIGroupFine_mapping.csv',index=False)
t=adata_nonneuron.obs[['ROIGroup','dissection']].drop_duplicates()
#t.to_csv('scRNA_seq/ROIGroup_dissection_mapping.csv',index=False)

# subset into tissue groups

In [77]:
roi_list=['Epithalamus',
 'Hindbrain',
 'Hippocampus',
 'Hypothalamus',
 'Midbrain',
 'Paleocortex',
 'Spinal cord',
 'Thalamus',
'Cerebral nuclei',
 'Cerebral cortex']

In [78]:
roi= 'Hindbrain'

In [None]:
adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi, :]
adata_neuron_subset = adata_neuron[adata_neuron.obs['ROIGroup'] == roi, :] 

In [None]:
adata_nonneuron = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi, :]

In [None]:
adata_nonneuron = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi, :]
adata_neuron = adata_neuron[adata_neuron.obs['ROIGroup'] == roi, :] 
ad_concat = ad.concat([adata_neuron,adata_nonneuron], merge="same")
del(adata_neuron)
del(adata_nonneuron)

In [None]:
ad_concat

In [None]:
ad_concat = ad.concat([adata_neuron_subset,adata_nonneuron_subset], merge="same")
ad_concat.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")
print(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad written to file")

In [36]:
del(adata_nonneuron)

In [None]:
for roi in roi_list:
    print(roi)
    if os.path.exists(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad"):
        print('file already exists')
    else:
        print('exporting file')
        adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi, :]
        adata_neuron_subset = adata_neuron[adata_neuron.obs['ROIGroup'] == roi, :] 
        ad_concat = ad.concat([adata_neuron_subset,adata_nonneuron_subset], merge="same")
        ad_concat.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")
        print(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad written to file")

Epithalamus
file already exists
Hindbrain
file already exists
Hippocampus
file already exists
Hypothalamus
exporting file
scRNA_seq/Hypothalamus_combined.h5ad written to file
Midbrain
file already exists
Paleocortex
file already exists
Spinal cord
file already exists
Thalamus
file already exists
Cerebral nuclei
exporting file
scRNA_seq/Cerebral-nuclei_combined.h5ad written to file
Cerebral cortex
exporting file


In [18]:
roi='Hindbrain'

Hindbrain, Hippocampus

In [19]:
adata_nonneuron.obs['ROIGroup'] == roi

CellID
10X362_3:TCAGTGAGTATTGACC    False
10X362_5:TCCGTGTGTGAAAGTT    False
10X362_5:CACGGGTAGAGCAGAA    False
10X362_5:GATTCTTGTATGTCAC    False
10X362_6:AGGACTTGTATCCTTT    False
                             ...  
10X194_8:GAAATGAGTTCGGCTG    False
10X350_4:TTTACCATCGCACGAC     True
10X225_1:AGAAGCGTCCATATGG    False
10X221_5:TTGAACGCAGCCTTCT    False
10X385_3:CTACCCAGTGGCGCTT    False
Name: ROIGroup, Length: 888263, dtype: bool

In [21]:
adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi, :]
adata_neuron_subset = adata_neuron[adata_neuron.obs['ROIGroup'] == roi, :]

In [22]:
adata_nonneuron_subset

View of AnnData object with n_obs × n_vars = 184265 × 59480
    obs: 'ROIGroup', 'ROIGroupCoarse', 'ROIGroupFine', 'roi', 'organism_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'dissection', 'cell_cycle_score', 'sample_id', 'cluster_id', 'subcluster_id', 'supercluster_term', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'group'
    var: 'Biotype', 'Chromosome', 'End', 'Gene', 'Start'
    uns: 'batch_condition', 'schema_version', 'title'
    obsm: 'X_UMAP', 'X_tSNE'

In [23]:
adata_neuron_subset

View of AnnData object with n_obs × n_vars = 257337 × 59480
    obs: 'ROIGroup', 'ROIGroupCoarse', 'ROIGroupFine', 'roi', 'organism_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'dissection', 'cell_cycle_score', 'sample_id', 'cluster_id', 'subcluster_id', 'supercluster_term', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'group'
    var: 'Biotype', 'Chromosome', 'End', 'Gene', 'Start'
    uns: 'batch_condition', 'schema_version', 'title'
    obsm: 'X_UMAP', 'X_tSNE'

In [24]:
ad_concat = ad.concat([adata_neuron_subset,adata_nonneuron_subset], merge="same")

In [28]:
ad_concat.obs

Unnamed: 0_level_0,ROIGroup,ROIGroupCoarse,ROIGroupFine,roi,organism_ontology_term_id,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,assay_ontology_term_id,sex_ontology_term_id,development_stage_ontology_term_id,donor_id,dissection,cell_cycle_score,sample_id,cluster_id,subcluster_id,supercluster_term,cell_type_ontology_term_id,tissue_ontology_term_id,group
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10X386_2:CATGGATTCTCGACGG,Hindbrain,Myelencephalon,Myelencephalon,Human MoAN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Myelencephalon (medulla oblongata) (Mo) - affe...,0.002146,10X386_2,312,20,Upper rhombic lip,CL:0000540,UBERON_0005290,Hindbrain_Neuron
10X383_5:TCTTGCGGTGAATTGA,Hindbrain,Myelencephalon,Myelencephalon,Human MoSR,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Myelencephalon (medulla oblongata) (Mo) - sens...,0.001790,10X383_5,312,20,Upper rhombic lip,CL:0000540,UBERON_0005290,Hindbrain_Neuron
10X386_2:CTCATCGGTCGAGCAA,Hindbrain,Myelencephalon,Myelencephalon,Human MoAN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Myelencephalon (medulla oblongata) (Mo) - affe...,0.000000,10X386_2,312,17,Upper rhombic lip,CL:0000540,UBERON_0005290,Hindbrain_Neuron
10X378_8:TTGGATGAGACAAGCC,Hindbrain,Pons,Pons,Human PnAN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Pons (Pn) - afferent nuclei of cranial nerves ...,0.002279,10X378_8,312,18,Upper rhombic lip,CL:0000540,UBERON:0000988,Hindbrain_Neuron
10X387_7:TGAACGTAGTATTCCG,Hindbrain,Myelencephalon,Myelencephalon,Human MoAN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Myelencephalon (medulla oblongata) (Mo) - affe...,0.002276,10X387_7,312,16,Upper rhombic lip,CL:0000540,UBERON_0005290,Hindbrain_Neuron
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10X176_5:TCAGGGCGTCTTGCGG,Hindbrain,Cerebellum,Cerebellum,Human CbDN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000144,H18.30.002,Cerebellum (CB) - Cerebellar deep nuclei - CbDN,0.001098,10X176_5,12,3264,Microglia,CL:0000878,UBERON:0002037,Hindbrain_Microglia
10X382_7:TCTTCCTAGGGCAACT,Hindbrain,Myelencephalon,Myelencephalon,Human IO,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Myelencephalon (medulla oblongata) (Mo) - prec...,0.000728,10X382_7,12,3265,Microglia,CL:0000878,UBERON_0005290,Hindbrain_Microglia
10X176_5:GCTTTCGAGTAAACAC,Hindbrain,Cerebellum,Cerebellum,Human CbDN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000144,H18.30.002,Cerebellum (CB) - Cerebellar deep nuclei - CbDN,0.001031,10X176_5,12,3264,Microglia,CL:0000878,UBERON:0002037,Hindbrain_Microglia
10X176_5:TCATGCCGTGGTCTGC,Hindbrain,Cerebellum,Cerebellum,Human CbDN,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000144,H18.30.002,Cerebellum (CB) - Cerebellar deep nuclei - CbDN,0.002639,10X176_5,12,3264,Microglia,CL:0000878,UBERON:0002037,Hindbrain_Microglia


In [25]:
257337+184265

441602

In [35]:
f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad"

'scRNA_seq/Hindbrain_combined.h5ad'

In [37]:
ad_concat.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")

# export sub tissue tables

## check variables and observations

In [None]:
#export neuronal tissue groups
for roi_group in set(adata_neuron.obs['ROIGroup']):
    print(roi_group)
    if os.path.exists(f"scRNA_seq/{roi_group.replace(' ','-')}_neuron.h5ad"):
        print('file already exists')
    else:
        adata_neuron_subset = adata_neuron[adata_neuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad")
        adata_neuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_neuron.h5ad")


In [9]:
roi='Cerebral cortex'
adata_neuron = adata_neuron[adata_neuron.obs['ROIGroup'] == roi, :]

In [17]:
#export neuronal tissue groups
for roi_group in set(adata_nonneuron.obs['ROIGroup']):
    print(roi_group)
    if os.path.exists(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad"):
        print('file already exists')
    else:
        print('exporting file')
        adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad")
        adata_nonneuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad")


Paleocortex
exporting file


  df[key] = c


Thalamus
exporting file
Hippocampus
exporting file
Epithalamus
exporting file
Cerebral cortex
file already exists
Hypothalamus
exporting file
Cerebral nuclei
exporting file
Spinal cord
exporting file
Hindbrain
file already exists
Midbrain
exporting file


In [22]:
adata_neuron_subset.obs.head()

Unnamed: 0_level_0,ROIGroup,ROIGroupCoarse,ROIGroupFine,roi,organism_ontology_term_id,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,assay_ontology_term_id,sex_ontology_term_id,development_stage_ontology_term_id,donor_id,dissection,cell_cycle_score,sample_id,cluster_id,subcluster_id,supercluster_term,cell_type_ontology_term_id,tissue_ontology_term_id,group
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10X264_2:GATGAGGGTGTTAGCT,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human MEC,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Cerebral cortex (Cx) - Anterior parahippocampa...,0.001388,10X264_2,311,13,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X384_6:GCTTCACCAACCGTAT,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human TF,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Cerebral cortex (Cx) - Occipitotemporal (fusif...,0.001908,10X384_6,311,10,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X216_6:CAAGCTATCGGCGATC,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human A29-A30,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000144,H18.30.002,"Cerebral cortex (Cx) - Cingulate gyrus, retros...",0.000674,10X216_6,311,11,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X264_1:TTCATGTCACTTTATC,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human MEC,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Cerebral cortex (Cx) - Anterior parahippocampa...,0.000802,10X264_1,311,12,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X384_6:CAAGACTCATCAGTCA,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human TF,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Cerebral cortex (Cx) - Occipitotemporal (fusif...,0.001765,10X384_6,311,10,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron


In [None]:
adata_neuron_subset.X

In [None]:
adata_nonneuron_subset.X.head()

In [10]:
adata.obs.columns

Index(['ROIGroup', 'ROIGroupCoarse', 'ROIGroupFine', 'roi',
       'organism_ontology_term_id', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id',
       'sex_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'dissection', 'cell_cycle_score', 'sample_id', 'cluster_id',
       'subcluster_id', 'supercluster_term', 'cell_type_ontology_term_id',
       'tissue_ontology_term_id'],
      dtype='object')

## combine sub-tissue

In [8]:
import os
import h5py
import scanpy as sc
import pandas as pd
import numpy as np
import json
import math
import anndata as ad

In [2]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [4]:
roi_list=['Epithalamus',
 'Hindbrain',
 'Hippocampus',
 'Hypothalamus',
 'Midbrain',
 'Paleocortex',
 'Spinal cord',
 'Thalamus',
 'Cerebral cortex',
 'Cerebral nuclei']

In [None]:
roi=roi.replace(' ','-')
print('combining files')
#file_path_neuron=f'scRNA_seq/{roi}_neuron.h5ad'
file_path_nonneuron=f'scRNA_seq/{roi}_nonneuron.h5ad'
#adata_neuron = sc.read_h5ad(file_path_neuron)
adata_neuron.X = csr_matrix(adata_neuron.X)
adata_nonneuron = sc.read_h5ad(file_path_nonneuron)

X_non=adata_nonneuron.to_df()
X_n=adata_neuron.to_df()
X=pd.concat([X_non,X_n])

adata_neuron.obs['group']=f'{roi}_Neuron'
obs=pd.concat([adata_nonneuron.obs,adata_neuron.obs])

adata = ad.AnnData(X,obs,adata_nonneuron.var)

adata.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad",compression='gzip')

combining files


In [10]:
#export neuronal tissue groups
for roi in roi_list:
    print(roi)
    if os.path.exists(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad"):
        print(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad already exists")
    else:
        roi=roi.replace(' ','-')
        print('combining files')
        #file_path_neuron=f'scRNA_seq/{roi}_neuron.h5ad'
        file_path_nonneuron=f'scRNA_seq/{roi}_nonneuron.h5ad'
        #adata_neuron = sc.read_h5ad(file_path_neuron)
        adata_neuron.X = csr_matrix(adata_neuron.X)
        adata_nonneuron = sc.read_h5ad(file_path_nonneuron)

        X_non=adata_nonneuron.to_df()
        X_n=adata_neuron.to_df()
        X=pd.concat([X_non,X_n])
        
        adata_neuron.obs['group']=f'{roi}_Neuron'
        obs=pd.concat([adata_nonneuron.obs,adata_neuron.obs])
        
        adata = ad.AnnData(X,obs,adata_nonneuron.var)
        
        adata.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")

Epithalamus
scRNA_seq/Epithalamus_combined.h5ad already exists
Hindbrain
scRNA_seq/Hindbrain_combined.h5ad already exists
Hippocampus
scRNA_seq/Hippocampus_combined.h5ad already exists
Hypothalamus
scRNA_seq/Hypothalamus_combined.h5ad already exists
Midbrain
scRNA_seq/Midbrain_combined.h5ad already exists
Paleocortex
scRNA_seq/Paleocortex_combined.h5ad already exists
Spinal cord
scRNA_seq/Spinal-cord_combined.h5ad already exists
Thalamus
scRNA_seq/Thalamus_combined.h5ad already exists
Cerebral cortex
combining files


BlockingIOError: [Errno 11] Unable to synchronously open file (unable to lock file, errno = 11, error message = 'Resource temporarily unavailable')

In [20]:
file_path_neuron=f'scRNA_seq/{roi}_neuron.h5ad'
file_path_nonneuron=f'scRNA_seq/{roi}_nonneuron.h5ad'
adata_neuron = sc.read_h5ad(file_path_neuron)
adata_nonneuron = sc.read_h5ad(file_path_nonneuron)

In [28]:
X_non=adata_nonneuron.to_df()

In [29]:
X_n=adata_neuron.to_df()

In [31]:
X=pd.concat([X_non,X_n])

In [40]:
adata_neuron.obs['group']=f'{roi}_Neuron'

In [41]:
obs=pd.concat([adata_nonneuron.obs,adata_neuron.obs])

In [57]:
adata.X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [50]:
adata = ad.AnnData(X,obs,adata_nonneuron.var)

In [55]:
adata.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")

# analyze reads using scanpy

In [3]:
import os
import h5py
import scanpy as sc
import pandas as pd
import numpy as np
import json
import math
import nexusformat.nexus as nx
import anndata as ad

In [4]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [5]:
roi_list=['Epithalamus',
 'Hindbrain',
 'Hippocampus',
 'Hypothalamus',
 'Midbrain',
 'Paleocortex',
 'Spinal cord',
 'Thalamus',
'Cerebral nuclei',
 'Cerebral cortex']

In [6]:
roi='Cerebral cortex'
adata=sc.read_h5ad(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")
sc.pp.log1p(adata)
adata.to_df().groupby(adata.obs['group']).mean().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_mean.csv')
adata.to_df().groupby(adata.obs['group']).sum().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_sum.csv')
adata.to_df().groupby(adata.obs['group']).std().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_stdev.csv')
adata.to_df().groupby(adata.obs['group']).apply(lambda df: (df > 0).sum()).T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count_nonzero.csv')
adata.to_df().groupby(adata.obs['group']).count().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count.csv')


In [87]:
roi='Hippocampus'

In [88]:
adata=sc.read_h5ad(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")

In [89]:
sc.pp.log1p(adata)

In [None]:
adata.to_df().head()

In [12]:
adata.to_df().groupby(adata.obs['group']).count().T

group,Epithalamus_Astrocyte,Epithalamus_Committed-oligodendrocyte-precursor,Epithalamus_Ependymal,Epithalamus_Fibroblast,Epithalamus_Microglia,Epithalamus_Neuron,Epithalamus_Oligodendrocyte,Epithalamus_Oligodendrocyte-precursor,Epithalamus_Vascular
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cas9,239,4,58,60,184,22279,1218,242,43
EGFP,239,4,58,60,184,22279,1218,242,43
ENSG00000000003.15,239,4,58,60,184,22279,1218,242,43
ENSG00000000005.6,239,4,58,60,184,22279,1218,242,43
ENSG00000000419.13,239,4,58,60,184,22279,1218,242,43
...,...,...,...,...,...,...,...,...,...
pCS-Cherry-DEST_101-850,239,4,58,60,184,22279,1218,242,43
pCS-Cre2_51-1150,239,4,58,60,184,22279,1218,242,43
pET-mOrange,239,4,58,60,184,22279,1218,242,43
pcDNA3-CFP_951-1700,239,4,58,60,184,22279,1218,242,43


In [28]:
rerun=True

In [None]:
for roi in roi_list:
    print(roi)
    if not os.path.exists(f'scRNA_seq/processed/{roi}_pseudobulk_count_nonzero.csv') or  rerun==True:
        print('reading in and normalizing data')
        adata=sc.read_h5ad(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")
        sc.pp.filter_cells(adata, min_genes=500)
        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=False)
        sc.pp.log1p(adata)
        print('writing roi data')
        adata.to_df().groupby(adata.obs['group']).mean().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_mean.csv')
        adata.to_df().groupby(adata.obs['group']).sum().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_sum.csv')
        adata.to_df().groupby(adata.obs['group']).std().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_stdev.csv')
        adata.to_df().groupby(adata.obs['group']).apply(lambda df: (df > 0).sum()).T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count_nonzero.csv')
        adata.to_df().groupby(adata.obs['group']).count().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count.csv')
    else:
        print('files exists- skipping')

Epithalamus
reading in and normalizing data
writing roi data
Hindbrain
reading in and normalizing data
writing roi data
Hippocampus
reading in and normalizing data
writing roi data
Hypothalamus
reading in and normalizing data
writing roi data
Midbrain
reading in and normalizing data
writing roi data
Paleocortex
reading in and normalizing data
writing roi data
Spinal cord
reading in and normalizing data
writing roi data
Thalamus
reading in and normalizing data
writing roi data
Cerebral nuclei
reading in and normalizing data
writing roi data


In [None]:
adata=sc.read_h5ad(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_counts=10)
sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=False)
sc.pp.log1p(adata)
print('writing roi data')
adata.to_df().groupby(adata.obs['group']).mean().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_mean.csv')
adata.to_df().groupby(adata.obs['group']).sum().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_sum.csv')
adata.to_df().groupby(adata.obs['group']).std().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_stdev.csv')
adata.to_df().groupby(adata.obs['group']).apply(lambda df: (df > 0).sum()).T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count_nonzero.csv')
adata.to_df().groupby(adata.obs['group']).count().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count.csv')


In [None]:
adata.to_df().groupby(adata.obs['group']).mean().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_mean.csv')
adata.to_df().groupby(adata.obs['group']).sum().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_sum.csv')
adata.to_df().groupby(adata.obs['group']).std().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_stdev.csv')
adata.to_df().groupby(adata.obs['group']).apply(lambda df: (df > 0).sum()).T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count_nonzero.csv')
adata.to_df().groupby(adata.obs['group']).count().T.to_csv(f'scRNA_seq/processed/{roi}_pseudobulk_count.csv')
