purpose: use scRNA-seq data to test for enrichment in different brain tissues

In [8]:
import loompy
import os
import h5py
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import json
import math
import cellxgene_census
import nexusformat.nexus as nx

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:


In [2]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

# data downloaded from cellxgene

Condensed dataset: The condensed dataset reports the same measurements for all possible combinations of organism, tissue, cell type, and gene.

In [None]:
exp=pd.read_csv('scRNA_seq/expression-summary-condensed-03-11-24.csv.gz',compression='gzip')

In [43]:
exp=exp[(exp['tissue_name']=='brain')&(exp['organism_name']=='Homo sapiens')]

In [44]:
set(exp['cell_type_name'])

{'B cell',
 'Bergmann glial cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'Cajal-Retzius cell',
 'GABAergic neuron',
 'L2/3-6 intratelencephalic projecting glutamatergic cortical neuron',
 'L5 extratelencephalic projecting glutamatergic cortical neuron',
 'L5/6 near-projecting glutamatergic neuron of the primary motor cortex',
 'L6 corticothalamic-projecting glutamatergic cortical neuron',
 'L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex',
 'L6b glutamatergic cortical neuron',
 'Purkinje cell',
 'T cell',
 'alveolar macrophage',
 'astrocyte',
 'astrocyte of the cerebral cortex',
 'blood vessel endothelial cell',
 'brain vascular cell',
 'bronchus fibroblast of lung',
 'capillary endothelial cell',
 'caudal ganglionic eminence derived GABAergic cortical interneuron',
 'cell',
 'central nervous system macrophage',
 'cerebellar granule cell',
 'cerebellar granule cell precursor',


The full dataset contains the summed expression, number of cells with nonzero expression, and total number of cells for all possible combinations of metadata displayed in Gene Expression (organism, tissue, gene, cell type, dataset, disease, self-reported ethnicity, publication, and sex).

#pc=n/n_cells_cell_type
#tpc=n/n_cells_tissue

In [None]:
exp_full=pd.read_csv('scRNA_seq/express_full_hmBrain.csv.gz',header=None,low_memory=False)
exp_full.columns=[0,'tissue_ontology_term_id','organism_ontology_term_id','tissue_original_ontology_term_id','dataset_id','disease_ontology_term_id','self_reported_ethnicity_ontology_term_id','sex_ontology_term_id',
'publication_citation','gene_ontology_term_id','cell_type_ontology_term_id',
'n_cells_cell_type','n_cells_tissue','n','me','pc','tpc','tissue_name','organism_name','gene_name',
                  'cell_type_name','disease_name','self_reported_ethnicity_name','sex_name','tissue_original_name']


In [None]:
exp_full=exp_full[exp_full['disease_name']=='normal']

In [None]:
ensr_map=pd.read_csv('scRNA_seq/ensr_mapping.tsv',sep='\t')

In [None]:
ensr_map=ensr_map[['Ensembl gene ID','Approved symbol']]

In [None]:
ensr_map.columns

In [None]:
t=exp_full.merge(ensr_map, left_on='gene_ontology_term_id',right_on='Ensembl gene ID',how='left')

In [None]:
t=t[(t.publication_citation.str.contains('Siletti'))]

In [None]:
set(t.cell_type_name)

In [None]:
set(t.tissue_original_name)

In [None]:
#tmini=t[(t.cell_type_name=='astrocyte')&(t.tissue_original_name=='cerebellum')][['n_cells_cell_type','n_cells_tissue','n','me','pc','tpc','Approved symbol']]

In [None]:
# Function to calculate effect size (Cohen's d)
def cohen_d(x1, x2):
    n1, n2 = len(x1), len(x2)
    print(f'{n1}, {n2}')
    #n needs to change for 
    s1, s2 = np.var(x1, ddof=1), np.var(x2, ddof=1)
    pooled_sd = np.sqrt((s1 / n1) + (s2 / n2))
    return (np.mean(x1) - np.mean(x2)) / pooled_sd

In [None]:
tissue='cerebellum'
df=t[(t.tissue_original_name==tissue)][['cell_type_name','n_cells_cell_type','n_cells_tissue','n','me','pc','tpc','Approved symbol']]
#tmini=t[(t.tissue_original_name==tissue)]
df=df[~df['Approved symbol'].isna()]
df['CPTT']=df['me'].apply(math.exp)-1
selected_cell_type = "oligodendrocyte precursor cell"

In [None]:
df

In [None]:
gene_data = df[df["Approved symbol"] == gene]
gene_data

In [101]:
selected_data=gene_data[gene_data["cell_type_name"] == selected_cell_type]
selected_data

Unnamed: 0,cell_type_name,n_cells_cell_type,n_cells_tissue,n,me,pc,tpc,Approved symbol,CPTT
16152112,oligodendrocyte precursor cell,1729,15171416,1001,1.877284,0.578947,6.6e-05,KMT2C,5.535731
16594197,oligodendrocyte precursor cell,3862,15171416,2101,1.899819,0.544019,0.000138,KMT2C,5.684687
16891748,oligodendrocyte precursor cell,564,15171416,337,1.877975,0.597518,2.2e-05,KMT2C,5.540247


In [114]:
np.log(np.mean(selected_data['CPTT'])+1)

1.8850810685725548

In [116]:
other_data=gene_data[gene_data["cell_type_name"] == other_cell_type]
other_data

Unnamed: 0,cell_type_name,n_cells_cell_type,n_cells_tissue,n,me,pc,tpc,Approved symbol,CPTT
16152107,Bergmann glial cell,19,15171416,12,1.905766,0.631579,7.909611e-07,KMT2C,5.724557
16594192,Bergmann glial cell,6614,15171416,3442,1.894853,0.520411,0.000226874,KMT2C,5.65157
16891743,Bergmann glial cell,1070,15171416,615,1.858607,0.574766,4.053676e-05,KMT2C,5.414793


In [117]:
np.log(np.mean(other_data['CPTT'])+1)

1.8866109432361151

In [None]:
effect_size_dict={}
marker_score_dict={}
print(selected_cell_type)
# Loop through each gene
for gene in df[(df['cell_type_name']==selected_cell_type)]["Approved symbol"].unique():
    gene_data = df[df["Approved symbol"] == gene]

    # Data for the selected cell type
    selected_data = gene_data[gene_data["cell_type_name"] == selected_cell_type]["me"]

    effect_sizes = []
    for other_cell_type in gene_data["cell_type_name"].unique():
        if other_cell_type == selected_cell_type:
            continue

        # Data for the other cell type
        other_data = gene_data[gene_data["cell_type_name"] == other_cell_type]["me"]

        # Perform Welch's t-test
        t_stat, p_value = ttest_ind(selected_data, other_data, equal_var=False,nan_policy='omit')
        # Calculate effect size (Cohen's d)
        effect_size = cohen_d(selected_data, other_data)
        effect_sizes.append(effect_size)

    # Store the effect sizes and marker scores
    effect_size_dict[gene] = effect_sizes
    marker_score_dict[gene] = np.nanpercentile(effect_sizes, 10) if len(effect_sizes) > 0 else np.nan
    print(gene)
    print(t_stat)
    #print(effect_size_dict[gene])
    #print(marker_score_dict[gene])
#write to file
#with open(f"scRNA_seq/effect_size_{tissue}_{selected_cell_type.replace(' ','-')}.json", "w") as f:
#    json.dump(effect_size_dict, f)
#with open(f"scRNA_seq/marker_score_{tissue}_{selected_cell_type.replace(' ','-')}.json", "w") as f:
#    json.dump(marker_score_dict, f)

oligodendrocyte precursor cell
TSPAN6
nan
DPM1
-2.40941511170784
SCYL3
-0.8700986264481885
FIRRM
nan
FUCA2
-2.555677532803712


  t_stat, p_value = ttest_ind(selected_data, other_data, equal_var=False,nan_policy='omit')


GCLC
-0.6986876413153956
NFYA
0.597950979246014
STPG1
-1.247167046247622
NIPAL3
nan
LAS1L
-0.426780216186498
ENPP4
3.784512657627534
CFTR
nan
ANKIB1
-1.9360894486349347
CYP51A1
-0.3619100525844927
KRIT1
nan
RAD52
0.33320329400346715
BAD
0.5336871110017712
LAP3
-0.5256599915666037
HS3ST1
nan
AOC1
nan
HECW1
nan
MAD1L1
0.04710282413117115
LASP1
-0.5877063475427037
SNX11
nan
TMEM176A
0.7013730449626965
M6PR
-0.4958359385962111
KLHL13
nan
CYP26B1
nan
ICA1
nan
DBNDD1
nan
ALS2
0.40593377250643264
CFLAR
-3.6949231960415565
NDUFAF7
0.40460112153480365
RBM5
-1.5267814233561365
MTMR7
0.514192783210864
SLC7A2
0.6763210491682354
ARF5
-0.18261953238885828
SARM1
-8.1866822384638
POLDIP2
1.3605558063298404
AK2
nan
CD38
nan
FKBP4
-1.340679185395109
KDM1A
-0.10200442442797283
RBM6
-3.859710035233859
RECQL
-0.5959680493180852
VPS50
1.0981883172584854
ARHGAP33
nan
NDUFAB1
0.131563261149859
PDK4
-0.5933570193898107
SLC25A13
-2.8390394383725472
ST7
-0.5075088157643982
CDC27
-1.0139851087327925
HCCS
nan
DVL2

In [24]:
for selected_cell_type in set(df.cell_type_name):
    effect_size_dict={}
    marker_score_dict={}
    print(selected_cell_type)
    # Loop through each gene
    for gene in df[(df['cell_type_name']==selected_cell_type)]["Approved symbol"].unique():
        #subset for gene of interest
        gene_data = df[df["Approved symbol"] == gene]
    
        # subset for cell type of interest
        #selected_data = gene_data[gene_data["cell_type_name"] == selected_cell_type]["me"]
        selected_me=np.log(np.mean(selected_data['CPTT'])+1)
        effect_sizes = []
        #loop through all other cell types
        for other_cell_type in gene_data["cell_type_name"].unique():
            if other_cell_type == selected_cell_type:
                continue
            # Data for the other cell type
            #other_data = gene_data[gene_data["cell_type_name"] == other_cell_type]["me"]
            other_me=np.log(np.mean(other_me['CPTT'])+1)
            # Perform Welch's t-test
            t_stat, p_value = ttest_ind(selected_data, other_data, equal_var=False)
    
            # Calculate effect size (Cohen's d)
            effect_size = cohen_d(selected_data, other_data)
            effect_sizes.append(effect_size)
    
        # Store the effect sizes and marker scores
        effect_size_dict[gene] = effect_sizes
        marker_score_dict[gene] = np.nanpercentile(effect_sizes, 10) if len(effect_sizes) > 0 else np.nan
        print(gene)
        #print(effect_size_dict[gene])
        #print(marker_score_dict[gene])
        # Update the main DataFrame
        #df.loc[df["Approved symbol"] == gene, ["effect_size", "marker_score"]] = gene_data[["effect_size", "marker_score"]]
    with open(f"scRNA_seq/effect_size_{tissue}_{selected_cell_type.replace(' ','-')}.json", "w") as f:
        json.dump(effect_size_dict, f)
    with open(f"scRNA_seq/marker_score_{tissue}_{selected_cell_type.replace(' ','-')}.json", "w") as f:
        json.dump(marker_score_dict, f)

fibroblast
3, 3
3, 3
3, 2
3, 2
3, 2
3, 1
3, 1
3, 3
3, 2
3, 2
3, 3
3, 3
3, 1
3, 2
3, 1
TSPAN6
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 2
3, 3
3, 3
3, 3
3, 2
DPM1
2, 3
2, 3
2, 3
2, 3
2, 2
2, 3
2, 3
2, 2
2, 2
2, 3
2, 3
2, 3
2, 2
SCYL3
2, 3
2, 3
2, 3
2, 3
2, 3
2, 3
2, 3
2, 3
2, 3
2, 3
2, 1
2, 2
2, 1
2, 1
FIRRM
3, 3
3, 3
3, 3
3, 1
3, 3
3, 3
3, 3
3, 3
3, 3
3, 2
3, 3
3, 3
3, 1
3, 2
3, 1
CFH
3, 3
3, 3
3, 3
3, 2
3, 3


  t_stat, p_value = ttest_ind(selected_data, other_data, equal_var=False)


3, 2
3, 2
3, 3
3, 2
3, 2
3, 3
3, 3
3, 3
3, 1
3, 2
FUCA2
3, 3
3, 1
3, 1
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
GCLC
3, 3
3, 3
3, 3
3, 3
3, 2
3, 3
3, 2
3, 2
3, 3
3, 3
3, 2
3, 3
3, 3
3, 3
3, 2
NFYA
3, 3
3, 1
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 1
3, 2
3, 1
3, 1
NIPAL3
2, 3
2, 2
2, 3
2, 3
2, 3
2, 3
2, 2
2, 3
2, 3
2, 3
2, 2
2, 2
2, 2
LAS1L
3, 3
3, 3
3, 2
3, 3
3, 2
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 2
ENPP4
3, 3
3, 2
3, 2
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 1
3, 3
3, 3
3, 3
3, 3
ANKIB1
3, 3
3, 1
3, 1
3, 3
3, 2
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 2
CYP51A1
2, 2
2, 2
2, 2
2, 1
2, 3
2, 3
2, 1
2, 2
2, 3
2, 2
2, 1
2, 1
2, 1
KRIT1
3, 3
3, 1
3, 1
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 2
3, 3
3, 3
3, 3
3, 2
RAD52
3, 3
3, 1
3, 1
3, 2
3, 2
3, 3
3, 2
3, 3
3, 1
3, 1
3, 3
3, 3
3, 1
3, 3
3, 3
3, 2
3, 2
BAD
3, 3
3, 3
3, 3
3, 3
3, 3
3, 3
3, 1
3, 2
3, 3
3, 2
3, 2
3, 3
3, 3
3, 3
3, 2
LAP3
2, 3
2, 1
2, 2
2, 1
2, 3
2, 2
2, 2
2, 3
2, 3
2, 1
2, 1
2, 1

KeyboardInterrupt: 

In [76]:
t_comp=exp.merge(ensr_map, left_on='gene_ontology_term_id',right_on='Ensembl gene ID',how='left')

In [77]:
set(t_comp['cell_type_name'])

{'B cell',
 'Bergmann glial cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'Cajal-Retzius cell',
 'GABAergic neuron',
 'L2/3-6 intratelencephalic projecting glutamatergic cortical neuron',
 'L5 extratelencephalic projecting glutamatergic cortical neuron',
 'L5/6 near-projecting glutamatergic neuron of the primary motor cortex',
 'L6 corticothalamic-projecting glutamatergic cortical neuron',
 'L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex',
 'L6b glutamatergic cortical neuron',
 'Purkinje cell',
 'T cell',
 'alveolar macrophage',
 'astrocyte',
 'astrocyte of the cerebral cortex',
 'blood vessel endothelial cell',
 'brain vascular cell',
 'bronchus fibroblast of lung',
 'capillary endothelial cell',
 'caudal ganglionic eminence derived GABAergic cortical interneuron',
 'cell',
 'central nervous system macrophage',
 'cerebellar granule cell',
 'cerebellar granule cell precursor',


In [None]:
exp_full

# HDF5 file

In [1]:
import os
import h5py
import scanpy as sc
import pandas as pd
import numpy as np
import json
import math
import nexusformat.nexus as nx
import anndata as ad

In [2]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [4]:
file_path_neuron='scRNA_seq/Neurons.h5ad'
file_path_nonneuron='scRNA_seq/Nonneurons.h5ad'

## check file

In [12]:
#check if corrupted
try:
    with h5py.File(file_path_neuron, "r") as f:
        print("Neuron file is a valid HDF5 file.")
except Exception as e:
    print("Neruon file is not a valid HDF5 file:")

try:
    with h5py.File(file_path_nonneuron, "r") as f:
        print("Neonnuron file is a valid HDF5 file.")
except Exception as e:
    print("Nonneruon file is not a valid HDF5 file:")

Neuron file is a valid HDF5 file.
Neonnuron file is a valid HDF5 file.


In [6]:
adata_neuron = sc.read_h5ad(file_path_neuron)

In [None]:
adata_neuron.obs['group']=(adata_neuron.obs['ROIGroup'].astype(str)+'_Neuron').apply(lambda x: x.replace(' ','-'))

In [5]:
adata_nonneuron=sc.read_h5ad(file_path_nonneuron)

In [14]:
adata_nonneuron.obs['group']=(adata_nonneuron.obs['ROIGroup'].astype(str)+'_'+adata_nonneuron.obs['supercluster_term'].astype(str)).apply(lambda x: x.replace(' ','-'))

## check variables and observations

In [None]:
#export neuronal tissue groups
for roi_group in set(adata_neuron.obs['ROIGroup']):
    print(roi_group)
    if os.path.exists(f"scRNA_seq/{roi_group.replace(' ','-')}_neuron.h5ad"):
        print('file already exists')
    else:
        adata_neuron_subset = adata_neuron[adata_neuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad")
        adata_neuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_neuron.h5ad")


In [15]:
adata_nonneuron.obs['group']

CellID
10X362_3:TCAGTGAGTATTGACC    Hypothalamus_Oligodendrocyte
10X362_5:TCCGTGTGTGAAAGTT    Hypothalamus_Oligodendrocyte
10X362_5:CACGGGTAGAGCAGAA    Hypothalamus_Oligodendrocyte
10X362_5:GATTCTTGTATGTCAC    Hypothalamus_Oligodendrocyte
10X362_6:AGGACTTGTATCCTTT    Hypothalamus_Oligodendrocyte
                                         ...             
10X194_8:GAAATGAGTTCGGCTG              Midbrain_Microglia
10X350_4:TTTACCATCGCACGAC             Hindbrain_Microglia
10X225_1:AGAAGCGTCCATATGG              Midbrain_Microglia
10X221_5:TTGAACGCAGCCTTCT       Cerebral-cortex_Microglia
10X385_3:CTACCCAGTGGCGCTT       Cerebral-nuclei_Microglia
Name: group, Length: 888263, dtype: object

In [17]:
#export neuronal tissue groups
for roi_group in set(adata_nonneuron.obs['ROIGroup']):
    print(roi_group)
    if os.path.exists(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad"):
        print('file already exists')
    else:
        print('exporting file')
        adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset = adata_nonneuron[adata_nonneuron.obs['ROIGroup'] == roi_group, :]
        #adata_nonneuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad")
        adata_nonneuron_subset.write(f"scRNA_seq/{roi_group.replace(' ','-')}_nonneuron.h5ad")


Paleocortex
exporting file


  df[key] = c


Thalamus
exporting file
Hippocampus
exporting file
Epithalamus
exporting file
Cerebral cortex
file already exists
Hypothalamus
exporting file
Cerebral nuclei
exporting file
Spinal cord
exporting file
Hindbrain
file already exists
Midbrain
exporting file


In [None]:
sc.pp.filter_cells(adata_neuron_subset, min_counts=10)
sc.pp.filter_genes(adata_neuron_subset, min_genes=5)
sc.pp.filter_cells(adata_nonneuron_subset, min_counts=10)
sc.pp.filter_genes(adata_nonneuron_subset, min_genes=5)

In [22]:
adata_neuron_subset.obs.head()

Unnamed: 0_level_0,ROIGroup,ROIGroupCoarse,ROIGroupFine,roi,organism_ontology_term_id,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,assay_ontology_term_id,sex_ontology_term_id,development_stage_ontology_term_id,donor_id,dissection,cell_cycle_score,sample_id,cluster_id,subcluster_id,supercluster_term,cell_type_ontology_term_id,tissue_ontology_term_id,group
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10X264_2:GATGAGGGTGTTAGCT,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human MEC,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Cerebral cortex (Cx) - Anterior parahippocampa...,0.001388,10X264_2,311,13,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X384_6:GCTTCACCAACCGTAT,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human TF,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Cerebral cortex (Cx) - Occipitotemporal (fusif...,0.001908,10X384_6,311,10,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X216_6:CAAGCTATCGGCGATC,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human A29-A30,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000144,H18.30.002,"Cerebral cortex (Cx) - Cingulate gyrus, retros...",0.000674,10X216_6,311,11,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X264_1:TTCATGTCACTTTATC,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human MEC,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,Cerebral cortex (Cx) - Anterior parahippocampa...,0.000802,10X264_1,311,12,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron
10X384_6:CAAGACTCATCAGTCA,Cerebral cortex,Cerebral cortex,Cerebral cortex,Human TF,NCBITaxon:9606,PATO:0000461,unknown,EFO:0009922,PATO:0000384,HsapDv:0000136,H19.30.001,Cerebral cortex (Cx) - Occipitotemporal (fusif...,0.001765,10X384_6,311,10,Upper rhombic lip,CL:0000540,UBERON:0000956,Cerebral-cortex_Neuron


In [None]:
adata_neuron_subset.X

In [None]:
adata_nonneuron_subset.X.head()

In [10]:
adata.obs.columns

Index(['ROIGroup', 'ROIGroupCoarse', 'ROIGroupFine', 'roi',
       'organism_ontology_term_id', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id',
       'sex_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'dissection', 'cell_cycle_score', 'sample_id', 'cluster_id',
       'subcluster_id', 'supercluster_term', 'cell_type_ontology_term_id',
       'tissue_ontology_term_id'],
      dtype='object')

# combine sub-tissue

In [9]:
roi_list=['Cerebral cortex',
 'Cerebral nuclei',
 'Epithalamus',
 'Hindbrain',
 'Hippocampus',
 'Hypothalamus',
 'Midbrain',
 'Paleocortex',
 'Spinal cord',
 'Thalamus']

In [19]:
#export neuronal tissue groups
for roi_group in roi_list:
    print(roi_group)
    if os.path.exists(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")):
        print('file already exists')
    else:
        print('combining files')
        file_path_neuron=f'scRNA_seq/{roi}_neuron.h5ad'
        file_path_nonneuron=f'scRNA_seq/{roi}_nonneuron.h5ad'
        adata_neuron = sc.read_h5ad(file_path_neuron)
        adata_nonneuron = sc.read_h5ad(file_path_nonneuron)

In [20]:
file_path_neuron=f'scRNA_seq/{roi}_neuron.h5ad'
file_path_nonneuron=f'scRNA_seq/{roi}_nonneuron.h5ad'
adata_neuron = sc.read_h5ad(file_path_neuron)
adata_nonneuron = sc.read_h5ad(file_path_nonneuron)

In [28]:
X_non=adata_nonneuron.to_df()

In [29]:
X_n=adata_neuron.to_df()

In [31]:
X=pd.concat([X_non,X_n])

In [40]:
adata_neuron.obs['group']=f'{roi}_Neuron'

In [41]:
obs=pd.concat([adata_nonneuron.obs,adata_neuron.obs])

In [57]:
adata.X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [50]:
adata = ad.AnnData(X,obs,adata_nonneuron.var)

In [55]:
adata.write(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")

# analyze reads using scanpy

In [14]:
roi='Hippocampus'

In [15]:
adata=sc.read_h5ad(f"scRNA_seq/{roi.replace(' ','-')}_combined.h5ad")

In [None]:
ad=sc.pp.normalize_total(adata, *, target_sum=1e6, exclude_highly_expressed=False,copy=True)

In [None]:
anndata=sc.tl.rank_genes_groups(
    adata, 
    groupby='group',  # Replace 'leiden' with your clustering key
    method='logreg'
)

In [None]:
t=sc.tl.rank_genes_groups(
    adata_nonneuron, 
    groupby='group',  # Replace 'leiden' with your clustering key
    method='logreg', # Use methods like 't-test', 'logreg' for large data
    use_raw=True       # Use raw counts if normalization was skipped
)

In [20]:
annData=sc.tl.rank_genes_groups(adata_nonneuron_subset,'group',use_raw=True)

ValueError: Received `use_raw=True`, but `adata.raw` is empty.

# export for deseq2

In [None]:
if (ds!='neuron')|(ds!='neurons'):
#choose how to cluster them
    adata.obs['group']=(adata.obs['ROIGroup'].astype(str)+'_'+adata.obs['supercluster_term'].astype(str)).apply(lambda x: x.replace(' ','-'))
    pseudo_bulk = adata.to_df().groupby(adata.obs['group']).sum().T
    pseudo_bulk
    meta_data = pd.DataFrame({"group": pseudo_bulk.columns})
    meta_data['roi'] = meta_data['group'].apply(lambda x: x.split("_")[0])
    meta_data['cell_type'] = meta_data['group'].apply(lambda x: x.split("_")[1])
    #meta_data
    pseudo_bulk.to_csv("scRNA_seq/processed/non_neuron_byROI_counts.csv")
    meta_data.to_csv("scRNA_seq/processed/non_neuron_byROI_metadata.csv", index=False)

In [9]:
trait_list=set(adata.obs['ROIGroup'])

In [10]:
trait_list

{'Cerebral cortex',
 'Cerebral nuclei',
 'Epithalamus',
 'Hindbrain',
 'Hippocampus',
 'Hypothalamus',
 'Midbrain',
 'Paleocortex',
 'Spinal cord',
 'Thalamus'}

In [None]:
if (ds=='neuron')|(ds=='neurons'):
    for roi_group in trait_list:
        # Subset adata for the current ROIGroup
        adata_subset = adata[adata.obs['ROIGroup'] == roi_group, :]
        
        # Group by cluster (ROIGroup + supercluster_term)
        adata_subset.obs['group'] = (
            adata_subset.obs['ROIGroup'].astype(str) + '_' +
            adata_subset.obs['supercluster_term'].astype(str)
        ).apply(lambda x: x.replace(' ', '-'))
        pseudo_bulk = adata_subset.to_df().groupby(adata_subset.obs['group']).sum().T
        
        meta_data = pd.DataFrame({"group": pseudo_bulk.columns})
        meta_data['roi'] = meta_data['group'].apply(lambda x: x.split("_")[0])
        meta_data['cell_type'] = meta_data['group'].apply(lambda x: x.split("_")[1])

        pseudo_bulk_path = f"scRNA_seq/processed/{roi_group}_counts.csv"
        metadata_path = f"scRNA_seq/processed/{roi_group}_metadata.csv"
        pseudo_bulk.to_csv(pseudo_bulk_path)
        meta_data.to_csv(metadata_path, index=False)
          

In [9]:
roi_group='Cerebral cortex'

In [10]:
adata_subset = adata[adata.obs['ROIGroup'] == roi_group, :]

In [11]:
adata_subset.obs['group'] = (
    adata_subset.obs['ROIGroup'].astype(str) + '_' +
    adata_subset.obs['supercluster_term'].astype(str)
).apply(lambda x: x.replace(' ', '-'))

  adata_subset.obs['group'] = (


In [12]:
pseudo_bulk = adata_subset.to_df().groupby(adata_subset.obs['group']).stdev().T

In [13]:
meta_data = pd.DataFrame({"group": pseudo_bulk.columns})
meta_data['roi'] = meta_data['group'].apply(lambda x: x.split("_")[0])
meta_data['cell_type'] = meta_data['group'].apply(lambda x: x.split("_")[1])

In [15]:
pseudo_bulk_path = f"scRNA_seq/processed/{roi_group}_counts.csv"
metadata_path = f"scRNA_seq/processed/{roi_group}_metadata.csv"
pseudo_bulk.to_csv(pseudo_bulk_path)
meta_data.to_csv(metadata_path, index=False)

In [137]:
if (ds=='neuron')|(ds=='neurons'):
    pseudo_bulk.to_csv("scRNA_seq/processed/neuron_byROI_counts.csv")
    meta_data.to_csv("scRNA_seq/processed/neuron_byROI_metadata.csv", index=False)
else:


## combine all neuron counts tables into one table with single column

In [10]:
file_list=os.listdir('scRNA_seq/processed/')
file_list=[i for i in file_list if (('counts' in i) and ('non_neuron' not in i)) ]

In [18]:
meta_data = pd.DataFrame(columns=['group','roi','cell_type'])

In [25]:
net_count={}
for f in file_list:
    tb=pd.read_csv(f'scRNA_seq/processed/{f}')
    f=f[0:len(f)-11]
    net_count[f'{f}_Neuron']=tb.sum(axis=1)
    meta_data.loc[len(meta_data)]=[f'{f}_Neuron',f,'Neuron']

  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)
  net_count[f'{f}_Neuron']=tb.sum(axis=1)


In [29]:
net_count=pd.DataFrame(net_count)
net_count.insert(0,'Accession',tb.Accession)

In [31]:
net_count.to_csv('scRNA_seq/processed/neuron_netcounts.csv',index=False)

In [27]:
meta_data.to_csv('scRNA_seq/processed/neuron_netmetadata.csv',index=False)

# R-base

In [1]:
library(DESeq2)
library(ggplot2)
library(dplyr)

Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min



Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomicRanges

Loading required package: GenomeInfoDb

Loa

In [2]:
setwd('/tscc/projects/ps-palmer/brittany/SUD_cross_species/scRNA_seq/processed')

In [48]:
#read in non_neuron values
counts=read.csv("non_neuron_byROI_counts.csv", row.names = 1)
meta=read.csv("non_neuron_byROI_metadata.csv")

#use to filter for single tissue
#counts=counts %>% dplyr:: select(starts_with("Cerebral.cortex"))
#meta=meta[meta$roi=='Cerebral-cortex',]

In [83]:
#use to read in single tissue all neuron types broken own
#counts_neuron=read.csv('Cerebral cortex_counts.csv', row.names = 1)
#meta_neuron=read.csv('Cerebral cortex_metadata.csv')
#counts_neuron$Cerebral.cortex_Neuron.all=rowSums(counts_neuron[,])

In [49]:
counts_neuron=read.csv('neuron_netcounts.csv', row.names = 1)
meta_neuron=read.csv('neuron_netmetadata.csv')

In [55]:
counts_all=cbind(counts_neuron,counts)
meta_all=rbind(meta_neuron,meta)

In [68]:
counts_all <- round(counts_all)                # Round decimals to integers
counts_all <- apply(counts_all, 2, as.integer)  # Convert to numeric
counts_all[is.na(counts_all)] <- 0             # Replace NAs with 0
rownames(counts_all)=rownames(counts)

In [77]:
meta_all$roi <- as.factor(meta_all$roi)
meta_all$cell_type <- as.factor(meta_all$cell_type)
meta_all$group <- as.factor(meta_all$group)

In [88]:
colnames(counts_all) <- gsub("\\.", "-", colnames(counts_all))

In [94]:
dds <- DESeqDataSetFromMatrix(countData = counts_all, colData = meta_all, design = ~ group)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
vsd <- vst(dds, blind=TRUE)
vsd_df=as.data.frame(assay(vsd))
count_df=counts(dds)

converting counts to integer mode

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters



In [109]:
write.table(counts_all,'counts_all.csv',quote=FALSE,sep=',')
write.table(meta_all,'metadata_all.csv',quote=FALSE,sep=',')

In [102]:
getwd()

In [104]:
write.table(vsd_df,'vst_netcounts.csv',quote=FALSE,sep=',')

In [106]:
write.table(count_df,'deseq2_netcounts.csv',quote=FALSE,sep=',')

In [99]:
grouped_data <- vsd_df %>%
  group_by(Region, Tissue_Type, Cell_Type)

ERROR: [1m[33mError[39m in `group_by()`:[22m
[1m[22m[33m![39m Must group by variables found in `.data`.
Column `Region` is not found.
Column `Tissue_Type` is not found.
Column `Cell_Type` is not found.


In [142]:
normalized_counts <- counts(dds, normalized = FALSE)

# HDF5 single dissection

In [7]:
file_path='scRNA_seq/midbrain(rn)-red_nucleus.h5ad'

In [8]:
try:
    with h5py.File(file_path, "r") as f:
        print("File is a valid HDF5 file.")
except Exception as e:
    print("File is not a valid HDF5 file:", e)

File is a valid HDF5 file.


In [9]:
f = nx.nxload(file_path)
print(f.tree)

root:NXroot
  @encoding-type = 'anndata'
  @encoding-version = '0.1.0'
  X:NXgroup
    @encoding-type = 'csr_matrix'
    @encoding-version = '0.1.0'
    @shape = [4714, 59236]
    data = float32(13231512)
    indices = int32(13231512)
    indptr = int32(4715)
  layers:NXgroup
    @encoding-type = 'dict'
    @encoding-version = '0.1.0'
  obs:NXgroup
    @_index = 'CellID'
    @column-order = ['roi', 'organism_ontology_term_id', 'disease_...
    @encoding-type = 'dataframe'
    @encoding-version = '0.2.0'
    CellID = 
      @encoding-type = 'string-array'
      @encoding-version = '0.2.0'
    assay:NXgroup
      @encoding-type = 'categorical'
      @encoding-version = '0.2.0'
      @ordered = False
      categories = '10x 3' v3'
        @encoding-type = 'string-array'
        @encoding-version = '0.2.0'
      codes = int8(4714)
        @encoding-type = 'array'
        @encoding-version = '0.2.0'
    assay_ontology_term_id:NXgroup
      @encoding-type = 'categorical'
      @encoding-vers

In [10]:
bdata = sc.read_h5ad(file_path)

In [11]:
bdata.obs.head()

Unnamed: 0_level_0,roi,organism_ontology_term_id,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,assay_ontology_term_id,sex_ontology_term_id,development_stage_ontology_term_id,donor_id,suspension_type,dissection,...,tissue_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10X377_8:GCCAGGTGTACAACGG,Human RN,NCBITaxon:9606,PATO:0000461,HANCESTRO:0005,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,nucleus,Midbrain (RN) - Red Nucleus - RN,...,tissue,neuron,10x 3' v3,normal,Homo sapiens,male,midbrain,European,29-year-old stage,#HfsH?Vo1v
10X377_7:GTGTAACAGAAGTCTA,Human RN,NCBITaxon:9606,PATO:0000461,HANCESTRO:0005,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,nucleus,Midbrain (RN) - Red Nucleus - RN,...,tissue,neuron,10x 3' v3,normal,Homo sapiens,male,midbrain,European,29-year-old stage,<pAfB)^tK5
10X377_7:CTCAAGATCGCGCCAA,Human RN,NCBITaxon:9606,PATO:0000461,HANCESTRO:0005,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,nucleus,Midbrain (RN) - Red Nucleus - RN,...,tissue,neuron,10x 3' v3,normal,Homo sapiens,male,midbrain,European,29-year-old stage,BEft{HJPXJ
10X377_7:AAGTCGTAGTTGTAAG,Human RN,NCBITaxon:9606,PATO:0000461,HANCESTRO:0005,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,nucleus,Midbrain (RN) - Red Nucleus - RN,...,tissue,neuron,10x 3' v3,normal,Homo sapiens,male,midbrain,European,29-year-old stage,c1v0~-SrbE
10X377_8:TTTCAGTTCAGAATAG,Human RN,NCBITaxon:9606,PATO:0000461,HANCESTRO:0005,EFO:0009922,PATO:0000384,HsapDv:0000123,H19.30.002,nucleus,Midbrain (RN) - Red Nucleus - RN,...,tissue,neuron,10x 3' v3,normal,Homo sapiens,male,midbrain,European,29-year-old stage,?Nvn<l70W#


In [53]:
for x in bdata.obs.columns:
    print(f'{x}:\n {set(bdata.obs[x])}')

roi:
 {'Human RN'}
organism_ontology_term_id:
 {'NCBITaxon:9606'}
disease_ontology_term_id:
 {'PATO:0000461'}
self_reported_ethnicity_ontology_term_id:
 {'HANCESTRO:0005'}
assay_ontology_term_id:
 {'EFO:0009922'}
sex_ontology_term_id:
 {'PATO:0000384'}
development_stage_ontology_term_id:
 {'HsapDv:0000123'}
donor_id:
 {'H19.30.002'}
suspension_type:
 {'nucleus'}
dissection:
 {'Midbrain (RN) - Red Nucleus - RN'}
fraction_mitochondrial:
 {0.0, 0.001953125, 0.00146484375, 0.0004610419548178884, 0.0003369045212586753, 0.0002643404705260375, 0.000488162069807176, 0.0005188067444876783, 0.00048828125, 0.001145475372279496, 0.002353679585752393, 0.0003803727653100038, 0.00046388336646785953, 0.0016597510373443983, 0.003482197266475146, 0.0015082956259426848, 0.0009337068160597573, 0.001212856276531231, 0.0011065006915629322, 0.0025940337224383916, 0.00020990764063811922, 0.011419836345652369, 0.0004418912947414936, 0.00046572280178837556, 0.004669260700389105, 0.0008790436005625879, 0.0029550

In [57]:
bdata.X

<4714x59236 sparse matrix of type '<class 'numpy.float32'>'
	with 13231512 stored elements in Compressed Sparse Row format>

In [49]:
# Access metadata for rows (cells)
print(bdata.obs.head())

# Access metadata for columns (genes)
print(bdata.var.head())

# Access the expression matrix shape
print(bdata.X.shape)  # (number of rows, number of columns)

                                roi organism_ontology_term_id  \
CellID                                                          
10X377_8:GCCAGGTGTACAACGG  Human RN            NCBITaxon:9606   
10X377_7:GTGTAACAGAAGTCTA  Human RN            NCBITaxon:9606   
10X377_7:CTCAAGATCGCGCCAA  Human RN            NCBITaxon:9606   
10X377_7:AAGTCGTAGTTGTAAG  Human RN            NCBITaxon:9606   
10X377_8:TTTCAGTTCAGAATAG  Human RN            NCBITaxon:9606   

                          disease_ontology_term_id  \
CellID                                               
10X377_8:GCCAGGTGTACAACGG             PATO:0000461   
10X377_7:GTGTAACAGAAGTCTA             PATO:0000461   
10X377_7:CTCAAGATCGCGCCAA             PATO:0000461   
10X377_7:AAGTCGTAGTTGTAAG             PATO:0000461   
10X377_8:TTTCAGTTCAGAATAG             PATO:0000461   

                          self_reported_ethnicity_ontology_term_id  \
CellID                                                               
10X377_8:GCCAGGTGTACAACGG

In [50]:
# Extract column names (gene names) from the expression matrix
columns = bdata.var_names.to_list()
print(columns)

['ENSG00000286004', 'ENSG00000283117', 'ENSG00000254303', 'ENSG00000249854', 'ENSG00000286679', 'ENSG00000232046', 'ENSG00000248238', 'ENSG00000227827', 'ENSG00000171729', 'ENSG00000226239', 'ENSG00000225208', 'ENSG00000230866', 'ENSG00000176054', 'ENSG00000187955', 'ENSG00000228510', 'ENSG00000263655', 'ENSG00000276702', 'ENSG00000287146', 'ENSG00000230453', 'ENSG00000183671', 'ENSG00000285741', 'ENSG00000224557', 'ENSG00000006128', 'ENSG00000249150', 'ENSG00000236858', 'ENSG00000115290', 'ENSG00000215869', 'ENSG00000196083', 'ENSG00000286329', 'ENSG00000271952', 'ENSG00000171126', 'ENSG00000284633', 'ENSG00000099250', 'ENSG00000287383', 'ENSG00000222041', 'ENSG00000171522', 'ENSG00000226149', 'ENSG00000250974', 'ENSG00000224595', 'ENSG00000133636', 'ENSG00000164756', 'ENSG00000205978', 'ENSG00000286637', 'ENSG00000271850', 'ENSG00000168081', 'ENSG00000287293', 'ENSG00000279437', 'ENSG00000150630', 'ENSG00000285780', 'ENSG00000177106', 'ENSG00000105976', 'ENSG00000251504', 'ENSG000002

In [51]:
bdata.var_names

Index(['ENSG00000286004', 'ENSG00000283117', 'ENSG00000254303',
       'ENSG00000249854', 'ENSG00000286679', 'ENSG00000232046',
       'ENSG00000248238', 'ENSG00000227827', 'ENSG00000171729',
       'ENSG00000226239',
       ...
       'ENSG00000240137', 'ENSG00000108669', 'ENSG00000241860',
       'ENSG00000111679', 'ENSG00000136250', 'ENSG00000167851',
       'ENSG00000135048', 'ENSG00000075234', 'ENSG00000196810',
       'ENSG00000176659'],
      dtype='object', length=59236)

In [52]:
for key in adata.var:
   print(key)

Biotype
Chromosome
End
Gene
Start
feature_is_filtered
feature_name
feature_reference
feature_biotype
feature_length
feature_type


# loom file

In [35]:
loom_file_path='scRNA_seq/human_adult_GRCh38-3.0.0.loom'
#loom_file_path='scRNA_seq/adult_human_20221007.agg.loom'

In [36]:
with loompy.connect(loom_file_path) as ds:
    print("\nGlobal attributes:")
    for key in ds.attrs.keys():
        print(f"{key}: {ds.attrs[key]}")  # Print each global attribute


Global attributes:
CreationDate: 20230525T091713.478089Z
LOOM_SPEC_VERSION: 3.0.0


In [37]:
with loompy.connect(loom_file_path) as ds:
    # List all row attributes
    print("Row attributes:")
    for attr in ds.ra:
        print(attr)
    
    # List all column attributes
    print("\nColumn attributes:")
    for attr in ds.ca:
        print(attr)
    
    # List all layers
    print("\nLayers:")
    for layer in ds.layers.keys():
        print(layer)

Row attributes:
Accession
Chromosome
End
Gene
Start
Strand

Column attributes:
Age
CellID
Chemistry
Clusters
Donor
Roi
SampleID
Sex
Tissue
TotalUMIs

Layers:



In [38]:
with loompy.connect(loom_file_path) as ds:
    # Display the shape of the main matrix
    print(f"Shape of the main matrix: {ds.shape} (rows, columns)")
    
    # View the first few rows and columns of the main matrix
    print("\nHead of the main matrix:")
    print(ds[:, :10])  # Replace 5 with the number of columns you want to see

    # View row attributes for the first few rows
    print("\nRow attributes for the first few rows:")
    for attr in ds.ra:
        print(f"{attr}: {ds.ra[attr][:5]}")  # Replace 5 with the number of rows you want to see

    # View column attributes for the first few columns
    print("\nColumn attributes for the first few columns:")
    for attr in ds.ca:
        print(f"{attr}: {ds.ca[attr][:20]}")  # Replace 5 with the number of columns you want to see

Shape of the main matrix: (33538, 3369219) (rows, columns)

Head of the main matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Row attributes for the first few rows:
Accession: ['ENSG00000237613' 'ENSG00000238009' 'ENSG00000239945' 'ENSG00000239906'
 'ENSG00000284733']
Chromosome: ['1' '1' '1' '1' '1']
End: [ 36081 133723  91105 140339 451697]
Gene: ['FAM138A' 'AL627309.1' 'AL627309.3' 'AL627309.2' 'OR4F29']
Start: [ 34554  89295  89551 139790 450703]
Strand: ['-' '-' '-' '-' '-']

Column attributes for the first few columns:
Age: [42. 42. 29. 29. 50. 29. 29. 29. 29. 29. 29. 29. 50. 29. 29. 29. 29. 29.
 42. 29.]
CellID: ['10X376_7:GAACGTTGTATCGCTA' '10X376_8:GAATCGTTCGATACGT'
 '10X393_5:GCGTTTCCATAGATGA' '10X377_4:CTCAACCTCATTTCCA'
 '10X220_8:CGGGACTTCGGTGAAG' '10X230_5:GTGTAACGTACCTTCC'
 '10X264_3:CAGTGCGAGCCTTCTC' '10X418_1:TGGATGTGTATCGATC'
 '10X393_4:TGAGGGAGTGTCATTG' '10X319_5:CTCCCAATCCAACACA'
 '10X361

In [98]:
with loompy.connect(loom_file_path) as ds:
    # Identify columns with "Tissue" in their attribute name
    print(ds.shape)
    tissue_columns = [i for i, name in enumerate(ds.ca.keys()) if "Tissue" in name]
    print (tissue_columns)
    if not tissue_columns:
        print("No columns with 'Tissue' found.")
    else:
        print(f"Columns with 'Tissue': {[ds.ca.keys()[i] for i in tissue_columns]}")

        # Extract the gene row for these columns
        # Assuming the "Gene" row attribute corresponds to the index you need
        gene_row_index = list(ds.ra.keys()).index("Gene")
        
        # Retrieve the data for these columns
        gene_data = ds[tissue_columns,:]  # Matrix subset for the desired columns
        
        # If you want row attribute details as well, extract them
        gene_names = ds.ra["Gene"]  # Extract all gene names
        
        # Example: print first 5 genes with their corresponding data
        print("\nFirst 5 genes and their values:")
        for i in range(min(5, gene_data.shape[0])):
            print(f"{gene_names[i]}: {gene_data[i, :]}")

(59480, 461)
[612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724]
Columns with 'Tissue': ['Tissue_Amygdaloid complex (AMY) - Basolateral nuclear group (BLN) - lateral nucleus - La', 'Tissue_Amygdaloid complex (AMY) - Central nuclear group - CEN', 'Tissue_Amygdaloid complex (AMY) - Corticomedial nuclear group (CMN) - anterior cortical nucleus - CoA', 'Tissue_Amygdaloid complex (AMY) - basolateral nuclear group (BLN) - basolateral nucleus (basal nucleus) - BL', 'Tissue_Amygdaloid comp

In [8]:
gene_data_pd=pd.DataFrame(gene_data)

In [16]:
gene_data

array([[0.        , 0.        , 0.        , ..., 0.00093408, 0.00199689,
        0.00154864],
       [0.        , 0.0004042 , 0.        , ..., 0.00253536, 0.11271356,
        0.10756019],
       [0.        , 0.00080841, 0.        , ..., 0.01654657, 0.04881296,
        0.01534563],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00440352, 0.01264699,
        0.03266226],
       [0.        , 0.00161681, 0.        , ..., 0.3761676 , 0.0639006 ,
        0.16387442],
       [0.        , 0.        , 0.        , ..., 0.03856418, 0.00465942,
        0.00520907]])

In [63]:
with loompy.connect(loom_file_path) as ds:
    enrichment_data = ds.layers["nonzeros"][:,:]
    colnames = list(ds.ca[t])  # Replace 'Gene' with the appropriate row attribute
    rownames = list(ds.ra['Gene'])  # Replace 'Gene' with the appropriate row attribute
    enr_nz=pd.DataFrame(enrichment_data, columns=colnames, index=rownames)
    enrichment_data = ds.layers["enrichment"][:,:]
    enr=pd.DataFrame(enrichment_data, columns=colnames, index=rownames)


In [94]:
t='Tissue_Amygdaloid complex (AMY) - corticomedial nuclear group - CMN'

with loompy.connect(loom_file_path) as ds:
    # Check that required attributes and layers exist
    if "enrichment" not in ds.layers:
        raise KeyError("Layer 'enrichment' does not exist in the file.")
    if "Gene" not in ds.ra:
        raise KeyError("Row attribute 'gene' does not exist in the file.")
    if t not in ds.ca:
        raise KeyError("Column attribute 't' does not exist in the file.")    
    # Extract row (gene) and column (t) attributes
    gene_names = ds.ra["Gene"]
    column_t = ds.ca[t]

    # Extract the data for the 'enrichment' layer
    enrichment_data = ds.layers["enrichment"][:, :]  # Full enrichment layer

    # Create a pandas DataFrame
    enr = pd.DataFrame(enrichment_data, index=gene_names, columns=column_t)


In [96]:
column_t

array([   0,    2,    3,    2,    3,    1,   10,   34,   44,   26,   50,
        214,   11,    0,   11,   29,   16,    1,    3,    2,    5,    4,
          4,    0,    0,    2,    5,   20,   35,    0,    0,    1,    0,
         19,  453,  183,    7,    9,    3,    7,   68,   22,    0,    3,
        177,  461,   78,  664,    0,   43,    0,    0,    7,    5,  815,
          5,   32,   14,   99,    0,   23,  125,   15,   31,    2,    1,
          0,    0,    0,   16,    0,    0,   31,   17,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   18,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    5,  101,
         43,  450,   25,  132,   48,   69,    5,    0,    0,   45,    0,
          0,   89,   11,    0,    0,    0,    0,    0,    0,   77,   21,
          5,    0,    0,    0,    1,    1,    0,    0,    0,    3,    0,
          0,    0,    0,    0,    0,    2,    0,    0,    0,    0,    0,
          4,    0,  214,  398,  112,    5,    2,   

In [93]:
pd.DataFrame(enrichment_data)

Unnamed: 0,0
0,0.344860
1,0.587929
2,0.425287
3,0.057703
4,0.463422
...,...
59475,59.758093
59476,2.027394
59477,5.156342
59478,84.602524


In [61]:
with loompy.connect(loom_file_path) as ds:
    enrichment_data = ds.layers["nonzeros"][:,:]
    colnames = list(ds.ca['Tissue_Amygdaloid complex (AMY) - corticomedial nuclear group - CMN'])  # Replace 'Gene' with the appropriate row attribute
    rownames = list(ds.ra['Gene'])  # Replace 'Gene' with the appropriate row attribute
    enr=pd.DataFrame(enrichment_data, columns=colnames, index=rownames)

In [47]:
with loompy.connect(loom_file_path) as ds:
    print(f"Number of columns in data matrix: {ds.shape[1]}")

Number of columns in data matrix: 461


In [50]:
with loompy.connect(loom_file_path) as ds:
    for attr in ds.ca.keys():
        print(f"{attr}: {len(ds.ca[attr])} entries")

Age_29.0: 461 entries
Age_42.0: 461 entries
Age_50.0: 461 entries
Age_60.0: 461 entries
Clusters: 461 entries
NCells: 461 entries
SampleID_10X145_1: 461 entries
SampleID_10X145_2: 461 entries
SampleID_10X145_3: 461 entries
SampleID_10X145_4: 461 entries
SampleID_10X145_5: 461 entries
SampleID_10X145_6: 461 entries
SampleID_10X145_7: 461 entries
SampleID_10X145_8: 461 entries
SampleID_10X146_2: 461 entries
SampleID_10X146_3: 461 entries
SampleID_10X146_4: 461 entries
SampleID_10X146_5: 461 entries
SampleID_10X146_6: 461 entries
SampleID_10X146_7: 461 entries
SampleID_10X159_1: 461 entries
SampleID_10X159_2: 461 entries
SampleID_10X159_3: 461 entries
SampleID_10X159_4: 461 entries
SampleID_10X159_5: 461 entries
SampleID_10X159_6: 461 entries
SampleID_10X159_7: 461 entries
SampleID_10X160_1: 461 entries
SampleID_10X160_2: 461 entries
SampleID_10X160_3: 461 entries
SampleID_10X160_5: 461 entries
SampleID_10X160_6: 461 entries
SampleID_10X160_7: 461 entries
SampleID_10X160_8: 461 entries
Sa

In [26]:
# Open the loom file
with loompy.connect(loom_file_path) as ds:
    # Find columns with attributes containing the string "Tissue"
    tissue_columns = [i for i, name in enumerate(ds.ca.keys()) if "Tissue" in name]

    if not tissue_columns:
        print("No columns with attributes containing 'Tissue'.")
    else:
        print(f"Found columns with attributes containing 'Tissue': {[list(ds.ca.keys())[i] for i in tissue_columns]}")

        # Extract the "enrichment" layer for the selected columns
        enrichment_data = ds.layers["enrichment"][:, :]

Found columns with attributes containing 'Tissue': ['Tissue_Amygdaloid complex (AMY) - Basolateral nuclear group (BLN) - lateral nucleus - La', 'Tissue_Amygdaloid complex (AMY) - Central nuclear group - CEN', 'Tissue_Amygdaloid complex (AMY) - Corticomedial nuclear group (CMN) - anterior cortical nucleus - CoA', 'Tissue_Amygdaloid complex (AMY) - basolateral nuclear group (BLN) - basolateral nucleus (basal nucleus) - BL', 'Tissue_Amygdaloid complex (AMY) - basolateral nuclear group (BLN) - basomedial nucleus (accessory basal nucleus) - BM', 'Tissue_Amygdaloid complex (AMY) - corticomedial nuclear group - CMN', 'Tissue_Basal forebrain (BF) - septal nuclei - SEP', 'Tissue_Basal forebrain (BF) - substantia innominata and nearby nuclei - SI', 'Tissue_Basal nuclei (BN) - Body of the Caudate - CaB', 'Tissue_Basal nuclei (BN) - Globus pallidus (GP) - External segment of globus pallidus - GPe', 'Tissue_Basal nuclei (BN) - Globus pallidus (GP) - Internal segment of globus pallidus - GPi', 'Tiss

In [28]:
pd.DataFrame(enrichment_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,451,452,453,454,455,456,457,458,459,460
0,0.344860,0.391184,0.344818,0.344830,0.344812,0.344810,0.360002,0.354256,0.354099,0.359219,...,1.788440,1.098701,1.079339,1.686650,3.479521,0.664248,0.616911,1.200425,0.977380,3.646978
1,0.587929,0.299409,0.274908,0.274919,0.274903,0.314277,0.286997,0.287369,0.294040,0.298178,...,0.283628,0.274614,0.290979,0.279240,0.339949,0.335396,0.274933,0.294732,0.321558,0.750652
2,0.425287,0.167897,0.185895,0.150346,0.141335,0.158588,0.144971,0.143858,0.165679,0.149405,...,0.214232,0.155631,0.207007,0.222149,0.279346,0.150017,0.122976,0.349625,1.694776,0.256841
3,0.057703,0.065427,0.057691,0.057695,0.066311,0.074405,0.068009,0.063263,0.064131,0.061292,...,0.366652,0.602429,0.379398,0.382443,0.100175,0.107084,0.057699,0.145519,0.747533,3.807201
4,0.463422,0.525733,0.540379,0.463390,0.463371,0.463369,0.483822,0.469448,0.482617,0.472907,...,0.520824,0.503748,0.527955,0.531915,0.686744,0.463390,0.640919,0.551947,0.613992,0.592599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59475,59.758093,8.360314,51.630179,33.027960,3.076698,6.304062,23.567938,7.413633,8.178642,11.029427,...,0.884947,0.744416,0.955812,0.668471,1.323802,0.474367,0.656101,0.956725,2.641244,0.780528
59476,2.027394,0.948060,0.948086,0.948089,0.948085,0.948085,0.948059,0.947580,0.947982,0.948020,...,1.177956,0.948022,1.356602,1.134798,0.948089,0.948088,0.948092,1.103472,1.474401,1.249909
59477,5.156342,0.647794,0.647921,1.055004,0.647915,0.647913,0.676588,0.672105,0.693718,0.675317,...,3.300378,1.402341,2.462563,2.583675,1.999038,1.578328,4.271600,3.741832,1.970847,1.557361
59478,84.602524,1.185057,2.644737,0.510783,0.510764,0.735344,0.556185,0.582929,0.634370,0.565298,...,1.761387,1.038020,1.141318,1.406349,1.730820,0.983944,0.510800,1.578726,1.504829,1.744987
