# MERFISH whole brain spatial transcriptomics (part 2a)

In part 1, we explored two examples looking at the expression of canonical neurotransmitter transporter genes and gene Tac2 in the one coronal section. In this notebook, we will prepare data so that we can repeat the examples for all cells spanning the whole brain. This notebook takes ~20 seconds to run.

In [12]:
import os
import pandas as pd
import numpy as np
import anndata
import time

In [13]:
input_base = '/allen/programs/celltypes/workgroups/rnaseqanalysis/lydian/ABC_handoff'
input_directory = os.path.join( input_base, 'dataframes', 'MERFISH-C57BL6J-638850','20230630' )

view_directory = os.path.join( input_directory, 'views')
cache_views = False
if cache_views :
    os.makedirs( view_directory, exist_ok=True )

In [14]:
file = os.path.join( input_directory,'cell_metadata.csv')
cell = pd.read_csv(file, dtype={'cell_label':str})
cell.set_index('cell_label',inplace=True)
print(len(cell))

4330907


In [15]:
matrices = cell.groupby('matrix_prefix')[['brain_section_label']].count()
matrices.head(5)

Unnamed: 0_level_0,brain_section_label
matrix_prefix,Unnamed: 1_level_1
C57BL6J-638850.01,26217
C57BL6J-638850.02,29286
C57BL6J-638850.03,36028
C57BL6J-638850.04,47445
C57BL6J-638850.05,50990


In [18]:
expression_directory = os.path.join(input_base, 'expression_matrices')
dataset_label = 'MERFISH-C57BL6J-638850'
release = '20230630'
matrix_prefix = matrices.index[0]
ext = 'h5ad'
normalization = 'log2'

file = os.path.join( expression_directory, dataset_label, release, '%s-%s.%s'% (matrix_prefix,normalization,ext) )
print(file)

/allen/programs/celltypes/workgroups/rnaseqanalysis/lydian/ABC_handoff/expression_matrices/MERFISH-C57BL6J-638850/20230630/C57BL6J-638850.01-log2.h5ad


In [19]:
ad = anndata.read_h5ad(file,backed='r')
gene = ad.var

In [20]:
ntgenes = ['Slc17a7','Slc17a6','Slc17a8','Slc32a1','Slc6a5','Slc18a3','Slc6a3','Slc6a4','Slc6a2']
exgenes = ['Tac2']
gnames = ntgenes + exgenes
pred = [x in gnames for x in gene.gene_symbol]
gene_filtered = gene[pred]
gene_filtered

Unnamed: 0_level_0,gene_symbol,transcript_identifier
gene_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSMUSG00000030500,Slc17a6,ENSMUST00000032710
ENSMUSG00000037771,Slc32a1,ENSMUST00000045738
ENSMUSG00000025400,Tac2,ENSMUST00000026466
ENSMUSG00000039728,Slc6a5,ENSMUST00000056442
ENSMUSG00000070570,Slc17a7,ENSMUST00000085374
ENSMUSG00000019935,Slc17a8,ENSMUST00000020102
ENSMUSG00000021609,Slc6a3,ENSMUST00000022100
ENSMUSG00000020838,Slc6a4,ENSMUST00000021195


In [22]:
# create empty gene expression dataframe
gdata = pd.DataFrame(index=cell.index,columns=gene_filtered.index)
count = 0
total_start = time.process_time()

for mp in matrices.index :
    
    print(mp)
    
    file = os.path.join( expression_directory, dataset_label, release, '%s-%s.%s'% (mp,normalization,ext) )
    
    start = time.process_time()
    ad = anndata.read_h5ad(file,backed='r')
    exp = ad[:,gene_filtered.index].to_df()
    gdata.loc[ exp.index, gene_filtered.index ] = exp
    print(" - time taken: ", time.process_time() - start)
    
    count += 1
    
    #if count > 2 :
    #    break
        
print("total time taken: ", time.process_time() - total_start)
    

C57BL6J-638850.01
 - time taken:  1.9116383910000039
C57BL6J-638850.02
 - time taken:  0.10446433599999949
C57BL6J-638850.03
 - time taken:  0.12465494799999988
C57BL6J-638850.04
 - time taken:  0.15820483799999607
C57BL6J-638850.05
 - time taken:  0.16723013200000025
C57BL6J-638850.06
 - time taken:  0.17020694699999694
C57BL6J-638850.08
 - time taken:  0.17107610200000067
C57BL6J-638850.09
 - time taken:  0.2299707139999967
C57BL6J-638850.10
 - time taken:  0.1566791099999989
C57BL6J-638850.11
 - time taken:  0.16926130200000244
C57BL6J-638850.12
 - time taken:  0.18050294500000064
C57BL6J-638850.13
 - time taken:  0.23930026899999746
C57BL6J-638850.14
 - time taken:  0.2978103129999994
C57BL6J-638850.15
 - time taken:  0.2744895530000022
C57BL6J-638850.16
 - time taken:  0.29903905300000133
C57BL6J-638850.17
 - time taken:  0.22760415900000197
C57BL6J-638850.18
 - time taken:  0.1723526640000017
C57BL6J-638850.19
 - time taken:  0.24059457599999945
C57BL6J-638850.24
 - time taken:  

In [23]:
# change columns from index to gene symbol
gdata.columns = gene_filtered.gene_symbol
pred = pd.notna(gdata[gdata.columns[0]])
gdata = gdata[pred].copy(deep=True)
print(len(gdata))

4330907


In [24]:
if cache_views :
    file = os.path.join( view_directory, 'example_genes_all_cells_expression.csv')
    gdata.to_csv( file )