In [1]:
import os
import pandas as pd
import numpy as np
import anndata
import time

In [2]:
input_base = '/allen/programs/celltypes/workgroups/rnaseqanalysis/lydian/ABC_handoff'
input_directory = os.path.join( input_base, 'metadata', '20230630', 'MERFISH-C57BL6J-638850' )

view_directory = os.path.join( input_directory, 'views')
cache_views = True
if cache_views :
    os.makedirs( view_directory, exist_ok=True )

In [3]:
file = os.path.join( input_directory,'cell_metadata.csv')
cell = pd.read_csv(file, dtype={'cell_label':str})
cell.set_index('cell_label',inplace=True)
print(len(cell))

4330907


In [4]:
matrices = cell.groupby('matrix_prefix')[['brain_section_label']].count()
matrices.head(5)

Unnamed: 0_level_0,brain_section_label
matrix_prefix,Unnamed: 1_level_1
C57BL6J-638850.01,26217
C57BL6J-638850.02,29286
C57BL6J-638850.03,36028
C57BL6J-638850.04,47445
C57BL6J-638850.05,50990


In [5]:
expression_directory = os.path.join(input_base, 'MERFISH')
ext = 'h5ad'
normalization = 'log2'

In [6]:
matrix_prefix = matrices.index[0]
file = os.path.join( expression_directory, '%s-%s.%s'% (matrix_prefix,normalization,ext) )
print(file)

/allen/programs/celltypes/workgroups/rnaseqanalysis/lydian/ABC_handoff/MERFISH/C57BL6J-638850.01-log2.h5ad


In [7]:
ad = anndata.read_h5ad(file,backed='r')
gene = ad.var

In [8]:
ntgenes = ['Slc17a7','Slc17a6','Slc17a8','Slc32a1','Slc6a5','Slc18a3','Slc6a3','Slc6a4','Slc6a2']
exgenes = ['Tac2']
gnames = ntgenes + exgenes
pred = [x in gnames for x in gene.gene_symbol]
gene_filtered = gene[pred]
gene_filtered

Unnamed: 0_level_0,gene_symbol,transcript_identifier
gene_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSMUSG00000030500,Slc17a6,ENSMUST00000032710
ENSMUSG00000037771,Slc32a1,ENSMUST00000045738
ENSMUSG00000025400,Tac2,ENSMUST00000026466
ENSMUSG00000039728,Slc6a5,ENSMUST00000056442
ENSMUSG00000070570,Slc17a7,ENSMUST00000085374
ENSMUSG00000019935,Slc17a8,ENSMUST00000020102
ENSMUSG00000021609,Slc6a3,ENSMUST00000022100
ENSMUSG00000020838,Slc6a4,ENSMUST00000021195


In [9]:
# create empty gene expression dataframe
gdata = pd.DataFrame(index=cell.index,columns=gene_filtered.index)
count = 0
total_start = time.process_time()

for mp in matrices.index :
    
    print(mp)
    
    file = os.path.join( expression_directory, '%s-%s.%s'% (mp,normalization,ext) )
    
    start = time.process_time()
    ad = anndata.read_h5ad(file,backed='r')
    exp = ad[:,gene_filtered.index].to_df()
    gdata.loc[ exp.index, gene_filtered.index ] = exp
    print(" - time taken: ", time.process_time() - start)
    
    count += 1
    
    #if count > 2 :
    #    break
        
print("total time taken: ", time.process_time() - total_start)
    

C57BL6J-638850.01
 - time taken:  1.8779552830000004
C57BL6J-638850.02
 - time taken:  0.10309131500000035
C57BL6J-638850.03
 - time taken:  0.1291523950000002
C57BL6J-638850.04
 - time taken:  0.18213723200000054
C57BL6J-638850.05
 - time taken:  0.18313839600000037
C57BL6J-638850.06
 - time taken:  0.1842877629999986
C57BL6J-638850.08
 - time taken:  0.17990856700000002
C57BL6J-638850.09
 - time taken:  0.24475073700000038
C57BL6J-638850.10
 - time taken:  0.16943006900000057
C57BL6J-638850.11
 - time taken:  0.17858449499999907
C57BL6J-638850.12
 - time taken:  0.19992414100000033
C57BL6J-638850.13
 - time taken:  0.2709365100000003
C57BL6J-638850.14
 - time taken:  0.3558575829999988
C57BL6J-638850.15
 - time taken:  0.3187146130000009
C57BL6J-638850.16
 - time taken:  0.36180216600000037
C57BL6J-638850.17
 - time taken:  0.266925874
C57BL6J-638850.18
 - time taken:  0.1927083550000006
C57BL6J-638850.19
 - time taken:  0.2932950870000006
C57BL6J-638850.24
 - time taken:  0.28326902

In [10]:
# change columns from index to gene symbol
gdata.columns = gene_filtered.gene_symbol
pred = pd.notna(gdata[gdata.columns[0]])
gdata = gdata[pred].copy(deep=True)
print(len(gdata))

4330907


In [11]:
if cache_views :
    file = os.path.join( view_directory, 'example_genes_all_cells_expression.csv')
    gdata.to_csv( file )