In [1]:
import os
import pandas as pd
import numpy as np
import anndata
import time

In [2]:
input_base = '/allen/programs/celltypes/workgroups/rnaseqanalysis/lydian/ABC_handoff'
input_directory = os.path.join( input_base, 'metadata', '20230630', '10x-scRNA-seq' )

view_directory = os.path.join( input_directory, 'views')
cache_views = False
if cache_views :
    os.makedirs( view_directory, exist_ok=True )

In [3]:
file = os.path.join( input_directory,'cell_metadata.csv')
cell = pd.read_csv(file)
cell.set_index('cell_label',inplace=True)

In [12]:
matrices = cell.groupby('matrix_prefix')[['library_label']].count()
matrices

Unnamed: 0_level_0,library_label
matrix_prefix,Unnamed: 1_level_1
WMB-10Xv2-CTXsp,44310
WMB-10Xv2-HPF,208299
WMB-10Xv2-HY,100562
WMB-10Xv2-Isocortex,1000160
WMB-10Xv2-MB,29891
WMB-10Xv2-OLF,193723
WMB-10Xv2-TH,131212
WMB-10Xv3-CB,182026
WMB-10Xv3-CTXsp,78464
WMB-10Xv3-HPF,181653


In [5]:
expression_directory = os.path.join(input_base, '10x')
ext = 'h5ad'
normalization = 'log2'

In [6]:
matrix_prefix = matrices.index[0]
file = os.path.join( expression_directory, '%s-%s.%s'% (matrix_prefix,normalization,ext) )
print(file)

/allen/programs/celltypes/workgroups/rnaseqanalysis/lydian/ABC_handoff/10x/WMB-10Xv2-CTXsp-log2.h5ad


In [7]:
ad = anndata.read_h5ad(file,backed='r')
gene = ad.var

In [8]:
ntgenes = ['Slc17a7','Slc17a6','Slc17a8','Slc32a1','Slc6a5','Slc18a3','Slc6a3','Slc6a4','Slc6a2']
exgenes = ['Tac2']
gnames = ntgenes + exgenes
pred = [x in gnames for x in gene.gene_symbol]
gene_filtered = gene[pred]
gene_filtered

Unnamed: 0_level_0,gene_symbol
gene_identifier,Unnamed: 1_level_1
ENSMUSG00000037771,Slc32a1
ENSMUSG00000070570,Slc17a7
ENSMUSG00000039728,Slc6a5
ENSMUSG00000030500,Slc17a6
ENSMUSG00000055368,Slc6a2
ENSMUSG00000019935,Slc17a8
ENSMUSG00000025400,Tac2
ENSMUSG00000020838,Slc6a4
ENSMUSG00000021609,Slc6a3
ENSMUSG00000100241,Slc18a3


In [9]:
# create empty gene expression dataframe
gdata = pd.DataFrame(index=cell.index,columns=gene_filtered.index)
count = 0
total_start = time.process_time()

for mp in matrices.index :
    
    print(mp)
    
    file = os.path.join( expression_directory, '%s-%s.%s'% (mp,normalization,ext) )
    
    start = time.process_time()
    ad = anndata.read_h5ad(file,backed='r')
    exp = ad[:,gene_filtered.index].to_df()
    gdata.loc[ exp.index, gene_filtered.index ] = exp
    print(" - time taken: ", time.process_time() - start)
    
    count += 1
    
    #if count > 2 :
    #    break
        
print("total time taken: ", time.process_time() - total_start)
    

WMB-10Xv2-CTXsp
 - time taken:  3.3886422189999994
WMB-10Xv2-HPF
 - time taken:  6.172880113999998
WMB-10Xv2-HY
 - time taken:  3.0396994699999986
WMB-10Xv2-Isocortex
 - time taken:  70.232477417
WMB-10Xv2-MB
 - time taken:  0.8921422459999917
WMB-10Xv2-OLF
 - time taken:  5.028425851000009
WMB-10Xv2-TH
 - time taken:  3.949250598000006
WMB-10Xv3-CB
 - time taken:  5.415416426999997
WMB-10Xv3-CTXsp
 - time taken:  3.077125742000007
WMB-10Xv3-HPF
 - time taken:  6.825266414999987
WMB-10Xv3-HY
 - time taken:  6.654827050000009
WMB-10Xv3-Isocortex
 - time taken:  36.753458218000006
WMB-10Xv3-MB
 - time taken:  12.666130941999995
WMB-10Xv3-MY
 - time taken:  6.877677086000006
WMB-10Xv3-OLF
 - time taken:  3.010932175999983
WMB-10Xv3-P
 - time taken:  5.684849082999989
WMB-10Xv3-PAL
 - time taken:  4.4091879450000135
WMB-10Xv3-STR
 - time taken:  14.248551574000004
WMB-10Xv3-TH
 - time taken:  5.419645736999996
total time taken:  203.751599182


In [10]:
# change columns from index to gene symbol
gdata.columns = gene_filtered.gene_symbol
pred = pd.notna(gdata[gdata.columns[0]])
gdata = gdata[pred].copy(deep=True)
print(len(gdata))

4057701


In [11]:
if cache_views :
    file = os.path.join( view_directory, 'example_genes_all_cells_expression.csv')
    gdata.to_csv( file )