In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import anndata
import time
import matplotlib.pyplot as plt
import json

from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

In [2]:
version = '20231215'
download_base = Path('../../../abc_download_root')
abc_cache = AbcProjectCache.from_s3_cache(download_base)
abc_cache.load_manifest(f'releases/{version}/manifest.json')


The manifest file you are loading is not the most up to date manifest file available for this dataset. The most up to data manifest file available for this dataset is 

releases/20241115/manifest.json

To see the differences between these manifests,run

type.compare_manifests('releases/20241115/manifest.json', 'releases/20231215/manifest.json')

To see all of the manifest files currently downloaded onto your local system, run

self.list_all_downloaded_manifests()

If you just want to load the latest manifest, run

self.load_latest_manifest()




Read in cluster annotation term set dataframe

In [3]:
term_sets = abc_cache.get_metadata_dataframe(directory='WMB-taxonomy', file_name='cluster_annotation_term_set').set_index('label')
term_sets

Unnamed: 0_level_0,name,description,order
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CCN20230722_NEUR,neurotransmitter,Clusters are assigned based on the average exp...,0
CCN20230722_CLAS,class,The top level of cell type definition in the m...,1
CCN20230722_SUBC,subclass,The coarse level of cell type definition in th...,2
CCN20230722_SUPT,supertype,The second finest level of cell type definitio...,3
CCN20230722_CLUS,cluster,The finest level of cell type definition in th...,4


Read in cluster annotation term dataframe. Spike in a row to handle unassigned neurotransmitter clusters.

In [4]:
term = abc_cache.get_metadata_dataframe(directory='WMB-taxonomy', file_name='cluster_annotation_term')

other_term = pd.DataFrame(columns=term.columns)
other_term.loc[0,'name'] = 'Other'
other_term.loc[0,'cluster_annotation_term_set_name'] = 'neurotransmitter'
other_term.loc[0,'color_hex_triplet'] = '#ebebeb'
other_term.loc[0,'term_set_order'] = 0
other_term.loc[0,'term_order'] = 9
term = pd.concat([term,other_term],ignore_index=True)

term.sort_values(['term_set_order','term_order'],inplace=True)

Find and store first child for each term

In [5]:
term[term['cluster_annotation_term_set_name'] == 'neurotransmitter']

Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet
0,CS20230722_NEUR_Glut,Glut,CCN20230722_NEUR,,,0,0,neurotransmitter,#2B93DF
1,CS20230722_NEUR_NA,,CCN20230722_NEUR,,,0,1,neurotransmitter,#666666
2,CS20230722_NEUR_GABA,GABA,CCN20230722_NEUR,,,0,2,neurotransmitter,#FF3358
3,CS20230722_NEUR_Dopa,Dopa,CCN20230722_NEUR,,,0,3,neurotransmitter,#fcf04b
4,CS20230722_NEUR_Glut-GABA,Glut-GABA,CCN20230722_NEUR,,,0,4,neurotransmitter,#0a9964
5,CS20230722_NEUR_Chol,Chol,CCN20230722_NEUR,,,0,5,neurotransmitter,#73E785
6,CS20230722_NEUR_Hist,Hist,CCN20230722_NEUR,,,0,6,neurotransmitter,#ff7621
7,CS20230722_NEUR_GABA-Glyc,GABA-Glyc,CCN20230722_NEUR,,,0,7,neurotransmitter,#820e57
8,CS20230722_NEUR_Sero,Sero,CCN20230722_NEUR,,,0,8,neurotransmitter,#533691
9,CS20230722_NEUR_Nora,Nora,CCN20230722_NEUR,,,0,9,neurotransmitter,#03EDFF


In [6]:
filtered = term[pd.notna(term['parent_term_label'])]
first_child = filtered.groupby('parent_term_label')[['label','name','term_order','cluster_annotation_term_set_name']].first()
first_child

Unnamed: 0_level_0,label,name,term_order,cluster_annotation_term_set_name
parent_term_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CS20230722_CLAS_01,CS20230722_SUBC_001,001 CLA-EPd-CTX Car3 Glut,0,subclass
CS20230722_CLAS_02,CS20230722_SUBC_027,027 L6b EPd Glut,26,subclass
CS20230722_CLAS_03,CS20230722_SUBC_035,035 OB Eomes Ms4a15 Glut,34,subclass
CS20230722_CLAS_04,CS20230722_SUBC_037,037 DG Glut,36,subclass
CS20230722_CLAS_05,CS20230722_SUBC_039,039 OB Meis2 Thsd7b Gaba,38,subclass
...,...,...,...,...
CS20230722_SUPT_1197,CS20230722_CLUS_5316,5316 DC NN_1,5315,cluster
CS20230722_SUPT_1198,CS20230722_CLUS_5319,5319 B cells NN_1,5318,cluster
CS20230722_SUPT_1199,CS20230722_CLUS_5320,5320 ILC NN_2,5319,cluster
CS20230722_SUPT_1200,CS20230722_CLUS_5321,5321 NK cells NN_3,5320,cluster


In [7]:
term.set_index('label',inplace=True)
term.loc[first_child.index,'first_child_label'] = first_child['label']
term.loc[first_child.index,'first_child_term_set_name'] = first_child['cluster_annotation_term_set_name']
term.reset_index(inplace=True)

In [8]:
term[pd.notna(term['first_child_label'])].head(5)

Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet,first_child_label,first_child_term_set_name
11,CS20230722_CLAS_01,01 IT-ET Glut,CCN20230722_CLAS,,,1,0,class,#FA0087,CS20230722_SUBC_001,subclass
12,CS20230722_CLAS_02,02 NP-CT-L6b Glut,CCN20230722_CLAS,,,1,1,class,#61e2a4,CS20230722_SUBC_027,subclass
13,CS20230722_CLAS_03,03 OB-CR Glut,CCN20230722_CLAS,,,1,2,class,#D00000,CS20230722_SUBC_035,subclass
14,CS20230722_CLAS_04,04 DG-IMN Glut,CCN20230722_CLAS,,,1,3,class,#16f2f2,CS20230722_SUBC_037,subclass
15,CS20230722_CLAS_05,05 OB-IMN GABA,CCN20230722_CLAS,,,1,4,class,#1b4332,CS20230722_SUBC_039,subclass


Read in the cluster annotation term name pivot table and sort it

In [9]:
df = abc_cache.get_metadata_dataframe(directory='WMB-taxonomy', file_name='cluster_to_cluster_annotation_membership_pivoted')
df = df.fillna('Other')
df_sorted = df.sort_values(['class', 'subclass', 'supertype', 'cluster'])
df_sorted

cluster_to_cluster_annotation_membership_pivoted.csv: 100%|██████████████████████████████████████| 531k/531k [00:00<00:00, 5.64MMB/s]


Unnamed: 0,cluster_alias,neurotransmitter,class,subclass,supertype,cluster
125,128,Glut,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0001 CLA-EPd-CTX Car3 Glut_1
126,129,Glut,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0002 CLA-EPd-CTX Car3 Glut_1
127,130,Glut,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0003 CLA-EPd-CTX Car3 Glut_1
140,143,Glut,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0004 CLA-EPd-CTX Car3 Glut_1
128,131,Glut,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0002 CLA-EPd-CTX Car3 Glut_2,0005 CLA-EPd-CTX Car3 Glut_2
...,...,...,...,...,...,...
5028,5279,Other,34 Immune,337 DC NN,1197 DC NN_1,5318 DC NN_1
5024,5275,Other,34 Immune,338 Lymphoid NN,1198 B cells NN_1,5319 B cells NN_1
5021,5272,Other,34 Immune,338 Lymphoid NN,1199 ILC NN_2,5320 ILC NN_2
5023,5274,Other,34 Immune,338 Lymphoid NN,1200 NK cells NN_3,5321 NK cells NN_3


Create a lookup by name dataframes for each term set

In [10]:
lookup = {}
for tag in term_sets['name'] :
    #print(tag)
    pred = (term['cluster_annotation_term_set_name'] == tag)
    filtered = term[pred].copy()
    filtered.set_index('name', inplace=True)
    lookup[tag] = filtered

Helper functions to lookup an term attribut and format a cell in the html table

In [11]:
def get_value(c, n, v) :
    return lookup[c].loc[n][v]

def format_cell(df, c, add_id=False, add_plus=False, add_minus=False) :

    divs = pd.DataFrame(index=df.index)
    
    pattern = '<div class="circle" style="background-color:%s"></div>'
    divs['circle'] = [ pattern % get_value(c, x, 'color_hex_triplet') for x in df[c]]
    
    pattern = '<div class="celltext">%s</div>'
    divs['name'] = [ pattern % x for x in df[c] ]
   
    divs['id'] = ''
    if add_id :
        pattern = '<div id="%s"></div>'
        divs['id'] = [ pattern % get_value(c,x,'label') for x in df[c] ]
        
    divs['plus'] = ''
    if add_plus :
        pattern = '<div class="celltext"><a href="%s.html#%s">[+]</a></div>'
        divs['plus'] = [ pattern % (get_value(c,x,'first_child_term_set_name'),
                                    get_value(c,x,'first_child_label')) for x in df[c] ]
        
    divs['minus'] = ''
    if add_minus :
        pattern = '<div class="celltext"><a href="%s.html#%s">[-]</a></div>'
        divs['minus'] = [ pattern % (get_value(c,x,'cluster_annotation_term_set_name'),
                                    get_value(c,x,'label')) for x in df[c] ]
    
    cols = ['id','circle','name','plus','minus']
    output = divs[cols].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    return output


In [12]:
def create_html(df, ts, file, title) :
    
    # apply formatter to each term set
    df_formatted = df.copy()
    
    for tag in term_sets['name']:
        if tag in df_formatted.columns :
            
            add_id = False
            if tag == ts:
                add_id = True
                
            add_plus = False
            if tag == ts and tag not in ('cluster','neurotransmitter', 'division') :
                add_plus = True
                
            add_minus = False
            if tag != ts and tag not in ('neurotransmitter') :
                add_minus = True
                
            df_formatted[tag] = format_cell(df, tag, add_id, add_plus, add_minus)
            
            
    output = df_formatted.to_html(index=False, na_rep='',
                        render_links=True,escape=False,
                        classes="mystyle")

    html_string = '''
    <html>
    <head><title>%s</title></head>
    <link rel="stylesheet" type="text/css" href="../../simple_style.css"/>
    <body>
    {table}
    </body>
    </html>.
    ''' % title

    # OUTPUT AN HTML FILE
    with open(file, 'w') as f:
        f.write(html_string.format(table=output))

Helper function to create html document

In [13]:
# Write the data to the _static directory of the abc_atlas_access so that links work properly in the jupyter-book/sphinx page.
output_directory = os.path.join('../../_static', 'WMB-taxonomy', version)
os.makedirs( output_directory, exist_ok=True)

In [14]:
df_supertype = df_sorted[['class']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'class.html')
title = 'WMB-taxonmy: cell type classes'
create_html(df_supertype, 'class',file, title)
print(len(df_supertype))
df_supertype

34


Unnamed: 0,class
125,01 IT-ET Glut
17,02 NP-CT-L6b Glut
2814,03 OB-CR Glut
1378,04 DG-IMN Glut
1238,05 OB-IMN GABA
636,06 CTX-CGE GABA
708,07 CTX-MGE GABA
1197,08 CNU-MGE GABA
1158,09 CNU-LGE GABA
978,10 LSX GABA


In [15]:
df_supertype = df_sorted[['class','subclass']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'subclass.html')
title = 'WMB-taxonmy: cell type subclasses'
create_html(df_supertype, 'subclass',file, title)
print(len(df_supertype))
df_supertype

338


Unnamed: 0,class,subclass
125,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut
113,01 IT-ET Glut,002 IT EP-CLA Glut
94,01 IT-ET Glut,003 L5/6 IT TPE-ENT Glut
98,01 IT-ET Glut,004 L6 IT CTX Glut
95,01 IT-ET Glut,005 L5 IT CTX Glut
...,...,...
5029,34 Immune,334 Microglia NN
5019,34 Immune,335 BAM NN
5025,34 Immune,336 Monocytes NN
5026,34 Immune,337 DC NN


In [16]:
df_supertype = df_sorted[['class','subclass','supertype']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'supertype.html')
title = 'WMB-taxonmy: cell type supertypes'
create_html(df_supertype, 'supertype',file, title)
print(len(df_supertype))
df_supertype

1201


Unnamed: 0,class,subclass,supertype
125,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1
128,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0002 CLA-EPd-CTX Car3 Glut_2
113,01 IT-ET Glut,002 IT EP-CLA Glut,0003 IT EP-CLA Glut_1
114,01 IT-ET Glut,002 IT EP-CLA Glut,0004 IT EP-CLA Glut_2
154,01 IT-ET Glut,002 IT EP-CLA Glut,0005 IT EP-CLA Glut_3
...,...,...,...
5026,34 Immune,337 DC NN,1197 DC NN_1
5024,34 Immune,338 Lymphoid NN,1198 B cells NN_1
5021,34 Immune,338 Lymphoid NN,1199 ILC NN_2
5023,34 Immune,338 Lymphoid NN,1200 NK cells NN_3


In [17]:
if version == '20230630' :
    df_supertype = df_sorted[['division','class','subclass','supertype','cluster','neurotransmitter']].copy()
    df_supertype.drop_duplicates(inplace=True)
else :
    df_supertype = df_sorted[['class','subclass','supertype','cluster','neurotransmitter']].copy()
    df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'cluster.html')
title = 'WMB-taxonmy: cell type clusters'
create_html(df_supertype,'cluster',file, title)
print(len(df_sorted))
df_supertype

5322


Unnamed: 0,class,subclass,supertype,cluster,neurotransmitter
125,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0001 CLA-EPd-CTX Car3 Glut_1,Glut
126,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0002 CLA-EPd-CTX Car3 Glut_1,Glut
127,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0003 CLA-EPd-CTX Car3 Glut_1,Glut
140,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0001 CLA-EPd-CTX Car3 Glut_1,0004 CLA-EPd-CTX Car3 Glut_1,Glut
128,01 IT-ET Glut,001 CLA-EPd-CTX Car3 Glut,0002 CLA-EPd-CTX Car3 Glut_2,0005 CLA-EPd-CTX Car3 Glut_2,Glut
...,...,...,...,...,...
5028,34 Immune,337 DC NN,1197 DC NN_1,5318 DC NN_1,Other
5024,34 Immune,338 Lymphoid NN,1198 B cells NN_1,5319 B cells NN_1,Other
5021,34 Immune,338 Lymphoid NN,1199 ILC NN_2,5320 ILC NN_2,Other
5023,34 Immune,338 Lymphoid NN,1200 NK cells NN_3,5321 NK cells NN_3,Other


In [18]:
if version == '20230630' :
    df_supertype = df_sorted[['division']].copy()
    df_supertype.drop_duplicates(inplace=True)

    file = os.path.join(output_directory,'division.html')
    title = 'WMB-taxonmy: cell type division'
    create_html(df_supertype, 'division', file, title)
    print(len(df_supertype))
    df_supertype

In [19]:
df_supertype = df_sorted[['neurotransmitter']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'neurotransmitter.html')
title = 'WMB-taxonmy: neurotransmitter types'
create_html(df_supertype, 'neurotransmitter', file, title)
print(len(df_supertype))
df_supertype

10


Unnamed: 0,neurotransmitter
125,Glut
1351,Other
1238,GABA
1375,Dopa
1358,Glut-GABA
559,Chol
5035,Hist
2863,GABA-Glyc
2781,Sero
3813,Nora
