In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import anndata
import time
import matplotlib.pyplot as plt
import json

from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

In [2]:
version = '20240330'
download_base = Path('../../../abc_download_root')
abc_cache = AbcProjectCache.from_s3_cache(download_base)
abc_cache.cache._manifest_file_names = abc_cache.cache.list_all_downloaded_manifests()
abc_cache.load_manifest(f'releases/{version}/manifest.json')
abc_cache.current_manifest

releases/20231215/manifest.json
which is newer than the most recent manifest file you have previously been working with
releases/20240330/manifest.json
It is possible that some data files have changed between these two data releases, which will force you to re-download those data files (currently downloaded files will not be overwritten). To continue using releases/20240330/manifest.json, run
type.load_manifest('releases/20240330/manifest.json')


'releases/20240330/manifest.json'

In [3]:
abc_cache.list_metadata_files('WHB-taxonomy')

['cluster',
 'cluster_annotation_term',
 'cluster_annotation_term_set',
 'cluster_to_cluster_annotation_membership']

In [4]:
cluster = abc_cache.get_metadata_dataframe('WHB-taxonomy', 'cluster')

In [5]:
cluster

Unnamed: 0,cluster_alias,number_of_cells,label
0,0,34,CS202210140_494
1,1,220,CS202210140_495
2,2,187,CS202210140_496
3,3,246,CS202210140_497
4,4,188,CS202210140_498
...,...,...,...
3308,3308,140,CS202210140_3802
3309,3309,138,CS202210140_3803
3310,3310,85,CS202210140_3804
3311,3311,93,CS202210140_3805


Read in cluster annotation term set dataframe

In [6]:
term_sets = abc_cache.get_metadata_dataframe(directory='WHB-taxonomy', file_name='cluster_annotation_term_set').set_index('label')
term_sets

Unnamed: 0_level_0,name,description,order
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CCN202210140_SUBC,subcluster,The finest level of cell type definition in th...,2
CCN202210140_CLUS,cluster,An intermediate level of cell type definitions...,1
CCN202210140_SUPC,supercluster,The top level of cell type definition in the h...,0
CCN202210140_NEUR,neurotransmitter,Neurotransmitter terms are assigned to cluster...,3


Read in cluster annotation term dataframe. Spike in a row to handle unassigned neurotransmitter clusters.

In [7]:
term = abc_cache.get_metadata_dataframe(directory='WHB-taxonomy', file_name='cluster_annotation_term')

other_term = pd.DataFrame(columns=term.columns)
other_term.loc[0,'name'] = 'Other'
other_term.loc[0,'cluster_annotation_term_set_name'] = 'neurotransmitter'
other_term.loc[0,'color_hex_triplet'] = '#ebebeb'
other_term.loc[0,'term_set_order'] = 0
other_term.loc[0,'term_order'] = 9
term = pd.concat([term,other_term], ignore_index=True)

term.sort_values(['term_set_order','term_order'], inplace=True)

Find and store first child for each term

In [8]:
term

Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet,number_of_cells,description
0,CS202210140_476,Upper-layer intratelencephalic,CCN202210140_SUPC,,,0,0,supercluster,#FEA7BA,455006,Upper-layer intratelencephalic
1,CS202210140_477,Deep-layer intratelencephalic,CCN202210140_SUPC,,,0,1,supercluster,#426600,228467,Deep-layer intratelencephalic
2,CS202210140_473,Deep-layer near-projecting,CCN202210140_SUPC,,,0,2,supercluster,#EE1010,18856,Deep-layer near-projecting
3,CS202210140_474,Deep-layer corticothalamic and 6b,CCN202210140_SUPC,,,0,3,supercluster,#5EF0F1,78396,Deep-layer corticothalamic and 6b
4,CS202210140_484,MGE interneuron,CCN202210140_SUPC,,,0,4,supercluster,#DFFE66,222434,MGE interneuron
...,...,...,...,...,...,...,...,...,...,...,...
3820,CS202210140_3822,VGLUT1 VGLUT2 VGLUT3,CCN202210140_NEUR,,,3,15,neurotransmitter,#2B93DF,29786,"Glutamatergic 1, Glutamatergic 2, Glutamatergic 3"
3821,CS202210140_3823,VGLUT2,CCN202210140_NEUR,,,3,16,neurotransmitter,#196AA5,189776,Glutamatergic 2
3822,CS202210140_3824,VGLUT2 VGLUT3,CCN202210140_NEUR,,,3,17,neurotransmitter,#2252C2,10419,"Glutamatergic 2, Glutamatergic 3"
3823,CS202210140_3825,VGLUT3,CCN202210140_NEUR,,,3,18,neurotransmitter,#2B39DF,1608,Glutamatergic 3


In [9]:
filtered = term[pd.notna(term['parent_term_label'])]
first_child = filtered.groupby('parent_term_label')[['label','name','term_order','cluster_annotation_term_set_name']].first()
first_child

Unnamed: 0_level_0,label,name,term_order,cluster_annotation_term_set_name
parent_term_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CS202210140_1,CS202210140_3787,Bcell_0_3293,0,subcluster
CS202210140_10,CS202210140_3773,Mgl_9_3279,0,subcluster
CS202210140_100,CS202210140_3310,DLCT6b_99_2816,0,subcluster
CS202210140_101,CS202210140_3005,DLCT6b_100_2511,0,subcluster
CS202210140_102,CS202210140_3340,DLCT6b_101_2846,0,subcluster
...,...,...,...,...
CS202210140_95,CS202210140_3446,DLNP_94_2952,0,subcluster
CS202210140_96,CS202210140_3461,DLNP_95_2967,0,subcluster
CS202210140_97,CS202210140_3426,DLNP_96_2932,0,subcluster
CS202210140_98,CS202210140_3416,DLCT6b_97_2922,0,subcluster


In [10]:
term.set_index('label',inplace=True)
term.loc[first_child.index,'first_child_label'] = first_child['label']
term.loc[first_child.index,'first_child_term_set_name'] = first_child['cluster_annotation_term_set_name']
term.reset_index(inplace=True)

In [11]:
term[pd.notna(term['first_child_label'])]

Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet,number_of_cells,description,first_child_label,first_child_term_set_name
0,CS202210140_476,Upper-layer intratelencephalic,CCN202210140_SUPC,,,0,0,supercluster,#FEA7BA,455006,Upper-layer intratelencephalic,CS202210140_121,cluster
1,CS202210140_477,Deep-layer intratelencephalic,CCN202210140_SUPC,,,0,1,supercluster,#426600,228467,Deep-layer intratelencephalic,CS202210140_137,cluster
2,CS202210140_473,Deep-layer near-projecting,CCN202210140_SUPC,,,0,2,supercluster,#EE1010,18856,Deep-layer near-projecting,CS202210140_84,cluster
3,CS202210140_474,Deep-layer corticothalamic and 6b,CCN202210140_SUPC,,,0,3,supercluster,#5EF0F1,78396,Deep-layer corticothalamic and 6b,CS202210140_85,cluster
4,CS202210140_484,MGE interneuron,CCN202210140_SUPC,,,0,4,supercluster,#DFFE66,222434,MGE interneuron,CS202210140_237,cluster
...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,CS202210140_426,Splat_425,CCN202210140_CLUS,CS202210140_483,CCN202210140_SUPC,1,87,cluster,#89ACAB,4253,Splatter (cluster 425),CS202210140_680,subcluster
489,CS202210140_429,Splat_428,CCN202210140_CLUS,CS202210140_483,CCN202210140_SUPC,1,88,cluster,#83AD88,3182,Splatter (cluster 428),CS202210140_1091,subcluster
490,CS202210140_430,Splat_429,CCN202210140_CLUS,CS202210140_483,CCN202210140_SUPC,1,89,cluster,#3DD6D8,9792,Splatter (cluster 429),CS202210140_2001,subcluster
491,CS202210140_432,Splat_431,CCN202210140_CLUS,CS202210140_483,CCN202210140_SUPC,1,90,cluster,#EFC7A3,2691,Splatter (cluster 431),CS202210140_1306,subcluster


Read in the cluster annotation term name pivot table and sort it

In [12]:
membership = abc_cache.get_metadata_dataframe(directory='WHB-taxonomy', file_name='cluster_to_cluster_annotation_membership')
pivot = membership.groupby(['cluster_alias', 'cluster_annotation_term_set_name'])['cluster_annotation_term_name'].first().unstack()
pivot = pivot[term_sets['name']] # order columns
pivot.fillna('Other', inplace=True)
pivot.sort_values(['supercluster', 'cluster', 'subcluster'], inplace=True)
cols = pivot.columns.to_list()
pivot.columns = cols
pivot

Unnamed: 0_level_0,subcluster,cluster,supercluster,neurotransmitter
cluster_alias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2461,Amex_153_2461,Amex_153,Amygdala excitatory,VGLUT1 VGLUT2
2462,Amex_153_2462,Amex_153,Amygdala excitatory,VGLUT1 VGLUT2
2463,Amex_153_2463,Amex_153,Amygdala excitatory,VGLUT1 VGLUT2
2464,Amex_153_2464,Amex_153,Amygdala excitatory,VGLUT1 VGLUT2
2465,Amex_153_2465,Amex_153,Amygdala excitatory,VGLUT1 VGLUT2
...,...,...,...,...
3223,Vsmc_20_3223,Vsmc_20,Vascular,Other
3224,Vsmc_20_3224,Vsmc_20,Vascular,Other
3225,Vsmc_20_3225,Vsmc_20,Vascular,Other
3226,Vsmc_20_3226,Vsmc_20,Vascular,Other


Create a lookup by name dataframes for each term set

In [13]:
lookup = {}
for tag in term_sets['name'] :
    #print(tag)
    pred = (term['cluster_annotation_term_set_name'] == tag)
    filtered = term[pred].copy()
    filtered.set_index('name', inplace=True)
    lookup[tag] = filtered

Helper functions to lookup an term attribut and format a cell in the html table

In [14]:
def get_value( c, n, v ) :
    return lookup[c].loc[n][v]

def format_cell (df, c, add_id=False, add_plus=False, add_minus=False) :

    divs = pd.DataFrame(index=df.index)
    
    pattern = '<div class="circle" style="background-color:%s"></div>'
    divs['circle'] = [pattern % get_value(c,x,'color_hex_triplet') for x in df[c]]
    
    pattern = '<div class="celltext">%s</div>'
    divs['name'] = [pattern % x for x in df[c]]
   
    divs['id'] = ''
    if add_id :
        pattern = '<div id="%s"></div>'
        divs['id'] = [pattern % get_value(c,x,'label') for x in df[c]]
        
    divs['plus'] = ''
    if add_plus :
        pattern = '<div class="celltext"><a href="%s.html#%s">[+]</a></div>'
        divs['plus'] = [pattern % (get_value(c,x,'first_child_term_set_name'),
                                   get_value(c,x,'first_child_label')) for x in df[c]]
        
    divs['minus'] = ''
    if add_minus :
        pattern = '<div class="celltext"><a href="%s.html#%s">[-]</a></div>'
        divs['minus'] = [pattern % (get_value(c,x,'cluster_annotation_term_set_name'),
                                    get_value(c,x,'label')) for x in df[c]]
    
    cols = ['id', 'circle', 'name', 'plus', 'minus']
    output = divs[cols].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    return output


Helper function to create html document

In [15]:
def create_html (df, ts, file, title) :
    
    # apply formatter to each term set
    df_formatted = df.copy()
    
    for tag in term_sets['name']:
        if tag in df_formatted.columns:
            
            add_id = False
            if tag == ts:
                add_id = True

            add_plus = False
            if tag == ts and tag not in ('subcluster', 'neurotransmitter') :
                add_plus = True

            add_minus = False
            if tag != ts and tag not in ('neurotransmitter') :
                add_minus = True

            df_formatted[tag] = format_cell(df, tag, add_id, add_plus, add_minus)
            
            
    output = df_formatted.to_html(index=False, na_rep='',
                        render_links=True,escape=False,
                        classes="mystyle")

    html_string = '''
    <html>
    <head><title>%s</title></head>
    <link rel="stylesheet" type="text/css" href="../../simple_style.css"/>
    <body>
    {table}
    </body>
    </html>.
    ''' % title

    # OUTPUT AN HTML FILE
    with open(file, 'w') as f:
        f.write(html_string.format(table=output))

In [16]:
# Write the data to the _static directory of the abc_atlas_access so that links work properly in the jupyter-book/sphinx page.
output_directory = os.path.join('../../_static', 'WHB-taxonomy', version)
os.makedirs( output_directory, exist_ok=True)

In [17]:
df_supertype = pivot[['supercluster']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory, 'supercluster.html')
title = 'WHB-taxonmy: cell type superclusters'
create_html(df_supertype, 'supercluster', file, title)
print(len(df_supertype))

31


In [18]:
df_supertype = pivot[['supercluster', 'cluster']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory, 'cluster.html')
title = 'WHB-taxonmy: cell type clusters'
create_html(df_supertype, 'cluster', file, title)
print(len(df_supertype))

461


In [19]:
df_supertype = pivot[['supercluster', 'cluster', 'subcluster', 'neurotransmitter']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'subcluster.html')
title = 'WHB-taxonmy: cell type subclusters'
create_html(df_supertype, 'subcluster', file, title)
print(len(df_supertype))

3313


In [20]:
df_supertype = pivot[['neurotransmitter']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory, 'neurotransmitter.html')
title = 'WHB-taxonmy: neurotransmitter types'
create_html(df_supertype, 'neurotransmitter', file, title)
print(len(df_supertype))

20
