In [1]:
import os
import pandas as pd
import numpy as np
import anndata
import time
import matplotlib.pyplot as plt
import json
import requests

In [2]:
download_base = '../../../abc_download_root'

url = 'https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/releases/20230630/manifest.json'
manifest = json.loads(requests.get(url).text)
    
taxonomy_metadata = manifest['file_listing']['WMB-taxonomy']['metadata']

Read in cluster annotation term set dataframe

In [3]:
rpath = taxonomy_metadata['cluster_annotation_term_set']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
term_sets = pd.read_csv(file)
term_sets.set_index('label', inplace=True)
term_sets

Read in cluster annotation term dataframe. Spike in a row to handle unassigned neurotransmitter clusters.

In [4]:
rpath = taxonomy_metadata['cluster_annotation_term']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
term = pd.read_csv(file)

other_term = pd.DataFrame(columns=term.columns)
other_term.loc[0,'name'] = 'Other'
other_term.loc[0,'cluster_annotation_term_set_name'] = 'neurotransmitter'
other_term.loc[0,'color_hex_triplet'] = '#ebebeb'
other_term.loc[0,'term_set_order'] = 0
other_term.loc[0,'term_order'] = 9
term = pd.concat([term,other_term],ignore_index=True)

term.sort_values(['term_set_order','term_order'],inplace=True)

Find and store first child for each term

In [5]:
term.head(5)

In [6]:
filtered = term[pd.notna(term['parent_term_label'])]
first_child = filtered.groupby('parent_term_label')[['label','name','term_order','cluster_annotation_term_set_name']].first()
first_child

In [7]:
term.set_index('label',inplace=True)
term.loc[first_child.index,'first_child_label'] = first_child['label']
term.loc[first_child.index,'first_child_term_set_name'] = first_child['cluster_annotation_term_set_name']
term.reset_index(inplace=True)

In [8]:
term[pd.notna(term['first_child_label'])].head(5)

Read in the cluster annotation term name pivot table and sort it

In [9]:
rpath = taxonomy_metadata['cluster_to_cluster_annotation_membership_pivoted']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
df = pd.read_csv(file)
df = df.fillna('Other')
df_sorted = df.sort_values(['class','subclass','supertype','cluster'])
df_sorted

Create a lookup by name dataframes for each term set

In [10]:
lookup = {}
for tag in term_sets['name'] :
    #print(tag)
    pred = (term['cluster_annotation_term_set_name'] == tag)
    filtered = term[pred].copy()
    filtered.set_index('name', inplace=True)
    lookup[tag] = filtered

Helper functions to lookup an term attribut and format a cell in the html table

In [11]:
def get_value( c, n, v ) :
    return lookup[c].loc[n][v]

def format_cell (df,c,add_id=False,add_plus=False,add_minus=False) :

    divs = pd.DataFrame(index=df.index)
    
    pattern = '<div class="circle" style="background-color:%s"></div>'
    divs['circle'] = [ pattern % get_value(c,x,'color_hex_triplet') for x in df[c] ]
    
    pattern = '<div class="celltext">%s</div>'
    divs['name'] = [ pattern % x for x in df[c] ]
   
    divs['id'] = ''
    if add_id :
        pattern = '<div id="%s"></div>'
        divs['id'] = [ pattern % get_value(c,x,'label') for x in df[c] ]
        
    divs['plus'] = ''
    if add_plus :
        pattern = '<div class="celltext"><a href="%s.html#%s">[+]</a></div>'
        divs['plus'] = [ pattern % (get_value(c,x,'first_child_term_set_name'),
                                    get_value(c,x,'first_child_label')) for x in df[c] ]
        
    divs['minus'] = ''
    if add_minus :
        pattern = '<div class="celltext"><a href="%s.html#%s">[-]</a></div>'
        divs['minus'] = [ pattern % (get_value(c,x,'cluster_annotation_term_set_name'),
                                    get_value(c,x,'label')) for x in df[c] ]
    
    cols = ['id','circle','name','plus','minus']
    output = divs[cols].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    return output


Helper function to create html document

In [12]:
def create_html (df,ts,file,title) :
    
    # apply formatter to each term set
    df_formatted = df.copy()
    
    for tag in term_sets['name'] :
        if tag in df_formatted.columns :
            
            add_id = False
            if tag == ts :
                add_id = True
                
            add_plus = False
            if tag == ts and tag not in ('cluster','neurotransmitter', 'division') :
                add_plus = True
                
            add_minus = False
            if tag != ts and tag not in ('neurotransmitter') :
                add_minus = True
                
            df_formatted[tag] = format_cell(df,tag,add_id,add_plus,add_minus)
            
            
    output = df_formatted.to_html(index=False, na_rep='',
                        render_links=True,escape=False,
                        classes="mystyle")

    html_string = '''
    <html>
    <head><title>%s</title></head>
    <link rel="stylesheet" type="text/css" href="../simple_style.css"/>
    <body>
    {table}
    </body>
    </html>.
    ''' % title

    # OUTPUT AN HTML FILE
    with open(file, 'w') as f:
        f.write(html_string.format(table=output))

In [13]:
output_directory = 'WMB-taxonomy'
os.makedirs( output_directory, exist_ok=True)

In [14]:
df_supertype = df_sorted[['class']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'class.html')
title = 'WMB-taxonmy: cell type classes'
create_html(df_supertype, 'class',file, title)
print(len(df_supertype))

In [15]:
df_supertype = df_sorted[['class','subclass']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'subclass.html')
title = 'WMB-taxonmy: cell type subclasses'
create_html(df_supertype, 'subclass',file, title)
print(len(df_supertype))

In [16]:
df_supertype = df_sorted[['class','subclass','supertype']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'supertype.html')
title = 'WMB-taxonmy: cell type supertypes'
create_html(df_supertype, 'supertype',file, title)
print(len(df_supertype))

In [17]:
df_supertype = df_sorted[['class','subclass','supertype','cluster','neurotransmitter']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'cluster.html')
title = 'WMB-taxonmy: cell type clusters'
create_html(df_supertype,'cluster',file, title)
print(len(df_sorted))

In [18]:
df_supertype = df_sorted[['division']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'division.html')
title = 'WMB-taxonmy: cell type divisions'
create_html(df_supertype, 'division', file, title)
print(len(df_supertype))

In [19]:
df_supertype = df_sorted[['neurotransmitter']].copy()
df_supertype.drop_duplicates(inplace=True)

file = os.path.join(output_directory,'neurotransmitter.html')
title = 'WMB-taxonmy: neurotransmitter types'
create_html(df_supertype, 'neurotransmitter', file, title)
print(len(df_supertype))