# Table 1. Thalamus + Zona Incerta subset of the CCN20230722 whole mouse brain taxonomy.

In [29]:
import pandas as pd

from IPython.display import display, HTML

import sys
sys.path.append('/code/')
from thalamus_merfish_analysis import abc_load as abc

from matplotlib import rcParams
rcParams['ps.fonttype'] = 42
rcParams['pdf.fonttype'] = 42
rcParams['font.size'] = 7

import matplotlib.pyplot as plt
import seaborn as sns

get_ipython().run_line_magic('matplotlib', 'inline') 

## Load .obs data

In [2]:
# define path for saving outputs
results_dir = '../../results'

In [3]:
# load in the full dataset with the taxonomy color metadata
obs_wmb = abc.get_combined_metadata(drop_unused=False)

# load the thalamus subset
obs_th = abc.load_standard_thalamus(data_structure='obs')

# join the color metadata to the thalamus subset
color_cols = ['neurotransmitter_color', 
              'class_color', 
              'subclass_color', 
              'supertype_color', 
              'cluster_color']
obs_th_color = obs_th.join(obs_wmb[color_cols])

## Convert .obs dataframe to taxonomy table

In [4]:
# subset df to just taxonomy (+color) columns
taxonomy_levels = ['neurotransmitter','class', 'subclass', 'supertype', 'cluster']
obs_tax = obs_th_color.drop_duplicates(subset='cluster')[taxonomy_levels+color_cols]

# count the number of cells per unique cluster & add as column to the taxonomy df
cells_per_cluster_counts = obs_th_color.groupby('cluster',observed=True).size().reset_index(name='num_cells_in_cluster')
obs_tax = obs_tax.merge(cells_per_cluster_counts, on='cluster')

# clean up the taxonomy columns for sorting
for col in taxonomy_levels:
    # only keep categories that are present in the TH+ZI dataset
    obs_tax.loc[:,col] = obs_tax.loc[:,col].cat.remove_unused_categories()
    # remove leading/trailing whitespace
    obs_tax.loc[:,col] = obs_tax.loc[:,col].str.strip()
    # extract the ID # b/c lexically sorting the str object doesn't yield correct order
    obs_tax.loc[:,col+'_id'] = obs_tax.loc[:,col].str.extract('(\d+)', expand=False).astype(float)

# sort by ID #s
num_cols = [x+'_id' for x in taxonomy_levels]
taxonomy_df = obs_tax.sort_values(by=num_cols, ascending=True, ignore_index=True)
# clean up the ID columns
taxonomy_df = taxonomy_df.drop(columns=num_cols)

display(taxonomy_df.head(5))

In [5]:
tax_result = taxonomy_df.groupby('subclass')['supertype'].nunique().reset_index()
tax_result.columns = ['subclass', 'unique_supertype_count']
tax_result

In [6]:
# Save out as csv file
taxonomy_df.to_csv(f'{results_dir}/thalamus_zi_taxonomy_CCN20230722.csv', index=False)

## Format taxonomy df as HTML table with colored markers

In [7]:
def create_abc_atlas_html_table(df):
    ''' Create an HTML table with the same custom formatting & dynamic color
    markers as the ABC Atlas taxonomy resource html page.
    
    See: https://alleninstitute.github.io/abc_atlas_access/_static/WMB-taxonomy/20230830/cluster.html#CS20230722_CLUS_0001
    '''
    # define the CSS style used by the ABC Atlas resource page
    # includes alternating gray and white with on-hover color
    styles = """
    <style>
    .mystyle {
        font-size: 11pt; 
        font-family: Arial;
        border-collapse: collapse; 
        border: 1px solid silver;
    }
    .mystyle td {
        padding: 5px;
    }
    .mystyle th {
        text-align: left;
        padding: 5px;
    }
    .mystyle tr:nth-child(even) {
        background: #E0E0E0;
    }
    .circle {
        background-color: blue;
        height: 20px;
        width: 20px;
        border-radius: 50%;
        display: inline-block;
    }
    .celltext {
        padding-left: 5px;
        display: inline-block;
    }
    </style>
    """
    
    def format_cell(value, color):
        ''' Set the circle color and text value for each cell using the defined 
        ABC Atlas taxonomy style.
        '''
        return f'<span class="circle" style="background-color: {color};"></span><span class="celltext">{value}</span>'

    # apply custom HTML formatting to each cell in the df
    formatted_df = pd.DataFrame()
    for col in df.columns:
        if not col.endswith('_color'):
            color_col = f'{col}_color'
            # add colored circle to each column that has a corresponding color column
            if color_col in df.columns:
                formatted_df[col] = df.apply(lambda x: format_cell(x[col], x[color_col]), axis=1)
            # otherwise, just the cell value as text
            else:
                formatted_df[col] = df[col]

    # convert df to html table via pandas
    html_table = formatted_df.to_html(escape=False, classes='mystyle')

    # append style to top of HTML table so it's stored with the html file
    # (alternatively, could be saved to a separate .css file and linked to the html file)
    full_html = styles + html_table
    return full_html

In [8]:
# convert taxonomy df to a HTML table formatted the same as the ABC Atlas resource
html_table = create_abc_atlas_html_table(taxonomy_df)

# save as html file to results
with open(f'{results_dir}/thalamus_zi_taxonomy_CCN20230722_table.html', 'w') as f:
    f.write(html_table)

## Display full taxonomy table in HTML format

In [9]:
# show the HTML table inline
display(HTML(html_table))

In [10]:
taxonomy_df_fig1 = taxonomy_df[taxonomy_df['num_cells_in_cluster']>=50].copy()

for col in taxonomy_levels:
    # only keep categories that are present in this subset of the data
    taxonomy_df_fig1.loc[:,col] = taxonomy_df_fig1.loc[:,col].cat.remove_unused_categories()
    
taxonomy_df_fig1

In [11]:
display(list(taxonomy_df_fig1['subclass'].unique()))
taxonomy_df_fig1['subclass'].unique()

In [12]:
result = taxonomy_df_fig1.groupby('subclass')['supertype'].nunique().reset_index()
result.columns = ['subclass', 'unique_supertype_count']
result

In [13]:
fig1_subclasses = list(taxonomy_df_fig1['subclass'].unique())
fig1_subclasses

In [14]:
taxonomy_df_fig1_subclasses = taxonomy_df[taxonomy_df['subclass'].isin(fig1_subclasses)].copy()

for col in taxonomy_levels:
    # only keep categories that are present in this subset of the data
    taxonomy_df_fig1_subclasses.loc[:,col] = taxonomy_df_fig1_subclasses.loc[:,col].cat.remove_unused_categories()

In [34]:
taxonomy_df_fig1_subclasses.head(20)

In [36]:
dict(zip(taxonomy_df_fig1_subclasses['supertype'], taxonomy_df_fig1_subclasses['supertype_color']))

In [16]:
list(taxonomy_df_fig1_subclasses['supertype'].unique())

In [17]:
unique_cluster_df = taxonomy_df_fig1_subclasses.groupby('supertype')['cluster'].nunique().reset_index()
unique_cluster_df.columns = ['supertype', 'unique_cluster_count']
unique_cluster_df

In [18]:
supertype_order = list(taxonomy_df_fig1_subclasses['supertype'].unique())
unique_cluster_df['supertype'] = pd.Categorical(unique_cluster_df['supertype'], 
                                                categories=supertype_order, 
                                                ordered=True)
unique_cluster_df = unique_cluster_df.sort_values('supertype').reset_index(drop=True)
unique_cluster_df

In [31]:
# sns.barplot(x='supertype', y='unique_cluster_count', data=unique_cluster_df, order=supertype_order)
fig, ax = plt.subplots(figsize=(6.3, 1))
plt.bar(unique_cluster_df['supertype'], 
        unique_cluster_df['unique_cluster_count'], 
        color='darkgrey')
# plt.xlabel('col1')
# plt.ylabel('Unique col2 Count')
# plt.title('Unique col2 Count per col1 Category')
plt.xticks(rotation=90)  # Rotate x-axis labels by 90 degrees
ax.invert_yaxis()

fig.savefig(f'{results_dir}/fig1A_cluster_count_hist.pdf',
            transparent=True, bbox_inches='tight')