In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import abc_load as abc

In [3]:
import anndata as ad
import numpy as np
import pandas as pd

In [4]:
prong1_cluster_annotations_file = '../code/resources/Prong 1 Vitessce links by nucleus.csv'
prong1_df = pd.read_csv(prong1_cluster_annotations_file)

In [5]:
prong1_df.head(5)

In [6]:
file = 'cluster_to_cluster_annotation_membership.csv'
cluster_memb_df_20230630 = pd.read_csv(
                            abc.ABC_ROOT/f'metadata/WMB-taxonomy/20230630/{file}'
                            )

cluster_memb_df_20230830 = pd.read_csv(
                            abc.ABC_ROOT/f'metadata/WMB-taxonomy/20230830/{file}'
                            )

cluster_memb_df_20231215 = pd.read_csv(
                            abc.ABC_ROOT/f'metadata/WMB-taxonomy/20231215/{file}'
                            )

# Taxonomy comparison

## How do the versions cluster id #s compare?

In [7]:
# compare known mapping for anterior nuclei cluster ids
# 20230630:
#  - AD: 1095, 1096
#
# 20230830: 
#  - AD: 2613, 2614

In [8]:
# 20230830 version's cluster 2613 matches the AD supertype
cluster_memb_df_20230830[(cluster_memb_df_20230830['cluster_annotation_term_set_name']=='cluster')
                        & (cluster_memb_df_20230830['cluster_annotation_term_name'].str.contains('2613'))]

In [9]:
# 20230630 version's cluster 2613 does NOT match the AD supertype
cluster_memb_df_20230630[(cluster_memb_df_20230630['cluster_annotation_term_set_name']=='cluster')
                         & (cluster_memb_df_20230630['cluster_annotation_term_name'].str.contains('2613'))]

In [10]:
# BUT v20230630 & v20230830 cluster_alias both have the same supertype name
cluster_memb_df_20230630[(cluster_memb_df_20230630['cluster_annotation_term_set_name']=='cluster')
                         & (cluster_memb_df_20230630['cluster_alias']==5020)]

In [11]:
# 20231215 version's cluster 2613 is the same as 20230830 - stable
cluster_memb_df_20231215[(cluster_memb_df_20231215['cluster_annotation_term_set_name']=='cluster')
                         & (cluster_memb_df_20231215['cluster_annotation_term_name'].str.contains('2613'))]

## What about supertypes?

In [12]:
# supertype id #s also changed from 20230630 to 20230830
cluster_memb_df_20230630[(cluster_memb_df_20230630['cluster_annotation_term_set_name']=='supertype')
                        & (cluster_memb_df_20230630['cluster_alias']==5020)]

In [13]:
# supertype id #s also changed from 20230630 to 20230830
cluster_memb_df_20230830[(cluster_memb_df_20230830['cluster_annotation_term_set_name']=='supertype')
                        & (cluster_memb_df_20230830['cluster_alias']==5020)]

# Update csv to include new taxonomy cluster ids + stable cluster aliases

## Rename columns to distinguish taxonomy versions

In [14]:
ait17_columns = {
    'annotated subclasses':'subclasses_AIT17',
    'annotated supertypes':'supertypes_AIT17',
    'annotated clusters':'cluster_ids_AIT17'
}

prong1_df = prong1_df.rename(columns=ait17_columns)

## Map taxonomy versions via stable cluster_alias

In [15]:
old_cl_df = cluster_memb_df_20230630
new_cl_df = cluster_memb_df_20230830
cl_annot_col = 'cluster_annotation_term_name'

prong1_df['cluster_alias'] = None
prong1_df['cluster_ids_CNN20230720'] = None
# prong1_df['supertype_labels_CNN20230720'] = None

for row, cluster_list in enumerate(
    prong1_df['cluster_ids_AIT17']):
    cluster_list = cluster_list.split(', ')
    
    # would love to do this vectorwise, but str.startswith(tuple()) weirdly
    # doesn't recognize a tuple stored in a variable as containing multiple strings
    cluster_alias_list = []
    cluster_id_list = []
    # supertype_label_list = []
    for cl_id in cluster_list:
        # get the cluster_alias from the 20230630 cluster annotation set
        old_cl_row = old_cl_df[old_cl_df[cl_annot_col].str.startswith(cl_id)]
        cl_alias = old_cl_row['cluster_alias'].values[0]
        # add to the Prong1 csv
        cluster_alias_list.append(cl_alias)
        # get the row of the new annotation set that matches the stable cluster_alias
        new_cl_row = new_cl_df[new_cl_df['cluster_alias']==cl_alias]
        # get the updated cluster_id
        cluster_id_list.append(new_cl_row['cluster_annotation_term_name'].values[0][0:4])
        # supertype_label_list.append(cl_row['cluster_annotation_term_name'].values[0][5:])
        
    prong1_df['cluster_alias'][row] = cluster_alias_list
    prong1_df['cluster_ids_CNN20230720'][row] = cluster_id_list
    # prong1_df['supertype_labels_CNN20230720'][row] = supertype_label_list

In [16]:
prong1_df.head(5)

## Save out to new csv

In [17]:
new_prong1_file = '../code/resources/prong1_cluster_annotations_by_nucleus.csv'
prong1_df.to_csv(new_prong1_file)

## demonstration of var=tuple not working with str.startswith

In [18]:
# prong1_df.head(5)

In [19]:
# test_clusters = prong1_df['annotated clusters'][0].split(',')
# test_clusters

In [20]:
# # cluster_memb_df_20230630[cluster_memb_df_20230630['cluster_annotation_term_name'].str.startswith(tuple(test_clusters))]#['cluster_alias']
# test_clusters = tuple(test_clusters)
# cluster_memb_df_20230630.query("cluster_annotation_term_name.str.startswith(@test_clusters, na=False)")['cluster_alias'].values[0]
# # cluster_memb_df_20230630[cluster_memb_df_20230630['cluster_annotation_term_name'].str.startswith(tuple(test_clusters), na=None)]

In [21]:
# cluster_memb_df_20230630[cluster_memb_df_20230630['cluster_annotation_term_name'].str.startswith(('1095','1096'), na=None)]

In [22]:
# ughhhhhhhhhhhhh, why is it not recognizing the tuple as a tuple??????
# https://stackoverflow.com/questions/75595060/pandas-dataframe-query-series-str-startswith-tuple-returns-empty