Strategy:
1. change any 'auto:'s and NAs to empty strings
1. drop duplicate rows from the typing columns
1. identify rows where no types are repeated within their column unless the only repeats are where one column is '' (i.e. keep 1:1 matches)
2. identify which of these are differently named across datasets
4. map to FBbt using symbols
5. make rows into len > 1 sets of FBbt IDs
6. make a template to add bidirectional 'may be identical to' annotations referencing banc paper

In [None]:
import pandas as pd
import re

In [None]:
# files 4, 5, 6 have cross-ds typing

info = pd.read_csv('supplements/supplemental_data_6.csv', dtype='str', na_filter=False)
symbols = pd.read_csv('../../../ontology/neuron_symbols.tsv', sep='\t', dtype='str', na_filter=False)
symbol_dict = symbols.set_index('symbol')['FBbt_id'].to_dict()

In [None]:
# 1. NAs and auto:s to empty string
info = info.map(lambda x: re.sub('auto[:].*', '', x))
info = info.map(lambda x: re.sub('NA', '', x))

In [None]:
unique_type_combos = info[['cell_type', 'fafb_cell_type', 'manc_cell_type']].drop_duplicates()

In [None]:
# 3. check for repeats (keep only 1:1 matches)
def unique_match_check(df, col):
    """Find values in `col` where other columns in `df` contain no more than one non-'' value"""
    check_cols = df.columns.drop(col)
    keep_types = []
    for x in df[col].unique():
        check = True
        df_subset = df[df[col]==x]
        for y in check_cols:
            value_count = len(df_subset[df_subset[y]!=''][y].unique())
            if value_count > 1:
                check = False
        if check:
            keep_types.append(x)
    return keep_types

In [None]:
keep_cell_types = unique_match_check(unique_type_combos, 'cell_type')

In [None]:
keep_fafb_cell_types = unique_match_check(unique_type_combos, 'fafb_cell_type')

In [None]:
keep_manc_cell_types = unique_match_check(unique_type_combos, 'manc_cell_type')

In [None]:
unique_matches = unique_type_combos[unique_type_combos['cell_type'].isin(keep_cell_types) 
    & unique_type_combos['fafb_cell_type'].isin(keep_fafb_cell_types) 
    & unique_type_combos['manc_cell_type'].isin(keep_manc_cell_types)]

In [None]:
# 4. only keep rows where names are different
mask = (
    (unique_matches['cell_type'] == unique_matches['fafb_cell_type']) &
    (unique_matches['fafb_cell_type'] == unique_matches['manc_cell_type'])
)

unique_matches = unique_matches[~mask].reset_index(drop=True)

In [None]:
# 5. map to symbols and drop any that don't map
unique_matches_fbbt = unique_matches.replace(symbol_dict)

In [None]:
# 6. consolidate rows to sets, excluding any non-FBbt and drop len 1 sets
def row_consolidator(values):
    """Takes a list of values and puts all FBbt IDs (not unmapped names) into a set."""
    value_set = set([v for v in values if v.startswith('FBbt')])
    return value_set

In [None]:
mapping_sets = unique_matches_fbbt.apply(lambda x: 
                        row_consolidator([x[col] for col in unique_matches_fbbt.columns]), axis=1)
mapping_sets = mapping_sets.rename('id_sets')

In [None]:
real_mapping_sets = mapping_sets[mapping_sets.apply(len)>1].reset_index(drop=True)

In [None]:
template = pd.DataFrame({'ID':['ID'], 'may_be_identical_to':["AI IAO:0006011 SPLIT=|"]})

In [None]:
# 7. make template
def template_row_maker(mapping_set):
    mapping_list = list(mapping_set)
    rows = pd.DataFrame({'ID':mapping_list})
    rows['may_be_identical_to'] = rows['ID'].apply(lambda x: '|'.join([y for y in mapping_list if y != x]))
    return rows

In [None]:
for i in real_mapping_sets.index:
    template_rows = template_row_maker(real_mapping_sets[i])
    template = pd.concat([template, template_rows], ignore_index=True)

Save mapping from each file, then concatenate:

In [None]:
sup_4_template = template

In [None]:
sup_5_template = template

In [None]:
sup_6_template = template

In [None]:
complete_template = pd.concat([sup_4_template, sup_5_template, sup_6_template], ignore_index=True)
complete_template = complete_template.drop_duplicates().reset_index(drop=True)

In [None]:
complete_template['ref'] = 'doi:10.1101/2025.07.31.667571'
complete_template.loc[0,'ref'] = '>A oboInOwl:hasDbXref'

In [None]:
complete_template.to_csv('template.tsv', sep='\t', index=False)