Notebook for generation of ontology terms from FlyWire data

In [None]:
import numpy as np
import pandas as pd

# The only reason we need this package is for the FlyWire neuropils:
# `pip3 install fafbseg` will do the trick.
from fafbseg import flywire

In [None]:
# Load annotations. You can get the file from
# https://github.com/flyconnectome/flywire_annotations/tree/main/supplemental_files
info = pd.read_csv(
    "Supplemental_file1_neuron_annotations.tsv",
    sep="\t",
    low_memory=False
    )
info["root_783"] = info.root_id

In [None]:
# Load per-neuropil synapse counts
# feather files can't be uploaded here
# Same info can be found in 'Neuropil Synapse Table' from https://codex.flywire.ai/api/download
# But this will need reformatting
# columns need to be ['pre_pt_root_id','neuropil','count'] and ['post_pt_root_id','neuropil','count']
pre_counts = pd.read_feather("per_neuron_neuropilv5_filtered_count_pre_783.feather")
post_counts = pd.read_feather("per_neuron_neuropilv5_filtered_count_post_783.feather")

In [None]:
# Drop neurons that aren't in our annotations (speeds things up)
pre_counts = pre_counts[pre_counts.pre_pt_root_id.isin(info.root_id)].copy()
post_counts = post_counts[post_counts.post_pt_root_id.isin(info.root_id)].copy()

# Turn neuropil columns into categories - this makes things faster later on
pre_counts["neuropil"] = pre_counts["neuropil"].astype("category")
post_counts["neuropil"] = post_counts["neuropil"].astype("category")

In [None]:
# Add soma side for each of the neurons
# -> we need this to determine wether the neuropil is ipsi- or contralateral
pre_counts["soma_side"] = pre_counts.pre_pt_root_id.map(
    info.set_index("root_id").side
).astype("category")
post_counts["soma_side"] = post_counts.post_pt_root_id.map(
    info.set_index("root_id").side
).astype("category")

#pre_counts.head()

In [None]:
# Add a column that tells us whether a given neuropil is ipsi- or contralateral
# for the given neuron
pre_counts["np_side"] = None  # neuropils like the GNG will be neither!
post_counts["np_side"] = None
for x, y, z in [
    ("left", "L", "ipsilateral"),
    ("right", "R", "ipsilateral"),
    ("left", "R", "contralateral"),
    ("right", "L", "contralateral"),
]:
    pre_counts.loc[
        (pre_counts.soma_side == x) & (pre_counts.neuropil.str.contains(f"_{y}", na=False)),
        "np_side",
    ] = z
    post_counts.loc[
        (post_counts.soma_side == x)
        & (post_counts.neuropil.str.contains(f"_{y}", na=False)),
        "np_side",
    ] = z
#pre_counts.head()

In [None]:
# Map short forms of neuropils to FBbt and long names
np_map = pd.read_csv('neuropil_map.tsv', sep='\t', dtype='str')
pre_counts["neuropil_short"] = pre_counts.neuropil.apply(
    lambda x: x.replace("_L", "").replace("_R", "")
).astype("category")
post_counts["neuropil_short"] = post_counts.neuropil.apply(
    lambda x: x.replace("_L", "").replace("_R", "")
).astype("category")

pre_counts = pre_counts.merge(np_map[['neuropil_short', 'neuropil_full', 'NP_id']], how='left', on='neuropil_short')
post_counts = post_counts.merge(np_map[['neuropil_short', 'neuropil_full', 'NP_id']], how='left', on='neuropil_short')
#post_counts.head()

In [None]:
# Check for unmapped neuropils
np_fbbt = pd.concat([post_counts[['neuropil_short', 'NP_id']], post_counts[['neuropil_short', 'NP_id']]], 
                    ignore_index=True)

unmapped_nps = np_fbbt[np_fbbt['NP_id'].isna()]
unmapped_nps.head().drop_duplicates()

In [None]:
# Add a column that combines neuropil name and ipsi/contra
# i.e. something like "LH_contra"
post_counts["neuropil_ipsi_contra"] = post_counts.neuropil_full.astype(str)
pre_counts["neuropil_ipsi_contra"] = pre_counts.neuropil_full.astype(str)

for l in ("ipsilateral", "contralateral"):
    
    is_this = post_counts.np_side == l
    post_counts.loc[is_this, "neuropil_ipsi_contra"] = post_counts.loc[
        is_this, "neuropil_full"].map(lambda x: f"{x}_{l}")
    
    is_this = pre_counts.np_side == l
    pre_counts.loc[is_this, "neuropil_ipsi_contra"] = pre_counts.loc[
        is_this, "neuropil_full"].map(lambda x: f"{x}_{l}")

#post_counts.head()

In [None]:
full_names = dict(zip(np_map['neuropil_short'], np_map['neuropil_full']))
neuropil_ids = dict(zip(np_map['neuropil_short'], np_map['NP_id']))
cbr_ids = dict(zip(np_map['neuropil_short'], np_map['CBR_id']))

In [None]:
# check that type exists and get sub-dataframe from annotations file (to use in future functions)

def get_type_annotations(t):
    this_type = info[(info.cell_type == t) | (info.hemibrain_type == t)]

    if this_type.empty:
        raise ValueError(f"Unknown cell type: {t}")
    else:
        return this_type

In [None]:
from scipy.spatial import KDTree

all_neuropils = flywire.get_neuropil_volumes(None)
neuropils = flywire.get_neuropil_volumes(all_neuropils)
neuropil_centers = np.vstack([n.center for n in neuropils])

np_tree = KDTree(neuropil_centers)


def describe_position(t):
    """Describe soma position(s) relative to the closest neuropil.

    Parameters
    ----------
    t :             str
                    cell type name.

    Returns
    -------
    desc :          str
                    Description of the position (dorsal, ventral, etc.) relative
                    to the closest neuropil.
    vol_name :      str
                    Name of the closest neuropil.

    """
    this_type = get_type_annotations(t)
    # Extract soma positions
    this_type = this_type[this_type.soma_x.notnull()]
    if this_type.empty:
        raise ValueError(f"No recorded soma positions for cell type: {t}")
    pos = this_type[["soma_x", "soma_y", "soma_z"]].values * [4, 4, 40]
    dist, ix = np_tree.query(pos, k=1)
    # Get the neuropil most often to be the closest
    vol = neuropils[np.bincount(ix).argmax()]
    # print(vol.name)

    # From now on use only the neurons on the side of the neuropil we're
    # using as reference.
    # N.B. that neuropil names have not yet been L/R swapped yet - i.e. the
    # volume called "LH_L" is actually the RHS LH
    if vol.name.endswith("_L"):
        pos = pos[this_type.side == "right"]
    elif vol.name.endswith("_R"):
        pos = pos[this_type.side == "left"]

    # Find out if average `pos` is behind, in front, above, below or to the left or
    # the right of `vol.center`
    # We will only be using axes that represent at least 40% of the total
    # distance between soma and neuropil center. I.e. if a soma is really far
    # posterior to a neuropil, we won't care about whether it's also slightly to
    # the left or right of the neuropil.
    d = (pos - vol.center).mean(axis=0)
    d_frac = np.abs(d) / np.abs(d).sum()

    # Describe the position
    desc = ""
    join = ""
    if d_frac[2] >= 0.4:
        if d[2] > 0:
            desc += "posterior"
        else:
            desc += "anterior"
        join = "-"

    if d_frac[1] >= 0.4:
        desc += join
        if d[1] > 0:
            desc += "ventral"
        else:
            desc += "dorsal"
        join = "-"

    if d_frac[0] >= 0.4:
        desc += join
        if d[0] > 0:
            if vol.name.endswith("L"):
                desc += "lateral"
            else:
                desc += "medial"
        else:
            if vol.name.endswith("L"):
                desc += "medial"
            else:
                desc += "lateral"

    
    if not desc:
        desc = "near"

    # Drop _L and _R suffixes from the neurpil name
    vol_name = vol.name.replace("_R", "").replace("_L", "")
    # return full neuropil names and FBbt ids for cell body rinds ('adult brain cell body rind' as default)
    return desc, full_names.get(vol_name, vol_name), cbr_ids.get(vol_name, 'FBbt:00003625')

# Test
#describe_position("IB066")

In [None]:
def get_cbr_id(t):
    this_type = get_type_annotations(t)

    if this_type.super_class.values[0] == "sensory":
        # soma in PNS
        return 'FBbt:00005892'
    elif this_type.super_class.values[0] == "ascending":
        # Ascending neurons could be sensory or from the VNC - no ID for this
        return ''
    elif this_type.pos_x.notnull().any():
        pos, vol, FBbt = describe_position(t)
        return FBbt
    else:
    # Everthing else will have a soma in the brain
        return 'FBbt:00003625'

# Test
#get_cbr_id('cL17')

In [None]:
def get_presynapses(t):
    this_type = get_type_annotations(t)
    if this_type.super_class.values[0] not in ("sensory", "ascending",):
        pre = (
                pre_counts[pre_counts.pre_pt_root_id.isin(this_type.root_id)]
                .groupby(["neuropil_full", "np_side", "NP_id"], as_index=False)["count"]
                .sum()
                .sort_values("count", ascending=False)
                )

        # Get >=80% of synapses (i.e. cut off the tail)
        pre["frac"] = pre["count"] / pre["count"].sum()
        to = (pre["frac"].cumsum() <= 0.8).sum()
        pre = pre.iloc[: to + 1].reset_index(drop=True)
        return pre
    else: raise ValueError("Cell is wrong type to get presynapses")

#get_presynapses('CB0016')

In [None]:
def get_postsynapses(t):
    this_type = get_type_annotations(t)
    if this_type.super_class.values[0] not in ("motor",):
        post = (
                post_counts[post_counts.post_pt_root_id.isin(this_type.root_id)]
                .groupby(["neuropil_full", "np_side", "NP_id"], as_index=False)["count"]
                .sum()
                .sort_values("count", ascending=False)
                )

        # Get >=80% of synapses (i.e. cut off the tail)
        post["frac"] = post["count"] / post["count"].sum()
        to = (post["frac"].cumsum() <= 0.8).sum()
        post = post.iloc[: to + 1].reset_index(drop=True)
        return post
    else: raise ValueError("Cell is wrong type to get postsynapses")

#get_postsynapses('CB0016')

In [None]:
# neurotransmitters - may want to remove from here as may be redundant with incorporation from NT paper
nt_dict = {'acetylcholine': 'GO:0014055', 'glutamate': 'GO:0061535', 'gaba': 'GO:0061534', 
           'serotonin': 'GO:0060096', 'dopamine': 'GO:0061527', 'octopamine': 'GO:0061540'}
def get_neurotransmitter(t):
    this_type = get_type_annotations(t)
    if len(this_type.top_nt.dropna().unique()) == 1:
        nt_name = this_type.top_nt.dropna().values[0]
        return (nt_name, nt_dict.get(nt_name, ''))
    else:
        return False

# Test
#get_neurotransmitter('DNxe001')

In [None]:
# fbbt mappings at other levels
superclasses = pd.read_csv('superclasses.tsv', sep='\t', dtype='str')
#superclasses.head()

In [None]:
# map super_class to FBbt
def get_parent_ids(t):
    this_type = get_type_annotations(t)
    parent_annotations = this_type.merge(superclasses, how='left', 
                                         on=['flow', 'super_class', 'cell_class', 'cell_sub_class'])
    parent_ids = parent_annotations.FBbt_id.dropna().unique()
    if len(parent_ids) == 0:
        return ['FBbt:00047095'] # 'adult neuron'
    else:
        return parent_ids
#get_parent_ids('CB0927')

In [None]:
# lineage
lineage_map = pd.read_csv('lineage_map.tsv', sep='\t', dtype='str')
#lineage_map.head()

In [None]:
def get_lineage(t):
    this_type = get_type_annotations(t)
    lineages = this_type.ito_lee_hemilineage.dropna().unique()
    if len(lineages) == 1:
        this_lineage = lineage_map[lineage_map['ito_lee_hemilineage']==lineages[0]]
        lineage_ids = this_lineage.NB_id.dropna().unique()
        if len(lineage_ids) == 1:
            return (lineages[0], lineage_ids[0])
        else:
            return (lineages[0], "")
    else:
        return False
#get_lineage('CB0007')

In [None]:
def describe_cell_type(t):
    """Generate descriptor for given type `t`."""
    this_type = get_type_annotations(t)

    # Describe coarse soma location
    if this_type.super_class.values[0] == "sensory":
        # Sensory neurons are assumed to have somas in periphery
        soma_loc = "periphery"
    elif this_type.super_class.values[0] == "ascending":
        # Ascending neurons could be sensory or from the VNC
        soma_loc = "ventral nerve cord or periphery"
    else:
        # Everthing else will have a soma in the brain
        soma_loc = "brain"
        if this_type.pos_x.notnull().any():
            pos, vol, FBbt = describe_position(t)
            soma_loc += f", {pos} to the {vol}"

    # Coarse neuron type
    if this_type.super_class.values[0] == "central":
        # `central` neurons will be intrinsic to the brain
        neuron_type = "brain-intrinsic"
    elif this_type.super_class.values[0] == "optic":
        # `optic` neurons will be intrinsic to the optic lobes
        neuron_type = "optic-lobe-intrinsic"
    else:
        # for all other neurons, use the super class - e.g. "visual centrifugal"
        neuron_type = this_type.super_class.values[0].replace("_", " ")

    # Start building the description
    desc = f"Adult {neuron_type} neuron with its soma in the {soma_loc}. "

    # Check if we know anything about the lineage of this neuron type
    lin = get_lineage(t)
    if lin:
        if lin[0] not in ("primary", "putative_primary"):
            desc += f"It belongs to the {lin[0]} hemilineage. "
        else:
            desc += f"It is a putative embryonic-born neuron. "

    # Describe synapse distribution
    # For anything but sensory and ascending neurons we will describe postsynapses
    try:
        post = get_postsynapses(t)
        if len(post) > 0:
            desc += "It has postsynapses in "
            for i, row in post.iterrows():
                if i == (len(post) - 2):
                    join = " and "
                elif i == (len(post) - 1):
                    join = ". "
                else:
                    join = ", "
                desc += f"the {row['np_side']} {row['neuropil_full']}{join}"
    except(ValueError):
        pass
    
    # For anything but motor neurons we will describe presynapses
    try:
        # Describe presynapse distribution
        pre = get_presynapses(t)
        if len(pre) > 0:
            desc += "It has presynapses in "
            for i, row in pre.iterrows():
                if i == (len(pre) - 2):
                    join = " and "
                elif i == (len(pre) - 1):
                    join = ". "
                else:
                    join = ", "
                desc += f"the {row['np_side']} {row['neuropil_full']}{join}"
    except(ValueError):
        pass

    # Add neurotransmitter predictions
    # Potential TODO: incoroporate _known_ transmitters
    nt = get_neurotransmitter(t)
    if nt:
        desc += f"Its predicted neurotransmitter is {nt[0]}. "
    
    # Add cell number (count rows)
    if len(this_type) > 1:
        desc += f"There are approximately {len(this_type)} of these cells per organism."
    elif len(this_type) == 1:
        desc += f"There is approximately one of these cells per organism."

    return desc.strip()


#describe_cell_type("CB0715")

In [None]:
# make nice labels for visual and descending types
import re

optic_neuropil_mapping = {
    'M': 'medulla',
    'LP': 'lobula plate',
    'L': 'lobula'
}
CT_mapping = {
    'C': 'columnar',
    'T': 'tangential'
}
dn_mapping = {
    'a':'of the anterior dorsal brain',
    'b':'of the anterior ventral brain',
    'c':'of the pars intercerebralis',
    'd':'of the outside anterior cluster',
    'g':'of the gnathal ganglion',
    'p':'of the posterior brain',
    'x':'outside of the brain'
}

In [None]:
# function for visual projection neurons
def vpn_label(cell_type):
    match = re.match(r'(([LM]{1}[P]?)([LM]?[P]?)([LM]?[P]?))([CT]{1})(e)(\d+[a-z]?)', cell_type)
    if match:
        full_neuropil, np_1, np_2, np_3, columnar_tangential, em, number = match.groups()
        neuropil = [optic_neuropil_mapping.get(n) for n in [np_1, np_2, np_3] if n]
        neuropil = '-'.join(neuropil)
        columnar_tangential = CT_mapping.get(columnar_tangential)
        return f"adult {neuropil} {columnar_tangential} neuron e{number}"
    else:
        return None

#vpn_label('MLCe01')

In [None]:
# function for visual centrifugal neurons
def vcn_label(cell_type):
    match = re.match(r'(c)(([LM]{1}[P]?)([LM]?[P]?)([LM]?[P]?))(\d+[a-z]?)', cell_type)
    if match:
        centrifugal, full_neuropil, np_1, np_2, np_3, number = match.groups()
        neuropils = [optic_neuropil_mapping.get(n) for n in [np_1, np_2, np_3] if n]
        neuropils = '-'.join(neuropils)
        return f"adult {neuropils} visual centrifugal neuron {number}"
    else:
        return None

#vcn_label('M12')

In [None]:
# function for descending neurons
def dn_label(cell_type):
    match = re.match(r'(DN)([abcdgpx])[e]?(\d+)', cell_type)
    if match:
        descending, region, number = match.groups()
        region_long = dn_mapping.get(region)
        return (f"descending neuron {region_long} {cell_type}", region)
    else:
        return None

#dn_label('DNge014')

In [None]:
def comma_replace(cell_type):
    """Check for ',' or ', ' and replace with '-'."""
    substituted = re.sub(', *', '-', cell_type)
    return substituted

#comma_replace('LHPVc3, LHPVc4')

In [None]:
# FBbt id maps for DN parent classes
dn_parent_mapping = {
    'a':'FBbt:00047512',
    'b':'FBbt:00047513',
    'c':'FBbt:00047514',
    'd':'FBbt:00047515',
    'g':'FBbt:00047516',
    'p':'FBbt:00047517',
    'x':'FBbt:00047518'
}

In [None]:
# load ids for types

fw_type_ids = pd.read_csv('FBbt_ID-cell_type.tsv', sep='\t', dtype='str')
#fw_type_ids.head()

In [None]:
# check for types that are not in annotations file
missing_types = []
for ct in fw_type_ids.cell_type.dropna().unique():
    try:
        get_type_annotations(ct)
    except ValueError:
        missing_types.append(ct)
missing_types

In [None]:
from collections import OrderedDict

# template header
template_head = OrderedDict([('ID', 'ID'), ('TYPE', 'TYPE'), ('Label', 'LABEL'), 
                             ("obo_id" , "A oboInOwl:id"), ("obo_namespace" , "A oboInOwl:hasOBONamespace"), 
                             ('Definition', 'A IAO:0000115'), 
                             ('Def_xrefs', '>A oboInOwl:hasDbXref SPLIT=|'), ('Comment', 'A rdfs:comment'), 
                             ('RelatedSynonyms', 'A oboInOwl:hasRelatedSynonym SPLIT=|'), 
                             ('RelatedSynonyms_xrefs', '>A oboInOwl:hasDbXref SPLIT=|'), 
                             ('ExactSynonyms', 'A oboInOwl:hasExactSynonym SPLIT=|'), 
                             ('ExactSynonyms_xrefs', '>A oboInOwl:hasDbXref SPLIT=|'), 
                             ('Creators', 'AI dc:contributor SPLIT=|'), 
                             ('Date', 'AT dc:date^^xsd:dateTime'), 
                             ('Soma', 'SC RO:0002100 some %'), ('Parents', 'SC % SPLIT=|'), 
                             ('Lineage','SC RO:0002202 some %'), ('Bilateral', 'SC RO:0000053 some %'),
                             ('Neurotransmitter', 'SC RO:0002215 some %'), 
                             ('Presynapses', 'SC RO:0013003 some % SPLIT=|'), 
                             ('Postsynapses', 'SC RO:0013002 some % SPLIT=|')])
template = pd.DataFrame.from_dict([template_head])
#template

In [None]:
# Build template one row at a time for each class in FW new types
for i in fw_type_ids.index:
    
    # create a dictionary to hold information for this class
    row = OrderedDict([])
    for c in template.columns:
        row[c] = "" # setting all as "" now to avoid awkward NaNs later
    
    # populate dictionary
    cell_type = fw_type_ids.cell_type[i]
    
    # annotation axioms
    row['ID'] = fw_type_ids['FBbt_id'][i]
    row['obo_id'] = fw_type_ids['FBbt_id'][i]
    row['obo_namespace'] = 'fly_anatomy.ontology'
    row['TYPE'] = 'owl:Class'
    row['Label'] = f'adult {comma_replace(cell_type)} neuron'
    row['Definition'] = describe_cell_type(cell_type)
    # xrefs for Dorkenwald et al. (2023), Schlegel et al. (2023), Eckstein et al. (2020)
    row['Def_xrefs'] = 'doi:10.1101/2023.06.27.546656|doi:10.1101/2023.06.27.546055|doi:10.1101/2020.06.12.148775'
    row['Comment'] = ("Uncharacterized putative cell type from Schlegel et al. (2023), based on "
                      "FlyWire v783 (FAFB) data (Dorkenwald et al., 2023). "
                      "Soma locations are based on the closest annotated neuropil region. "
                      "Pre- or post-synapse locations are the fewest regions that collectively "
                      "contain at least 80 percent of all pre- or post-synapses of these neurons in FlyWire. "
                      "Neurotransmitter predictions are from Eckstein et al. (2023). "
                      "Other annotations are based on annotations in FlyWire and are available in "
                      "the supplemental material of Schlegel et al. (2023).")
                      
    row['Creators'] = "https://orcid.org/0000-0002-1373-1705|https://orcid.org/0000-0002-5633-1314"
    row['Date'] = fw_type_ids['creation_date'][i]
    
    # establish lists for collecting multivalues
    Parents_list = list(get_parent_ids(cell_type))
    try:
        ExactSynonyms_list = fw_type_ids['synonym'][i].split('|')
    except(AttributeError):
        ExactSynonyms_list = []
    ExactSynonyms_xrefs_list = []
    
    # logical axioms
    row['Soma'] = get_cbr_id(cell_type)
    
    nt = get_neurotransmitter(cell_type)
    if nt:
        row['Neurotransmitter'] = nt[1]
    
    lin = get_lineage(cell_type)
    if lin:
        row['Lineage'] = lin[1]
        if lin[0] in ("primary", "putative_primary"):
            Parents_list.append('FBbt:00047097') # primary neuron
    
    if "contralateral" in row['Definition']:
        row['Bilateral'] = "PATO:0000618"
    
    try:
        row['Presynapses'] = '|'.join(get_presynapses(cell_type)['NP_id'].unique())
    except(ValueError):
        pass
    try:
        row['Postsynapses'] = '|'.join(get_postsynapses(cell_type)['NP_id'].unique())
    except(ValueError):
        pass
    
    # special sets of axioms for visual and descending
    # parents might be redundant with other annotations, but doesn't hurt to add again
    vpn_label_out = vpn_label(cell_type)
    if vpn_label_out:
        row['Label'] = vpn_label_out
        Parents_list.append('FBbt:00048286') # 'adult visual projection neuron'
        ExactSynonyms_list.append(f'adult {cell_type} neuron')
        ExactSynonyms_xrefs_list.append('doi:10.1101/2023.06.27.546055')
    
    vcn_label_out = vcn_label(cell_type)
    if vcn_label_out:
        row['Label'] = vcn_label_out
        Parents_list.append('FBbt:00059244') # 'adult visual centrifugal neuron'
        ExactSynonyms_list.append(f'adult {cell_type} neuron')
        ExactSynonyms_xrefs_list.append('doi:10.1101/2023.06.27.546055')
    
    dn_label_out = dn_label(cell_type)
    if dn_label_out:
        row['Label'] = dn_label_out[0]
        Parents_list.append(f'{dn_parent_mapping.get(dn_label_out[1])}')
    
    # check for mutually-exclusive parents
    
    if ('FBbt:00007440' in Parents_list) and ('FBbt:00007441' in Parents_list):
        Parents_list.remove('FBbt:00007440') # uniglomerular PN
        Parents_list.remove('FBbt:00007441') # multiglomerular PN
        Parents_list.append('FBbt:00067123') # PN
        
    # lists to strings
    row['Parents'] = '|'.join(Parents_list)
    row['ExactSynonyms'] =' |'.join(ExactSynonyms_list)
    row['ExactSynonyms_xrefs'] = '|'.join(ExactSynonyms_xrefs_list)
    
    # turn into dataframe and join to template
    row_df = pd.DataFrame.from_dict([row])
    template = pd.concat([template, row_df], ignore_index=True)

#template.head()

In [None]:
# add rows to specify TYPE for other classes and OPs
# this is a workaround for a robot bug - https://github.com/ontodev/robot/issues/1105
import re

# function to get curies
def extract_uris(text):
    return re.findall(r'\b(?:FBbt|GO|PATO|RO):\d+\b', text)


In [None]:
curies = []
for column in template.columns:
    if column != 'ID':
        new_uris = template[column].apply(extract_uris)
        # Filter out empty lists
        new_uris = new_uris[new_uris.apply(lambda x: len(x) > 0)].explode()
        curies.extend(new_uris)
        curies = list(set(curies))

#curies

In [None]:
# make a row for each
for x in curies:
    
    # create a dictionary to hold information for this class
    row = OrderedDict([])
    for c in template.columns:
        row[c] = "" # setting all as "" now to avoid awkward NaNs later
    
    # populate dictionary
    
    # annotation axioms
    row['ID'] = x
    if x.startswith('RO:'):
        row['TYPE'] = 'owl:ObjectProperty'
    else:
        row['TYPE'] = 'owl:Class'
    
    # turn into dataframe and join to template
    row_df = pd.DataFrame.from_dict([row])
    template = pd.concat([template, row_df], ignore_index=True)

#template.head()

In [None]:
template.to_csv("flywire_new_cells_template.tsv", sep='\t', index=None)

In [None]:
# Also make updated terms for hemibrain neurons where data is available in FlyWire
all_hemibrain_types = pd.read_csv('../hemibrain_new_types/new_cell_types.tsv', sep='\t', 
                                  dtype = 'str', na_filter=False)
#all_hemibrain_types.head()

In [None]:
updated_hemibrain_types = all_hemibrain_types[
    all_hemibrain_types['np_type'].isin(info['cell_type'])\
    |all_hemibrain_types['np_type'].isin(info['hemibrain_type'])].reset_index()

#len(updated_hemibrain_types)

In [None]:
# Build template one row at a time for each class in hemibrain types
### Run template header cell first ###

for i in updated_hemibrain_types.index:
    
    # create a dictionary to hold information for this class
    row = OrderedDict([])
    for c in template.columns:
        row[c] = "" # setting all as "" now to avoid awkward NaNs later
    
    # populate dictionary
    cell_type = updated_hemibrain_types.np_type[i]
    
    # annotation axioms
    row['ID'] = updated_hemibrain_types['FBbt_id'][i]
    row['obo_id'] = updated_hemibrain_types['FBbt_id'][i]
    row['obo_namespace'] = 'fly_anatomy.ontology'
    row['Label'] = updated_hemibrain_types['FBbt_name'][i]
    row['Definition'] = describe_cell_type(cell_type)
    # xrefs for Dorkenwald et al. (2023), Schlegel et al. (2023), Eckstein et al. (2023)
    row['Def_xrefs'] = ('FlyBase:FBrf0246888|doi:10.1101/2023.06.27.546656|'
                        'doi:10.1101/2023.06.27.546055|doi:10.1101/2020.06.12.148775')
    row['Comment'] = ("Uncharacterized putative cell type based on Hemibrain data (Scheffer et al., 2020) "
                      "and updated using FlyWire (FAFB) data (Dorkenwald et al., 2023). "
                      "Soma locations are based on the closest annotated neuropil region. "
                      "Pre- or post-synapse locations are the fewest regions that collectively "
                      "contain at least 80 percent of all pre- or post-synapses of these neurons in FlyWire. "
                      "Neurotransmitter predictions are from Eckstein et al. (2023). "
                      "Other annotations are based on annotations in FlyWire and are available in "
                      "the supplemental material of Schlegel et al. (2023).")
                      
    row['Creators'] = "https://orcid.org/0000-0002-1373-1705"
    row['Date'] = updated_hemibrain_types['date'][i]
    row['RelatedSynonyms'] = updated_hemibrain_types['synonym'][i]
    row['RelatedSynonyms_xrefs'] = updated_hemibrain_types['synonym_ref'][i]
    
    # logical axioms
    Parents_list = list(get_parent_ids(cell_type))
    row['Soma'] = get_cbr_id(cell_type)
    
    nt = get_neurotransmitter(cell_type)
    if nt:
        row['Neurotransmitter'] = nt[1]
    
    lin = get_lineage(cell_type)
    if lin:
        row['Lineage'] = lin[1]
        if lin[0] in ("primary", "putative_primary"):
            Parents_list.append('FBbt:00047097') # primary neuron
    
    try:
        row['Presynapses'] = '|'.join(get_presynapses(cell_type)['NP_id'].unique())
    except(ValueError):
        pass
    try:
        row['Postsynapses'] = '|'.join(get_postsynapses(cell_type)['NP_id'].unique())
    except(ValueError):
        pass
    
    # check for mutually-exclusive parents
    
    if ('FBbt:00007440' in Parents_list) and ('FBbt:00007441' in Parents_list):
        Parents_list.remove('FBbt:00007440') # uniglomerular PN
        Parents_list.remove('FBbt:00007441') # multiglomerular PN
        Parents_list.append('FBbt:00067123') # PN
        
    # add parent to row dict
    
    row['Parents'] = '|'.join(Parents_list)
    
    # turn into dataframe and join to template
    row_df = pd.DataFrame.from_dict([row])
    template = pd.concat([template, row_df], ignore_index=True)

#template

In [None]:
# re run addition of TYPE info for ancestors

In [None]:
template.to_csv("flywire_hemibrain_cells_template.tsv", sep='\t', index=None)