In [None]:
import pandas as pd
import csv
import re
from num2words import num2words
from collections import OrderedDict

In [None]:
# load table, all values as strings, NaN -> empty string
Feng_new_types = pd.read_csv('./Feng.tsv', sep='\t', dtype='str').fillna('')
# updated version of table 2 from preprint
Ehrhardt_new_types = pd.read_csv('./VNCtable2.tsv', sep='\t', dtype='str').fillna('')

# convert segment and response to lists
Feng_new_types['Segments'] = Feng_new_types['Segments'].apply(lambda x: x.split(', '))
Feng_new_types['response to MDN'] = Feng_new_types['response to MDN'].apply(lambda x: x.split(', '))
#Ehrhardt_new_types[20:30]

In [None]:
# 16A mapped to stage neutral term for now (secondary A lineage not supposed to exist...)
hemilineages = pd.read_csv("./hemilineage_map.tsv", sep='\t', dtype='str', index_col='hemilineage')

In [None]:
neuromere_id_dict = {'T1':'FBbt:00111083', 'T2':'FBbt:00111084', 'T3':'FBbt:00111085', 'A':'FBbt:00110173'}
neuromere_dict = {'T1':'prothoracic', 'T2':'mesothoracic', 'T3':'metathoracic', 'A':'abdominal'}

In [None]:
type_id_dict = {'adult ascending neuron':'FBbt:00048301', 'adult neuron': 'FBbt:00047095'}

In [None]:
def name_parser(cell_type):
    components = re.match("([A-Z][0-9]?)([BU])([ILAM])([0-9]+)", cell_type)
    return components
#name_parser('LUA18')[1]

In [None]:
def soma_parser(soma_loc):
    components = re.match("([A123])([a-z]+)", soma_loc)
    if components[1] == 'A':
        neuromere = components[1]
    else:
        neuromere = 'T' + components[1]
    pos_dict = {'a': 'anterior', 'p': 'posterior', 'v': 'ventral', 'd': 'dorsal', 'm': 'medial', 'l': 'lateral'}
    position = '-'.join([pos_dict[pos] for pos in components[2]])
    return neuromere, position

#soma_parser('Aavm')

In [None]:
def word_joiner(word_list, and_or='and'):
    word_string = ""
    while len(word_list) > 2:
        word_string += word_list.pop(0) + ', '
    if len(word_list) ==2:
        word_string += word_list.pop(0) + f' {and_or} '
    word_string += word_list.pop(0)
    return word_string
    
#word_joiner(['a','c'])

In [None]:
arbor_name_map = {'N':'neck neuropil','W':'wing neuropil','H':'haltere neuropil', 'L':'leg neuropil',
                  'L1':'prothoracic leg neuropil','L2':'mesothoracic leg neuropil',
                  'L3':'metathoracic leg neuropil','A':'abdominal neuromere',
                  'X':'accessory mesothoracic neuropil',
                  'iT1':'intermediate tectulum of the prothoracic neuromere',
                  'lT1':'lower tectulum of the prothoracic neuromere',
                  'iT2':'intermediate tectulum of the mesothoracic neuromere',
                  'lT2':'lower tectulum of the mesothoracic neuromere',
                  'iT3':'intermediate tectulum of the metathoracic neuromere',
                  'C1':'tectulum of the prothoracic neuromere',
                  'C2':'tectulum of the mesothoracic neuromere',
                  'C3':'tectulum of the metathoracic neuromere'
                  }

arbor_FBbt_map = {'N':'FBbt:00047173','W':'FBbt:00047137','H':'FBbt:00047138', 'L':'FBbt:00047139',
                  'L1':'FBbt:00047140','L2':'FBbt:00047142',
                  'L3':'FBbt:00047142','A':'FBbt:00110173',
                  'X':'FBbt:00004091',
                  'iT1':'FBbt:00052868',
                  'lT1':'FBbt:00052871',
                  'iT2':'FBbt:00052869',
                  'lT2':'FBbt:00052872',
                  'iT3':'FBbt:00052870',
                  'C1':'FBbt:00052875',
                  'C2':'FBbt:00052876',
                  'C3':'FBbt:00052876'
                  }

def map_to_np_names(list_of_np_letters):
    return [arbor_name_map[i] for i in list_of_np_letters]

def map_to_np_ids(list_of_np_letters):
    return [arbor_FBbt_map[i] for i in list_of_np_letters]
# nb no column in data for lT3

In [None]:
class ArborDetail:
    def __init__(self, cell_type):
        self.cell_type=cell_type
        self.table_row = Ehrhardt_new_types[Ehrhardt_new_types['cell_type']==cell_type].reset_index(drop=True)
        self.ipsi_dendritic=[]
        self.contra_dendritic=[]
        self.ipsi_axonal=[]
        self.contra_axonal=[]
        self.ipsi_mixed=[]
        self.contra_mixed=[]
        self.ipsi_partitioned=[]
        self.contra_partitioned=[]

    def get_arbor_detail(self):
        "Ignores small/sparse (lower case) arbors."
        arbor_cols = ['N','iT1','lT1','L1','X','W','iT2','lT2','L2','H','iT3','L3','A']
        for col in arbor_cols:
            if re.match("[ADMPadmp-]{2}",self.table_row[col][0]):
                if self.table_row[col][0][0]=='D':
                    self.ipsi_dendritic.append(col)
                elif self.table_row[col][0][0]=='A':
                    self.ipsi_axonal.append(col)
                elif self.table_row[col][0][0]=='M':
                    self.ipsi_mixed.append(col)
                elif self.table_row[col][0][0]=='P':
                    self.ipsi_partitioned.append(col)
                if self.table_row[col][0][1]=='D':
                    self.contra_dendritic.append(col)
                elif self.table_row[col][0][1]=='A':
                    self.contra_axonal.append(col)
                elif self.table_row[col][0][1]=='M':
                    self.contra_mixed.append(col)
                elif self.table_row[col][0][1]=='P':
                    self.contra_partitioned.append(col)
        
    def def_snippet_writer(self):
        arbor_def = ""
        # dendritic, axonal and mixed
        for pattern in [['dendritic', self.ipsi_dendritic, self.contra_dendritic], 
                        ['axonal', self.ipsi_axonal, self.contra_axonal], 
                        ['mixed', self.ipsi_mixed, self.contra_mixed]]:
            if (len(pattern[1] + pattern[2])) > 0:
                arbor_def += f" It has {pattern[0]} arborization in the "
                if self.table_row['hemilineage'][0].startswith('0'):
                    arbor_def += word_joiner(map_to_np_names(pattern[1])) + " of both hemispheres"
                else:
                    if len(pattern[1]) > 0:
                        arbor_def += "ipsilateral " + word_joiner(map_to_np_names(pattern[1]))
                    if len(pattern[2]) > 0:
                        if len(pattern[1]) > 0:
                            arbor_def += " and the "
                        arbor_def += "contralateral " + word_joiner(map_to_np_names(pattern[2]))
                arbor_def += ' (Ehrhardt et al., 2023).'

        # partitioned
        if (len(self.ipsi_partitioned + self.contra_partitioned)) > 0:
            arbor_def += " It has partitioned arborization in the "
            if self.table_row['hemilineage'][0].startswith('0'):
                arbor_def += word_joiner(self.ipsi_partitioned) + " of both hemispheres"
            else:
                if len(self.ipsi_partitioned) > 0:
                    arbor_def += "ipsilateral " + word_joiner(map_to_np_names(self.ipsi_partitioned))
                if len(self.contra_partitioned) > 0:
                    if len(self.ipsi_partitioned) > 0:
                        arbor_def += " and the "
                    arbor_def += "contralateral " + word_joiner(map_to_np_names(self.contra_partitioned))
            arbor_def += (', where dendritic and axonal arbors occupy '
                          'separate regions of the neuropil (Ehrhardt et al., 2023).')
        return arbor_def

    def dendritic_FBbts(self):
        all_dendritic = set(self.ipsi_dendritic + self.contra_dendritic + \
            self.ipsi_mixed + self.contra_mixed + self.ipsi_partitioned + \
            self.contra_partitioned)
        FBbt_ids = [arbor_FBbt_map[i] for i in all_dendritic]
        return FBbt_ids
    
    def axonal_FBbts(self):
        all_axonal = set(self.ipsi_axonal + self.contra_axonal + \
            self.ipsi_mixed + self.contra_mixed + self.ipsi_partitioned + \
            self.contra_partitioned)
        FBbt_ids = [arbor_FBbt_map[i] for i in all_axonal]
        return FBbt_ids
    


In [None]:
#test_arbors = ArborDetail('WBL001')
#test_arbors.get_arbor_detail()
#test_arbors.def_snippet_writer()

In [None]:
def hemilineage_parser(hemilineage):
    if hemilineage == ('emb' | 'abd'):
        return hemilineage
    else:
        return hemilineage.split(' ')

In [None]:
def label_maker(cell_type, subtype=False, neuromere=None):
    if subtype:
        try:
            label = "adult %s %s neuron" % (neuromere_dict[neuromere], cell_type)
        except KeyError:
            print("Neuromere must be 'T1', 'T2' or 'T3' if subtype=True")
            return None
    else:
        label = "adult %s neuron of ventral nerve cord" % cell_type
    return label

In [None]:
def definition_writer(neuromeres, cell_type, response, subtype=False, number='', hetero=''):
    pub = " (Feng et al., 2020)"
    response_dict = {'i':'inhibited', 'w':'weakly excited', 's':'strongly excited'}
    if 1 <= len(neuromeres) <= 3:
        nm = "in the %s neuromere" % word_joiner([neuromere_dict[n] for n in neuromeres], and_or='or')
    else:
        raise ValueError("'neuromeres' must be a list of length 1-3")
    
    if subtype:
        typing = ("Adult %s neuron with its soma in the %s neuromere%s. " 
                  % (cell_type, neuromere_dict[neuromeres[0]], pub))
        try:
            MDN_response = ("It is %s downstream of moonwalker descending neuron activity%s. " 
                            % (response_dict[response[int(neuromeres[0].lstrip('T'))-1]], pub))
        except KeyError:
            MDN_response = ""
    else:
        typing = " It has its soma %s%s. " % (nm, pub)
        if len(set(response)) == 1:
            MDN_response =  ("It is %s downstream of moonwalker descending neuron activity%s. " 
                            % (response_dict[response[0]], pub))
        else:
            MDN_response = ""
        
    if len(number) > 0:
        if '~' in number:
            approx = 'are approximately '
        elif number == '1':
            approx = 'is '
        else:
            approx = 'are '
        num = number.lstrip('~')
        n = re.search(('[0-9]+'),num)
        for m in n.groups():
            num=num.replace(m, num2words(m))
        if hetero == '1':
            het = ' and they are heterogeneous'
        else:
            het = ''
        number = ("There %s%s of these cells per hemineuromere%s%s."
                  % (approx, num, het, pub))
        
    definition = typing + MDN_response + number
    definition.rstrip(' ')
    return definition 

In [None]:
# Make a dictionary with key - column header & value = template specification (first row of table).

template_seed = OrderedDict([('ID' , 'ID'), ('class_expression_type', 'CLASS_TYPE'), ('TYPE' , 'TYPE')])

# fields for obo ID and namespace
template_seed.update([("obo_id" , "A oboInOwl:id"), ("obo_namespace" , "A oboInOwl:hasOBONamespace")])

#label, definition, short synonym:
template_seed.update([("Name" , "A rdfs:label"), ("definition" , "A IAO:0000115"),
                      ("Xref_def" , ">A oboInOwl:hasDbXref SPLIT=|"),
                      ("created_by" , "A dc:contributor"),
                      ("creation_date", "A dc:date")])

#short name synonym
template_seed.update([("MFSynonym" , "A oboInOwl:hasExactSynonym"),
                      ("MFsyn_ref" , ">A oboInOwl:hasDbXref"),
                      ("MTSynonym" , "A oboInOwl:hasNarrowSynonym"),
                      ("MTsyn_ref" , ">A oboInOwl:hasDbXref"),
                      ("comment" , "A rdfs:comment")])

# Columns for relationships:
template_seed.update([("soma" , "C 'has soma location' some %"),
                      ("Neuroblast" , "C 'develops from' some %"),
                      ("synaptic_IO", "C 'has synaptic IO in region' some %"),
                      ("dendritic_arbors" , "C 'receives synaptic input in region' some % SPLIT=|"),
                      ("axonal_arbors" , "C 'sends synaptic output to region' some % SPLIT=|"),
                      ("laterality" , "C 'has characteristic' some %"),
                      ("parents" , "C % SPLIT=|")])

# Create dataFrame for template
template = pd.DataFrame.from_records([template_seed])

#template

In [None]:
# fill template for Feng neurons

for i in Feng_new_types.index:

    row_od = OrderedDict([]) #new template row as an empty ordered dictionary
    for c in template.columns: #make columns and blank data for new template row
        row_od.update([(c , "")])
        
    cell_type = Feng_new_types['cell_type'][i]
    
    #these are the same in each row
    row_od["Xref_def"] = "FlyBase:FBrf0247391|doi:10.1101/2023.05.31.542897"
    row_od["MFsyn_ref"] = "FlyBase:FBrf0247391"
    row_od["MTsyn_ref"] = "FlyBase:FBrf0247391"
    row_od["created_by"] = "http://orcid.org/0000-0002-1373-1705"
    row_od["creation_date"] = "2021-03-22T12:00:00Z"
    row_od["comment"] = ("Cell type described in Feng et al. (2020), "
                         "nomenclature scheme explained in Ehrhardt et al. (2023).")
    row_od['obo_namespace'] = "fly_anatomy.ontology"
    row_od["TYPE"] = 'owl:Class'
    
    row_od["ID"] = Feng_new_types['FBbt_ID'][i]
    row_od['obo_id'] = row_od["ID"]
    
    # cells only in one segment - add soma location and synonym
    if (len(Feng_new_types['Segments'][i]) == 1):
        row_od["soma"] = neuromere_id_dict[Feng_new_types['Segments'][i][0]]
        if(len(Feng_new_types['MT'][i]) > 0):     
            row_od["MTSynonym"] = "adult %s neuron" % Feng_new_types['MT'][i]
    else:
        row_od["soma"] = 'FBbt:00048302' # CBR of thoracic nm
    
    # cell types that are subclasses of other new types
    if 'FBbt' in Feng_new_types['parent_ID'][i]:
        row_od["class_expression_type"] = 'equivalent'
        row_od["Name"] = label_maker(cell_type, 
                                     subtype=True, 
                                     neuromere=Feng_new_types['Segments'][i][0])
        row_od["parents"] = Feng_new_types['parent_ID'][i]
        row_od["soma"] = neuromere_id_dict[Feng_new_types['Segments'][i][0]]
        name_def = ""
    
    # cell types that are not subclasses of other new types
    else:
        row_od["class_expression_type"] = 'subclass'
        row_od["Name"] = label_maker(cell_type)
        row_od["MFSynonym"] = "adult MF%s neuron" % Feng_new_types['MF'][i].zfill(2)
        
        # collect parents in a list and make row later
        parents = []
        
        # info from cell type name (for non subtypes)
        name_components = name_parser(cell_type)

        prim_arbor = arbor_name_map[name_components[1]]
        row_od['synaptic_IO'] = arbor_FBbt_map[name_components[1]]

        if name_components[2] == 'B':
            row_od['laterality'] = 'PATO:0000618'
            laterality = 'Bilateral'
        elif name_components[2] == 'U':
            row_od['laterality'] = 'PATO:0000634'
            laterality = 'Unilateral'

        if name_components[3] == 'I':
            parents.append('FBbt:00052046')
            type_detail = 'It is an intersegmental interneuron (Feng et al. 2020; Ehrhardt et al., 2023).'
        elif name_components[3] == 'L':
            parents.append('FBbt:00052046')
            type_detail = ("It is an interneuron that arborizes mainly "
                           "within one neuromere of the VNC (Feng et al. 2020; Ehrhardt et al., 2023).")
        elif name_components[3] == 'A':
            parents.append('FBbt:00048301')
            type_detail = ("It is an ascending neuron (Feng et al. 2020; Ehrhardt et al., 2023).")
        elif name_components[3] == 'M':
            parents.append('FBbt:00048235')
            type_detail = ("It is a motor neuron (Feng et al. 2020; Ehrhardt et al., 2023).")

        name_def = ("%s neuron of the %s group of the adult ventral nerve cord that primarily arborizes in "
                     "the %s (Feng et al. 2020; Ehrhardt et al., 2023). " % (laterality, cell_type, prim_arbor)) + type_detail
    
        row_od["parents"] = '|'.join(parents)
    
    
    # definition   
    row_od["definition"] = name_def + definition_writer(neuromeres=Feng_new_types['Segments'][i], 
                                         cell_type=cell_type, 
                                         response=Feng_new_types['response to MDN'][i], 
                                         subtype=('FBbt' in Feng_new_types['parent_ID'][i]), 
                                         number=Feng_new_types['cell_number'][i], 
                                         hetero=Feng_new_types['hetero'][i])
    

    #make new row into a DataFrame and add it to template
    new_row = pd.DataFrame.from_records([row_od])
    template = pd.concat([template, new_row], ignore_index=True, sort=False)
        
#template.head()

In [None]:
#template.to_csv('template.tsv', sep='\t', index=None)

In [None]:
# add Ehrhardt neurons

for i in Ehrhardt_new_types.index:

    row_od = OrderedDict([]) #new template row as an empty ordered dictionary
    for c in template.columns: #make columns and blank data for new template row
        row_od.update([(c , "")])
        
    cell_type = Ehrhardt_new_types['cell_type'][i]
    
    
    # these are the same in each row
    row_od["Xref_def"] = "doi:10.1101/2023.05.31.542897"
    row_od["created_by"] = "http://orcid.org/0000-0002-1373-1705"
    row_od["creation_date"] = "2024-03-01T12:00:00Z"
    row_od['obo_namespace'] = "fly_anatomy.ontology"
    row_od["class_expression_type"] = 'subclass'
    row_od["TYPE"] = 'owl:Class'
    
    
    # easy fields
    row_od["ID"] = Ehrhardt_new_types['FBbt_ID'][i]
    row_od["Name"] = label_maker(cell_type)
    row_od['obo_id'] = row_od["ID"]
    
    # collect parents in a list and make row later
    parents = []
        
    # lineage
    if Ehrhardt_new_types['hemilineage'][i] == 'emb':
        lineage_def = " It is a primary neuron (Ehrhardt et al., 2023)."
        parents.append('FBbt:00047097')
    elif Ehrhardt_new_types['hemilineage'][i] == 'abd':
        row_od['Neuroblast'] = 'FBbt:00003584'
    else:
        hl = Ehrhardt_new_types['hemilineage'][i]
        lineage_def = (f" It belongs to the {hl.split(' ')[0]} hemilineage of "
                       f"the {hl.split(' ')[1]} neuromere (Ehrhardt et al., 2023).")
        parents.append(hemilineages['hemilineage_FBbt'][hl])
        row_od['Neuroblast'] = hemilineages['neuroblast_FBbt'][hl]

        
    # soma location
    soma_info = soma_parser(Ehrhardt_new_types['soma_loc'][i])
    row_od["soma"] = neuromere_id_dict[soma_info[0]]
    soma_def = (f" It has its soma in a {soma_info[1]} position "
                f"in the {neuromere_dict[soma_info[0]]} neuromere (Ehrhardt et al., 2023).")
    
    # arborisation pattern
    arborization = ArborDetail(cell_type)
    arborization.get_arbor_detail()
    arbor_def = arborization.def_snippet_writer()
    row_od['dendritic_arbors'] = '|'.join(arborization.dendritic_FBbts())
    row_od['axonal_arbors'] = '|'.join(arborization.axonal_FBbts())
    
    
    # info from cell type name
    name_components = name_parser(cell_type)
    
    prim_arbor = arbor_name_map[name_components[1]]
    
    if name_components[2] == 'B':
        row_od['laterality'] = 'PATO:0000618'
        laterality = 'Bilateral'
    elif name_components[2] == 'U':
        row_od['laterality'] = 'PATO:0000634'
        laterality = 'Unilateral'
    
    if name_components[3] == 'I':
        parents.append('FBbt:00052046')
        type_detail = 'It is an intersegmental interneuron (Ehrhardt et al., 2023).'
    elif name_components[3] == 'L':
        parents.append('FBbt:00052046')
        type_detail = ("It is an interneuron that arborizes mainly "
                       "within one neuromere of the VNC (Ehrhardt et al., 2023).")
    elif name_components[3] == 'A':
        parents.append('FBbt:00048301')
        type_detail = ("It is an ascending neuron (Ehrhardt et al., 2023).")
    elif name_components[3] == 'M':
        parents.append('FBbt:00048235')
        type_detail = ("It is a motor neuron (Ehrhardt et al., 2023).")
    
    name_def = ("%s neuron of the %s group of the adult ventral nerve cord that primarily arborizes in "
                 "the %s (Ehrhardt et al., 2023). " % (laterality, cell_type, prim_arbor)) + type_detail
    

    # cell number
    cell_no = Ehrhardt_new_types['cell_number'][i]
    
    if hl.startswith('0'): # unpaired lineage 0
        per_statement = "per organism"
    else:
        per_statement = "per hemineuromere"
    
    if cell_no == '1':
        cell_no_def = f" There is one of these cells {per_statement} (Ehrhardt et al., 2023)."
    elif cell_no > '2':
        cell_no_def = f" There are {num2words(cell_no)} of these cells {per_statement} (Ehrhardt et al., 2023)."
    else:
        cell_no_def = ''

    
    # definition   
    row_od["definition"] = name_def + arbor_def + soma_def + lineage_def + cell_no_def
    
    row_od['parents'] = '|'.join(parents)

    
    #make new row into a DataFrame and add it to template
    new_row = pd.DataFrame.from_records([row_od])
    template = pd.concat([template, new_row], ignore_index=True, sort=False)
        
#template.head()

In [None]:
# add rows to specify TYPE for other classes and OPs
# this is a workaround for a robot bug - https://github.com/ontodev/robot/issues/1105
import re

# function to get curies
def extract_uris(text):
    return re.findall(r'\b(?:FBbt|GO|PATO|RO):\d+\b', text)

In [None]:
curies = []
for column in template.columns:
    if column != 'ID':
        new_uris = template[column].apply(extract_uris)
        # Filter out empty lists
        new_uris = new_uris[new_uris.apply(lambda x: len(x) > 0)].explode()
        curies.extend(new_uris)
        curies = list(set(curies))

#curies

In [None]:
# make a row for each
for x in curies:
    
    # create a dictionary to hold information for this class
    row = OrderedDict([])
    for c in template.columns:
        row[c] = "" # setting all as "" now to avoid awkward NaNs later
    
    # populate dictionary
    
    # annotation axioms
    row['ID'] = x
    if x.startswith('RO:'):
        row['TYPE'] = 'owl:ObjectProperty'
    else:
        row['TYPE'] = 'owl:Class'
    
    # turn into dataframe and join to template
    row_df = pd.DataFrame.from_dict([row])
    template = pd.concat([template, row_df], ignore_index=True)

#template.head()

In [None]:
template.to_csv('template.tsv', sep='\t', index=None)