In [None]:
"""
makes robot template to generate provisional cell types 
based on type:ID mapping in 'new_cell_types.tsv' (does not use 'FBbt_name' in file)
may need to copy across latest hemibrain ROI mapping if this gets updated
"""
import pandas as pd
import csv
import re
from collections import OrderedDict
import numpy as np
import neuprint

# for connecting to neuPrint (add token)
token = ""
np_client = neuprint.Client('https://neuprint.janelia.org', dataset='hemibrain:v1.2.1', token=token)

# load file with types mapped to FBbt:2... IDs, labels, refs
cell_types = pd.read_csv('./new_cell_types.tsv', sep='\t', dtype='str', na_filter=False)

# copy across latest ROI mapping file from hemibrain_metadata and load
full_roi_mapping = pd.read_csv('./hemibrain_1-1_ROI_mapping.tsv', sep='\t')

# minimum no. synapses to add region connectivity
connectivity_threshold = 10
#len(cell_types)

In [None]:
# filter cell_types to remove any that are in FlyWire annotations file
# (these will be updated using ../flywire_neurons/flywire_neurons.ipynb)
# annotations file at https://github.com/flyconnectome/flywire_annotations/tree/main/supplemental_files
flywire_annotations = pd.read_csv("../flywire_neurons/Supplemental_file1_neuron_annotations.tsv", 
                                  sep="\t", dtype="str")

cell_types = cell_types[~(cell_types['np_type'].isin(flywire_annotations['cell_type'])\
    |cell_types['np_type'].isin(flywire_annotations['hemibrain_type']))].reset_index()
#len(cell_types)

In [None]:
# add entry for 'posterior slope' and 'clamp' (not in hemibrain ROIs?)
extra_regions = pd.DataFrame({'ROI': ['PS(R)', 'PS(L)','CL(R)','CL(L)'], 
                              'FBbt_id': ['FBbt:00040072','FBbt:00040072','FBbt:00040047','FBbt:00040047'],
                              'FBbt_name': ['posterior slope','posterior slope','clamp','clamp']})

full_roi_mapping = pd.concat([full_roi_mapping, extra_regions], 
                   ignore_index=True)

# fix 'NO' and any other leading and trailing '
full_roi_mapping['ROI'] = full_roi_mapping['ROI'].apply(lambda x: x.strip("'"))


In [None]:
# dictionary of raw ROI names to FBbt
raw_ROI_dict = dict(zip(full_roi_mapping['ROI'],full_roi_mapping['FBbt_id']))


In [None]:
# remove left/right, keep only capitalised regions
tidy_roi_mapping = full_roi_mapping
tidy_roi_mapping['ROI'] = tidy_roi_mapping['ROI'].map(
    lambda x: re.compile('\([LR]+\)').sub('',x))
tidy_roi_mapping = tidy_roi_mapping[tidy_roi_mapping['ROI'].str.match('[A-Z]+$')==True]
tidy_roi_mapping = tidy_roi_mapping.drop_duplicates().reset_index(drop=True)



In [None]:
# patterns for different types of neuprint label (used by functions)
TI_pattern = re.compile("([A-Z]+)([0-9][0-9][0-9]$)")
multiPN_pattern = re.compile("(M_)([lvad]+[2]?)(PN)([0-9]*[mlt]+)([0-9]+[A-Z]?)")

In [None]:
# dataframe of neuroblasts

nb_data = np.array([['FBbt:00067348', 'v', 'ALv1'], ['FBbt:00050035', 'v2', 'ALv2'], \
                    ['FBbt:00050038', 'lv', 'ALlv1'], ['FBbt:00067346', 'ad', 'ALad1'], \
                    ['FBbt:00067347', 'l', 'ALl1 (Notch OFF hemilineage)'], \
                    ['FBbt:00067347', 'l2', 'ALl1 (Notch ON hemilineage)']])
neuroblasts = pd.DataFrame(nb_data, columns=['ID', 'short', 'name'])
neuroblasts = neuroblasts.set_index('short')
#neuroblasts

In [None]:
# dataframe of tracts

tract_data = np.array([['FBbt:00003985', 'm', 'medial antennal lobe tract'], \
                       ['FBbt:00003983', 'l', 'lateral antennal lobe tract'], \
                       ['FBbt:00003984', 'ml', 'mediolateral antennal lobe tract'], \
                       ['FBbt:00049719', '10t', 'transverse antennal lobe t10ALT tract']])
tracts = pd.DataFrame(tract_data, columns=['ID', 'short', 'name'])
tracts = tracts.set_index('short')
#tracts

In [None]:
# get all individuals of each type and their region connectivity
query = ("MATCH (n:Neuron) WHERE n.type IN %s "
         "RETURN n.type, n.bodyId, apoc.convert.fromJsonMap(n.roiInfo) AS ROIs" 
         % cell_types.np_type.tolist())

np_results = np_client.fetch_custom(query)
np_results = np_results.set_index(['n.type','n.bodyId'])
#print(np_results.head())

In [None]:
# split each region into a column, map column names to FBbt, then stack ROI columns into index
connecivity_by_ROI = np_results.ROIs.apply(pd.Series)
connecivity_by_ROI = connecivity_by_ROI.stack(future_stack=True)
#print(connecivity_by_ROI.head())

In [None]:
# split out connectivity type (pre, post etc) into columns (SLOW)
connectivity_table_1 = connecivity_by_ROI.apply(pd.Series)
connectivity_table_1.index = connectivity_table_1.index.rename(['type', 'bodyID','ROI_np'])
#print(connectivity_table_1.head())

In [None]:
# map neuprint ROIs to FBbt and tidy up
connectivity_table = connectivity_table_1.reset_index(level='ROI_np', drop=False)
connectivity_table['ROI'] = connectivity_table['ROI_np'].map(raw_ROI_dict)
connectivity_table = connectivity_table.drop(labels = ['downstream', 'upstream', 0, 'ROI_np'], axis=1)
connectivity_table = connectivity_table.set_index('ROI', append=True)
connectivity_table = connectivity_table.fillna(0)
#print(connectivity_table.head())

In [None]:
# get max connectivity per region per bodyID (groups duplicate regions)
body_connectivity_table = connectivity_table.groupby(['type', 'bodyID','ROI']).agg({'post':'max', 'pre':'max'})

# get min connectivity per region per type (groups multiple bodies per type)
type_connectivity_table = body_connectivity_table.groupby(['type', 'ROI']).agg({'post':'min', 'pre':'min'})

# drop rows where pre and post are both 0
type_connectivity_table = type_connectivity_table.drop(type_connectivity_table[type_connectivity_table['post'].eq(0) & type_connectivity_table['pre'].eq(0)].index)


In [None]:
def type_checker(shortname):
    if re.match(TI_pattern,shortname):
        return 'TI'
    elif re.match(multiPN_pattern,shortname):
        return 'multi'
    else:
        raise ValueError("Invalid neuron name - " + shortname)

In [None]:
def name_lister(names):
    L = ""
    if len(names) < 1:
        return False
    elif len(names) == 1:
        return names[0]
    elif len(names) > 1:
        L = names[0]
        if len(names) > 2:
            for i in names[1:-1]:
                L = L + ", " + i
        L = L + " and " + names[-1]
        return L

In [None]:
def neuropil_writer(roi):
    """returns neuropil name for a short roi name or an FBbt_ID"""
    if re.match('FBbt',roi):
        neuropil = str(list(full_roi_mapping[full_roi_mapping['FBbt_id'] == roi]['FBbt_name'])[0])
    elif roi in list(tidy_roi_mapping['ROI']):
        neuropil = str(list(tidy_roi_mapping[tidy_roi_mapping['ROI'] == roi]['FBbt_name'])[0])
    elif roi in list(full_roi_mapping['ROI']):
        neuropil = str(list(full_roi_mapping[full_roi_mapping['ROI'] == roi]['FBbt_name'])[0])
    else:
        raise KeyError("Input to neuropil_wirter must be a valid roi or FBbt ID!")
    
    return neuropil.replace('adult ', '')

In [None]:
# functions for terra incognita neurons

def shortname_splitter(shortname):
    """
    Splits neuron names - at least one (uppercase) letter / three digits.
    """
    name_type = type_checker(shortname)
    if name_type == 'TI':
        m = re.match(TI_pattern, shortname)
    elif name_type == 'multi':
        m = re.match(multiPN_pattern, shortname)
        
    if m: return m.groups()
    else:
        raise ValueError(shortname + "could not be split.")


In [None]:
def label_maker(shortname):
    """
    Autogenerates term label based on neuprint type name.
    """
    if type_checker(shortname) == 'TI':
        neuropil = neuropil_writer(shortname_splitter(shortname)[0])
        return "adult %s neuron %s" % (neuropil, shortname_splitter(shortname)[1])
    elif type_checker(shortname) == 'multi':
        return "adult multiglomerular antennal lobe projection neuron type %s %sPN" % \
            (shortname_splitter(shortname)[4], shortname_splitter(shortname)[1])
    else:
        raise ValueError("Could not make label for " + shortname)


In [None]:
def definition_maker(shortname):
    """
    Autogenerates term definition based on neuprint type name.
    """
    definition = ""
    if type_checker(shortname) == 'TI':
        definition += ("Adult neuron belonging to group %s of the terra incognita neurons "
                       "with substantial synapsing in the %s (Scheffer et al., 2020)." 
                       % (shortname_splitter(shortname)[1], neuropil_writer(shortname_splitter(shortname)[0])))
    elif type_checker(shortname) == 'multi':
        definition += ("Adult multiglomerular antennal lobe projection neuron belonging to group %s "
                   "(Scheffer et al., 2020). It develops from neuroblast %s and follows "
                   "the %s (Bates et al., 2020; Scheffer et al., 2020)." 
                   % (shortname_splitter(shortname)[4], neuroblasts['name'][shortname_splitter(shortname)[1]], 
                   tracts['name'][shortname_splitter(shortname)[3]]))
    else:
        raise ValueError("Could not make definition for " + shortname)
        
    # connectivity
    try:
        type_connectivity = type_connectivity_table.loc[shortname]
        
        postsynapses = []
        postsynapse_names = []
        for region in type_connectivity.index:
            if type_connectivity['post'][region] >= connectivity_threshold:
                postsynapses.append(region)
        if postsynapses:
            for i in postsynapses:
                label = neuropil_writer(i)
                postsynapse_names.append(label)
            definition += (" It has postsynaptic sites in the %s (Scheffer et al., 2020)." 
                           % name_lister(postsynapse_names))
        
        presynapses = []
        presynapse_names = []
        for region in type_connectivity.index:
            if type_connectivity['pre'][region] >= connectivity_threshold:
                presynapses.append(region)
        if presynapses:
            for i in presynapses:
                label = neuropil_writer(i)
                presynapse_names.append(label)
            definition += (" It has presynaptic sites in the %s (Scheffer et al., 2020)." 
                           % name_lister(presynapse_names))
        
    except(KeyError):
        pass
    
    return definition
    

In [None]:
# Make a dictionary with key - column header & value = template specification (first row of table).

template_seed = OrderedDict([ ('ID' , 'ID'), ('CLASS_TYPE' , 'CLASS_TYPE'),\
                             ('TYPE' , 'TYPE' )])

# fields for obo ID and namespace
template_seed.update([("obo_id" , "A oboInOwl:id"), ("obo_namespace" , "A oboInOwl:hasOBONamespace")])

#label, definition, creation:
template_seed.update([("label" , "A rdfs:label"), ("definition" , "A IAO:0000115"),\
                      ("Xref_def" , ">A oboInOwl:hasDbXref SPLIT=|"),\
                      ("created_by" , "AI dc:contributor"),\
                      ("creation_date", "AT dc:date^^xsd:dateTime")])

#synonyms, comment:
template_seed.update([("synonym" , "A oboInOwl:hasExactSynonym"),\
                      ("syn_ref" , ">A oboInOwl:hasDbXref"),\
                      ("additional_synonym", "A oboInOwl:hasRelatedSynonym"),\
                      ("additional_synonym_ref", ">A oboInOwl:hasDbXref"),\
                      ("comment" , "A rdfs:comment")])

# Columns for relationships:
template_seed.update([("synapses", "SC 'has synaptic IO in region' some %"),\
                      ("inputs", "SC 'receives synaptic input in region' some %"),\
                      ("parent", "SC % SPLIT=|"), ("neuroblast", "SC 'develops from' some %"),\
                      ("tract", "SC 'fasciculates with' some %"), ("hemilineage", "SC %"),\
                      ("inputs", "SC 'receives synaptic input in region' some % SPLIT=|"),\
                      ("outputs", "SC 'sends synaptic output to region' some % SPLIT=|")])

# Create dataFrame for template
template = pd.DataFrame.from_records([template_seed])

#template

In [None]:
count = 0 # first row
id_mapping = {} # dictionary of ids for types

for i in cell_types.index:

    row_od = OrderedDict([]) #new template row as an empty ordered dictionary
    for c in template.columns: #make columns and blank data for new template row
        row_od.update([(c , "")])
    
    np_type = cell_types['np_type'][i]
    
    Parents_list = []
    if cell_types.asserted_parents[i]:
        Parents_list.extend(cell_types.asserted_parents[i].split('|'))
    
    #these are the same in each row
    row_od["CLASS_TYPE"] = "subclass"
    row_od["TYPE"] = "owl:Class"
    row_od["created_by"] = "http://orcid.org/0000-0002-1373-1705"
    row_od["creation_date"] = cell_types['date'][i]
    row_od["comment"] = str("Uncharacterized putative cell type (based on clustering analysis) "
        "from Janelia hemibrain data (Scheffer et al., 2020).")
    row_od['obo_namespace'] = "fly_anatomy.ontology"

    #easy to generate data
    row_od["ID"] = cell_types['FBbt_id'][i]
    row_od['obo_id'] = cell_types['FBbt_id'][i]
    row_od["synonym"] = "adult %s neuron" % np_type
    row_od["syn_ref"] = cell_types['ref'][i]
    row_od["additional_synonym"] = cell_types['synonym'][i]
    row_od["additional_synonym_ref"] = cell_types['synonym_ref'][i]
    row_od["label"] = label_maker(np_type)
    row_od["definition"] = definition_maker(np_type)
    row_od["Xref_def"] = cell_types['ref'][i]
    
    # conditional
    if type_checker(np_type) == 'TI':
        Parents_list.append("FBbt:00047095") # adult neuron
        row_od["synapses"] = str(list(tidy_roi_mapping[tidy_roi_mapping['ROI'] == shortname_splitter(np_type)[0]]['FBbt_id'])[0])
        
    if type_checker(np_type) == 'multi':
        Parents_list.append("FBbt:00007441") # adult multiglomerular antennal lobe projection neuron
        row_od["neuroblast"] = neuroblasts['ID'][shortname_splitter(np_type)[1]]
        row_od["tract"] = tracts['ID'][shortname_splitter(np_type)[3]]
        row_od["inputs"] = str(list(tidy_roi_mapping[tidy_roi_mapping['ROI'] == 'AL']['FBbt_id'])[0])
        row_od["Xref_def"]+=("|FlyBase:FBrf0246460")
        
    if 'Notch OFF' in row_od["definition"]:
        row_od["hemilineage"] = 'FBbt:00049540'
    elif 'Notch ON' in row_od["definition"]:
        row_od["hemilineage"] = 'FBbt:00049539'
    
    if np_type in type_connectivity_table.index:
        row_od["comment"] += (" Connectivity based on Hemibrain v1.2.1 data where each individual of this type "
                              "has at least %s synapses in a region." % connectivity_threshold)
    
    # connectivity
    try:
        type_connectivity = type_connectivity_table.loc[np_type]
        postsynapses = []
        for region in type_connectivity.index:
            if type_connectivity['post'][region] >= connectivity_threshold:
                postsynapses.append(region)
        if postsynapses:
            row_od["inputs"] = '|'.join(postsynapses)
        presynapses = []
        for region in type_connectivity.index:
            if type_connectivity['pre'][region] >= connectivity_threshold:
                presynapses.append(region)
        if presynapses:
            row_od["outputs"] = '|'.join(presynapses)
    except(KeyError):
        pass
    
    row_od["parent"] = '|'.join(Parents_list)
    
    #make new row into a DataFrame and add it to template
    new_row = pd.DataFrame.from_records([row_od])
    template = pd.concat([template, new_row], ignore_index=True, sort=False)

    count +=1
    
    
#template.head()

In [None]:
# add rows to specify TYPE for other classes and OPs
# this is a workaround for a robot bug - https://github.com/ontodev/robot/issues/1105
import re

# function to get curies
def extract_uris(text):
    return re.findall(r'\b(?:FBbt|GO|PATO|RO):\d+\b', text)


In [None]:

curies = []
for column in template.columns:
    if column != 'ID':
        new_uris = template[column].apply(extract_uris)
        # Filter out empty lists
        new_uris = new_uris[new_uris.apply(lambda x: len(x) > 0)].explode()
        curies.extend(new_uris)
        curies = list(set(curies))

#curies

In [None]:
# make a row for each
for x in curies:
    
    # create a dictionary to hold information for this class
    row = OrderedDict([])
    for c in template.columns:
        row[c] = "" # setting all as "" now to avoid awkward NaNs later
    
    # populate dictionary
    
    # annotation axioms
    row['ID'] = x
    if x.startswith('RO:'):
        row['TYPE'] = 'owl:ObjectProperty'
    else:
        row['TYPE'] = 'owl:Class'
    
    # turn into dataframe and join to template
    row_df = pd.DataFrame.from_dict([row])
    template = pd.concat([template, row_df], ignore_index=True)

#template.head()

In [None]:
template.to_csv("./template.tsv", sep = "\t", header=True, index=False)