This tutorial provides code to load NeuroArch database with FlyWire Dataset Snapshot 630. Requirement before running the notebook:
- Installed [NeuroArch](https://github.com/fruitflybrain/neuroarch), [OrientDB Community Version](https://www.orientdb.org/download), and [pyorient](https://github.com/fruitflybrain/pyorient). The [NeuroNLP Docker image](https://hub.docker.com/r/fruitflybrain/neuronlp) and [FlyBrainLab Docker image](https://hub.docker.com/r/fruitflybrain/fbl) all have a copy of the software requirement ready.
- Download [FlyWire public data](https://codex.flywire.ai/api/download) and unzip in the same directory.
- Have more than 10 GB free disk space (for Neuprint dump and NeuroArch database).

A backup of the database created by this notebook can be downloaded [here](https://drive.google.com/file/d/1S_6qRe7lBIC8vzZvsFAM59ScaMVwx7as/view?usp=drive_link). To restore it in OrientDB, run
```
/path/to/orientdb/bin/console.sh "create database plocal:../databases/flywire admin admin; restore database /path/to/flywire630_na_v1.0.0_backup.zip"
```

In [None]:
import glob
import os
import subprocess
import csv
import json
import warnings
from requests import HTTPError

import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py
import re
import neuroarch.na as na
import pymeshlab as ml


## Define Brain Region
First define all brain regions in the hemibrain data, and assign them as subsystem, neuropil or subregions.

In [None]:
all_brain_regions = \
{'OL_R': {'System': 'OL_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Optic Lobe', 'Abbreviation': 'OL'},
 'AME_R': {'System': 'OL_R', 'Neuropil': 'AME_R', 'Subregions': None, 'FullName': 'Accessory Medulla', 'Abbreviation': 'AME'},
 'LA_R': {'System': 'OL_R', 'Neuropil': 'LA_R', 'Subregions': None, 'FullName': 'Laminia', 'Abbreviation': 'LA'},
 'LO_R': {'System': 'OL_R', 'Neuropil': 'LO_R', 'Subregions': None, 'FullName': 'Lobula', 'Abbreviation': 'LO'},
 'LOP_R': {'System': 'OL_R', 'Neuropil': 'LOP_R', 'Subregions': None, 'FullName': 'Lobula Plate', 'Abbreviation': 'LOP'},
 'ME_R': {'System': 'OL_R', 'Neuropil': 'ME_R', 'Subregions': None, 'FullName': 'Medulla', 'Abbreviation': 'ME'},
 'LC_R': {'System': 'LC_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Lateral Complex', 'Abbreviation': 'LC'},
 'BU_R': {'System': 'LC_R', 'Neuropil': 'BU_R', 'Subregions': None, 'FullName': 'Bulb', 'Abbreviation': 'BU'},
 'LAL_R': {'System': 'LC_R', 'Neuropil': 'LAL_R', 'Subregions': None, 'FullName': 'Lateral Accessory Lobe', 'Abbreviation': 'LAL'},
 'GA_R': {'System': 'LC_R', 'Neuropil': 'GA_R', 'Subregions': None, 'FullName': 'Gall', 'Abbreviation': 'GA'},
 'LH_R': {'System': 'LH_R', 'Neuropil': 'LH_R', 'Subregions': None, 'FullName': 'Lateral Horn', 'Abbreviation': 'LH'},
 'PENP': {'System': 'PENP', 'Neuropil': None, 'Subregions': None, 'FullName': 'Periesophageal Neuropils', 'Abbreviation': 'PENP'},
 'CAN_R': {'System': 'PENP', 'Neuropil': 'CAN_R', 'Subregions': None, 'FullName': 'Cantle', 'Abbreviation': 'CAN'},
 'AMMC_R': {'System': 'PENP', 'Neuropil': 'AMMC_R', 'Subregions': None, 'FullName': 'Antennal Mechanosensory and Motor Center', 'Abbreviation': 'AMMC'},
 'FLA_R': {'System': 'PENP', 'Neuropil': 'FLA_R', 'Subregions': None, 'FullName': 'Flange', 'Abbreviation': 'FLA'},
 'INP_R': {'System': 'INP_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Inferior Neuropils', 'Abbreviation': 'INP'},
 'ICL_R': {'System': 'INP_R', 'Neuropil': 'ICL_R', 'Subregions': None, 'FullName': 'Inferior Clamp', 'Abbreviation': 'ICL'},
 'IB_R': {'System': 'INP_R', 'Neuropil': 'IB_R', 'Subregions': None, 'FullName': 'Inferior Bridge', 'Abbreviation': 'IB'},
 'ATL_R': {'System': 'INP_R', 'Neuropil': 'ATL_R', 'Subregions': None, 'FullName': 'Antler', 'Abbreviation': 'ATL'},
 'CRE_R': {'System': 'INP_R', 'Neuropil': 'CRE_R', 'Subregions': None, 'FullName': 'Crepine', 'Abbreviation': 'CRE'},
 'SCL_R': {'System': 'INP_R', 'Neuropil': 'SCL_R', 'Subregions': None, 'FullName': 'Superior Clamp', 'Abbreviation': 'SCL'},
 'VMNP_R': {'System': 'VMNP_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Ventromedial Neuropils', 'Abbreviation': 'VMNP'},
 'VES_R': {'System': 'VMNP_R', 'Neuropil': 'VES_R', 'Subregions': None, 'FullName': 'Vest', 'Abbreviation': 'VES'},
 'GOR_R': {'System': 'VMNP_R', 'Neuropil': 'GOR_R', 'Subregions': None, 'FullName': 'Gorget', 'Abbreviation': 'GOR'},
 'SPS_R': {'System': 'VMNP_R', 'Neuropil': 'SPS_R', 'Subregions': None, 'FullName': 'Superior Posterior Slope', 'Abbreviation': 'SPS'},
 'IPS_R': {'System': 'VMNP_R', 'Neuropil': 'IPS_R', 'Subregions': None, 'FullName': 'Inferior Posterior Slope', 'Abbreviation': 'IPS'},
 'EPA_R': {'System': 'VMNP_R', 'Neuropil': 'EPA_R', 'Subregions': None, 'FullName': 'Epaulette', 'Abbreviation': 'EPA'},
 'MB_R': {'System': 'MB_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Mushroom Body', 'Abbreviation': 'MB'},
 'PED_R': {'System': 'MB_R', 'Neuropil': 'PED_R', 'Subregions': None, 'FullName': 'Pedunculus', 'Abbreviation': 'PED'},
 'VL_R': {'System': 'MB_R', 'Neuropil': 'VL_R', 'Subregions': None, 'FullName': 'Vertical Lobe', 'Abbreviation': 'VL'},
 'ML_R': {'System': 'MB_R', 'Neuropil': 'ML_R', 'Subregions': None, 'FullName': 'Medial Lobe', 'Abbreviation': 'ML'},
 'CA_R': {'System': 'MB_R', 'Neuropil': 'CA_R', 'Subregions': None, 'FullName': 'Calyx', 'Abbreviation': 'CA'},
 'AL_R': {'System': 'AL_R', 'Neuropil': 'AL_R', 'Subregions': None, 'FullName': 'Antennal Lobe', 'Abbreviation': 'AL'},
 'SNP_R': {'System': 'SNP_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Superior Neuropils', 'Abbreviation': 'SNP'},
 'SLP_R': {'System': 'SNP_R', 'Neuropil': 'SLP_R', 'Subregions': None, 'FullName': 'Superior Lateral Protocerebrum', 'Abbreviation': 'SLP'},
 'SIP_R': {'System': 'SNP_R', 'Neuropil': 'SIP_R', 'Subregions': None, 'FullName': 'Superior Intermediate Protocerebrum', 'Abbreviation': 'SIP'},
 'SMP_R': {'System': 'SNP_R', 'Neuropil': 'SMP_R', 'Subregions': None, 'FullName': 'Superior Medial Protocerebrum', 'Abbreviation': 'SMP'},
 'VLNP_R': {'System': 'VLNP_R', 'Neuropil': None, 'Subregions': None, 'FullName': 'Ventrolateral Neuropils', 'Abbreviation': 'VLNP'},
 'AVLP_R': {'System': 'VLNP_R', 'Neuropil': 'AVLP_R', 'Subregions': None, 'FullName': 'Anterior Ventrolateral Protocerebrum', 'Abbreviation': 'AVLP'},
 'PVLP_R': {'System': 'VLNP_R', 'Neuropil': 'PVLP_R', 'Subregions': None, 'FullName': 'Posterior Ventrolateral Protocerebrum', 'Abbreviation': 'PVLP'},
 'WED_R': {'System': 'VLNP_R', 'Neuropil': 'WED_R', 'Subregions': None, 'FullName': 'Wedge', 'Abbreviation': 'WED'},
 'PLP_R': {'System': 'VLNP_R', 'Neuropil': 'PLP_R', 'Subregions': None, 'FullName': 'Posteriorlateral Protocerebrum', 'Abbreviation': 'PLP'},
 'AOTU_R': {'System': 'VLNP_R', 'Neuropil': 'AOTU_R', 'Subregions': None, 'FullName': 'Anterior Optic Tubercle', 'Abbreviation': 'AOTU'},
 'OL_L': {'System': 'OL_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Optic Lobe', 'Abbreviation': 'OL'},
 'AME_L': {'System': 'OL_L', 'Neuropil': 'AME_L', 'Subregions': None, 'FullName': 'Accessory Medulla', 'Abbreviation': 'AME'},
 'LA_L': {'System': 'OL_L', 'Neuropil': 'LA_L', 'Subregions': None, 'FullName': 'Laminia', 'Abbreviation': 'LA'},
 'LO_L': {'System': 'OL_L', 'Neuropil': 'LO_L', 'Subregions': None, 'FullName': 'Lobula', 'Abbreviation': 'LO'},
 'LOP_L': {'System': 'OL_L', 'Neuropil': 'LOP_L', 'Subregions': None, 'FullName': 'Lobula Plate', 'Abbreviation': 'LOP'},
 'ME_L': {'System': 'OL_L', 'Neuropil': 'ME_L', 'Subregions': None, 'FullName': 'Medulla', 'Abbreviation': 'ME'},
 'LC_L': {'System': 'LC_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Lateral Complex', 'Abbreviation': 'LC'},
 'BU_L': {'System': 'LC_L', 'Neuropil': 'BU_L', 'Subregions': None, 'FullName': 'Bulb', 'Abbreviation': 'BU'},
 'LAL_L': {'System': 'LC_L', 'Neuropil': 'LAL_L', 'Subregions': None, 'FullName': 'Lateral Accessory Lobe', 'Abbreviation': 'LAL'},
 'GA_L': {'System': 'LC_L', 'Neuropil': 'GA_L', 'Subregions': None, 'FullName': 'Gall', 'Abbreviation': 'GA'},
 'LH_L': {'System': 'LH_L', 'Neuropil': 'LH_L', 'Subregions': None, 'FullName': 'Lateral Horn', 'Abbreviation': 'LH'},
 'PENP': {'System': 'PENP', 'Neuropil': None, 'Subregions': None, 'FullName': 'Periesophageal Neuropils', 'Abbreviation': 'PENP'},
 'CAN_L': {'System': 'PENP', 'Neuropil': 'CAN_L', 'Subregions': None, 'FullName': 'Cantle', 'Abbreviation': 'CAN'},
 'AMMC_L': {'System': 'PENP', 'Neuropil': 'AMMC_L', 'Subregions': None, 'FullName': 'Antennal Mechanosensory and Motor Center', 'Abbreviation': 'AMMC'},
 'FLA_L': {'System': 'PENP', 'Neuropil': 'FLA_L', 'Subregions': None, 'FullName': 'Flange', 'Abbreviation': 'FLA'},
 'INP_L': {'System': 'INP_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Inferior Neuropils', 'Abbreviation': 'INP'},
 'ICL_L': {'System': 'INP_L', 'Neuropil': 'ICL_L', 'Subregions': None, 'FullName': 'Inferior Clamp', 'Abbreviation': 'ICL'},
 'IB_L': {'System': 'INP_L', 'Neuropil': 'IB_L', 'Subregions': None, 'FullName': 'Inferior Bridge', 'Abbreviation': 'IB'},
 'ATL_L': {'System': 'INP_L', 'Neuropil': 'ATL_L', 'Subregions': None, 'FullName': 'Antler', 'Abbreviation': 'ATL'},
 'CRE_L': {'System': 'INP_L', 'Neuropil': 'CRE_L', 'Subregions': None, 'FullName': 'Crepine', 'Abbreviation': 'CRE'},
 'SCL_L': {'System': 'INP_L', 'Neuropil': 'SCL_L', 'Subregions': None, 'FullName': 'Superior Clamp', 'Abbreviation': 'SCL'},
 'VMNP_L': {'System': 'VMNP_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Ventromedial Neuropils', 'Abbreviation': 'VMNP'},
 'VES_L': {'System': 'VMNP_L', 'Neuropil': 'VES_L', 'Subregions': None, 'FullName': 'Vest', 'Abbreviation': 'VES'},
 'GOR_L': {'System': 'VMNP_L', 'Neuropil': 'GOR_L', 'Subregions': None, 'FullName': 'Gorget', 'Abbreviation': 'GOR'},
 'SPS_L': {'System': 'VMNP_L', 'Neuropil': 'SPS_L', 'Subregions': None, 'FullName': 'Superior Posterior Slope', 'Abbreviation': 'SPS'},
 'IPS_L': {'System': 'VMNP_L', 'Neuropil': 'IPS_L', 'Subregions': None, 'FullName': 'Inferior Posterior Slope', 'Abbreviation': 'IPS'},
 'EPA_L': {'System': 'VMNP_L', 'Neuropil': 'EPA_L', 'Subregions': None, 'FullName': 'Epaulette', 'Abbreviation': 'EPA'},
 'MB_L': {'System': 'MB_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Mushroom Body', 'Abbreviation': 'MB'},
 'PED_L': {'System': 'MB_L', 'Neuropil': 'PED_L', 'Subregions': None, 'FullName': 'Pedunculus', 'Abbreviation': 'PED'},
 'VL_L': {'System': 'MB_L', 'Neuropil': 'VL_L', 'Subregions': None, 'FullName': 'Vertical Lobe', 'Abbreviation': 'VL'},
 'ML_L': {'System': 'MB_L', 'Neuropil': 'ML_L', 'Subregions': None, 'FullName': 'Medial Lobe', 'Abbreviation': 'ML'},
 'CA_L': {'System': 'MB_L', 'Neuropil': 'CA_L', 'Subregions': None, 'FullName': 'Calyx', 'Abbreviation': 'CA'},
 'AL_L': {'System': 'AL_L', 'Neuropil': 'AL_L', 'Subregions': None, 'FullName': 'Antennal Lobe', 'Abbreviation': 'AL'},
 'SNP_L': {'System': 'SNP_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Superior Neuropils', 'Abbreviation': 'SNP'},
 'SLP_L': {'System': 'SNP_L', 'Neuropil': 'SLP_L', 'Subregions': None, 'FullName': 'Superior Lateral Protocerebrum', 'Abbreviation': 'SLP'},
 'SIP_L': {'System': 'SNP_L', 'Neuropil': 'SIP_L', 'Subregions': None, 'FullName': 'Superior Intermediate Protocerebrum', 'Abbreviation': 'SIP'},
 'SMP_L': {'System': 'SNP_L', 'Neuropil': 'SMP_L', 'Subregions': None, 'FullName': 'Superior Medial Protocerebrum', 'Abbreviation': 'SMP'},
 'VLNP_L': {'System': 'VLNP_L', 'Neuropil': None, 'Subregions': None, 'FullName': 'Ventrolateral Neuropils', 'Abbreviation': 'VLNP'},
 'AVLP_L': {'System': 'VLNP_L', 'Neuropil': 'AVLP_L', 'Subregions': None, 'FullName': 'Anterior Ventrolateral Protocerebrum', 'Abbreviation': 'AVLP'},
 'PVLP_L': {'System': 'VLNP_L', 'Neuropil': 'PVLP_L', 'Subregions': None, 'FullName': 'Posterior Ventrolateral Protocerebrum', 'Abbreviation': 'PVLP'},
 'WED_L': {'System': 'VLNP_L', 'Neuropil': 'WED_L', 'Subregions': None, 'FullName': 'Wedge', 'Abbreviation': 'WED'},
 'PLP_L': {'System': 'VLNP_L', 'Neuropil': 'PLP_L', 'Subregions': None, 'FullName': 'Posteriorlateral Protocerebrum', 'Abbreviation': 'PLP'},
 'AOTU_L': {'System': 'VLNP_L', 'Neuropil': 'AOTU_L', 'Subregions': None, 'FullName': 'Anterior Optic Tubercle', 'Abbreviation': 'AOTU'},
 'CX': {'System': 'CX', 'Neuropil': None, 'Subregions': None, 'FullName': 'Central Complex', 'Abbreviation': 'CX'},
 'NO': {'System': 'CX', 'Neuropil': 'NO', 'Subregions': None, 'FullName': 'Noduli', 'Abbreviation': 'NO'},
 'PB': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': None, 'FullName': 'Protocerebral Bridge', 'Abbreviation': 'PB'},
 'EB': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': None, 'FullName': 'Ellipsoid Body', 'Abbreviation': 'EB'},
 'FB': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': None, 'FullName': 'Fan-shaped Body', 'Abbreviation': 'FB'},
 'SAD': {'System': 'PENP', 'Neuropil': 'SAD', 'Subregions': None, 'FullName': 'Saddle', 'Abbreviation': 'SAD'},
 'PRW': {'System': 'PENP', 'Neuropil': 'PRW', 'Subregions': None, 'FullName': 'Prow', 'Abbreviation': 'PRW'},
 'GNG': {'System': 'GNG', 'Neuropil': 'GNG', 'Subregions': None, 'FullName': 'Gnatal Ganglia', 'Abbreviation': 'GNG'},
 'OCG': {'System': 'OCG', 'Neuropil': 'OCG', 'Subregions': None, 'FullName': 'Ocelli', 'Abbreviation': 'OCG'},
}


In [None]:
df_neurons = pd.read_csv('neurons.csv')

In [None]:
df_synapses = pd.read_csv('synapse_coordinates.csv', dtype = str).fillna(method='ffill').astype(int)

In [None]:
df_labels = pd.read_csv('labels.csv').set_index('root_id')

In [None]:
df_coordinates = pd.read_csv('coordinates.csv').set_index('root_id')

In [None]:
df_neuropil = pd.read_csv('neuropil_synapse_table.csv').set_index('root_id')
df_neuropil.columns = [n.replace('MB_', '') for n in df_neuropil.columns.tolist()]

In [None]:
df_classification = pd.read_csv('classification.csv').set_index('root_id')

In [None]:
for root_id, df in df_labels.groupby(level=0):
    df_classification.loc[root_id, 'label'] = (';'.join(df.label.tolist())).replace('α', 'a').replace('β', 'b').replace('γ', 'g').replace('\xa0', '').replace('\t','').replace('\n','')

Step missing here to combine label (which can have several entries) with classification

In [None]:
df_classification

In [None]:
# data = pd.read_csv('new_classification.csv')
# data.set_index('root_id', inplace=True)

# Extract columns
data_extracted = df_classification[['cell_type', 'hemibrain_type', 'side', 'label']]

In [None]:
with open('fbbt-full.json', 'r') as file:
    fbbt_data = json.load(file)
    
def get_synonyms(node):
    synonyms = [synonym['val'] for synonym in node['meta']['synonyms']] if 'synonyms' in node['meta'] else None
    return synonyms

fbbt_info = {}
for node in fbbt_data['graphs'][0]['nodes']:
    if node['id'].startswith('http://purl.obolibrary.org/obo/FBbt_') and 'lbl' in node:
        FBbt_number = node['id'].split('_')[-1]
        lbl = node['lbl']
        synonyms = get_synonyms(node)
        if synonyms is None:
            fbbt_info[FBbt_number] = [lbl]
        else:
            fbbt_info[FBbt_number] = [lbl]+synonyms

In [None]:
# Define the function to extract FBbt number from the label
def extract_fbbt_from_label(label):
    if not isinstance(label, str):
        return None
    # Find all FBbt numbers in the label
    fbbt_numbers = re.findall(r'FBbt[:_ ](\d{8})', label, flags=re.IGNORECASE)

    # Filter the numbers based on their presence in fbbt_info and whether they have more than one synonym
    valid_fbbt_numbers = [number for number in fbbt_numbers if number in fbbt_info and len(fbbt_info[number]) > 1]

    # If there are valid FBbt numbers, return the first one. Otherwise, return None.
    return valid_fbbt_numbers[0] if valid_fbbt_numbers else None

# Apply the function to the 'label' column of data_extracted to create the new DataFrame
FBbt_df = pd.DataFrame(index=data_extracted.index)
FBbt_df['FBbt'] = data_extracted['label'].apply(extract_fbbt_from_label)
FBbt_df.head()

In [None]:
def extract_labels(label):
    if not isinstance(label, str):
        return []
    
    label = label = re.sub(r'\(.*?\)', '', label)
    label = label = re.sub(r'\[.*?\]', '', label)
    labels = re.split(r';| - ', label)
    # Remove leading/trailing spaces and empty labels
    labels = [label.strip() for label in labels if label.strip()]
    labels = [label for label in labels if len(label) < 100 and not label.lower().startswith('see') and not label.startswith('http')]
    
    new_labels = []
    for label in labels:
        
        label = re.sub(r'\(\s*[A-Za-z\s]+,\s*[A-Za-z\s]+(\set\s*al\.)*,\s*\d{4}', '', label)
        label = re.sub(r'[A-Za-z., ]+\(\d{4}\)', '', label)
        label = re.sub(r'\b[A-Za-z\s]+,\s\d{4}\b', '', label, flags=re.IGNORECASE)
        label = re.sub(r'\b\w+,\s*\w+\.\s*\(\d{4}\)\b', '', label, flags=re.IGNORECASE)
        
        label = re.sub(r'_L\b', '', label, flags=re.IGNORECASE)
        label = re.sub(r'_R\b', '', label, flags=re.IGNORECASE)
        label = re.sub(r'FBbt[:_ ]\d{8}', '', label, flags=re.IGNORECASE)
        label = re.sub(r'-RHS|-LHS', '', label, flags=re.IGNORECASE)
        label = re.sub(r'_left|_right', '', label, flags=re.IGNORECASE)


        neurotransmitters = ['acetylcholine', 'dopamine', 'serotonin', 'glutamate', 'GABA', 'glycine', 'histamine', 'octopamine', 'tyramine']
        for neurotransmitter in neurotransmitters:
            label = re.sub(r'\b{}\b'.format(neurotransmitter), '', label, flags=re.IGNORECASE)
        label = re.sub(r'[A-Za-z., ]+\(\d{4}\)', '', label)
        label = re.sub(r'\b[A-Za-z\s]+(,|and|\s)*[A-Za-z\s]+(,|et al\.)*\s*\d{4}\b', '', label)
        label = re.sub(r'\(e\.g\.\s*[^)]*', '', label)
        label = label.strip()

        if len(label):
            new_labels.extend([n.strip().replace('α', 'a').replace('β', 'b').replace('γ', 'g').replace('\xa0', '').replace('\t','').replace('\n','') for n in label.split(',') if len(n.strip())])
    
    new_labels = [n for n in new_labels if n.lower() not in ['left', 'right', 'center'] and ' is ' not in n and 'doi:' not in n and 'Catmaid ID' not in n]
    new_labels = sum([[k.strip() for k in re.split(r':| /', n) if len(k.strip())] for n in new_labels ], [])
    
    return list(set(new_labels))

labels_dict = {root_id: extract_labels(label) for root_id, label in data_extracted['label'].items()}


In [None]:
def get_name(cell_type, fbbt, hemibrain_name, labels):
    if isinstance(hemibrain_name, str):
        hemibrain_names = hemibrain_name.split(',')
    else:
        hemibrain_names = []
    
    final_name = None
    other_names = []
    can_be_used = []
    
    if isinstance(cell_type, str):
        final_name = cell_type
        
    if isinstance(fbbt, str) and fbbt in fbbt_info:
        names = fbbt_info[fbbt]
        names_ending_with_number = [name for name in names if name[-1].isdigit()]
        if len(names_ending_with_number):
            if len(hemibrain_names):
                tmp = set(hemibrain_names).intersection(set(names_ending_with_number))
                if len(tmp):
                    if final_name is None:
                        final_name = min(list(tmp), key=len)
                    other_names.extend(hemibrain_names)
                    other_names.extend(names_ending_with_number)
                else:
                    if final_name is None:
                        can_be_used.extend(hemibrain_names)
                    else:
                        other_names.extend(hemibrain_names)
                    other_names.extend(names_ending_with_number)
            else:
                if final_name is None:
                    final_name = min(list(names_ending_with_number), key=len)
                other_names.extend(names_ending_with_number)
        else:
            if len(hemibrain_names):
                tmp = set(hemibrain_names).intersection(set(names))
                if len(tmp):
                    if final_name is None:
                        final_name = min(list(tmp), key=len)
                    other_names.extend(hemibrain_names)
                    other_names.extend(names)
                else:
                    if final_name is None:
                        can_be_used.extend(hemibrain_names)
                    else:
                        other_names.extend(hemibrain_names)
                    other_names.extend(names)
            else:
                if final_name is None:
                    final_name = min(list(names), key=len)
                other_names.extend(names)
    
    if len(labels):
        names = [n for n in labels if '`' not in n]
        if not len(names):
            names = labels
        names_ending_with_number = [name for name in names if name[-1].isdigit()]
        if len(names_ending_with_number):
            if len(hemibrain_names):
                tmp = set(hemibrain_names).intersection(set(names_ending_with_number))
                if len(tmp):
                    if final_name is None:
                        final_name = min(list(tmp), key=len)
                    other_names.extend(hemibrain_names)
                    other_names.extend(names_ending_with_number)
                else:
                    if final_name is None:
                        can_be_used.extend(hemibrain_names)
                    else:
                        other_names.extend(hemibrain_names)
                    other_names.extend(names_ending_with_number)
            else:
                if final_name is None:
                    final_name = min(list(names_ending_with_number), key=len)
                other_names.extend(names_ending_with_number)
        else:
            if len(hemibrain_names):
                tmp = set(hemibrain_names).intersection(set(names))
                if len(tmp):
                    if final_name is None:
                        final_name = min(list(tmp), key=len)
                    other_names.extend(hemibrain_names)
                    other_names.extend(names)
                else:
                    if final_name is None:
                        can_be_used.extend(hemibrain_names)
                    else:
                        other_names.extend(hemibrain_names)
                    other_names.extend(names)
            else:
                if final_name is None:
                    final_name = min(list(names), key=len)
                other_names.extend(names)
    
    if len(hemibrain_names):
        if len(hemibrain_names) == 1:
            if final_name is None:
                final_name = hemibrain_names[0]
                other_names.extend(hemibrain_names)
            else:
                other_names.extend(hemibrain_names)
        else:
            if final_name is None:
                can_be_used.extend(hemibrain_names)
            else:
                other_names.extend(hemibrain_names)

    if final_name is None:
        if len(can_be_used):
            final_name = min(can_be_used, key=len)
            other_names.extend(can_be_used)
        elif len(other_names):
            tmp = [n for n in other_names if not '`' in n]
            if len(tmp):
                final_name = min(tmp, key=len)
            else:
                final_name = min(other_names, key=len)
    else:
        other_names.extend(can_be_used)
    
    return final_name, other_names


# Create the final DataFrame
df_final = data_extracted.copy()
final_names = []
community_names = {}
for i, row in df_final.iterrows():
    final_name, other_names = get_name(row['cell_type'], FBbt_df.loc[i, 'FBbt'], row['hemibrain_type'], labels_dict[i])
    final_names.append(final_name)
    community_names[i] = list(set(other_names))

df_final['name'] = final_names
# df_final['name'] = [get_name(row['cell_type'], FBbt_df.loc[i, 'FBbt'], row['hemibrain_type'].split(',')[0], labels_dict[i]) for i, row in df_final.iterrows()]

df_final.loc[df_final[df_final.name == 'α/β KC'].index, 'name'] = df_final[df_final.name == 'α/β KC']['hemibrain_type']

df_final.head()

In [None]:
df_final.to_csv('all_used_name.csv')

In [None]:
db = na.NeuroArch('flywire2', port = 22424, mode = 'o', version = "1.0.0",
                  maintainer_name = "Yiyin Zhou", maintainer_email = "yiyin.zhou@fordham.edu")

Create a species

In [None]:
species = db.add_Species('Drosophila melanogaster', stage = 'adult',
                         sex = 'female',
                         synonyms = ['fruit fly', 'common fruit fly', 'vinegar fly'])

Create a datasource under the species

In [None]:
version = 'Snapshot 630, April 2023'
datasource = db.add_DataSource('FlyWire', version = version,
                                      url = 'https://www.flywire.ai',
                                      species = species)
db.default_DataSource = datasource

Create subsystems, tracts, neuropils and subregions under the datasource

In [None]:
for k, v in all_brain_regions.items():
    if v['Neuropil'] is None and v['Subregions'] is None:
        if 'System' in v:
            db.add_Subsystem(k)
    elif 'System' in v:
        if v['Neuropil'] == v['System'] and v['Subregions'] is None:
            db.add_Subsystem(k)

In [None]:
all_neuropils = []
for k, v in all_brain_regions.items():
    if v['Neuropil'] is not None and v['Subregions'] is None:
        if isinstance(v['Neuropil'], list):
            continue
        ms = ml.MeshSet()
        ms.load_new_mesh("neuropil_meshes/{}.obj".format(k))
        current_mesh = ms.current_mesh()
        vertex_matrix = current_mesh.vertex_matrix()
        vertex_matrix[:,0] = 97920+891008-vertex_matrix[:,0]
        db.add_Neuropil(k,
                        morphology = {'type': 'mesh',
                                      "vertices": (vertex_matrix*0.001).flatten().tolist(),
                                      "faces": current_mesh.face_matrix().flatten().tolist()},
                        subsystem = v['System'])
        all_neuropils.append(k)

In [None]:
swc_dir = 'swcs'
uname_dict = {}
segment_ids = set()
added = 0
unadded = []
to_combine = []

pattern = r"\([^()]*\)"

nt_types = {'ACH': 'acetylcholine',
            'DA': 'dopamine',
            'GABA': 'GABA',
            'GLUT': 'glutamate',
            'OCT': 'octopamine',
            'SER': 'serotonin'}

for i, row in tqdm(df_neurons.iterrows()):
#     if i < 28835:
#         continue
#     try:
    root_id = row['root_id']
    segment = False
    info = {}
    if root_id in df_classification.index:
        if isinstance(FBbt_df.loc[root_id, 'FBbt'], str):
            info['FBbt'] = FBbt_df.loc[root_id, 'FBbt']

        # get type:
        name = df_final.loc[root_id, 'name']
        hemibrain_name = df_classification.loc[root_id, 'hemibrain_type']
        if isinstance(hemibrain_name, str):
            info['hemibrain type'] = hemibrain_name
        hemilineage = df_classification.loc[root_id, 'hemilineage']
        if isinstance(hemilineage, str):
            info['hemilineage'] = hemilineage
        side = df_classification.loc[root_id, 'side']

        if not isinstance(name, str):
            name = 'unknown'
            
        if name.startswith('hb'):
            name = 'unknown'
            if 'hemibrain_name' in info:
                info['hemibrain type'] = "{},{}".format(info['hemibrain type'],name[2:])
            else:
                info['hemibrain type'] = name[2:]

        cell_type = name

        if isinstance(side, str):
            if side == 'right':
                instance = "{}_R".format(name)
            elif side == 'left':
                instance = "{}_L".format(name)
            else:
                instance = name
        else:
            instance = name

        if name == 'unknown':
            instance = 'unknown_{}'.format(root_id)
        else:
            if instance not in uname_dict:
                uname_dict[instance] = 0
            uname_dict[instance] += 1
            instance = '{}_{}'.format(instance, uname_dict[instance])
    else:
        instance = 'unknown_{}'.format(root_id)
        cell_type = 'unknown'
    if root_id in community_names:
        info['community names'] = ';'.join(community_names[root_id])

    added += 1
    arborization = []

    if root_id in df_neuropil.index:    
        tmp = df_neuropil.loc[root_id, ['input synapses in {}'.format(n) for n in all_neuropils]]
        dendrites = {k.replace('input synapses in ', ''): v for k, v in tmp[tmp!=0].to_dict().items()}
        tmp = df_neuropil.loc[root_id, ['output synapses in {}'.format(n) for n in all_neuropils]]
        axons = {k.replace('output synapses in ', ''): v for k, v in tmp[tmp!=0].to_dict().items()}
        arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'neuropil'})

#             tmp = df_neuropil.loc[root_id, ['input synapses in {}'.format(n) for n in all_subregions]]
#             dendrites = {k.replace('input synapses in ', ''): v for k, v in tmp[tmp!=0].to_dict().items()}
#             tmp = df_neuropil.loc[root_id, ['output synapses in {}'.format(n) for n in all_subregions]]
#             axons = {k.replace('output synapses in ', ''): v for k, v in tmp[tmp!=0].to_dict().items()}
#             arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'subregion'})

    morphology = None
    try:
        df = na.load_swc('{}/{}.swc'.format(swc_dir, root_id))
        morphology = {'x': (97.920+891.008-df['x']).tolist(),
                      'y': df['y'].tolist(),
                      'z': df['z'].tolist(),
                      'r': (df['r']*0.1).tolist(),
                      'parent': df['parent'].tolist(),
                      'identifier': df['identifier'].tolist(),
                      'sample': df['sample'].tolist(),
                      'type': 'swc'}
    except FileNotFoundError:
        morphology = None
        if segment: # no name, not traced, no morph
            to_combine.append(bodyID)
            continue
        else:
            segment = True

    neurotransmitter = []
    if row['nt_type_score'] > 0:
        if isinstance(row['nt_type'], str):
            neurotransmitter.append(nt_types[row['nt_type']])
    db.add_Neuron(instance, # uname
                  cell_type, # name
                  referenceId = str(root_id), #referenceId
                  info = info if len(info) else None,
                  morphology = morphology,
                  arborization = arborization,
                  neurotransmitters = neurotransmitter if len(neurotransmitter) else None)
#     except:
#         print(root_id)
#         raise

In [None]:
db.flush_edges()

In [None]:
# find all the neurons so they can be keyed by their referenceId.
neurons = db.sql_query('select from NeuronAndFragment').node_objs
# set the cache so there is no need for database access.
for neuron in neurons:
    db.set('NeuronAndFragment', neuron.uname, neuron, db.default_DataSource)
neuron_ref_to_obj = {int(neuron.referenceId): neuron for neuron in neurons}

In [None]:
tmp = {}
morph_dict = {}

bad_pair = []
pre_missing_pair = []
post_missing_pair = []
for pair, df in tqdm(df_synapses.groupby(['pre_root_id', 'post_root_id'])):
    pre, post = pair
    if pre not in neuron_ref_to_obj:
        pre = -1
        pre_missing_pair.append(pair)
    if post not in neuron_ref_to_obj:
        post = -1
        post_missing_pair.append(pair)
    if pre == -1 or post == -1:
        bad_pair.append(pair)
        continue

    pre_neuron = neuron_ref_to_obj[pre]
    post_neuron = neuron_ref_to_obj[post]
    
    content = {'type': 'swc'}
    content['x'] = ((97920+891008-df['x'])*0.001).tolist()
    content['y'] = (df['y']*0.001).tolist()
    content['z'] = (df['z']*0.001).tolist()
    content['r'] = [0]*len(content['x'])
    content['parent'] = [-1]*(len(content['x']))
    content['identifier'] = [7]*(len(content['x']))
    content['sample'] = [i+1 for i in range(len(content['x']))]
    
    synapse = db.add_Synapse(pre_neuron, post_neuron, N = len(df))
    morph_dict[synapse._id] = {'morphology': content}
print(bad_pair)


In [None]:
db.flush_edges()

In [None]:
for rid, data in tqdm(morph_dict.items()):
    if 'morphology' in data:
        db.add_morphology(rid, data['morphology'])