This tutorial provides code to load NeuroArch database with FIB19 Optic Lobe Dataset v1.0. Requirement before running the notebook:,
- Installed [NeuroArch](https://github.com/fruitflybrain/neuroarch), [OrientDB Community Version](https://www.orientdb.org/download) version 3.1.x, and [pyorient](https://github.com/fruitflybrain/pyorient). The [NeuroNLP Docker image](https://hub.docker.com/r/fruitflybrain/neuronlp) and [FlyBrainLab Docker image](https://hub.docker.com/r/fruitflybrain/fbl) all have a copy of the software requirement ready.
- Have more than 3 GB free disk space (for NeuroArch database).

A backup of the database created by this notebook can be downloaded [here](https://drive.google.com/file/d/11TJlrASgf6HlhLNrnoAZ8trd8cbcToOM/view?usp=drive_link). To restore it in OrientDB, run
```
/path/to/orientdb/bin/console.sh "create database plocal:../databases/hemibrain admin admin; restore database /path/to/fib19_1.0_na_v1.0.0_backup.zip"\n",
```

In [None]:
import glob
import os
import subprocess
import csv
import json
import warnings

import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py
import neuroarch.na as na

## Create a new database
Create a new database called `fib19`, open it in overwrite mode so any old `fib19` folder under `orientdb/databases/` will be removed and recreated. Using default OrientDB binary port `2424`.

In [None]:
fib19 = na.NeuroArch('fib19',  mode = 'o', version = "1.0.0", port = 2424,
                     maintainer_name = "", maintainer_email = "")

## Specify the species and datasource
We add two datasource, one from FIB19 and another from transcriptome data

In [None]:
species = fib19.add_Species('Drosophila melanogaster', stage = 'adult',
                            sex = 'female',
                            synonyms = ['fruit fly', 'common fruit fly', 'vinegar fly'])
version = '1.0'
datasource = fib19.add_DataSource('fib19', version = version,
                                url = 'https://emdata.janelia.org/#/repo/73581d2d46fc445d83cf98382b566b55',
                                species = species)
fib19.default_DataSource = datasource
transcriptome_datasource = fib19.add_DataSource('GSE116969', version = '1.0',
                                                  url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116969',
                                                  description = 'Fred P Davis, Aljoscha Nern, Serge Picard, Michael B Reiser, Gerald M Rubin, Sean R Eddy, Gilbert L Henry, A genetic, genomic, and computational resource for exploring neural circuit function. eLife 2020;9:e50901. DOI: 10.7554/eLife.50901',
                                                  species = species)

## Add Regions
Adding subsystem and neuropils.
Note that the neuropil mesh is provided in $\mu$m unit, so we don't need to scale it here.

In [None]:
fib19.add_Subsystem('OL(R)', synonyms = ['right optic lobe'])
all_rois = {
    'LO': {'System': 'OL(R)', 'Neuropil': 'LO(R)', 'synonyms': 'right Lobula'},
    'ME': {'System': 'OL(R)', 'Neuropil': 'ME(R)', 'synonyms': 'right Medulla'},
    'LOP': {'System': 'OL(R)', 'Neuropil': 'LOP(R)', 'synonyms': 'right Lobula Plate'},
}
for k, v in all_rois.items():
    if v['Neuropil'] is not None:
        ms = ml.MeshSet()
        ms.load_new_mesh("roi/{}.obj".format(k))
        current_mesh = ms.current_mesh()
        fib19.add_Neuropil(v['Neuropil'],
                           morphology = {'type': 'mesh',
                                         "vertices": (current_mesh.vertex_matrix()).flatten().tolist(),
                                         "faces": current_mesh.face_matrix().flatten().tolist()},
                           subsystem = v['System'])

## Load neuron data
Load a csv fetch from Neuprint about neurons.
Load the transcriptome data.

In [None]:
neuron_list = pd.read_csv('fetched_neurons.csv', index_col=0)
nt_df = pd.read_csv('GSE116969_dataTable7b.genes_x_cells_p_expression.modeled_genes.txt', sep = '\t', index_col = 0)

## Load Neurons
Loading neuorns and any segments

In [None]:
swc_dir = 'swc'
uname_dict = {}
segment_ids = set()
added = 0
unadded = []
to_combine = []
loaded_to_segment = []
columns = {}

neurotransmitter_translation = {'acetylcholine': 'acetylcholine',
                                'gaba': 'GABA',
                                'glutamate': 'glutamate'}
x = 'x'
y = 'y'
z = 'z'

for i, row in tqdm(neuron_list.iterrows()):
    if not isinstance(row['instance'], str):
        continue
    bodyID = row['bodyId']
    
    name = None
    cell_type = None
    
    segment = False
    
    info = {}
    neurotransmitter = None
    column = None
    
    name = row['instance'].replace('likle', 'like')
    if isinstance(row['type'], str):
        cell_type = row['type']
    else:
        cell_type = 'unknown'
        name = 'unknown_{}_{}'.format(name, bodyID)
    if cell_type != 'unknown':
        if ' home' in name:
            name = name.replace(' home', '-home')
            column = 'home'
        elif '-' in name and '-' not in cell_type:
            column = name.split('-')[-1]
            if column in ['ant', 'post']:
                column = name.split('-')[-2]
            if (len(column) == 1 and column.isalpha()) or column == 'home':
                column = column.upper()
            else:
                column = None
                if name not in uname_dict:
                    uname_dict[name] = 0
                uname_dict[name] += 1
                name = '{}_{}'.format(name, uname_dict[name])
        else:
            if name not in uname_dict:
                uname_dict[name] = 0
            uname_dict[name] += 1
            name = '{}_{}'.format(name, uname_dict[name])
    
    added += 1
    c_neuropils = row['roiInfo']
    arborization = []
    
    if isinstance(c_neuropils, str):
        dendrites = {'{}(R)'.format(neuropil): v['upstream'] for neuropil, v in eval(row['roiInfo']).items() if 'upstream' in v}
        axons = {'{}(R)'.format(neuropil): v['downstream'] for neuropil, v in eval(row['roiInfo']).items() if 'downstream' in v}
        arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'neuropil'})
    
    try:
        df = na.load_swc('../fib19/{}/{}.swc'.format(swc_dir, bodyID))
        morphology = {'x': (df['x']*0.008).tolist(),
                      'y': (df['y']*0.008).tolist(),
                      'z': (df['z']*0.008).tolist(),
                      'r': (df['r']*0.008).tolist(),
                      'parent': df['parent'].tolist(),
                      'identifier': [0]*(len(df['x'])),
                      'sample': df['sample'].tolist(),
                      'type': 'swc'}
    except FileNotFoundError:
        morphology = None
        if segment: # no name, not traced, no morph
            to_combine.append(bodyID)
            continue
        else:
            segment = True
            loaded_to_segment.append(bodyID)
    
    if isinstance(row['statusLabel'], str):
        info['fib19 Trace Status'] = row['statusLabel']
    else:
        info['fib19 Trace Status'] = 'Untraced'
    
    if not segment:
        if column is not None:
            circuit = None
            circuit = columns.get(column, None)
            if circuit is None:
                circuit = fib19.add_Circuit('Column {}'.format(column), 'Column', neuropil = 'ME(R)')
                columns[column] = circuit
        else:
            circuit = None
                
        neurotransmitter = []
        if cell_type in nt_df.columns or cell_type == 'R8':
            if cell_type in ['R{}'.format(i) for i in range(1,7)]:
                gene_expression_type = 'R1_6'
            elif cell_type == 'R8':
                gene_expression_type = 'R8_Rh5'
            else:
                gene_expression_type = cell_type

            if nt_df[gene_expression_type]['Hdc'] > 0.9:
                neurotransmitter.append('histamine')
            if nt_df[gene_expression_type]['Gad1'] > 0.9:
                neurotransmitter.append('GABA')
            if nt_df[gene_expression_type]['VAChT'] > 0.9:
                neurotransmitter.append('acetylcholine')
            if nt_df[gene_expression_type]['VGlut'] > 0.9:
                neurotransmitter.append('glutamate')
            if nt_df[gene_expression_type]['ple'] > 0.9 \
                    and nt_df[gene_expression_type]['Ddc'] > 0.9 \
                    and nt_df[gene_expression_type]['Vmat'] > 0.9 \
                    and nt_df[gene_expression_type]['DAT'] > 0.9:
                neurotransmitter.append('dopamine')
        
        
        fib19.add_Neuron(name, # uname
                       cell_type, # name
                       referenceId = str(bodyID), #referenceId
                       info = info if len(info) else None,
                       morphology = morphology,
                       arborization = arborization,
                       neurotransmitters = neurotransmitter if len(neurotransmitter) else None,
                       circuit = circuit)
    else:
        cell_type = 'segment'
        name = 'segment_{}'.format(bodyID)
        fib19.add_NeuronFragment(name,
                               cell_type,
                               referenceId = str(bodyID),
                               info = info if len(info) else None,
                               morphology = morphology,
                               arborization = arborization)

add additional segments

In [None]:
n_added = 0
for i, row in tqdm(neuron_list.iterrows()):
    bodyID = row['bodyId']
    if isinstance(row['instance'], str):
        continue
    if not bodyID in all_swc:
        continue

    cell_type = 'segment'
    name = 'segment_{}'.format(bodyID)

    info = {}

    added += 1

    c_neuropils = row['roiInfo']
    arborization = []
    if isinstance(c_neuropils, str):
        dendrites = {'{}(R)'.format(neuropil): v['upstream'] for neuropil, v in eval(row['roiInfo']).items() if 'upstream' in v}
        axons = {'{}(R)'.format(neuropil): v['downstream'] for neuropil, v in eval(row['roiInfo']).items() if 'downstream' in v}
        arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'neuropil'})
    
    df = na.load_swc('../fib19/{}/{}.swc'.format(swc_dir, bodyID))
    morphology = {'x': (df['x']*0.008).tolist(),
                  'y': (df['y']*0.008).tolist(),
                  'z': (df['z']*0.008).tolist(),
                  'r': (df['r']*0.008).tolist(),
                  'parent': df['parent'].tolist(),
                  'identifier': [0]*(len(df['x'])),
                  'sample': df['sample'].tolist(),
                  'type': 'swc'}
    if isinstance(row['statusLabel'], str):
        info['fib19 Trace Status'] = row['statusLabel']
    else:
        info['fib19 Trace Status'] = 'Untraced'

    fib19.add_NeuronFragment(name,
                         cell_type,
                         referenceId = str(bodyID),
                         info = info if len(info) else None,
                         morphology = morphology,
                         arborization = arborization)
    n_added += 1

Flush cache of ownership edges. This is needed because creating these edges while creating neuron record turns out to be very time consuming

In [None]:
fib19.flush_edges()

Load neurons into cache so that they can be referred to directly when creating synapses, rather than querying the database many times.

In [None]:
# find all the neurons so they can be keyed by their referenceId.
neurons = fib19.sql_query('select from NeuronAndFragment').node_objs
# set the cache so there is no need for database access.
for neuron in neurons:
    fib19.set('NeuronAndFragment', neuron.uname, neuron, fib19.default_DataSource)
neuron_ref_to_obj = {int(neuron.referenceId): neuron for neuron in neurons}

## Create Synapse Records
Create synapses if both pre- and post-synaptic neuron/segments are loaded.

If not, then combine data of segments pre to a neuron, combine data of segments post to a neuron.

In [None]:
tmp = {}
morph_dict = {}

for chunk in tqdm(pd.read_csv('synapses_all.csv', chunksize=100000)):
    for i, row in chunk.iterrows():
        pre = int(row['pre_id'])
        post = int(row['post_id'])
        if pre not in neuron_ref_to_obj:
            pre = -1
        if post not in neuron_ref_to_obj:
            post = -1
        if pre == -1 and post == -1:
            continue
        
        pre_conf = np.array(eval(row['pre_confidence']))/1e6
        post_conf = np.array(eval(row['post_confidence']))/1e6
        NHP = np.sum(np.logical_and(post_conf>=0.7, pre_conf>=0.7))
        
        if pre == -1:
            if post not in tmp:
                tmp[post] = {}
            if 'pre' not in tmp[post]:
                tmp[post]['pre'] = {'pre_x': [], 'pre_y': [], 'pre_z': [], 'post_x': [], 'post_y': [], 'post_z': [],
                                    'pre_confidence': [], 'post_confidence': [],
                                    'N': 0, 'NHP': 0}
            tmp[post]['pre']['pre_x'].append(np.array(eval(row['pre_x']))*0.008)
            tmp[post]['pre']['pre_y'].append(np.array(eval(row['pre_y']))*0.008)
            tmp[post]['pre']['pre_z'].append(np.array(eval(row['pre_z']))*0.008)
            tmp[post]['pre']['post_x'].append(np.array(eval(row['post_x']))*0.008)
            tmp[post]['pre']['post_y'].append(np.array(eval(row['post_y']))*0.008)
            tmp[post]['pre']['post_z'].append(np.array(eval(row['post_z']))*0.008)
            tmp[post]['pre']['pre_confidence'].append(pre_conf)
            tmp[post]['pre']['post_confidence'].append(post_conf)
            tmp[post]['pre']['N'] += row['N']
            tmp[post]['pre']['NHP'] += NHP                        
        elif post == -1:
            if pre not in tmp:
                tmp[pre] = {}
            if 'post' not in tmp[pre]:
                tmp[pre]['post'] = {'pre_x': [], 'pre_y': [], 'pre_z': [], 'post_x': [], 'post_y': [], 'post_z': [],
                                    'pre_confidence': [], 'post_confidence': [],
                                    'N': 0, 'NHP': 0}
            tmp[pre]['post']['pre_x'].append(np.array(eval(row['pre_x']))*0.008)
            tmp[pre]['post']['pre_y'].append(np.array(eval(row['pre_y']))*0.008)
            tmp[pre]['post']['pre_z'].append(np.array(eval(row['pre_z']))*0.008)
            tmp[pre]['post']['post_x'].append(np.array(eval(row['post_x']))*0.008)
            tmp[pre]['post']['post_y'].append(np.array(eval(row['post_y']))*0.008)
            tmp[pre]['post']['post_z'].append(np.array(eval(row['post_z']))*0.008)
            tmp[pre]['post']['pre_confidence'].append(pre_conf)
            tmp[pre]['post']['post_confidence'].append(post_conf)
            tmp[pre]['post']['N'] += row['N']
            tmp[pre]['post']['NHP'] += NHP
        else:
            pre_neuron = neuron_ref_to_obj[pre]
            post_neuron = neuron_ref_to_obj[post]
            c_neuropils = row['neuropils']
            arborization = []
            neuropils = {}
            if isinstance(c_neuropils, str):
                arborization.append({'type': 'neuropil',
                               'synapses': {"{}(R)".format(j.split(':')[0]): int(j.split(':')[1]) \
                                            for j in c_neuropils.split(';') \
                                            if j.split(':')[0] != 'None' and int(j.split(':')[1]) > 0}})
            content = {'type': 'swc'}
            content['x'] = (np.array(eval(row['pre_x'])+eval(row['post_x']))*0.008).tolist()
            content['y'] = (np.array(eval(row['pre_y'])+eval(row['post_y']))*0.008).tolist()
            content['z'] = (np.array(eval(row['pre_z'])+eval(row['post_z']))*0.008).tolist()
            content['r'] = [0]*len(content['x'])
            content['parent'] = [-1]*(len(content['x'])//2) + [i+1 for i in range(len(content['x'])//2)]
            content['identifier'] = [7]*(len(content['x'])//2) + [8]*(len(content['x'])//2)
            content['sample'] = [i+1 for i in range(len(content['x']))]
            content['confidence'] = pre_conf.tolist() + post_conf.tolist()

            synapse = fib19.add_Synapse(pre_neuron, post_neuron, N = row['N'], NHP = NHP)
            morph_dict[synapse._id] = {'morphology': content,
                                       'arborization': arborization}


In [None]:
fib19.flush_edges()

create a notional datasource under which the combined untraced segments are created.

In [None]:
version = '1.0'
species = fib19.sql_query('select from Species').node_objs[0]
notional_datasource = fib19.add_DataSource('notional', version = version,
                                         species = species)


Create dummpy NeuronFragments and load their synapses

In [None]:
for n, v in tqdm(tmp.items()):
    if 'pre' in v:
        post_neuron = neuron_ref_to_obj[n]
        cell_type = 'combined_untraced_segments'
        name = 'segments_presynaptic_to_{}'.format(post_neuron.uname)
        pre_neuron = fib19.add_NeuronFragment(
                                     name,
                                     cell_type,
                                     data_source = notional_datasource)
        
        content = {'type': 'swc'}
        content['x'] = np.concatenate(v['pre']['pre_x'] + v['pre']['post_x']).tolist()
        content['y'] = np.concatenate(v['pre']['pre_y'] + v['pre']['post_y']).tolist()
        content['z'] = np.concatenate(v['pre']['pre_z'] + v['pre']['post_z']).tolist()
        content['r'] = [0]*len(content['x'])
        content['parent'] = [-1]*(len(content['x'])//2) + [i+1 for i in range(len(content['x'])//2)]
        content['identifier'] = [7]*(len(content['x'])//2) + [8]*(len(content['x'])//2)
        content['sample'] = [i+1 for i in range(len(content['x']))]
        content['confidence'] = np.concatenate(v['pre']['pre_confidence'] + v['pre']['post_confidence']).tolist()
        synapse = fib19.add_Synapse(pre_neuron, post_neuron, N = v['pre']['N'], NHP = v['pre']['NHP'])
        morph_dict[synapse._id] = {'morphology': content}
    if 'post' in v:
        pre_neuron = neuron_ref_to_obj[n]
        cell_type = 'combined_untraced_segments'
        name = 'segments_postsynaptic_to_{}'.format(pre_neuron.uname)
        post_neuron = fib19.add_NeuronFragment(
                                     name,
                                     cell_type,
                                     data_source = notional_datasource)
        content = {'type': 'swc'}
        content['x'] = np.concatenate(v['post']['pre_x'] + v['post']['post_x']).tolist()
        content['y'] = np.concatenate(v['post']['pre_y'] + v['post']['post_y']).tolist()
        content['z'] = np.concatenate(v['post']['pre_z'] + v['post']['post_z']).tolist()
        content['r'] = [0]*len(content['x'])
        content['parent'] = [-1]*(len(content['x'])//2) + [i+1 for i in range(len(content['x'])//2)]
        content['identifier'] = [7]*(len(content['x'])//2) + [8]*(len(content['x'])//2)
        content['sample'] = [i+1 for i in range(len(content['x']))]
        content['confidence'] = np.concatenate(v['post']['pre_confidence'] + v['post']['post_confidence']).tolist()
        synapse = fib19.add_Synapse(pre_neuron, post_neuron, N = v['post']['N'], NHP = v['post']['NHP'])
        morph_dict[synapse._id] = {'morphology': content}

In [None]:
fib19.flush_edges()

load all morphology and arborization data for synapses

In [None]:
for rid, data in tqdm(morph_dict.items()):
    if 'morphology' in data:
        fib19.add_morphology(rid, data['morphology'])
    if 'arborization' in data:
        fib19.add_synapse_arborization(rid, data['arborization'])