# Loading NeuroArch Database with Hemibrain Dataset v1.0.1

This tutorial provides code to load NeuroArch database with Hemibrain Dataset v1.0.1. Requirement before running the notebook:
- Installed [NeuroArch](https://github.com/fruitflybrain/neuroarch), [OrientDB Community Version](https://www.orientdb.org/download), and [pyorient](https://github.com/fruitflybrain/pyorient). The [NeuroNLP Docker image](https://hub.docker.com/r/fruitflybrain/neuronlp) and [FlyBrainLab Docker image](https://hub.docker.com/r/fruitflybrain/fbl) all have a copy of the software requirement ready.
- Installed [PyMeshLab](https://pypi.org/project/pymeshlab/).
- Installed [neuprint-python](https://github.com/connectome-neuprint/neuprint-python).
- Download the [Neuprint database dump for the Hemibrain dataset v1.0.1](https://storage.cloud.google.com/hemibrain-release/neuprint/hemibrain_v1.0.1_neo4j_inputs.zip).
- Have the [token](https://connectome-neuprint.github.io/neuprint-python/docs/client.html#neuprint.client.Client) for Neuprint HTTP access ready.
- Have more than 60 GB free disk space (for Neuprint dump and NeuroArch database).

A backup of the database created by this notebook can be downloaded [here](https://drive.google.com/file/d/1x6MQJB_4OaWJR6d6O3WFCSeJWG58FsPT/view?usp=sharing). To restore it in OrientDB, run
```
/path/to/orientdb/bin/console.sh "create database plocal:../databases/hemibrain admin admin; restore database /path/to/hemibrain1.0.1_na_v1.0_backup.zip"
```

In [None]:
import glob
import os
import subprocess
import csv
import json
import warnings
from requests import HTTPError

import numpy as np
import pandas as pd
from neuprint import Client
from tqdm import tqdm
import h5py
import pymeshlab as ml
import neuroarch.na as na


## Define Brain Region
First define all brain regions in the hemibrain data, and assign them as subsystem, neuropil or subregions.

In [None]:
all_brain_regions = \
{'OL(R)': {'System': 'OL(R)', 'Neuropil': None, 'Subregions': None},
'MB(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': None},
'MB(L)': {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': None},
'CX': {'System': 'CX', 'Neuropil': None, 'Subregions': None},
'LX(R)': {'System': 'LX(R)', 'Neuropil': None, 'Subregions': None},
'LX(L)': {'System': 'LX(L)', 'Neuropil': None, 'Subregions': None},
'VLNP(R)': {'System': 'VLNP(R)', 'Neuropil': None, 'Subregions': None},
'LH(R)': {'System': 'LH(R)', 'Neuropil': 'LH(R)', 'Subregions': None},
'SNP(R)': {'System': 'SNP(R)', 'Neuropil': None, 'Subregions': None},
'SNP(L)': {'System': 'SNP(L)', 'Neuropil': None, 'Subregions': None},
'INP': {'System': 'INP', 'Neuropil': None, 'Subregions': None},
'AL(R)': {'System': 'AL(R)', 'Neuropil': 'AL(R)', 'Subregions': None},
'AL(L)': {'System': 'AL(L)', 'Neuropil': 'AL(L)', 'Subregions': None},
'VMNP': {'System': 'VMNP', 'Neuropil': None, 'Subregions': None},
'PENP': {'System': 'PENP', 'Neuropil': None, 'Subregions': None},
'GNG': {'System': 'GNG', 'Neuropil': 'GNG', 'Subregions': None},
'AOT(R)': {'Tract': 'AOT(R)', 'Neuropil': None, 'Subregions': None},
'GC': {'Tract': 'GC', 'Neuropil': None, 'Subregions': None},
'GF(R)': {'Tract': 'GF(R)', 'Neuropil': None, 'Subregions': None},
'mALT(R)': {'Tract': 'mALT(R)', 'Neuropil': None, 'Subregions': None},
'mALT(L)': {'Tract': 'mALT(L)', 'Neuropil': None, 'Subregions': None},
'POC': {'Tract': 'POC', 'Neuropil': None, 'Subregions': None},
'ME(R)': {'System': 'OL(R)', 'Neuropil': 'ME(R)', 'Subregions': None},
'AME(R)': {'System': 'OL(R)', 'Neuropil': 'AME(R)', 'Subregions': None},
'CA(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'CA(R)'},
'CA(L)': {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': 'CA(L)'},
'dACA(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'dACA(R)'},
'lACA(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'lACA(R)'},
'vACA(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'vACA(R)'},
'PED(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'PED(R)'},
'CB': {'System': 'CX', 'Neuropil': ['FB', 'EB'], 'Subregions': None},
'PB': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': None},
'NO': {'System': 'CX', 'Neuropil': ['NO(R)', 'NO(L)'], 'Subregions': None},
'BU(R)': {'System': 'LX(R)', 'Neuropil': 'BU(R)', 'Subregions': None},
'BU(L)': {'System': 'LX(L)', 'Neuropil': 'BU(L)', 'Subregions': None},
'LAL(R)': {'System': 'LX(R)', 'Neuropil': 'LAL(R)', 'Subregions': None},
'LAL(L)': {'System': 'LX(L)', 'Neuropil': 'LAL(L)', 'Subregions': None},
'AOTU(R)': {'System': 'VLNP(R)', 'Neuropil': 'AOTU(R)', 'Subregions': None},
'PLP(R)': {'System': 'VLNP(R)', 'Neuropil': 'PLP(R)', 'Subregions': None},
'WED(R)': {'System': 'VLNP(R)', 'Neuropil': 'WED(R)', 'Subregions': None},
'SLP(R)': {'System': 'SNP(R)', 'Neuropil': 'SLP(R)', 'Subregions': None},
'SIP(R)': {'System': 'SNP(R)', 'Neuropil': 'SIP(R)', 'Subregions': None},
'SIP(L)': {'System': 'SNP(L)', 'Neuropil': 'SIP(L)', 'Subregions': None},
'SMP(R)': {'System': 'SNP(R)', 'Neuropil': 'SMP(R)', 'Subregions': None},
'SMP(L)': {'System': 'SNP(L)', 'Neuropil': 'SMP(L)', 'Subregions': None},
'CRE(R)': {'System': 'INP', 'Neuropil': 'CRE(R)', 'Subregions': None},
'CRE(L)': {'System': 'INP', 'Neuropil': 'CRE(L)', 'Subregions': None},
'IB': {'System': 'INP', 'Neuropil': 'IB', 'Subregions': None},
'ATL(R)': {'System': 'INP', 'Neuropil': 'ATL(R)', 'Subregions': None},
'ATL(L)': {'System': 'INP', 'Neuropil': 'ATL(L)', 'Subregions': None},
'AL-DC3(R)': {'System': 'AL(R)', 'Neuropil': 'AL(R)', 'Subregions': 'AL-DC3(R)'},
'SAD': {'System': 'PENP', 'Neuropil': 'SAD', 'Subregions': None},
'FLA(R)': {'System': 'PENP', 'Neuropil': 'FLA(R)', 'Subregions': None},
'CAN(R)': {'System': 'PENP', 'Neuropil': 'CAN(R)', 'Subregions': None},
'PRW': {'System': 'PENP', 'Neuropil': 'PRW', 'Subregions': None},
'LO(R)': {'System': 'OL(R)', 'Neuropil': 'LO(R)', 'Subregions': None},
'LOP(R)': {'System': 'OL(R)', 'Neuropil': 'LOP(R)', 'Subregions': None},
"a'L(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "a'L(R)"},
"a'1(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "a'1(R)"},
"a'2(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "a'2(R)"},
"a'3(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "a'3(R)"},
"a'L(L)": {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': "a'L(L)"},
'aL(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'aL(R)'},
'a1(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'a1(R)'},
'a2(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'a2(R)'},
'a3(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'a3(R)'},
'aL(L)': {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': 'aL(L)'},
'gL(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'gL(R)'},
'g1(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'g1(R)'},
'g2(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'g2(R)'},
'g3(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'g3(R)'},
'g4(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'g4(R)'},
'g5(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'g5(R)'},
'gL(L)': {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': 'gL(L)'},
"b'L(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "b'L(R)"},
"b'1(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "b'1(R)"},
"b'2(R)": {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': "b'2(R)"},
"b'L(L)": {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': "b'L(L)"},
'bL(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'bL(R)'},
'b1(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'b1(R)'},
'b2(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': 'MB(R)', 'Subregions': 'b2(R)'},
'bL(L)': {'System': 'MB(+ACA)(L)', 'Neuropil': 'MB(L)', 'Subregions': 'bL(L)'},
'FB': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': None},
'FBl1': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl1'},
'FBl2': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl2'},
'FBl3': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl3'},
'FBl4': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl4'},
'FBl5': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl5'},
'FBl6': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl6'},
'FBl7': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl7'},
'FBl8': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl8'},
'FBl9': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FBl9'},
'EB': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': None},
'EBr1': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr1'},
'EBr2r4': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr2r4'},
'EBr3am': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr3am'},
'EBr3d': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr3d'},
'EBr3pw': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr3pw'},
'EBr5': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr5'},
'EBr6': {'System': 'CX', 'Neuropil': 'EB', 'Subregions': 'EBr6'},
'PB(R1)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R1)'},
'PB(R2)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R2)'},
'PB(R3)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R3)'},
'PB(R4)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R4)'},
'PB(R5)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R5)'},
'PB(R6)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R6)'},
'PB(R7)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R7)'},
'PB(R8)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R8)'},
'PB(R9)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(R9)'},
'PB(L1)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L1)'},
'PB(L2)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L2)'},
'PB(L3)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L3)'},
'PB(L4)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L4)'},
'PB(L5)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L5)'},
'PB(L6)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L6)'},
'PB(L7)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L7)'},
'PB(L8)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L8)'},
'PB(L9)': {'System': 'CX', 'Neuropil': 'PB', 'Subregions': 'PB(L9)'},
'NO(R)': {'System': 'CX', 'Neuropil': 'NO(R)', 'Subregions': None},
'NO(L)': {'System': 'CX', 'Neuropil': 'NO(L)', 'Subregions': None},
'GA(R)': {'System': 'LX(R)', 'Neuropil': 'LAL(R)', 'Subregions': 'GA(R)'},
'AVLP(R)': {'System': 'VLNP(R)', 'Neuropil': 'AVLP(R)', 'Subregions': None},
'PVLP(R)': {'System': 'VLNP(R)', 'Neuropil': 'PVLP(R)', 'Subregions': None},
'RUB(R)': {'System': 'INP', 'Neuropil': 'CRE(R)', 'Subregions': 'RUB(R)'},
'RUB(L)': {'System': 'INP', 'Neuropil': 'CRE(L)', 'Subregions': 'RUB(L)'},
'ROB(R)': {'System': 'INP', 'Neuropil': 'CRE(R)', 'Subregions': 'ROB(R)'},
'SCL(R)': {'System': 'INP', 'Neuropil': 'SCL(R)', 'Subregions': None},
'SCL(L)': {'System': 'INP', 'Neuropil': 'SCL(L)', 'Subregions': None},
'ICL(R)': {'System': 'INP', 'Neuropil': 'ICL(R)', 'Subregions': None},
'ICL(L)': {'System': 'INP', 'Neuropil': 'ICL(L)', 'Subregions': None},
'VES(R)': {'System': 'VMNP', 'Neuropil': 'VES(R)', 'Subregions': None},
'VES(L)': {'System': 'VMNP', 'Neuropil': 'VES(L)', 'Subregions': None},
'EPA(R)': {'System': 'VMNP', 'Neuropil': 'EPA(R)', 'Subregions': None},
'EPA(L)': {'System': 'VMNP', 'Neuropil': 'EPA(L)', 'Subregions': None},
'GOR(R)': {'System': 'VMNP', 'Neuropil': 'GOR(R)', 'Subregions': None},
'GOR(L)': {'System': 'VMNP', 'Neuropil': 'GOR(L)', 'Subregions': None},
'SPS(R)': {'System': 'VMNP', 'Neuropil': 'SPS(R)', 'Subregions': None},
'SPS(L)': {'System': 'VMNP', 'Neuropil': 'SPS(L)', 'Subregions': None},
'IPS(R)': {'System': 'VMNP', 'Neuropil': 'IPS(R)', 'Subregions': None},
'AMMC': {'System': 'PENP', 'Neuropil': 'SAD', 'Subregions': 'AMMC'},
'AB(R)': {'System': 'CX', 'Neuropil': 'AB(R)', 'Subregions': None},
'AB(L)': {'System': 'CX', 'Neuropil': 'AB(L)', 'Subregions': None},
'FB-column3': {'System': 'CX', 'Neuropil': 'FB', 'Subregions': 'FB-column3'},
'NO1(R)': {'System': 'CX', 'Neuropil': 'NO(R)', 'Subregions': 'NO1(R)'},
'NO1(L)': {'System': 'CX', 'Neuropil': 'NO(L)', 'Subregions': 'NO1(L)'},
'NO2(R)': {'System': 'CX', 'Neuropil': 'NO(R)', 'Subregions': 'NO2(R)'},
'NO2(L)': {'System': 'CX', 'Neuropil': 'NO(L)', 'Subregions': 'NO2(L)'},
'NO3(R)': {'System': 'CX', 'Neuropil': 'NO(R)', 'Subregions': 'NO3(R)'},
'NO3(L)': {'System': 'CX', 'Neuropil': 'NO(L)', 'Subregions': 'NO3(L)'},
'MB(+ACA)(R)': {'System': 'MB(+ACA)(R)', 'Neuropil': None, 'Subregions': None},
'MB(+ACA)(L)': {'System': 'MB(+ACA)(L)', 'Neuropil': None, 'Subregions': None},
'LAL(-GA)(R)': {'System': 'LX(R)', 'Neuropil': 'LAL(R)', 'Subregions': 'LAL(-GA)(R)'},
'SAD(-AMMC)': {'System': 'PENP', 'Neuropil': 'SAD', 'Subregions': 'SAD(-AMMC)'},
'CRE(-ROB,-RUB)(R)': {'System': 'INP', 'Neuropil': 'CRE(R)', 'Subregions': 'CRE(-ROB,-RUB)(R)'},
}


Extract from neuprint server the mesh files defining the boundary of these regions. The subsystems do not have mesh. You will need to put your token here.

In [None]:
try: 
    os.mkdir('roi')
except FileExistsError:
    warnings.warn('folder roi already exists.')
    pass

token = ''
c = Client('neuprint.janelia.org', dataset='hemibrain:v1.0.1', token=token)
for region in all_brain_regions:
    try:
        c.fetch_roi_mesh(region, 'roi/{}.obj'.format(region))
    except:
        print(region)

## Extract Neuron Attributes
In the next two cells, we extract from the database dump all the *Traced* neurons and write them into 'neurons.csv' file.

In [None]:
def process(chunk):
    status = np.nonzero(np.array([i == 'Traced' for i in chunk['status:string'].values]))[0]
    used = chunk.iloc[status]
    neurons = []

    for i, row in used.iterrows():
        neuropil_list = []
        subregion_list = []
        tract_list = []
        kk = json.loads(row['roiInfo:string'])
        for k, v in kk.items():
            if k == "None": continue
            region = all_brain_regions[k]
            if region['Subregions'] is None:
                if region['Neuropil'] is None:
                    if 'Tract' in region:
                        tract_list.append('{}:{}:{}'.format(
                            region['Tract'], v.get('pre',0), v.get('post',0)))
                    else:
                        continue
                elif isinstance(region['Neuropil'], list):
                    continue
                else:
                    neuropil_list.append('{}:{}:{}'.format(
                        region['Neuropil'], v.get('pre', 0), v.get('post', 0)))
            else:
                subregion_list.append('{}:{}:{}'.format(
                    region['Subregions'], v.get('pre', 0), v.get('post', 0)))

        neuropil_list = ';'.join(neuropil_list)
        subregion_list = ';'.join(subregion_list)
        tract_list = ';'.join(tract_list)

        li = [row['bodyId:long'], row['pre:int'], row['post:int'], row['status:string'],\
              row['statusLabel:string'], int(row['cropped:boolean']) if not np.isnan(row['cropped:boolean']) else row['cropped:boolean'], row['instance:string'], \
              row['type:string'], row['cellBodyFiber:string'], row['somaLocation:point{srid:9157}'], \
              row['somaRadius:float'], row['size:long'], neuropil_list, subregion_list,tract_list]
        neurons.append(li)
    return neurons

In [None]:
chunksize = 100000

with open('neurons.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['bodyID','pre','post','status','statusLabel','cropped','instance','type','cellBodyFiber','somaLocation','somaRadius','size','neuropils','subregions','tracts'])
    for chunk in tqdm(pd.read_csv('hemibrain_v1.0.1_neo4j_inputs/Neuprint_Neurons_52a133.csv', chunksize=chunksize)):
        neurons = process(chunk)
        writer.writerows(neurons)

To download the SWC files for all the traced neurons into folder 'swc', we can run the code below uncommented, or download it from [here](https://drive.google.com/file/d/1nhVABRX0cwQc3sXOoAx2EvWKbIYsoFPm/view?usp=sharing).

In [None]:

# try: 
#     os.mkdir('swc')
# except FileExistsError:
#     warnings.warn('folder roi already exists.')
#     pass

# neurons = pd.read_csv('neurons.csv')

# for i, row in tqdm(neurons.iterrows()):
#     bodyID = int(row['bodyID'])
#     try:
#         s = c.fetch_skeleton(bodyID, format='pandas')
#         s.to_csv('swc/{}.swc'.format(bodyID), header=False, index=False, sep=' ')
#     except HTTPError:
#         print(bodyID)

## Extract Synapses
We only use the neurons that are traced, or roughly traced, or has a name/instance assigned to it. Only synapses between these neurons are extracted below.

In [None]:
neurons = pd.read_csv('neurons.csv')
used = []
for i, row in neurons.iterrows():
    if row['statusLabel'] in ['Traced', 'Roughly traced'] or isinstance(row['instance'], str) or isinstance(row['type'], str):
        used.append(i)
        
traced_neuron_id = neurons.iloc[used]['bodyID'].to_numpy()
        
chunksize = 1000000
pre_syn = np.empty((int(1e8),3), np.int64)
post_syn = np.empty((int(1e8),3), np.int64)

pre_count = 0
post_count = 0
count = 0
for chunk in pd.read_csv('hemibrain_v1.0.1_neo4j_inputs/Neuprint_SynapseSet_to_Synapses_52a133.csv', chunksize=chunksize):
    ids = chunk[':START_ID']
    pre_site = np.array([[n, int(i.split('_')[0]), int(i.split('_')[1])] \
                         for n,i in enumerate(ids) if i.split('_')[2] == 'pre'])
    post_site = np.array([[n, int(i.split('_')[0]), int(i.split('_')[1])] \
                          for n,i in enumerate(ids) if i.split('_')[2] == 'post'])
    pre_site_known = pre_site[np.logical_and(
                              np.isin(pre_site[:,1], traced_neuron_id),
                              np.isin(pre_site[:,2], traced_neuron_id)),0]
    post_site_known = post_site[np.logical_and(
                                np.isin(post_site[:,1], traced_neuron_id),
                                np.isin(post_site[:,2], traced_neuron_id)),0]
    retrieved_pre_site = chunk.iloc[pre_site_known]
    pre_site = np.array([[row[':END_ID(Syn-ID)'], int(row[':START_ID'].split('_')[0]), int(row[':START_ID'].split('_')[1])] \
                         for i, row in retrieved_pre_site.iterrows()])
    retrieved_post_site = chunk.iloc[post_site_known]
    post_site = np.array([[row[':END_ID(Syn-ID)'], int(row[':START_ID'].split('_')[0]), int(row[':START_ID'].split('_')[1])] \
                         for i, row in retrieved_post_site.iterrows()])
    if pre_site.size:
        pre_syn[pre_count:pre_count+pre_site.shape[0], :] = pre_site
        pre_count += pre_site.shape[0]
    if post_site.size:
        post_syn[post_count:post_count+post_site.shape[0], :] = post_site
        post_count += post_site.shape[0]
    count += chunksize
    print(count, pre_count, post_count)

pre_syn = pre_syn[:pre_count,:]
post_syn = post_syn[:post_count,:]

ind = np.argsort(pre_syn[:,0])
pre_syn_sorted = pre_syn[ind, :]
ind = np.argsort(post_syn[:,0])
post_syn_sorted = post_syn[ind, :]


In [None]:
# extract synapse (pre-site) to synapse (post-site) connection
# use only the post synaptic site to get all the synapses because one presynaptic site can have multiple postsynaptic sites
post_syn_index = post_syn_sorted[:,0].copy()

df = pd.read_csv('hemibrain_v1.0.1_neo4j_inputs/Neuprint_Synapse_Connections_52a133.csv')
post_ids = df[':END_ID(Syn-ID)']
used = np.where(post_ids.isin(post_syn_index).to_numpy())[0]
connections = df.iloc[used].to_numpy()
ind = np.argsort(connections[:,1])
connections = connections[ind, :]


In [None]:
# extract synapse details
chunksize = 100000

pre_syn_index = list(set(pre_syn_sorted[:,0].copy()))
pre_syn_index.extend(list(post_syn_sorted[:,0].copy()))
syn_index = np.array(sorted(pre_syn_index))
del pre_syn_index#, pre_syn_sorted, post_syn_sorted

synapse_array = np.empty((len(syn_index), 230+6), np.int64)

synapse_count = 0
count = 0

for chunk in pd.read_csv('hemibrain_v1.0.1_neo4j_inputs/Neuprint_Synapses_52a133.csv', chunksize=chunksize):
    ids = chunk[':ID(Syn-ID)']
    
    start_id = ids.iloc[0]
    stop_id = ids.iloc[-1]
    pre_start = np.searchsorted(syn_index, start_id, side='left')
    pre_end = np.searchsorted(syn_index, stop_id, side='right')
    if pre_start >= len(syn_index):
        pre_index = []
    else:
        if pre_end >= len(syn_index):
            pre_index = syn_index[pre_start:pre_end] #same as syn_index[pre_start:]
        else:
            pre_index = syn_index[pre_start:pre_end]
    pre_used_synapse = chunk.loc[ids.isin(pre_index)]
    li = np.empty((pre_index.size, 230+6), np.int64)
    i = 0
    for _, row in pre_used_synapse.iterrows():
        location = eval(row['location:point{srid:9157}'].replace('x', "'x'").replace('y', "'y'").replace('z', "'z'"))
        li[i,:6] = [row[':ID(Syn-ID)'], # synpase id
                     0 if row['type:string'] == 'pre' else 1, #synapse type
                     int(row['confidence:float']*1000000), #confidence
                     location['x'], location['y'], location['z']]
        li[i,6:] = ~np.isnan(np.asarray(row.values[5:], np.double))
        i += 1
    synapse_array[synapse_count:synapse_count+pre_index.shape[0],:] = li
    synapse_count += pre_index.shape[0]
    count += chunksize
    print(count, len(pre_used_synapse))
synapse_array = synapse_array[:synapse_count,:]


In [None]:
# reorder synapses

synapse_connections = connections
    
ids = synapse_array[:,0]
syn_id_dict = {j: i for i, j in enumerate(ids)}
ids = post_syn_sorted[:,0]
post_syn_id_dict = {j: i for i, j in enumerate(ids)} # map syn id to post_syn_sorted

synapse_dict = {}
wrong_synapse = 0
for i, pair in tqdm(enumerate(synapse_connections)):
    pre_syn_id = pair[0]
    post_syn_id = pair[1]
    post_id = post_syn_id_dict[post_syn_id]
    post_info = synapse_array[syn_id_dict[post_syn_id]]
    post_neuron_id, pre_neuron_id = post_syn_sorted[post_id, 1:]

    pre_info = synapse_array[syn_id_dict[pre_syn_id]]

    if pre_neuron_id not in synapse_dict:
        synapse_dict[pre_neuron_id] = {}
    pre_dict = synapse_dict[pre_neuron_id]
    if post_neuron_id not in synapse_dict[pre_neuron_id]:
        pre_dict[post_neuron_id] =  {'pre_synapse_ids': [],
                                     'post_synapse_ids': [],
                                     'pre_confidence': [],
                                     'post_confidence': [],
                                     'pre_x': [],
                                     'pre_y': [],
                                     'pre_z': [],
                                     'post_x': [],
                                     'post_y': [],
                                     'post_z': [],
                                     'regions': np.zeros(230, np.int32)}
    info_dict = pre_dict[post_neuron_id]
    info_dict['pre_synapse_ids'].append(pre_syn_id)
    info_dict['post_synapse_ids'].append(post_syn_id)
    info_dict['pre_confidence'].append(pre_info[2])
    info_dict['post_confidence'].append(post_info[2])
    info_dict['pre_x'].append(pre_info[3])
    info_dict['pre_y'].append(pre_info[4])
    info_dict['pre_z'].append(pre_info[5])
    info_dict['post_x'].append(post_info[3])
    info_dict['post_y'].append(post_info[4])
    info_dict['post_z'].append(post_info[5])
    info_dict['regions'] += post_info[6:]

chunk = pd.read_csv('hemibrain_v1.0.1_neo4j_inputs/Neuprint_Synapses_52a133.csv', chunksize=1).get_chunk()
labels = [i.split(':')[0] for i in chunk.columns.to_list()]
regions = labels[5:]

with open('synapses.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['pre_id','post_id','N','pre_confidence','post_confidence',\
                     'pre_x','pre_y','pre_z','post_x','post_y','post_z',\
                     'neuropils','subregions','tracts'])
    for pre, k in tqdm(synapse_dict.items()):
        for post, v in k.items():
            reg = {regions[i]: v['regions'][i] for i in np.nonzero(v['regions'])[0]}
            neuropil_list = []
            subregion_list = []
            tract_list = []
            for k, n in reg.items():
                region = all_brain_regions[k]
                if region['Subregions'] is None:
                    if region['Neuropil'] is None:
                        if 'Tract' in region:
                            tract_list.append('{}:{}'.format(
                                region['Tract'], n))
                        else:
                            continue
                    elif isinstance(region['Neuropil'], list):
                        continue
                    else:
                        neuropil_list.append('{}:{}'.format(
                            region['Neuropil'], n))
                else:
                    subregion_list.append('{}:{}'.format(
                        region['Subregions'], n))

            neuropil_list = ';'.join(neuropil_list)
            subregion_list = ';'.join(subregion_list)
            tract_list = ';'.join(tract_list)
            writer.writerow([pre, post, len(v['pre_x']), str(v['pre_confidence']), \
                             str(v['post_confidence']), str(v['pre_x']), str(v['pre_y']), str(v['pre_z']), \
                             str(v['post_x']), str(v['post_y']), str(v['post_z']), \
                             neuropil_list, subregion_list, tract_list])


## Loading NeuroArch Database

Create and connect to database. mode 'o' overwrites the entire database.

In [None]:
hemibrain = na.NeuroArch('hemibrain', mode = 'o')

Create a species

In [None]:
species = hemibrain.add_species('Drosophila melanogaster', stage = 'adult',
                                sex = 'female',
                                synonyms = ['fruit fly', 'common fruit fly', 'vinegar fly'])

Create a datasource under the species

In [None]:
version = '1.0.1'
datasource = hemibrain.add_DataSource('Hemibrain', version = version,
                                      url = 'https://www.janelia.org/project-team/flyem/hemibrain',
                                      species = species)
hemibrain.default_DataSource = datasource

Create subsystems, tracts, neuropils and subregions under the datasource

In [None]:
for k, v in all_brain_regions.items():
    if v['Neuropil'] is None and v['Subregions'] is None:
        if 'System' in v:
            hemibrain.add_Subsystem(k)
    elif 'System' in v:
        if v['Neuropil'] == v['System'] and v['Subregions'] is None:
            hemibrain.add_Subsystem(k)

The mesh are then downsampled using [MeshLab](https://www.meshlab.net/):

In [None]:
with open('filter_file_tmp.mlx', 'w') as f:
    f.write("""<!DOCTYPE FilterScript>
<FilterScript>
 <filter name="Simplification: Quadric Edge Collapse Decimation">
  <Param type="RichInt" value="60000" name="TargetFaceNum"/>
  <Param type="RichFloat" value="0.05" name="TargetPerc"/>
  <Param type="RichFloat" value="1" name="QualityThr"/>
  <Param type="RichBool" value="true" name="PreserveBoundary"/>
  <Param type="RichFloat" value="1" name="BoundaryWeight"/>
  <Param type="RichBool" value="true" name="OptimalPlacement"/>
  <Param type="RichBool" value="true" name="PreserveNormal"/>
  <Param type="RichBool" value="true" name="PlanarSimplification"/>
 </filter>
</FilterScript>""")


In [None]:
for k, v in all_brain_regions.items():
    if v['Neuropil'] is None and v['Subregions'] is None:
        if 'Tract' in v:
            ms = ml.MeshSet()
            ms.load_new_mesh("roi/{}.obj".format(k))
            ms.load_filter_script('filter_file_tmp.mlx')
            ms.apply_filter_script()
            current_mesh = ms.current_mesh()
            hemibrain.add_Tract(k, morphology = {'type': 'mesh', 
                                                 "vertices": (current_mesh.vertex_matrix()*0.008).flatten().tolist(),
                                                 "faces": current_mesh.face_matrix().flatten().tolist()})

In [None]:
for k, v in all_brain_regions.items():
    if v['Neuropil'] is not None and v['Subregions'] is None:
        if isinstance(v['Neuropil'], list):
            continue
        ms = ml.MeshSet()
        ms.load_new_mesh("roi/{}.obj".format(k))
        ms.load_filter_script('filter_file_tmp.mlx')
        ms.apply_filter_script()
        current_mesh = ms.current_mesh()
        hemibrain.add_Neuropil(k,
                               morphology = {'type': 'mesh', 
                                             "vertices": (current_mesh.vertex_matrix()*0.008).flatten().tolist(),
                                             "faces": current_mesh.face_matrix().flatten().tolist()},
                               subsystem = v['System'])

In [None]:
for k, v in all_brain_regions.items():
    if v['Subregions'] is not None:
        if isinstance(v['Neuropil'], list):
            continue
        if os.path.exists("roi/{}.obj".format(k)):
            ms = ml.MeshSet()
            ms.load_new_mesh("roi/{}.obj".format(k))
            ms.load_filter_script('filter_file_tmp.mlx')
            ms.apply_filter_script()
            current_mesh = ms.current_mesh()
            hemibrain.add_Subregion(k,
                                    morphology = {'type': 'mesh', 
                                                  "vertices": (current_mesh.vertex_matrix()*0.008).flatten().tolist(),
                                                  "faces": current_mesh.face_matrix().flatten().tolist()},
                                    neuropil = v['Neuropil'])
        else:
            hemibrain.add_Subregion(k,
                                    neuropil = v['Neuropil'])

Load Neurons

In [None]:
def load_swc(file_name):
    df = pd.read_csv(file_name, sep = ' ', header = None, comment = '#', index_col = False,
                     names = ['sample', 'x', 'y', 'z', 'r', 'parent'],
                    skipinitialspace = True)
    return df

In [None]:
neuron_list = pd.read_csv('neurons.csv')
swc_dir = 'swc'
uname_dict = {}

for i, row in tqdm(neuron_list.iterrows()):
    if row['statusLabel'] in ['Traced', 'Roughly traced']:
        pass
    elif isinstance(row['instance'], str) or  isinstance(row['type'], str):
        pass
    else:
        continue

    bodyID = row['bodyID']
    cell_type = row['type']
    name = row['instance']
    
    if not isinstance(name, str):
        if isinstance(cell_type, str):
            name = '{}_{}'.format(cell_type, bodyID)
        else:
            cell_type = 'unknown'
            name = 'unknown_{}'.format(bodyID)
    else:
        if not isinstance(cell_type, str):
            cell_type = 'unknown'
            if not isinstance(name, str):
                name = 'unknown_{}'.format(bodyID)
            else:
                name = '{}_{}'.format(name, bodyID)
        else:
            if name not in uname_dict:
                uname_dict[name] = 0
            uname_dict[name] += 1
            name = '{}_{}'.format(name, uname_dict[name])
                
    info = {}
    
    c_neuropils = row['neuropils']
    c_subregions = row['subregions']
    c_tracts = row['tracts']
    arborization = []
    if isinstance(c_neuropils, str):
        dendrites = {j.split(':')[0]: int(j.split(':')[2]) for j in c_neuropils.split(';') if int(j.split(':')[2]) > 0}
        axons = {j.split(':')[0]: int(j.split(':')[1]) for j in c_neuropils.split(';') if int(j.split(':')[1]) > 0}
        arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'neuropil'})
    if isinstance(c_subregions, str):
        dendrites = {j.split(':')[0]: int(j.split(':')[2]) for j in c_subregions.split(';') if int(j.split(':')[2]) > 0}
        axons = {j.split(':')[0]: int(j.split(':')[1]) for j in c_subregions.split(';') if int(j.split(':')[1]) > 0}
        arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'subregion'})
    if isinstance(c_tracts, str):
        dendrites = {j.split(':')[0]: int(j.split(':')[2]) for j in c_tracts.split(';') if int(j.split(':')[2]) > 0}
        axons = {j.split(':')[0]: int(j.split(':')[1]) for j in c_tracts.split(';') if int(j.split(':')[1]) > 0}
        arborization.append({'dendrites': dendrites, 'axons': axons, 'type': 'tract'})
    
    df = load_swc('{}/{}.swc'.format(swc_dir, bodyID))
    morphology = {'x': (df['x']*0.008).tolist(),
                  'y': (df['y']*0.008).tolist(),
                  'z': (df['z']*0.008).tolist(),
                  'r': (df['r']*0.008).tolist(),
                  'parent': df['parent'].tolist(),
                  'identifier': [0]*(len(df['x'])),
                  'sample': df['sample'].tolist(),
                  'type': 'swc'}
    
    hemibrain.add_Neuron(name, # uname
                         cell_type, # name
                         referenceId = str(bodyID), #referenceId
                         info = info if len(info) else None,
                         morphology = morphology,
                         arborization = arborization)

In [None]:
# If restarting the kernel after loading neurons, start with this
# hemibrain = na.NeuroArch('hemibrain', mode = 'w')
# hemibrain.default_DataSource = hemibrain.find_objs('DataSource', name = 'Hemibrain')[0]

In [None]:
# find all the neurons so they can be keyed by their referenceId.

neurons = hemibrain.sql_query('select from Neuron').nodes_as_objs
# set the cache so there is no need for database access.
for neuron in neurons:
    hemibrain.set('Neuron', neuron.uname, neuron, hemibrain.default_DataSource)
neuron_ref_to_obj = {int(neuron.referenceId): neuron for neuron in neurons}

Load synapses

In [None]:
synapse_df = pd.read_csv('synapses.csv')

for i, row in tqdm(synapse_df.iterrows()):
    pre_neuron = neuron_ref_to_obj[row['pre_id']]
    post_neuron = neuron_ref_to_obj[row['post_id']]

    pre_conf = np.array(eval(row['pre_confidence']))/1e6
    post_conf = np.array(eval(row['post_confidence']))/1e6
    NHP = np.sum(np.logical_and(post_conf>=0.7, pre_conf>=0.7))

    c_neuropils = row['neuropils']
    c_subregions = row['subregions']
    c_tracts = row['tracts']
    arborization = []
    neuropils = {}
    if isinstance(c_neuropils, str):
        arborization.append({'type': 'neuropil',
                       'synapses': {j.split(':')[0]: int(j.split(':')[1]) \
                                    for j in c_neuropils.split(';') \
                                    if int(j.split(':')[1]) > 0}})
    if isinstance(c_subregions, str):
        arborization.append({'type': 'subregion',
                             'synapses': {j.split(':')[0]: int(j.split(':')[1]) \
                                          for j in c_subregions.split(';') \
                                          if int(j.split(':')[1]) > 0}})
    if isinstance(c_tracts, str):
        arborization.append({'type': 'tract',
                             'synapses': {j.split(':')[0]: int(j.split(':')[1]) \
                                          for j in c_tracts.split(';') \
                                          if int(j.split(':')[1]) > 0}})
    content = {'type': 'swc'}
    content['x'] = [round(i, 3) for i in (np.array(eval(row['pre_x'])+eval(row['post_x']))*0.008).tolist()]
    content['y'] = [round(i, 3) for i in (np.array(eval(row['pre_y'])+eval(row['post_y']))*0.008).tolist()]
    content['z'] = [round(i, 3) for i in (np.array(eval(row['pre_z'])+eval(row['post_z']))*0.008).tolist()]
    content['r'] = [0]*len(content['x'])
    content['parent'] = [-1]*(len(content['x'])//2) + [i+1 for i in range(len(content['x'])//2)]
    content['identifier'] = [7]*(len(content['x'])//2) + [8]*(len(content['x'])//2)
    content['sample'] = [i+1 for i in range(len(content['x']))]
    content['confidence'] = [round(i, 3) for i in pre_conf.tolist()] + [round(i, 3) for i in post_conf.tolist()]
    
    hemibrain.add_Synapse(pre_neuron, post_neuron, N = row['N'], NHP = NHP,
                          morphology = content,
                          arborization = arborization)
