# Querying external database sources of interest

* Enable users to integrate data from external databases of interest within BBP KG
* While using the Nexus Forge interface and BMO vocabulary as much as possible as
* While benefiting from out of the box (meta)data transformation to make them ready for BBP internal pipelines and applications
* Demo with Mouselight, NeuroElectro, UniProt

In [1]:
import json

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

In [2]:
endpoint = "https://staging.nise.bbp.epfl.ch/nexus/v1"
BUCKET = "neurosciencegraph/datamodels"
forge = KnowledgeGraphForge("../../configurations/database-sources/prod-nexus-sources.yml", endpoint=endpoint, bucket=BUCKET)

# List of Data sources

In [3]:
forge.db_sources(pretty=True)

Available Database sources:
UniProt
NeuroElectro
MouseLight


In [4]:
sources = forge.db_sources(pretty=False)

In [5]:

data = {
       'store':{
          'name': 'DemoStore'
       },
        'model': { 
          'name': 'DemoModel',
          'origin': 'directory',
          'source': "../../../tests/data/demo-model/" 
        }
}


In [6]:
from kgforge.specializations.resources import DatabaseSource
ds = DatabaseSource(forge, name="DemoDB", from_forge=False, **data)

In [7]:
# print(ds)

In [8]:
forge.db_sources(pretty=True)

Available Database sources:
UniProt
NeuroElectro
MouseLight
DemoDB


# Data source metadata

In [9]:
mouselight = sources["MouseLight"]

## Name, description, url, license, protocol => more can be added through configuration

In [10]:
print(mouselight.name)
print(mouselight.protocol)
print(mouselight.license)

MouseLight
https://www.janelia.org/project-team/mouselight/resources
{'id': 'https://creativecommons.org/licenses/by-nc/4.0', 'label': 'CC BY-NC 4.0'}


## Get data mappings (hold transformations logic) per data type

* Data mappings are used to transform results obtained from the external data sources so that they are ready for consumption by BBP tools
* Perform automatic ontology linking

In [11]:
forge.mappings("MouseLight", pretty=False)

{'NeuronMorphology': ['DictionaryMapping']}

In [12]:
forge.mappings('UniProt', pretty=True)

Managed mappings for the data source per entity type and mapping type:
   - NeuronElectrophysiologicalFeature:
        * DictionaryMapping


In [13]:
forge.mappings('NeuroElectro', pretty=True)

Managed mappings for the data source per entity type and mapping type:
   - ElectrophysiologicalFeatureAnnotation:
        * DictionaryMapping
   - ParameterAnnotation:
        * DictionaryMapping
   - ParameterBody:
        * DictionaryMapping
   - ScholarlyArticle:
        * DictionaryMapping
   - SeriesBody:
        * DictionaryMapping


In [14]:
from kgforge.specializations.mappings import DictionaryMapping
mapping = forge.mapping("NeuronMorphology", "MouseLight", type=DictionaryMapping)
direct_mapping = mouselight.mapping("NeuronMorphology", type=DictionaryMapping)

In [15]:
print(mapping)

{
    id: forge.format("identifier", "neuronmorphologies/mouselight", x.neurons[0]["idString"])
    type:
    [
        Dataset
        NeuronMorphology
    ]
    brainLocation:
    {
        type: BrainLocation
        brainRegion:
        {
            id: f"http://api.brain-map.org/api/v2/data/Structure/{x.neurons[0]['soma']['allenId']}"
            label: x.neurons[0]["allenLabel"]
        }
        coordinatesInBrainAtlas:
        {
            valueX: x.neurons[0]["soma"]["x"]
            valueY: x.neurons[0]["soma"]["y"]
            valueZ: x.neurons[0]["soma"]["z"]
        }
    }
    contribution:
    {
        type: Contribution
        agent:
        {
            id: https://www.grid.ac/institutes/grid.443970.d
            type: Organization
            label: Janelia Research Campus
        }
    }
    dateCreated: x.neurons[0]["sample"]["date"]
    description: x.neurons[0]["annotationSpace"]["description"]
    distribution: forge.attach(f"./mouselight/{x.neurons[0]['idSt

In [16]:
print(direct_mapping)

{
    id: forge.format("identifier", "neuronmorphologies/mouselight", x.neurons[0]["idString"])
    type:
    [
        Dataset
        NeuronMorphology
    ]
    brainLocation:
    {
        type: BrainLocation
        brainRegion:
        {
            id: f"http://api.brain-map.org/api/v2/data/Structure/{x.neurons[0]['soma']['allenId']}"
            label: x.neurons[0]["allenLabel"]
        }
        coordinatesInBrainAtlas:
        {
            valueX: x.neurons[0]["soma"]["x"]
            valueY: x.neurons[0]["soma"]["y"]
            valueZ: x.neurons[0]["soma"]["z"]
        }
    }
    contribution:
    {
        type: Contribution
        agent:
        {
            id: https://www.grid.ac/institutes/grid.443970.d
            type: Organization
            label: Janelia Research Campus
        }
    }
    dateCreated: x.neurons[0]["sample"]["date"]
    description: x.neurons[0]["annotationSpace"]["description"]
    distribution: forge.attach(f"./mouselight/{x.neurons[0]['idSt

In [17]:
forge.db_sources(with_datatype='NeuronMorphology', pretty=True)

Available Database sources:
MouseLight


In [18]:
ne = sources['NeuroElectro']

# Search and Access data from data source

* Mapping are automatically applied to search results
* takes a mn for now => working on making it faster 

In [19]:
# Type, source or target brain region, 
filters = {"type":"ScholarlyArticle"}
#map=True, use_cache=True, # download=True
resources = forge.search(filters, db_source="NeuroElectro", limit=2) 
# ADd function for checking datsource health => reqsuire health url from db


query in rewrite_sparql SELECT ?id ?_constrainedBy ?_createdAt ?_createdBy ?_deprecated ?_incoming ?_outgoing ?_project ?_rev ?_schemaProject ?_self ?_updatedAt ?_updatedBy WHERE { Graph ?g {?id type ScholarlyArticle;
 <https://bluebrain.github.io/nexus/vocabulary/constrainedBy> ?_constrainedBy;
 <https://bluebrain.github.io/nexus/vocabulary/createdAt> ?_createdAt;
 <https://bluebrain.github.io/nexus/vocabulary/createdBy> ?_createdBy;
 <https://bluebrain.github.io/nexus/vocabulary/deprecated> ?_deprecated;
 <https://bluebrain.github.io/nexus/vocabulary/incoming> ?_incoming;
 <https://bluebrain.github.io/nexus/vocabulary/outgoing> ?_outgoing;
 <https://bluebrain.github.io/nexus/vocabulary/project> ?_project;
 <https://bluebrain.github.io/nexus/vocabulary/rev> ?_rev;
 <https://bluebrain.github.io/nexus/vocabulary/schemaProject> ?_schemaProject;
 <https://bluebrain.github.io/nexus/vocabulary/self> ?_self;
 <https://bluebrain.github.io/nexus/vocabulary/updatedAt> ?_updatedAt;
 <https://blu

In [20]:
len(resources)

2

In [21]:
print(resources[0])

{
    context: https://bbp.neuroshapes.org
    id: https://bbp.epfl.ch/neurosciencegraph/data/scholarlyarticles/91941
    type:
    [
        Entity
        ScholarlyArticle
    ]
    abstract: Neurons in the medial septal/diagonal band complex (MS/DB) in vivo exhibit rhythmic burst-firing activity that is phase-locked with the hippocampal theta rhythm. The aim was to assess the morphology of local axon collaterals of electrophysiologically identified MS/DB neurons using intracellular recording and biocytin injection in vitro. Cells were classified according to previous criteria into slow-firing, fast-spiking, regular-spiking, and burst-firing neurons; previous work has suggested that the slow-firing neurons are cholinergic and that the other types are GABAergic. A novel finding was the existence of two types of burst-firing neuron. Type I burst-firing neurons had significantly longer duration after hyperpolarisation potentials when held at -60 mV, and at -75 mV, type I neurons exhibit

In [22]:
uquery = """
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein
WHERE {
  ?protein a up:Protein ;
  up:reviewed true.
}
"""

In [23]:
uresources = forge.sparql(query=uquery, db_source='UniProt', limit=10, debug=True)

query in sparql 
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein
WHERE {
  ?protein a up:Protein ;
  up:reviewed true.
}

query in rewrite_sparql 
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein
WHERE {
  ?protein a up:Protein ;
  up:reviewed true.
}

Submitted query:
   
   PREFIX up: <http://purl.uniprot.org/core/>
   SELECT ?protein
   WHERE {
     ?protein a up:Protein ;
     up:reviewed true.
   }
     LIMIT 10

amount of results = 10


In [24]:
len(uresources)

10

In [25]:
# uresources

In [26]:
from kgforge.core.wrappings.paths import Filter, FilterOperator

In [27]:
proteins = forge.search({'type': 'Protein', 'up:reviewed': True}, db_source='UniProt', limit=10)

query in sparql SELECT ?id WHERE {?id type Protein;
 up:reviewed ?v1 . 
 FILTER(?v1 = 'true'^^xsd:boolean)
}
query in rewrite_sparql SELECT ?id WHERE {?id type Protein;
 up:reviewed ?v1 . 
 FILTER(?v1 = 'true'^^xsd:boolean)
}
amount of results = 10


In [28]:
uniprot = sources['UniProt']

In [29]:
uniprot._store.context.prefixes

{'up': 'http://purl.uniprot.org/core/',
 'owl': 'http://www.w3.org/2002/07/owl#',
 'owl2xml': 'http://www.w3.org/2006/12/owl2-xml#',
 'swrlb': 'http://www.w3.org/2003/11/swrlb#',
 'protege': 'http://protege.stanford.edu/plugins/owl/protege#',
 'swrl': 'http://www.w3.org/2003/11/swrl#',
 'xsd': 'http://www.w3.org/2001/XMLSchema#',
 'skos': 'http://www.w3.org/2004/02/skos/core#',
 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
 'dc11': 'http://purl.org/dc/terms/',
 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 'foaf': 'http://xmlns.com/foaf/0.1/'}

# Save in BBP KG (Nexus)

In [30]:
# forge.register(resources)

## Access

### Set filters

In [31]:
_type = "NeuronMorphology"
filters = {"type": _type}

### Run Query

In [32]:
limit = 10  # You can limit the number of results, pass `None` to fetch all the results

data = forge.search(filters, limit=limit)

print(f"{str(len(data))} dataset(s) of type {_type} found")

query in rewrite_sparql SELECT ?id ?_constrainedBy ?_createdAt ?_createdBy ?_deprecated ?_incoming ?_outgoing ?_project ?_rev ?_schemaProject ?_self ?_updatedAt ?_updatedBy WHERE { Graph ?g {?id type NeuronMorphology;
 <https://bluebrain.github.io/nexus/vocabulary/constrainedBy> ?_constrainedBy;
 <https://bluebrain.github.io/nexus/vocabulary/createdAt> ?_createdAt;
 <https://bluebrain.github.io/nexus/vocabulary/createdBy> ?_createdBy;
 <https://bluebrain.github.io/nexus/vocabulary/deprecated> ?_deprecated;
 <https://bluebrain.github.io/nexus/vocabulary/incoming> ?_incoming;
 <https://bluebrain.github.io/nexus/vocabulary/outgoing> ?_outgoing;
 <https://bluebrain.github.io/nexus/vocabulary/project> ?_project;
 <https://bluebrain.github.io/nexus/vocabulary/rev> ?_rev;
 <https://bluebrain.github.io/nexus/vocabulary/schemaProject> ?_schemaProject;
 <https://bluebrain.github.io/nexus/vocabulary/self> ?_self;
 <https://bluebrain.github.io/nexus/vocabulary/updatedAt> ?_updatedAt;
 <https://blu

### Display the results as pandas dataframe

In [33]:
property_to_display = ["id","name","subject","brainLocation.brainRegion.id","brainLocation.brainRegion.label","brainLocation.layer.id","brainLocation.layer.label", "contribution","brainLocation.layer.id","brainLocation.layer.label","distribution.name","distribution.contentUrl","distribution.encodingFormat"]
reshaped_data = forge.reshape(data, keep=property_to_display)

forge.as_dataframe(reshaped_data)

### Download

In [34]:
dirpath = "./downloaded/"
forge.download(data, "distribution.contentUrl", dirpath)

<action> download
<error> DownloadingError: path to follow 'distribution.contentUrl' was not found in any provided resource.

