# Querying external database sources of interest

* Enable users to integrate data from external databases of interest within BBP KG
* While using the Nexus Forge interface and BMO vocabulary as much as possible as
* While benefiting from out of the box (meta)data transformation to make them ready for BBP internal pipelines and applications
* Demo with Mouselight, NeuroElectro, UniProt

In [1]:
import json

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

In [2]:
endpoint = "https://staging.nise.bbp.epfl.ch/nexus/v1"
BUCKET = "neurosciencegraph/datamodels"
forge = KnowledgeGraphForge("../../configurations/database-sources/prod-nexus-sources_progress.yml", endpoint=endpoint, bucket=BUCKET)

I am in BBNexus store
I am in BBNexus store
I am in BBNexus store
I am in BBNexus store
Configuration {'origin': 'store', 'source': 'SPARQLStore', 'endpoint': 'http://purl.uniprot.org/core/', 'searchendpoints': {'sparql': {'endpoint': 'https://sparql.uniprot.org/sparql'}}, 'model': {'origin': 'directory', 'source': '/Users/cgonzale/Documents/code/nexus-forge/examples/database_sources/UniProt', 'context': {'iri': 'https://bbp.epfl.ch/jsonldcontext/db/uniprot', 'bucket': 'jsonld_context.json'}}}
store config {'endpoint': 'http://purl.uniprot.org/core/', 'searchendpoints': {'sparql': {'endpoint': 'https://sparql.uniprot.org/sparql'}}, 'model_context': <kgforge.core.commons.context.Context object at 0x7fb137dcf3d0>}
inside SPARQL Store
initializing service
Configuration {'origin': 'store', 'source': 'BlueBrainNexus', 'bucket': 'bbp/neuroelectro', 'model': {'origin': 'directory', 'source': '/Users/cgonzale/Documents/code/nexus-forge/examples/database_sources/NeuroElectro', 'context': {'buck

# List of Data sources

In [3]:
forge.db_sources(pretty=True)

Available Database sources:
UniProt
NeuroElectro


In [4]:
sources = forge.db_sources()

In [5]:

data = {
       'origin': 'store',
       'source': 'DemoStore',
       'model': { 
          'name': 'DemoModel',
          'origin': 'directory',
          'source': "../../../tests/data/demo-model/" 
        }
}


In [6]:
from kgforge.specializations.databases import StoreDatabase
ds = StoreDatabase(forge, name="DemoDB", **data)

store config {'model_context': None}


In [7]:
# print(ds)

In [8]:
forge.db_sources(pretty=True)

Available Database sources:
UniProt
NeuroElectro


# Data source metadata

In [9]:
neuroelectro = sources['NeuroElectro']

## Get data mappings (hold transformations logic) per data type

* Data mappings are used to transform results obtained from the external data sources so that they are ready for consumption by BBP tools
* Perform automatic ontology linking

In [10]:
forge.mappings("NeuroElectro")

{'ElectrophysiologicalFeatureAnnotation': ['DictionaryMapping'],
 'ParameterAnnotation': ['DictionaryMapping'],
 'ParameterBody': ['DictionaryMapping'],
 'ScholarlyArticle': ['DictionaryMapping'],
 'SeriesBody': ['DictionaryMapping']}

In [11]:
forge.mappings('UniProt')

{'Gene': ['DictionaryMapping'], 'Protein': ['DictionaryMapping']}

In [12]:
from kgforge.specializations.mappings import DictionaryMapping
mapping = forge.mapping("ScholarlyArticle", "NeuroElectro")
direct_mapping = neuroelectro.mapping("ScholarlyArticle", type=DictionaryMapping)

In [13]:
print(mapping)

{
    id: forge.format("identifier", "scholarlyarticles", x.id)
    type:
    [
        Entity
        ScholarlyArticle
    ]
    abstract: x.abstract
    author: x.authors_shaped
    datePublished: x.date_issued
    identifier: x.identifiers
    isPartOf:
    {
        type: Periodical
        issn: x.issn
        name: x.journal
        publisher: x.publisher
    }
    name: f"article_{x.id}"
    sameAs: x.full_text_link
    title: x.title
    url: x.full_text_link
}


In [14]:
print(direct_mapping)

{
    id: forge.format("identifier", "scholarlyarticles", x.id)
    type:
    [
        Entity
        ScholarlyArticle
    ]
    abstract: x.abstract
    author: x.authors_shaped
    datePublished: x.date_issued
    identifier: x.identifiers
    isPartOf:
    {
        type: Periodical
        issn: x.issn
        name: x.journal
        publisher: x.publisher
    }
    name: f"article_{x.id}"
    sameAs: x.full_text_link
    title: x.title
    url: x.full_text_link
}


In [15]:
forge.db_sources(mappings='Gene')

{'UniProt': StoreDatabase(context=<kgforge.core.commons.context.Context object at 0x7fb137dcf3d0>, type='Database', _dirpath='/Users/cgonzale/Documents/code/nexus-forge/examples/database_sources/UniProt', _forge=<kgforge.core.forge.KnowledgeGraphForge object at 0x7fb136a9f610>, name='UniProt', service=SPARQLStore(context=None, bucket=None, endpoint='http://purl.uniprot.org/core/', file_mapping=None, metadata_context=None, model_context=<kgforge.core.commons.context.Context object at 0x7fb137dcf3d0>, service=<kgforge.specializations.stores.databases.service.Service object at 0x7fb134137d10>, token=None, versioned_id_template=None), source='SPARQLStore')}

# Search and Access data from data source

* Mapping are automatically applied to search results
* takes a mn for now => working on making it faster 

In [18]:
# Type, source or target brain region, 
filters = {"type":"ScholarlyArticle"}
#map=True, use_cache=True, # download=True
resources = forge.search(filters, db_source="NeuroElectro", limit=2) 
# ADd function for checking datsource health => reqsuire health url from db


<action> search
<error> ValueError: context model missing



In [19]:
len(resources)

TypeError: object of type 'NoneType' has no len()

In [None]:
print(resources[0])

{
    context: https://bbp.neuroshapes.org
    id: https://bbp.epfl.ch/neurosciencegraph/data/scholarlyarticles/14353
    type:
    [
        Entity
        ScholarlyArticle
    ]
    abstract: Cerebellar Purkinje cells (PCs) from spinocerebellar ataxia type 1 (SCA1) transgenic mice develop dendritic and somatic atrophy with age. Inositol 1,4,5-trisphosphate receptor type 1 and the sarco/endoplasmic reticulum Ca(2+) ATPase pump, which regulate [Ca(2+)](i), are expressed at lower levels in these cells compared with the levels in cells from wild-type (WT) mice. To examine PCs in SCA1 mice, we used whole-cell patch clamp recording combined with fluorometric [Ca(2+)](i) and [Na(+)](i) measurements in cerebellar slices. PCs in SCA1 mice had Na(+) spikes, Ca(2+) spikes, climbing fiber (CF) electrical responses, parallel fiber (PF) electrical responses, and metabotropic glutamate receptor (mGluR)-mediated, PF-evoked Ca(2+) release from intracellular stores that were qualitatively similar to t

In [None]:
uquery = """
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein
WHERE {
  ?protein a up:Protein ;
  up:reviewed true.
}
"""

In [None]:
uresources = forge.sparql(query=uquery, db_source='UniProt', limit=10, debug=True)

query in sparql 
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein
WHERE {
  ?protein a up:Protein ;
  up:reviewed true.
}

Submitted query:
   
   PREFIX up: <http://purl.uniprot.org/core/>
   SELECT ?protein
   WHERE {
     ?protein a up:Protein ;
     up:reviewed true.
   }
     LIMIT 10

amount of results = 10


In [None]:
len(uresources)

10

In [None]:
uresources[0]

Resource(_last_action=None, _validated=False, _synchronized=False, _store_metadata=None, _inner_sync=False, protein=Resource(_last_action=None, _validated=False, _synchronized=False, _store_metadata=None, id='http://purl.uniprot.org/uniprot/A0A131MCZ8', _inner_sync=False, annotationScore=5.0, comments=[{'texts': [{'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '27564576'}], 'value': 'Probable metal transporter. Probably acts redundantly with the other metal transport proteins cnnm-1, cnnm-2, cnnm-4 and cnnm-5 to regulate Mg(2+) homeostasis. Promotes postembryonic gonad development by regulating Mg(2+) levels, probably via AMPK signaling'}], 'commentType': 'FUNCTION'}, {'commentType': 'SUBCELLULAR LOCATION', 'subcellularLocations': [{'location': {'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '27564576'}], 'value': 'Basolateral cell membrane', 'id': 'SL-0026'}, 'topology': {'evidences': [{'evidenceCode': 'ECO:0000255'}], 'value': 'Multi-

In [None]:
# uresources

In [None]:
from kgforge.core.wrappings.paths import Filter, FilterOperator

In [None]:


proteins = forge.search({'type': 'Protein', 'up:reviewed': True}, db_source='UniProt', limit=10, debug=True)

query in sparql SELECT ?id WHERE {?id type Protein;
 up:reviewed ?v1 . 
 FILTER(?v1 = 'true'^^xsd:boolean)
}
Submitted query:
   PREFIX up: <http://purl.uniprot.org/core/>
   PREFIX owl: <http://www.w3.org/2002/07/owl#>
   PREFIX owl2xml: <http://www.w3.org/2006/12/owl2-xml#>
   PREFIX swrlb: <http://www.w3.org/2003/11/swrlb#>
   PREFIX protege: <http://protege.stanford.edu/plugins/owl/protege#>
   PREFIX swrl: <http://www.w3.org/2003/11/swrl#>
   PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
   PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
   PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
   PREFIX dc11: <http://purl.org/dc/terms/>
   PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
   PREFIX foaf: <http://xmlns.com/foaf/0.1/>
   SELECT ?id WHERE {?id rdf:type up:Protein;
    up:reviewed ?v1 . 
    FILTER(?v1 = 'true'^^xsd:boolean)
   }  LIMIT 10

amount of results = 10


In [None]:
uniprot = sources['UniProt']

In [None]:
uniprot._store.context.prefixes

{'up': 'http://purl.uniprot.org/core/',
 'owl': 'http://www.w3.org/2002/07/owl#',
 'owl2xml': 'http://www.w3.org/2006/12/owl2-xml#',
 'swrlb': 'http://www.w3.org/2003/11/swrlb#',
 'protege': 'http://protege.stanford.edu/plugins/owl/protege#',
 'swrl': 'http://www.w3.org/2003/11/swrl#',
 'xsd': 'http://www.w3.org/2001/XMLSchema#',
 'skos': 'http://www.w3.org/2004/02/skos/core#',
 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
 'dc11': 'http://purl.org/dc/terms/',
 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 'foaf': 'http://xmlns.com/foaf/0.1/'}

In [None]:
genes = forge.search({'type': 'Gene'}, db_source='UniProt', limit=10)

query in sparql SELECT ?id WHERE {?id type Gene . 
 
}
amount of results = 10


In [None]:
genes[0]

Resource(_last_action=None, _validated=False, _synchronized=False, _store_metadata=None, id=Resource(_last_action=None, _validated=False, _synchronized=False, _store_metadata=None, id='http://purl.uniprot.org/uniprot/H5SQ95#gene-MD58301D33AF640374C84A4DA4CAF383BE6', _inner_sync=False, annotationScore=2.0, comments=[{'texts': [{'evidences': [{'evidenceCode': 'ECO:0000305', 'source': 'PubMed', 'id': '28087277'}], 'value': 'May function as a protein modifier covalently attached to lysine residues of substrate proteins. This may serve to target the modified proteins for degradation by proteasomes'}], 'commentType': 'FUNCTION'}, {'texts': [{'evidences': [{'evidenceCode': 'ECO:0000255', 'source': 'HAMAP-Rule', 'id': 'MF_02133'}], 'value': 'Belongs to the ubiquitin-like protein UBact family'}], 'commentType': 'SIMILARITY'}], entryAudit={'firstPublicDate': '2017-10-25', 'lastAnnotationUpdateDate': '2022-05-25', 'lastSequenceUpdateDate': '2012-04-18', 'entryVersion': 15, 'sequenceVersion': 1}, 

# Save in BBP KG (Nexus)

In [None]:
# forge.register(resources)

## Access

### Set filters

In [None]:
_type = "NeuronMorphology"
filters = {"type": _type}

### Run Query

In [None]:
limit = 10  # You can limit the number of results, pass `None` to fetch all the results

data = forge.search(filters, db_source='MouseLight', limit=limit)

print(f"{str(len(data))} dataset(s) of type {_type} found")

10 dataset(s) of type NeuronMorphology found


### Display the results as pandas dataframe

In [None]:
property_to_display = ["id","name","subject","brainLocation.brainRegion.id","brainLocation.brainRegion.label","brainLocation.layer.id","brainLocation.layer.label", "contribution","brainLocation.layer.id","brainLocation.layer.label","distribution.name","distribution.contentUrl","distribution.encodingFormat"]
reshaped_data = forge.reshape(data, keep=property_to_display)

forge.as_dataframe(reshaped_data)

Unnamed: 0,id,brainLocation.brainRegion.id,brainLocation.brainRegion.label,contribution.type,contribution.agent.id,contribution.agent.type,contribution.agent.label,distribution.contentUrl,distribution.encodingFormat,distribution.name,name,subject.type,subject.species.id,subject.species.label,subject.strain.label
0,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Primary motor area Layer 5,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1050.swc,AA1050,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
1,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Primary somatosensory area mouth layer 5,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1049.swc,AA1049,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
2,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Retrosplenial area ventral part layer 5,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1045.swc,AA1045,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
3,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Parafascicular nucleus,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1046.swc,AA1046,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
4,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Medial mammillary nucleus,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1048.swc,AA1048,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
5,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Primary somatosensory area mouth layer 5,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1051.swc,AA1051,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
6,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Entorhinal area lateral part,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1047.swc,AA1047,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
7,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Primary motor area Layer 6a,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1043.swc,AA1043,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
8,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Medial mammillary nucleus,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1041.swc,AA1041,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre
9,https://bbp.epfl.ch/neurosciencegraph/data/neu...,http://api.brain-map.org/api/v2/data/Structure...,Medial mammillary nucleus,Contribution,https://www.grid.ac/institutes/grid.443970.d,Organization,Janelia Research Campus,https://staging.nise.bbp.epfl.ch/nexus/v1/file...,application/swc,AA1039.swc,AA1039,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,Sim1-Cre


### Download

In [None]:
dirpath = "./downloaded/"
forge.download(data, "distribution.contentUrl", dirpath)

<action> _download
<error> DownloadingError: Downloading from neurosciencegraph/datamodels:404, message='Not Found', url=URL('https://staging.nise.bbp.epfl.ch/nexus/v1/files/neurosciencegraph/datamodels/665642bf-4d60-45be-853b-ace965f0057f')



### Try query

In [None]:
mquery = """
# PREFIXES
SELECT ?id WHERE {
    ?id a nsg:DetailedCircuit
} LIMIT 100
"""

forge.sparql(mquery, debug=True, rewrite=False)

Submitted query:
   
   # PREFIXES
   SELECT ?id WHERE {
       ?id a nsg:DetailedCircuit
   } LIMIT 100

<action> _sparql
<error> QueryingError: 400 Client Error: Bad Request for url: https://staging.nise.bbp.epfl.ch/nexus/v1/views/neurosciencegraph/datamodels/https%3A%2F%2Fbluebrain.github.io%2Fnexus%2Fvocabulary%2FdefaultSparqlIndex/sparql



In [None]:
forge.sparql(mquery, debug=True, rewrite=False)

Submitted query:
   
   # PREFIXES
   SELECT ?id WHERE {
       ?id a nsg:DetailedCircuit
   } LIMIT 100

<action> _sparql
<error> QueryingError: 400 Client Error: Bad Request for url: https://staging.nise.bbp.epfl.ch/nexus/v1/views/neurosciencegraph/datamodels/https%3A%2F%2Fbluebrain.github.io%2Fnexus%2Fvocabulary%2FdefaultSparqlIndex/sparql



In [None]:
example_region = forge.retrieve("http://api.brain-map.org/api/v2/data/Structure/614454282")

In [None]:
query = """
SELECT ?id ?label ?preflabel
WHERE {
  ?id subClassOf "nsg:BrainRegion" .
}
 """

In [None]:
bregions = forge.sparql(query, limit=100, debug=True)

Submitted query:
   PREFIX bmc: <https://bbp.epfl.ch/ontologies/core/bmc/>
   PREFIX bmo: <https://bbp.epfl.ch/ontologies/core/bmo/>
   PREFIX commonshapes: <https://neuroshapes.org/commons/>
   PREFIX datashapes: <https://neuroshapes.org/dash/>
   PREFIX dc: <http://purl.org/dc/elements/1.1/>
   PREFIX dcat: <http://www.w3.org/ns/dcat#>
   PREFIX dcterms: <http://purl.org/dc/terms/>
   PREFIX mba: <http://api.brain-map.org/api/v2/data/Structure/>
   PREFIX nsg: <https://neuroshapes.org/>
   PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
   PREFIX oa: <http://www.w3.org/ns/oa#>
   PREFIX obo: <http://purl.obolibrary.org/obo/>
   PREFIX owl: <http://www.w3.org/2002/07/owl#>
   PREFIX prov: <http://www.w3.org/ns/prov#>
   PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
   PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
   PREFIX schema: <http://schema.org/>
   PREFIX sh: <http://www.w3.org/ns/shacl#>
   PREFIX shsh: <http://www.w3.org/ns/shacl-shacl#>
   PREFI

In [None]:
bregions

[]

In [None]:
example_region.__dict__

{'context': 'https://neuroshapes.org',
 'id': 'http://api.brain-map.org/api/v2/data/Structure/614454282',
 'type': 'Class',
 'atlas_id': 966,
 'color_hex_triplet': '1F9D5A',
 'graph_order': 20,
 'hemisphere_id': 3,
 'identifier': '614454282',
 'isDefinedBy': 'http://bbp.epfl.ch/neurosciencegraph/ontologies/core/brainregion',
 'isPartOf': 'mba:943',
 'notation': 'MOp2',
 'rdfs:label': 'Primary motor area, Layer 2',
 'skos:prefLabel': 'Primary motor area, Layer 2',
 'st_level': 11,
 'subClassOf': ['nsg:BrainRegion'],
 '_last_action': Action(error=None, message=None, operation='retrieve', succeeded=True),
 '_validated': False,
 '_inner_sync': True,
 '_synchronized': True,
 '_store_metadata': {'id': 'mba:614454282',
  '_constrainedBy': 'https://neuroshapes.org/dash/ontologyentity',
  '_createdAt': '2022-05-27T07:24:49.109Z',
  '_createdBy': 'https://staging.nise.bbp.epfl.ch/nexus/v1/realms/serviceaccounts/users/service-account-brain-modeling-ontology-ci-cd',
  '_deprecated': False,
  '_inc