# Embedder API example: node embedding

In order to run this notebook, setup and start the embedding service as described [here](https://github.com/BlueBrain/BlueGraph/blob/master/services/embedder/README.rst).

You may want to modify the following configs in `services/embedder/configs/app_config.py`:

- `DOWNLOAD_DIR = "downloads/"`: Directory for downloading or serving from embeddig pipelines
- `LOCAL = True`: Flag indicating whether you would like to serve embedding pipelines hosted in Nexus or stored in the local `DOWNLOAD_DIR` 

By default, the `services/embedder/downloads` folder is used and `LOCAL` is set to `True`. This folder contains two example models (`Cord-19-NCIT-linking` and `Attri2vec_test_model`) distributed along with the source code.

In [1]:
import requests
import json
import getpass

In [31]:
ENDPOINT = "http://127.0.0.1:5000"

## Get all the models in the catalogue

In [3]:
r = requests.get(
    f'{ENDPOINT}/models/')
print(r)
r.json()

<Response [200]>


{'models': {'Attri2vec_test_model': {'id': 'Attri2vec_test_model',
   'name': 'Attri2vec_test_model',
   'description': 'Attri2vec_test_model',
   'filename': 'downloads/Attri2vec_test_model.zip',
   'created': 'Tue May 11 17:43:42 2021',
   'modified': 'Tue May 11 17:35:03 2021'},
  'Cord-19-NCIT-linking': {'id': 'Cord-19-NCIT-linking',
   'name': 'Cord-19-NCIT-linking',
   'description': 'Cord-19-NCIT-linking',
   'filename': 'downloads/Cord-19-NCIT-linking.zip',
   'created': 'Tue Jun  1 12:36:56 2021',
   'modified': 'Tue Jun  1 12:36:56 2021'}}}

## Get a model by name

In [4]:
MODEL_NAME = "Attri2vec_test_model"

In [5]:
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}')
print(r)
r.json()

<Response [200]>


{'id': 'Attri2vec_test_model',
 'name': 'Attri2vec_test_model',
 'description': 'Attri2vec_test_model',
 'filename': 'downloads/Attri2vec_test_model.zip',
 'created': 'Tue May 11 17:43:42 2021',
 'modified': 'Tue May 11 17:35:03 2021'}

## Get details on different model components

In [6]:
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}/details/preprocessor/')
print(r)
r.json()

<Response [200]>


{'heterogeneous': 'False',
 'drop_types': 'False',
 'encode_types': 'False',
 'edge_features': 'False',
 'categorical_encoding': 'multibin',
 'text_encoding': 'tfidf',
 'text_encoding_max_dimension': '512',
 'missing_numeric': 'drop',
 'imputation_strategy': 'mean',
 'standardize_numeric': 'True',
 'node_properties': "['definition']",
 'edge_properties': 'None',
 'interface': 'ScikitLearnPGEncoder'}

In [7]:
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}/details/embedder/')
print(r)
r.json()

<Response [200]>


{'interface': 'StellarGraphNodeEmbedder',
 'model_type': 'inductive',
 'trained': 'True',
 'model_name': 'attri2vec',
 'model_params': "{'length': 5, 'number_of_walks': 10, 'epochs': 5, 'embedding_dimension': 128, 'batch_size': 20, 'negative_samples': 10, 'num_samples': [10, 5], 'random_walk_p': 0.5, 'random_walk_q': 2.0, 'clusters': 2, 'clusters_q': 1}",
 'graph_configs': "{'directed': True, 'include_type': False, 'feature_props': None, 'feature_vector_prop': 'features', 'edge_weight': 'npmi'}"}

In [8]:
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}/details/similarity-processor/')
print(r)
r.json()

<Response [200]>


{'similarity': 'cosine',
 'dimension': '128',
 'segmented': 'False',
 'interface': 'SimilarityProcessor'}

## Get resource embeddings

In [9]:
%%time
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}/embedding/',
    params={
        "resource_ids": ["dna replication", "glucose", "covid-19 infection", "lalala not in the index"]
    })
print(r)
r.json()

<Response [200]>
CPU times: user 5.38 ms, sys: 2.5 ms, total: 7.88 ms
Wall time: 9 ms


{'embeddings': {'dna replication': [0.08470042794942856,
   0.09900690615177155,
   0.07597528398036957,
   0.08027490973472595,
   0.08072628825902939,
   0.07598402351140976,
   0.08475766330957413,
   0.07523154467344284,
   0.09625132381916046,
   0.08713880181312561,
   0.10648053139448166,
   0.06859174370765686,
   0.08339131623506546,
   0.13870905339717865,
   0.08356621116399765,
   0.06849964708089828,
   0.056576866656541824,
   0.07129482179880142,
   0.08577119559049606,
   0.05917414277791977,
   0.06380248814821243,
   0.0735551044344902,
   0.11149250715970993,
   0.08344874531030655,
   0.08202512562274933,
   0.05940954014658928,
   0.08951541036367416,
   0.0801745057106018,
   0.08328740298748016,
   0.15063944458961487,
   0.08265821635723114,
   0.08628937602043152,
   0.0857185646891594,
   0.07131116837263107,
   0.10163924098014832,
   0.0789499580860138,
   0.08741418272256851,
   0.08072108775377274,
   0.06931886076927185,
   0.07911945879459381,
   0.08722

## Get nearest neighbors

In [10]:
%%time
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}/similar-points/',
    params={
        "resource_ids": ["glucose", "covid-19 infection", "dna replication", "lalala not in the index"],
        "k": 20
    })
print(r)
r.json()

<Response [200]>
CPU times: user 5.03 ms, sys: 2.44 ms, total: 7.47 ms
Wall time: 15.2 ms


{'similar_points': {'glucose': ['glucose',
   'fatigue',
   'molecule',
   'food',
   'interferon',
   'nitric oxide',
   'anxiety',
   'water',
   'depression',
   'endotracheal',
   'dizziness',
   'bicarbonate ion',
   'nasal',
   'congenital abnormality',
   'dog',
   'subarachnoid hemorrhage',
   'pruritus',
   'intracranial hemorrhage',
   'proximal',
   'anal injury'],
  'covid-19 infection': None,
  'dna replication': ['dna replication',
   'mutation abnormality',
   'electron',
   'human respiratory syncytial virus',
   'transcription factor',
   'human immunodeficiency virus 1',
   'microorganism',
   'phenol',
   'fungus',
   'nucleotide',
   'dacarbazine',
   'yeast',
   'acetylcysteine',
   'herpesvirus',
   'immune',
   'cyclophosphamide',
   'tacrolimus',
   'virus',
   'pharmacologic substance',
   'sus'],
  'lalala not in the index': None}}

In [11]:
%%time
r = requests.get(
    f'{ENDPOINT}/model/{MODEL_NAME}/similar-points/',
    params={
        "resource_ids": ["glucose", "covid-19 infection", "dna replication", "lalala not in the index"],
        "k": 20,
        "values": True
    })
print(r)
r.json()

<Response [200]>
CPU times: user 3.18 ms, sys: 1.33 ms, total: 4.51 ms
Wall time: 5.41 ms


{'similar_points': {'glucose': {'glucose': 1.0,
   'fatigue': 0.9895962476730347,
   'molecule': 0.9888150691986084,
   'food': 0.9882543087005615,
   'interferon': 0.9881671667098999,
   'nitric oxide': 0.9875469207763672,
   'anxiety': 0.987397313117981,
   'water': 0.987397313117981,
   'depression': 0.987397313117981,
   'endotracheal': 0.987397313117981,
   'dizziness': 0.987397313117981,
   'bicarbonate ion': 0.987397313117981,
   'nasal': 0.987397313117981,
   'congenital abnormality': 0.987397313117981,
   'dog': 0.987397313117981,
   'subarachnoid hemorrhage': 0.987397313117981,
   'pruritus': 0.987397313117981,
   'intracranial hemorrhage': 0.987397313117981,
   'proximal': 0.987397313117981,
   'anal injury': 0.987397313117981},
  'covid-19 infection': None,
  'dna replication': {'dna replication': 1.0,
   'mutation abnormality': 0.9902889728546143,
   'electron': 0.9891344308853149,
   'human respiratory syncytial virus': 0.9880732297897339,
   'transcription factor': 0.987

## Predict embeddings for unseen graph nodes in JSON

In [12]:
with open("../../../../examples/data/test_cooccurrence_graph.json", "r") as f:
    graph_json = json.load(f)

In [13]:
%%time
r = requests.post(
    f'{ENDPOINT}/model/{MODEL_NAME}/embedding/',
    json={
        "data": graph_json,
        "data_type": "json_pgframe"
    })
print(r)
vectors = r.json()["embeddings"]

<Response [200]>
CPU times: user 44.7 ms, sys: 6.64 ms, total: 51.3 ms
Wall time: 624 ms


In [14]:
%%time
r = requests.post(
    f'{ENDPOINT}/model/{MODEL_NAME}/similar-points/',
    params={
        "k": 20,
        "values": True
    },
    json={
        "vectors": vectors
    })
print(r)
r.json()

<Response [200]>
CPU times: user 51.1 ms, sys: 4.92 ms, total: 56 ms
Wall time: 111 ms


{'similar_points': [{'anal injury': 1.0,
   'water': 1.0,
   'cerebral hemorrhage': 1.0,
   'constipation': 1.0,
   'subarachnoid hemorrhage': 1.0,
   'endotracheal': 1.0,
   'pruritus': 1.0,
   'caddo language': 1.0,
   'anxiety': 1.0,
   'nasal': 1.0,
   'proximal': 1.0,
   'intracranial hemorrhage': 1.0,
   'brother': 1.0,
   'dizziness': 1.0,
   'cat': 1.0,
   'depression': 1.0,
   'deny': 1.0,
   'dog': 1.0,
   'bicarbonate ion': 1.0,
   'congenital abnormality': 1.0},
  {'urinary system': 0.996371865272522,
   'muscle': 0.9944179654121399,
   'hematuria': 0.9942547678947449,
   'proximal': 0.9934683442115784,
   'dizziness': 0.9934683442115784,
   'dog': 0.9934683442115784,
   'endotracheal': 0.9934683442115784,
   'water': 0.9934683442115784,
   'depression': 0.9934683442115784,
   'constipation': 0.9934683442115784,
   'bicarbonate ion': 0.9934683442115784,
   'subarachnoid hemorrhage': 0.9934683442115784,
   'anal injury': 0.9934683442115784,
   'anxiety': 0.9934683442115784,


## Predict embeddings for unseen graph nodes from a Nexus dataset

In [15]:
import getpass

In [32]:
# Specify here your ENDPOINT/BUCKET and RESOURCE_ID with a JSON-serialized PGFrames
NEXUS_ENDPOINT = "https://staging.nexus.ocp.bbp.epfl.ch/v1"
BUCKET = "dke/embedder_catalog"
RESOURCE_ID = "https://staging.nexus.ocp.bbp.epfl.ch/v1/resources/dke/embedder_catalog/_/aa9fc8c3-e559-4c9a-be9c-2a0aa92da8cf"

In [23]:
TOKEN = getpass.getpass()

········


In [25]:
r.json()

{'@context': 'https://bluebrain.github.io/nexus/contexts/error.json',
 '@type': 'HttpMethodNotAllowed',
 'reason': 'HTTP method not allowed, supported methods: GET.'}

In [33]:
%%time
r = requests.post(
    f'{ENDPOINT}/model/{MODEL_NAME}/embedding/',
    headers={'Authorization': f'Bearer {TOKEN}'},
    json={
        "data": {
            "endpoint": NEXUS_ENDPOINT,
            "bucket": BUCKET,
            "resource_id": RESOURCE_ID
        },
        "data_type": "nexus_dataset",
    })
print(r)
vectors = r.json()["embeddings"]

<Response [200]>
CPU times: user 22.4 ms, sys: 3.25 ms, total: 25.7 ms
Wall time: 7.69 s


In [75]:
%%time
r = requests.post(
    f'{ENDPOINT}/model/{MODEL_NAME}/similar-points/',
    params={
        "k": 20,
        "values": True
    },
    json={
        "vectors": vectors
    })
print(r)
r.json()

<Response [200]>
CPU times: user 56.4 ms, sys: 3.13 ms, total: 59.6 ms
Wall time: 122 ms


{'similar_points': [{'anal injury': 1.0,
   'water': 1.0,
   'cerebral hemorrhage': 1.0,
   'constipation': 1.0,
   'subarachnoid hemorrhage': 1.0,
   'endotracheal': 1.0,
   'pruritus': 1.0,
   'caddo language': 1.0,
   'anxiety': 1.0,
   'nasal': 1.0,
   'proximal': 1.0,
   'intracranial hemorrhage': 1.0,
   'brother': 1.0,
   'dizziness': 1.0,
   'cat': 1.0,
   'depression': 1.0,
   'deny': 1.0,
   'dog': 1.0,
   'bicarbonate ion': 1.0,
   'congenital abnormality': 1.0},
  {'urinary system': 0.996371865272522,
   'muscle': 0.9944179654121399,
   'hematuria': 0.9942547678947449,
   'proximal': 0.9934683442115784,
   'dizziness': 0.9934683442115784,
   'dog': 0.9934683442115784,
   'endotracheal': 0.9934683442115784,
   'water': 0.9934683442115784,
   'depression': 0.9934683442115784,
   'constipation': 0.9934683442115784,
   'bicarbonate ion': 0.9934683442115784,
   'subarachnoid hemorrhage': 0.9934683442115784,
   'anal injury': 0.9934683442115784,
   'anxiety': 0.9934683442115784,
