# Embedder API example: term embedding

In order to run this notebook, setup and start the embedding service as described [here](https://github.com/BlueBrain/BlueGraph/blob/master/services/embedder/README.rst).

You may want to modify the following configs in `services/embedder/configs/app_config.py`:

- `DOWNLOAD_DIR = "downloads/"`: Directory for downloading or serving from embeddig pipelines
- `LOCAL = True`: Flag indicating whether you would like to serve embedding pipelines hosted in Nexus or stored in the local `DOWNLOAD_DIR` 

By default, the `services/embedder/downloads` folder is used and `LOCAL` is set to `True`. This folder contains two example models (`Cord-19-NCIT-linking` and `Attri2vec_test_model`) distributed along with the source code.

In [32]:
import requests

In [33]:
ENDPOINT = "http://127.0.0.1:5000"

## Get all the models in the catalogue

In [37]:
r = requests.get(
    f'{ENDPOINT}/models/')
print(r)
r.json()

<Response [200]>


{'models': {'Attri2vec_test_model': {'id': 'Attri2vec_test_model',
   'name': 'Attri2vec_test_model',
   'description': 'Attri2vec_test_model',
   'filename': 'downloads/Attri2vec_test_model.zip',
   'created': 'Tue Jul  6 16:32:33 2021',
   'modified': 'Tue Jul  6 15:15:48 2021'},
  'Cord-19-NCIT-linking': {'id': 'Cord-19-NCIT-linking',
   'name': 'Cord-19-NCIT-linking',
   'description': 'Cord-19-NCIT-linking',
   'filename': 'downloads/Cord-19-NCIT-linking.zip',
   'created': 'Tue Jul  6 16:32:33 2021',
   'modified': 'Tue Jun  1 12:36:56 2021'}}}

## Get a model by name

In [38]:
MODEL_NAME = "Cord-19-NCIT-linking"

In [39]:
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}')
print(r)
r.json()

<Response [200]>


{'id': 'Cord-19-NCIT-linking',
 'name': 'Cord-19-NCIT-linking',
 'description': 'Cord-19-NCIT-linking',
 'filename': 'downloads/Cord-19-NCIT-linking.zip',
 'created': 'Tue Jul  6 16:32:33 2021',
 'modified': 'Tue Jun  1 12:36:56 2021'}

## Get details on different model components

In [40]:
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/preprocessor/')
print(r)
r.json()

<Response [200]>


{'analyzer': 'char',
 'binary': 'False',
 'decode_error': 'strict',
 'dtype': "<class 'numpy.float32'>",
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': 'True',
 'max_df': '1.0',
 'max_features': '1024',
 'min_df': '0.0001',
 'ngram_range': '(3, 3)',
 'norm': 'l2',
 'preprocessor': 'None',
 'smooth_idf': 'True',
 'stop_words': 'None',
 'strip_accents': 'None',
 'sublinear_tf': 'False',
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': 'None',
 'use_idf': 'True',
 'vocabulary': 'None',
 'interface': 'TfIdfEncoder'}

In [41]:
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/embedder/')
print(r)
r.json()

<Response [404]>


{'success': False, 'message': 'Model does not contain an embedder'}

In [42]:
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/similarity-processor/')
print(r)
r.json()

<Response [200]>


{'similarity': 'euclidean',
 'dimension': '1024',
 'segmented': 'True',
 'interface': 'SimilarityProcessor'}

## Get resource embeddings

In [43]:
%%time
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/embedding',
    params={
        "resource_ids": ["dna replication", "glucose", "covid-19 infection", "lalala not in the index"]
    })
print(r)
r.json()

<Response [200]>
CPU times: user 8.03 ms, sys: 3.29 ms, total: 11.3 ms
Wall time: 45.8 ms


{'vectors': {'dna replication': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.3091674745082855,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   

In [44]:
%%time
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/embedding',
    params={
        "resource_ids": ["dna replication", "glucose", "covid-19 infection", "lalala not in the index"]
    })
print(r)
r.json()

<Response [200]>
CPU times: user 5.85 ms, sys: 1.95 ms, total: 7.8 ms
Wall time: 9.7 ms


{'vectors': {'dna replication': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.3091674745082855,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   

## Get nearest neighbors

In [45]:
%%time
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/neighbors/',
    params={
        "resource_ids": ["glucose", "covid-19 infection", "dna replication", "lalala not in the index"],
        "k": 20
    })
print(r)
r.json()

<Response [200]>
CPU times: user 5.67 ms, sys: 2.65 ms, total: 8.33 ms
Wall time: 18.8 ms


{'neighbors': {'glucose': ['2-deoxy-d-glucose',
   'hexose',
   'glucose',
   'fucose',
   'stachyose',
   'acarbose',
   'purpose',
   'marmoset',
   'voglibose',
   'ribose',
   'rhamnose',
   'nose',
   'choose',
   'xylose',
   'amylose',
   'azosemide',
   'camiglibose',
   'cefoselis',
   'maltose',
   'raffinose'],
  'covid-19 infection': ['ebv infection',
   'hhv8 infection',
   'hiv infection',
   'htlv-1 infection',
   'hhv6 infection',
   'covid-19 infection',
   'hpv16 infection',
   'infection',
   'eye infection',
   'gum infection',
   'tinea infection',
   'lip infection',
   'rubella infection',
   'tooth infection',
   'cecal infection',
   'nail infection',
   'skin infection',
   'vulval infection',
   'fungal infection',
   'ear infection'],
  'dna replication': ['dna replication',
   'replication error',
   'publication',
   'dna replication fork',
   'dna replication damage',
   'replicate',
   'dna replication pathway',
   'multiplication',
   'dna replication t

In [46]:
%%time
r = requests.get(
    f'{ENDPOINT}/models/{MODEL_NAME}/neighbors/',
    params={
        "resource_ids": ["glucose", "covid-19 infection", "dna replication", "lalala not in the index"],
        "k": 20,
        "values": True
    })
print(r)
r.json()

<Response [200]>
CPU times: user 6.55 ms, sys: 2.2 ms, total: 8.76 ms
Wall time: 17.8 ms


{'neighbors': {'glucose': {'2-deoxy-d-glucose': 0.0,
   'hexose': 0.0,
   'glucose': 0.0,
   'fucose': 0.0,
   'stachyose': 0.32602882385253906,
   'acarbose': 0.34396257996559143,
   'purpose': 0.4933681786060333,
   'marmoset': 0.5248439908027649,
   'voglibose': 0.5394153594970703,
   'ribose': 0.5394153594970703,
   'rhamnose': 0.5634313821792603,
   'nose': 0.5634313821792603,
   'choose': 0.5669065713882446,
   'xylose': 0.5952121615409851,
   'amylose': 0.5952121615409851,
   'azosemide': 0.6985251903533936,
   'camiglibose': 0.7662659883499146,
   'cefoselis': 0.7786716818809509,
   'maltose': 0.7811556458473206,
   'raffinose': 0.8184324502944946},
  'covid-19 infection': {'ebv infection': 0.0,
   'hhv8 infection': 0.0,
   'hiv infection': 0.0,
   'htlv-1 infection': 0.0,
   'hhv6 infection': 0.0,
   'covid-19 infection': 0.0,
   'hpv16 infection': 0.0,
   'infection': 0.10975020378828049,
   'eye infection': 0.12972569465637207,
   'gum infection': 0.19268672168254852,
   'ti

## Predict embeddings for unseen points

In [47]:
%%time
r = requests.post(
    f'{ENDPOINT}/models/{MODEL_NAME}/embedding/',
    json={
        "data": ["hello world", "protein", "coronavirus"],
    })
print(r)
vectors = r.json()["vectors"]

<Response [200]>
CPU times: user 4.75 ms, sys: 2.64 ms, total: 7.38 ms
Wall time: 10.5 ms


In [48]:
%%time
r = requests.post(
    f'{ENDPOINT}/models/{MODEL_NAME}/neighbors/',
    params={
        "k": 20,
        "values": True
    },
    json={
        "vectors": vectors
    })
print(r)
r.json()

<Response [200]>
CPU times: user 5.47 ms, sys: 2.17 ms, total: 7.64 ms
Wall time: 17.7 ms


{'neighbors': [{'shell': 0.8483226299285889,
   'shellfish': 0.8483226299285889,
   "st. john's wort": 0.8498218655586243,
   'how worried': 0.8998590707778931,
   'got worse': 0.9149569272994995,
   'frizzled-2': 1.0,
   '3p21.3-p21.1': 1.0,
   '9p13': 1.0,
   '1p13.3-p13.1': 1.0,
   '16p': 1.0,
   'unlikely': 1.0,
   '19: 56026591-56020357': 1.0,
   '21q21.3': 1.0,
   '3: 52419049-52410067': 1.0,
   '2q37': 1.0,
   'cdc73 np_078805.3:p.m1v': 1.0,
   '10: 135229746-135241501': 1.0,
   '1q22-q23': 1.0,
   '13q34': 1.0,
   '10: 115793796-115795518': 1.0},
  {'glycoprotein': 0.0,
   'protein': 0.0,
   'protein xrp2': 0.13581442832946777,
   'protein btg1': 0.13581442832946777,
   'protein nlrc5': 0.13581442832946777,
   'protein wnt-10b': 0.13581442832946777,
   'protein wnt-2b': 0.13581442832946777,
   'protein emsy': 0.13581442832946777,
   'protein wnt-16': 0.13581442832946777,
   'protein hira': 0.13581442832946777,
   'protein wnt-5a': 0.13581442832946777,
   'protein znf365': 0.135