In [None]:
%load_ext autoreload
%autoreload 2

# Add/update embedding vectors for cell types

##  (or any type of ontological classes)

When adding a new set of embedding vectors or updating them, we need to perform the following sequence of steps

1. Given a model id, its revision, and a given a set of resources, ask the service[or some python code] for embedding vectors
2. Create/update embedding resources according to [this mapping](https://bbpgitlab.epfl.ch/dke/users/eugeniashurko/dataset-embeddings/-/blob/master/mappings/seu-embedding.hjson) --> model revision needs to be added to the `generation.activity.used.id`
3. Push them to Nexus
4. Tag them with the model UUID and the its revision (e.g. `e2b953b9-6724-4278-a1e5-3472bd63e374?rev=1`)

Related JIRA tickets: 
* https://bbpteam.epfl.ch/project/issues/browse/DKE-718
* https://bbpteam.epfl.ch/project/issues/browse/DKE-715

Prerequisites:

- The embedding model has been built
- Embedding service can read models from a dedicated Nexus project where all models are stored (here, at the moment, we can download models locally and get vectors directly from the models, without using the service)
- Model ID equals the Nexus resource id of the EmbeddingModel resource
- __Important__: local contexts in the projects with vectors should contain:

```
{
      "embedding": {
        "@id": "nsg:embedding",
        "@container": "@list"
      }
}
```

Questions:

* do we really need to url-encode tags ?
* add missing types and properties to the context

---

## Setup

### Imports

In [None]:
import requests
import getpass

from kgforge.core import KnowledgeGraphForge
from kgforge.core.resource import Resource

from inference_tools.similarity.data_registration import (create_forge_session,
                                                          load_embedding_models,
                                                          push_embedding_vectors)

In [None]:
from kgforge.version import __version__
print(__version__)

---

## User input

In [None]:
CONFIG_PATH = "../../../configs/ontology-forge-config.yaml"
ENDPOINT = "https://bbp.epfl.ch/nexus/v1"
# ENDPOINT = "https://staging.nexus.ocp.bbp.epfl.ch/v1"
DOWNLOAD_DIR = "../../../data"
TOKEN = getpass.getpass()

Bucket where embedding models live

In [None]:
MODEL_CATALOG_ORG = "dke"
MODEL_CATALOG_PROJECT = "embedding-pipelines"

__PROVIDE HERE THE IDs OF YOUR MODELs (OPTIONAL, REVISION)__

ID of the embedding models to use. For each type of taxonomy (mtype, ttype) we can have a list of embedding models representing different similarity aspects

In [None]:
MODEL_IDS = {
    "https://bbp.epfl.ch/ontologies/core/ttypes": [
        "https://bbp.epfl.ch/nexus/v1/resources/dke/embedding-pipelines/_/d79a408f-3356-4e98-8998-df6720cac376" # expression profile + taxonomy 
    ],
    "http://bbp.epfl.ch/neurosciencegraph/ontologies/mtypes": [
        "https://bbp.epfl.ch/nexus/v1/resources/dke/embedding-pipelines/_/35681e34-5dea-45fa-82f1-511265dc238b"  # morph features + taxonomy
    ]
}

In [None]:
MODEL_REVISIONS = {}  # Specify a revision (key model_id, value revision number), if necessary.
# If not specified the latest revision is used

Buckets where the input data lives together with the Bucket where the new embedding vectors should be registered.

In [None]:
DATA_BUCKETS = {
    ("https://bbp.epfl.ch/nexus/v1", "neurosciencegraph", "datamodels"): 
         (
            "https://bbp.epfl.ch/nexus/v1",
             "neurosciencegraph",
             "datamodels"
         )
}

If the embedding endpoint/bucket are not specified, we assume that embeddings should live in the same bucket as the input data.

---

## Create Forge sessions

### Session for embedding models

In [None]:
forge_models = create_forge_session(
    CONFIG_PATH,
    (ENDPOINT, MODEL_CATALOG_ORG, MODEL_CATALOG_PROJECT),
    TOKEN)

### Sessions for different buckets for data and embedding vectors

In [None]:
# TODO: find a way to pass different tokens and different configs
FORGE_SESSIONS = {}
for data_bucket, emb_bucket in DATA_BUCKETS.items():
    if data_bucket not in FORGE_SESSIONS:
        FORGE_SESSIONS[data_bucket] = create_forge_session(CONFIG_PATH, data_bucket, TOKEN)
    if emb_bucket not in FORGE_SESSIONS:
        FORGE_SESSIONS[emb_bucket] = create_forge_session(CONFIG_PATH, emb_bucket, TOKEN)

---

## Load the embedding model

In [None]:
model_revisions = {}
model_tags = {}
pipelines = {}
for ontology_id, ontology_models in MODEL_IDS.items():
    revisions = MODEL_REVISIONS.get(ontology_id)
    model_revisions[ontology_id], model_tags[ontology_id], pipelines[ontology_id] = load_embedding_models(
        forge_models, ontology_models, model_revisions=revisions, dowload_dir=DOWNLOAD_DIR)

## Fetch resources from data buckets

Data type filter for generating embedding vectors

In [None]:
DATA_TYPE_FILTER = "Class"
HARD_RESOURCE_LIMIT = 10000  # Limit on number of resources we can retrieve with SPARQL queries

If you want to register vectors only for existing classes, run the following:

In [None]:
# resource_set = {}
# for ontology_id in MODEL_IDS.keys():
#     resource_set[ontology_id] = {}
#     for bucket_config in DATA_BUCKETS.keys():
#         if bucket_config not in resource_set:
#             forge = FORGE_SESSIONS[bucket_config]
#             query = f"""
#                 SELECT ?id
#                 WHERE {{
#                     ?id a {DATA_TYPE_FILTER} ;
#                         <http://www.w3.org/2000/01/rdf-schema#isDefinedBy> <{ontology_id}>;
#                         <https://bluebrain.github.io/nexus/vocabulary/deprecated> false .
#                 }}
#             """ 
#             resources = forge.sparql(query, limit=HARD_RESOURCE_LIMIT)
#             resources = [forge.retrieve(r.id) for r in resources] 
#             resource_set[ontology_id][bucket_config] = resources

If you want to register vectors for all points from the embedding model, run the following:

In [None]:
resource_set = {}
for k, v in pipelines.items():
    resource_set[k] = {}
    for model in MODEL_IDS[k]:
        for bucket in DATA_BUCKETS:
            pipeline = pipelines[k][model]
            resource_set[k][bucket] = [
                Resource(id=el)
                for el in pipeline.get_point_ids().tolist()
            ]

In [None]:
for k, v in resource_set.items():
    print("Ontology: ", k)
    for kk, vv in v.items():
        print("Bucket: ", kk)
        print("\t", len(vv), "resources")
        print()

## Compute embedding vectors for all the resources and push to Nexus

- TODO: add the NeuronMorphology revision once available
- TODO: add prediction of previously unseen points (currently, only the in-sample points are considered)

In [None]:
SEU_DICTIONARY_MAPPING = "../../../mappings/seu-embedding.hjson"

In [None]:
for ontology_id, ontology_models in MODEL_IDS.items():
    push_embedding_vectors(
        FORGE_SESSIONS, DATA_BUCKETS, ontology_models,
        model_revisions[ontology_id], model_tags[ontology_id],
        pipelines[ontology_id], resource_set[ontology_id], SEU_DICTIONARY_MAPPING)

The following tag should be used to create new ES views on the vectors.

In [None]:
model_tags