In [None]:
%load_ext autoreload
%autoreload 2

## TODO

- Make sure boosting factors are computed with __normalized__ scores (so we need to pass the new statistics)


## Forge issues to address

- `forge.search({"type": "Embedding"}, limit=200)` throws 'Server disconnected'. Current (very slow!) workaround:

```
query = f"""
    SELECT ?id
    WHERE {{
        ?id a {DATA_TYPE_FILTER} ;
            <https://bluebrain.github.io/nexus/vocabulary/deprecated> false .
    }}
""" 
resources = forge.sparql(query, limit=HARD_RESOURCE_LIMIT)
resources = [forge.retrieve(r.id) for r in resources] 

```


- `forge.elastic` print the query it executes even without debug (with debug, prints twice)
- `forge.elastic` expects the `limit` parameter (cannot ask for all the documents), current workaround: set `HARD_RESOURCE_LIMIT=10000`, some large number so that all the resources can be fetched.
- `forge.update` after retrieve adds a full context payload

## Context issues

Add new types:
- `EmbeddingModel`
- `Embedding`
- `SimilarityBoostingFactor`
- `ElasticSearchViewStatistics`
- `RecommenderConfiguration`
- any other properties to add:
    - from `EmbeddingModel`: `similarity`, `vectiorDimension`
    - from `Embedding`: `embedding`
    - from `SimilarityBoostingFactor`: `scriptScore`, `vectorParameter`
    - from `ElasticSearchViewStatistics`: `boosted`, `scriptScore`, `vectorParameter` 
    - from `RecommenderConfiguration`: `embeddingModel`, `boostingViewmodel`, `similarityView`, `statisticsView`
    

# Add/update a set of embedding vectors


When adding/updating a set of embedding vectors, we need to perform the following sequence of steps

I. Create a new ES view for the new/updated vectors as follows

1. Get dimensions of the embedding vectors
2. Create a Nexus ES View resource with:
- `resourceTypes` being `Embedding`
- mapping that has `"embedding": "dense_vector"` with the right dimensions
- `resourceTag` field corresponds to the model UUID and its revision (e.g. `e2b953b9-6724-4278-a1e5-3472bd63e374?rev=1`)

 
II. Update an existing similarity aspect in the recommender config

__Pre-requisites:__ the `RecommenderConfiguration` resource exists and the aspect is added to it added to it (see `Add new similarity aspects.ipynb`)

1. Create a new aggregated view including the new similarity view. This view will be the new master view. Make sure all the vectors have been indexed. 
2. Compute raw statistics (min/max/mean/std) of similarity values from the master view and push them as a `ElasticSearchViewStatistics` resource (created if doesn't exist, updated if exists), taged with the new revision of the master view.
3. Compute boosting factors for all the data points (vectors) indexed by the master view and push them as separate resources into respective projects (create if don't exist, update if exist). Tag them by the new revision of the master view.
4. In the bucket with embedding data create a new ES view for boosting factors (tagged by the new master view id). Make sure that all the boosting factors have finished indexing.
5. Compute statistics (min/max/mean/std) of similarity values from thr master view with boosting, push and tag them with the new revision of the master view.
6. Create a new ES view serving statistics (both raw and boosted) tagged with the new revision of the master view. Make sure that all the stats have finished indexing.
7. Create a new aggregated view for boosting factors targeting all the new boosting ES views.
9. Update the Recommender Configuration to point to the new revision of the master view, the new ES view with the stats and the new aggregated view with the boosting factors.
10. In each of the individual projects deprecate the old boosting ES view and the old ES view serving embedding vectors (if such exists).
11. If necessary, deprecate old stats view and the old aggregated view for boosting.

Related JIRA tickets: 
* https://bbpteam.epfl.ch/project/issues/browse/DKE-718
* https://bbpteam.epfl.ch/project/issues/browse/DKE-715

# Setup

## Imports

In [None]:
import copy
import getpass
import math
import requests
import urllib
import time

from collections import OrderedDict

import numpy as np
import nexussdk as nxs

from collections import namedtuple
from urllib.parse import quote_plus
from kgforge.core import KnowledgeGraphForge

from inference_tools.query.elastic_search import check_view_readiness, set_elastic_view, get_all_documents
from inference_tools.similarity.data_registration import (BucketConfiguration, create_forge_session,
                                                          add_views_with_replacement, register_stats,
                                                          register_boosting_data)
from inference_tools.similarity.utils import compute_statistics, compute_boosting_factors
from inference_tools.similarity.es_mappings import get_es_view_mappings, BOOSTING_VIEW_MAPPING, STATS_VIEW_MAPPING

In [None]:
from kgforge.version import __version__
print(__version__)

In [None]:
def get_current_config(config_resource, model_id):
    """Get the configuration record corresponding to the input model."""
    current_config = None
    if isinstance(config_resource.configuration, list):
        for el in config_resource.configuration:
            if el.embeddingModel.id == model_id:
                current_config = el
    else:
        if config_resource.configuration.embeddingModel.id == model_id:
            current_config = config_resource.configuration

    return current_config


def update_current_config(forge, config_resource, current_config):
    """Update the configuration record."""
    if isinstance(config_resource.configuration, list):
        new_configs = []
        for el in config_resource.configuration:
            if el is not current_config:
                new_configs.append(el)
        new_configs.append(current_config)
        config_resource.configuration = new_configs
    else:
        config_resource.configuration = [
            current_config
        ]
    try:
        del config_resource.context
    except:
        pass
    forge.update(config_resource)
    
def deprecate_individual_views(agg_view):
    views = agg_view["views"]
    if not isinstance(views, list):
        views = [agg_view["views"]]
    
    for el in views:
        org = el["project"].split("/")[0]
        proj = el["project"].split("/")[1]
        view = el["viewId"]
        es_view = nxs.views.fetch(org, proj, view)
        try:
            nxs.views.deprecate_es(es_view)
        except Exception as e:
            print(f"Deprecation failed with '{e}'")

## User input

In [None]:
ENDPOINT = "https://bbp.epfl.ch/nexus/v1"
DOWNLOAD_DIR = "../../../data"
TOKEN = getpass.getpass()

TODO: Here we need to fix forge and allow to not specify the limit when doing ES queries, for now we put 'very large' number

In [None]:
HARD_RESOURCE_LIMIT = 10000

Bucket where embedding models live

In [None]:
MODEL_CATALOG_ORG = "dke"
MODEL_CATALOG_PROJECT = "embedding-pipelines"

ID of the embedding model to use

__PROVIDE HERE THE ID OF YOUR MODEL (OPTIONAL, REVISION)__

In [None]:
MODEL_ID = "https://bbp.epfl.ch/nexus/v1/resources/dke/embedding-pipelines/_/d79a408f-3356-4e98-8998-df6720cac376"
MODEL_REVISION = None  # Specify a revision, if necessary. If None, the latest revision is used

Bucket where embedding vectors live

In [None]:
EMBEDDING_BUCKETS = [
     BucketConfiguration(
        "https://bbp.epfl.ch/nexus/v1",
         "neurosciencegraph", "datamodels")
]

Later, we will assume that data and embeddings live in the same bucket

In [None]:
NEIGHBORHOOD_SIZE = 20  # Number of nearest neighbors to consider for local boosting

---

## Forge sessions

In [None]:
CONFIG_PATH = "../../../configs/new-forge-config.yaml"

### Session for embedding models

In [None]:
forge_models = create_forge_session(
    CONFIG_PATH,
    BucketConfiguration(ENDPOINT, MODEL_CATALOG_ORG, MODEL_CATALOG_PROJECT),
    TOKEN)

### Session for embedding resources

In [None]:
FORGE_SESSIONS = {
    el: create_forge_session(CONFIG_PATH, el, TOKEN) for el in EMBEDDING_BUCKETS
}

### Nexussdk session

In [None]:
nxs.config.set_environment(ENDPOINT)
nxs.config.set_token(TOKEN)

---

# I. Create ElasticSearchView


TODO: Adapt the resource_types property to the proper `Embedding` type once it is added to the context

In [None]:
model_resource = forge_models.retrieve(
    f"{MODEL_ID}{'?rev=' + str(MODEL_REVISION) if MODEL_REVISION is not None else ''}")

# If revision is not provided by the user, fetch the latest
if MODEL_REVISION is None:
    MODEL_REVISION = model_resource._store_metadata._rev 

MODEL_TAG = f"{MODEL_ID.split('/')[-1]}?rev={MODEL_REVISION}"

In [None]:
MODEL_TAG

In [None]:
dimension = model_resource.vectorDimension

In [None]:
SIMILARITY_VIEWS = {}
for bucket_config in EMBEDDING_BUCKETS:
    view = nxs.views.create_es(
        bucket_config.org, bucket_config.proj,
        mapping=get_es_view_mappings(dimension),
        tag=MODEL_TAG,
        resource_types=[
            f"https://neuroshapes.org/Embedding"],
        source_as_text=False,
        include_metadata = True, 
        include_deprecated = False)
    SIMILARITY_VIEWS[bucket_config] = nxs.views.fetch(
        bucket_config.org, bucket_config.proj,
        view_id=view["@id"])

In [None]:
SIMILARITY_VIEW_IDS = {
    k: v["@id"] for k, v in SIMILARITY_VIEWS.items()
}

__IMPORTANT__: Here, before we execute the next step, we need to make sure that the indexing is over. Execute the following cell until finishes

In [None]:
start = time.time()
while True:
    all_ready = []
    for k, v in SIMILARITY_VIEW_IDS.items():
        ready = check_view_readiness(k, v, TOKEN) 
        all_ready.append(ready)
    if all(all_ready):
        print(f"Indexing has finished after: {time.time() - start}s")
        break
    time.sleep(30) 

In [None]:
SIMILARITY_VIEW_IDS

---

## Compute raw (non-boosted) statistics

Compute raw statistics (min/max/mean/std) of similarity values from the master view and push them as a ElasticSearchViewStatistics resource (created if doesn't exist, updated if exists), taged with the new revision of the master view (in bbp/atlas).

In [None]:
for bucket, forge in FORGE_SESSIONS.items():
    set_elastic_view(forge, SIMILARITY_VIEW_IDS[bucket])

    n_values, stats = compute_statistics(
        forge, SIMILARITY_VIEW_IDS[bucket], model_resource.similarity, boosting=None)

    TAG = SIMILARITY_VIEW_IDS[bucket].split("/")[-1]
    stats_resource = register_stats(
        forge, SIMILARITY_VIEW_IDS[bucket], n_values, stats, model_resource.similarity,
        TAG, boosted=False)

## Compute boosting factors and create necessary resources

Compute boosting factors for all the data points (vectors) indexed by the new master view and push them as separate resources into respective projects. Tag them by the new UUID of the master view.

In [None]:
stats_json = [forge_atlas.as_json(el) for el in stats_resource.series]
stats_json = {el["statistic"]: el["value"] for el in stats_json}

In [None]:
NEIGHBORHOOD_SIZE

In [None]:
ALL_BOOSTING_FACTORS = {}
for bucket, view_id in SIMILARITY_VIEW_IDS.items():
    print(f"(Re-)computing boosting factors in '{bucket}'...")
    deviations = dict()
    bucket_forge = FORGE_SESSIONS[bucket]
    
    boosting_factors = compute_boosting_factors(
        bucket_forge, view_id, stats, model_resource.similarity,
        neighborhood_size=NEIGHBORHOOD_SIZE)
    
    ALL_BOOSTING_FACTORS.update(boosting_factors)

    TAG = view_id.split("/")[-1]
    
    print(f"Registering/updating boosting factors in '{bucket}'...")
    # Register boosting factors into the current buckets
    boosting_resources = register_boosting_data(
        bucket_forge, view_id, boosting_factors,
        model_resource.similarity, TAG)

In the individual embedding data buckets create a new ES view for boosting factors (tagged by the new master view).

In [None]:
new_boosting_views = []
for bucket, view_id in SIMILARITY_VIEW_IDS.items():
    print(f"Creating a new ES view on boosting factors in '{bucket}'...")
    
    TAG = view_id.split("/")[-1]
    
    boosting_view = nxs.views.create_es(
        bucket.org, bucket.proj,
        mapping=BOOSTING_VIEW_MAPPING,
        tag=TAG,
        resource_types=[
            f"https://neuroshapes.org/SimilarityBoostingFactor"],
        source_as_text=False,
        include_metadata=True, 
        include_deprecated=False)
    new_boosting_views.append(boosting_view)

In [None]:
for el in new_boosting_views:
    print("Project: ", el["_project"])
    print("View: ", el["@id"])
    print()

__IMPORTANT__: Here, before we execute the next step, we need to make sure that the indexing in the aggregated view is over. Execute the following cell until it stops throwing an assertion error. If no error is observed, all the resources have been indexed, and we can proceed with the rest of the notebook.

In [None]:
start = time.time()
while True:
    all_ready = []
    for el in new_boosting_views:
        ready = check_view_readiness(
            BucketConfiguration(
                "/".join(el["_project"].split("/")[:-3]),
                el["_project"].split("/")[-2],
                el["_project"].split("/")[-1]),
            el["@id"],
            TOKEN)
        all_ready.append(ready)
    if all(all_ready):
        print(f"Indexing has finished after: {time.time() - start}s")
        break
    else:
        time.sleep(30)

## Compute boosted statistics

Compute statistics (min/max/mean/std) of similarity values after boosting and push them as a ElasticSearchViewStatistics resource (created if doesn't exist, updated if exists), taged with the new revision of the master view.

In [None]:
for bucket, forge in FORGE_SESSIONS.items():
    set_elastic_view(forge, SIMILARITY_VIEW_IDS[bucket])

    n_values, stats = compute_statistics(
        forge, SIMILARITY_VIEW_IDS[bucket], model_resource.similarity, boosting=ALL_BOOSTING_FACTORS)

    TAG = SIMILARITY_VIEW_IDS[bucket].split("/")[-1]
    stats_resource = register_stats(
        forge, SIMILARITY_VIEW_IDS[bucket], n_values, stats, model_resource.similarity,
        TAG, boosted=True)