# Create and push embedding pipelines to the Nexus catalog

In [None]:
from bluegraph.core import PandasPGFrame

from bluegraph.preprocess.encoders import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder
from bluegraph.downstream.similarity import SimilarityProcessor
from bluegraph.downstream import EmbeddingPipeline

## Creating and saving a pipeline object

`bluegraph` allows to create emebedding pipelines (using the `EmbeddingPipeline` class) that represent a useful wrapper around a sequence of steps necessary to produce embeddings and compute point similarities. In the example below we create a pipeline for producing `attri2vec` node embeddings and computing their cosine similarity.

We first create an encoder object that will be used in our pipeline as a preprocessing step.

In [None]:
definition_encoder = ScikitLearnPGEncoder(
    node_properties=["definition"], text_encoding_max_dimension=512)

We then create an embedder object.

In [None]:
D = 128
params = {
    "length": 5,
    "number_of_walks": 10,
    "epochs": 5,
    "embedding_dimension": D
}
attri2vec_embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features", edge_weight="npmi", **params)

And finally we create a pipeline object. Note that in the code below we use the `SimilarityProcessor` interface and not `NodeSimilarityProcessor`, as we have done it previously. We use this lower abstraction level interface, because the `EmbeddingPipeline` is designed to work with any embedding models (not only node embedding models).

In [None]:
attri2vec_pipeline = EmbeddingPipeline(
    preprocessor=definition_encoder,
    embedder=attri2vec_embedder,
    similarity_processor=SimilarityProcessor(similarity="cosine", dimension=D))

Let us read the training graph from the provided example dataset

We run the fitting process, which given the input data:
1. fits the encoder
2. transforms the data
3. fits the embedder
4. produces the embedding table
5. fits the similarity processor index

In [None]:
graph = PandasPGFrame.load_json("../data/cooccurrence_graph.json")

In [None]:
attri2vec_pipeline.run_fitting(graph)

Now we can save our pipeline to the file system.

In [None]:
attri2vec_pipeline.save(
    "../data/attri2vec_test_model",
    compress=True)

## Pushing models to the catalog

In [None]:
import getpass
import jwt

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

In [None]:
FORGE_CONFIG_FILE = "../services/embedder/configs/forge_configs/prod-forge-nexus.yml"

Get your token from https://staging.nexus.ocp.bbp.epfl.ch/

In [None]:
TOKEN = getpass.getpass()

In [None]:
forge = KnowledgeGraphForge(FORGE_CONFIG_FILE, token=TOKEN)

In [None]:
def get_agent(token):
    agent_data = jwt.decode(token, verify=False)
    agent = forge.reshape(
        forge.from_json(agent_data), keep=[
            "name", "email", "sub", "preferred_username"])
    agent.id = agent.sub
    agent.type = "Person"
    return agent

In [None]:
def push_model(forge, agent, name, description, distribution):
    # Create a new model resource
    model_resource = Dataset(
        forge,
        name=name,
        description=description)
    model_resource.type = ["Dataset", "EmbeddingModel"]
    # Add distrubution
    model_resource.add_distribution(
        distribution, content_type="application/octet-stream")
    # Add contribution
    model_resource.add_contribution(agent, versioned=False)
    model_resource.contribution.hadRole = "Engineer"

    forge.register(model_resource)

In [None]:
agent = get_agent(TOKEN)

In [None]:
push_model(
    forge, agent, "New test attri2vec model",
    "Node embedding model built on the Covid dataset", "../data/attri2vec_test_model.zip")

## Get existing models

In [None]:
def retrieve_all_model_resources(forge):
    """Retrieve all models from the catalog."""
    query = """
        SELECT ?id
        WHERE {
            ?id a <https://bbp.epfl.ch/nexus/v1/resources/dke/embedder_catalog/_/EmbeddingModel>;
                <https://bluebrain.github.io/nexus/vocabulary/deprecated> false.
        }
    """
    resources = forge.sparql(query, limit=1000)
    return [
        forge.retrieve(r.id) for r in resources
    ]

In [None]:
models = retrieve_all_model_resources(forge)
for m in models:
    print(m)

## Update an existing model

In [None]:
def retrieve_model_resource(forge, model_name):
    """Retrieve model resource by its name."""
    query = f"""
        SELECT ?id
        WHERE {{
            ?id a <https://bbp.epfl.ch/nexus/v1/resources/dke/embedder_catalog/_/EmbeddingModel>;
                name "{model_name}";
                <https://bluebrain.github.io/nexus/vocabulary/deprecated> false.
        }}
    """
    resources = forge.sparql(query, limit=1)
    if resources and len(resources) > 0:
        resource = forge.retrieve(resources[0].id)
        return resource

In [None]:
def update_model(forge, name, description=None, distribution=None):
    # Try retreiving model resource
    model_resource = retrieve_model_resource(forge, name)
    if model_resource:
        # Update an existing model
        if description:
            model_resource.description = description
        if distribution:
            model_resource.distribution = forge.attach(
                distribution, content_type="application/octet-stream")
        forge.update(model_resource)
    else:
        raise ValueError("Model node found")

In [None]:
update_model(forge, "New test attri2vec model", description="Updated description of the test model")

## Delete a model

In [None]:
def deprecate_resource(forge, resource):
    """Deprecate the resource together with its distribution."""
    base = resource.id.rsplit('/', 1)[0]
    file_id = resource.distribution.contentUrl.rsplit('/', 1)[1]
    file = forge.retrieve(f"{base}/{file_id}")

    forge.deprecate(resource)
    forge.deprecate(file)

In [None]:
model_resource = retrieve_model_resource(forge, "New test attri2vec model")

In [None]:
deprecate_resource(forge, model_resource)