# Create and push embedding pipelines to the Nexus catalog

In [19]:
from bluegraph.core import PandasPGFrame

from bluegraph.preprocess.encoders import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder
from bluegraph.downstream.similarity import (SimilarityProcessor,
                                             FaissSimilarityIndex,
                                             ScikitLearnSimilarityIndex,
                                             SimilarityIndex)
from bluegraph.downstream import EmbeddingPipeline

## Creating and saving a pipeline object

`bluegraph` allows to create emebedding pipelines (using the `EmbeddingPipeline` class) that represent a useful wrapper around a sequence of steps necessary to produce embeddings and compute point similarities. In the example below we create a pipeline for producing `attri2vec` node embeddings and computing their cosine similarity.

We first create an encoder object that will be used in our pipeline as a preprocessing step.

In [2]:
definition_encoder = ScikitLearnPGEncoder(
    node_properties=["definition"], text_encoding_max_dimension=512)

We then create an embedder object.

In [3]:
D = 128
params = {
    "length": 5,
    "number_of_walks": 10,
    "epochs": 5,
    "embedding_dimension": D
}
attri2vec_embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features", edge_weight="npmi", **params)

And finally we create a pipeline object. Note that in the code below we use the `SimilarityProcessor` interface and not `NodeSimilarityProcessor`, as we have done it previously. We use this lower abstraction level interface, because the `EmbeddingPipeline` is designed to work with any embedding models (not only node embedding models).

In [6]:
attri2vec_pipeline = EmbeddingPipeline(
    preprocessor=definition_encoder,
    embedder=attri2vec_embedder,
    similarity_processor=SimilarityProcessor(
        FaissSimilarityIndex(similarity="cosine", dimension=D))
)

Let us read the training graph from the provided example dataset

We run the fitting process, which given the input data:
1. fits the encoder
2. transforms the data
3. fits the embedder
4. produces the embedding table
5. fits the similarity processor index

In [7]:
graph = PandasPGFrame.load_json("../data/cooccurrence_graph.json")

In [8]:
attri2vec_pipeline.run_fitting(graph)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


Now we can save our pipeline to the file system.

In [9]:
attri2vec_pipeline.save(
    "../data/attri2vec_test_model",
    compress=True)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../data/attri2vec_test_model/embedder/model/assets


In [12]:
graph

<bluegraph.core.io.PandasPGFrame at 0x7fa4e732ce50>

In [13]:
graph.to_json()

{'nodes': [{'@id': 'pulmonary',
   '@type': 'Entity',
   'entity_type': 'ORGAN',
   'definition': 'Relating to the lungs as the intended site of administration, where the pharmaceutical product is administered, usually by inhalation, for local action in the lower respiratory tract or for systemic action after absorption via the lower respiratory tract.',
   'frequency': 8295},
  {'@id': 'host',
   '@type': 'Entity',
   'entity_type': 'ORGANISM',
   'definition': 'An organism that nourishes and supports another but does not benefit by the association; recipient of transplanted tissue or organ from a donor.',
   'frequency': 2660},
  {'@id': 'surfactant protein d measurement',
   '@type': 'Entity',
   'entity_type': 'PROTEIN',
   'definition': 'The determination of the amount of surfactant protein D present in a sample.',
   'frequency': 268},
  {'@id': 'microorganism',
   '@type': 'Entity',
   'entity_type': 'ORGANISM',
   'definition': 'A microscopic organism. The term microorganism ma

Finally, prediction pipeline can be launched as follows:

In [14]:
attri2vec_pipeline.run_prediction(graph)

[[0.06584152579307556,
  0.05633807182312012,
  0.06426459550857544,
  0.07616272568702698,
  0.07659083604812622,
  0.057510316371917725,
  0.07146468758583069,
  0.06897985935211182,
  0.058695077896118164,
  0.08644208312034607,
  0.06694287061691284,
  0.07027018070220947,
  0.07117584347724915,
  0.09018227458000183,
  0.06760171055793762,
  0.05492877960205078,
  0.1118972897529602,
  0.0657186210155487,
  0.06802555918693542,
  0.058672577142715454,
  0.06623849272727966,
  0.06197741627693176,
  0.0591660737991333,
  0.06882309913635254,
  0.08360326290130615,
  0.09442004561424255,
  0.06327483057975769,
  0.06641030311584473,
  0.059668898582458496,
  0.07264953851699829,
  0.08649662137031555,
  0.07280147075653076,
  0.05603700876235962,
  0.060746192932128906,
  0.050737202167510986,
  0.049847185611724854,
  0.07647764682769775,
  0.06865286827087402,
  0.07162386178970337,
  0.0586104691028595,
  0.06583529710769653,
  0.06631159782409668,
  0.062080562114715576,
  0.077

In the previous example we used

In [21]:
try:
    sklearn_similarity_processor = SimilarityProcessor(
        ScikitLearnSimilarityIndex(
            similarity="poincare", dimension=D,
            index_type="ballktree", leaf_size=10)
    )
except SimilarityIndex.SimilarityException as e:
    print("Caught the following error: ")
    print(e)

Caught the following error: 
Initial vectors must be provied (scikit learn indices are not updatable) 


In [16]:
transformed_graph = definition_encoder.fit_transform(graph)
embedding = attri2vec_embedder.fit_model(transformed_graph)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [26]:
sklearn_similarity_processor = SimilarityProcessor(
    ScikitLearnSimilarityIndex(
        similarity="poincare", dimension=D,
        initial_vectors=embedding["embedding"].tolist(),
        index_type="ballktree", leaf_size=10))

In [27]:
attri2vec_sklearn_pipeline = EmbeddingPipeline(
    preprocessor=definition_encoder,
    embedder=attri2vec_embedder,
    similarity_processor=sklearn_similarity_processor)

## Pushing models to the catalog

In [None]:
import getpass
import jwt

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

In [None]:
FORGE_CONFIG_FILE = "../services/embedder/configs/forge_configs/prod-forge-nexus.yml"

Get your token from https://staging.nexus.ocp.bbp.epfl.ch/

In [None]:
TOKEN = getpass.getpass()

In [None]:
forge = KnowledgeGraphForge(FORGE_CONFIG_FILE, token=TOKEN)

In [None]:
def get_agent(token):
    agent_data = jwt.decode(token, verify=False)
    agent = forge.reshape(
        forge.from_json(agent_data), keep=[
            "name", "email", "sub", "preferred_username"])
    agent.id = agent.sub
    agent.type = "Person"
    return agent

In [None]:
def push_model(forge, agent, name, description, distribution):
    # Create a new model resource
    model_resource = Dataset(
        forge,
        name=name,
        description=description)
    model_resource.type = ["Dataset", "EmbeddingModel"]
    # Add distrubution
    model_resource.add_distribution(
        distribution, content_type="application/octet-stream")
    # Add contribution
    model_resource.add_contribution(agent, versioned=False)
    model_resource.contribution.hadRole = "Engineer"

    forge.register(model_resource)

In [None]:
agent = get_agent(TOKEN)

In [None]:
push_model(
    forge, agent, "New test attri2vec model",
    "Node embedding model built on the Covid dataset", "../data/attri2vec_test_model.zip")

## Get existing models

In [None]:
def retrieve_all_model_resources(forge):
    """Retrieve all models from the catalog."""
    query = """
        SELECT ?id
        WHERE {
            ?id a <https://bbp.epfl.ch/nexus/v1/resources/dke/embedder_catalog/_/EmbeddingModel>;
                <https://bluebrain.github.io/nexus/vocabulary/deprecated> false.
        }
    """
    resources = forge.sparql(query, limit=1000)
    return [
        forge.retrieve(r.id) for r in resources
    ]

In [None]:
models = retrieve_all_model_resources(forge)
for m in models:
    print(m)

## Update an existing model

In [None]:
def retrieve_model_resource(forge, model_name):
    """Retrieve model resource by its name."""
    query = f"""
        SELECT ?id
        WHERE {{
            ?id a <https://bbp.epfl.ch/nexus/v1/resources/dke/embedder_catalog/_/EmbeddingModel>;
                name "{model_name}";
                <https://bluebrain.github.io/nexus/vocabulary/deprecated> false.
        }}
    """
    resources = forge.sparql(query, limit=1)
    if resources and len(resources) > 0:
        resource = forge.retrieve(resources[0].id)
        return resource

In [None]:
def update_model(forge, name, description=None, distribution=None):
    # Try retreiving model resource
    model_resource = retrieve_model_resource(forge, name)
    if model_resource:
        # Update an existing model
        if description:
            model_resource.description = description
        if distribution:
            model_resource.distribution = forge.attach(
                distribution, content_type="application/octet-stream")
        forge.update(model_resource)
    else:
        raise ValueError("Model node found")

In [None]:
update_model(forge, "New test attri2vec model", description="Updated description of the test model")

## Delete a model

In [None]:
def deprecate_resource(forge, resource):
    """Deprecate the resource together with its distribution."""
    base = resource.id.rsplit('/', 1)[0]
    file_id = resource.distribution.contentUrl.rsplit('/', 1)[1]
    file = forge.retrieve(f"{base}/{file_id}")

    forge.deprecate(resource)
    forge.deprecate(file)

In [None]:
model_resource = retrieve_model_resource(forge, "New test attri2vec model")

In [None]:
deprecate_resource(forge, model_resource)