- Exact match queries (get morphologies from a brain region X)
- Multiple embedding vectors produced by models tackling similarity according to different aspects (vectors representing a morphology in a latent co-projection graph space, get 10 most similar morphologies in that space)
- Ontology queries (get similar species, similar or adjacent brain region: get morphologies of this species/brain region)
- Full text search

- Rule-based approach
- Vector similarity-based approach


For embeddings, we need to build rank aggregation functions:
https://arxiv.org/pdf/1906.06011.pdf

"Rank aggregation functions allow retrieval models (or rankers) to be builton top of others.  They combine results from different rankers and promote moreeffective retrieval results, without dealing with raw data or low-level descriptors. Besides, even heterogeneous models such as text-based or image-based can begathered together.  Rank aggregation techniques are important in many appli-cations, such as meta-search, document filtering, recommendation systems, andsocial choice."

In [None]:
import ast
import pickle
import re
import getpass
import requests

import networkx as nx
import numpy as np
import pandas as pd

from bluegraph import PandasPGFrame
from bluegraph.backends.networkx import networkx_to_pgframe

from kgforge.core import KnowledgeGraphForge

from elasticsearch import Elasticsearch, helpers

#  1. Prepare a sample data

In [None]:
data = pd.read_csv("combined_data.csv")

In [None]:
data.head(4)

In [None]:
data["brain_region"].unique()

In [None]:
KEYWORDS = [
    "@id", "@type",
    "brain_region",
    "atlas_release",
    "object_of_study",
]

In [None]:
EMBEDDING_VECTORS = [
    "coordinates", "neurite_features",
    "axon_coproj_embedding", "dendrite_coproj_embedding"
]

In [None]:
for c in EMBEDDING_VECTORS:
    data[c] = data[c].apply(lambda x: ast.literal_eval(x))

In [None]:
releases = ["release1", "release2", "release3"]
data["atlas_release"] = pd.Series([
    np.random.choice(releases, p=[0.4, 0.4, 0.2])
    for n in data["@id"]
])
objects = ["Single Cell", "Whole Brain"]
data["object_of_study"] = pd.Series([
    np.random.choice(objects, p=[0.6, 0.4])
    for n in data["@id"]
])

In [None]:
morphologies = data[["@id", "brain_region", "atlas_release", "object_of_study"]]
morphologies["@type"] = "NeuronMorphology"

In [None]:
brain_region_frame = PandasPGFrame.load_json("brain_region_data.json")

In [None]:
brain_region_frame._nodes

# 2. Create Elasticsearch indices

In [None]:
ELASTIC_URI = "http://localhost:9200"

In [None]:
morph_mapping = {
    "mappings": {
        "properties": {
            "@id": {
                "type": "keyword"
            },
            "@type": {
                "type": "keyword"
            }
        }
    }
}
for k in KEYWORDS:
    morph_mapping["mappings"]["properties"][k] = {"type": "keyword"}

In [None]:
emb_mapping = {}
for el in EMBEDDING_VECTORS:
    dim = len(data[el].iloc[0])
    emb_mapping[el] = {
        "mappings": {
            "properties": {
                "@id": {
                    "type": "keyword"
                },
                "@type": {
                    "type": "keyword"
                },
                "embedding": {
                    "dims": dim,
                    "type": "dense_vector"
                },
                "derivation": {
                    "type": "keyword"
                }
            }
        }
    }

Create morphology index

In [None]:
requests.delete(f"{ELASTIC_URI}/morphologies")
r = requests.put(
    f'{ELASTIC_URI}/morphologies/',
    json=morph_mapping
)

In [None]:
elastic = Elasticsearch(f'{ELASTIC_URI}/morphologies/')

In [None]:
actions = [
    {
        "_id": element["@id"],
        "@id": element["@id"],
        "@type": element["@type"],
        "brain_region": element["brain_region"],
        "atlas_release": element["atlas_release"],
        "object_of_study": element["object_of_study"]
        
    }
    for element in morphologies.to_dict("records")
]

try:
    # make the bulk call, and get a response
    response = helpers.bulk(elastic, actions)
    print ("\nRESPONSE:", response)
except Exception as e:
    print("\nERROR:", e)

Create an index per embedding vector

In [None]:
for el in emb_mapping:
    requests.delete(f"{ELASTIC_URI}/{el}")
    r = requests.put(
        f'{ELASTIC_URI}/{el}/',
        json=emb_mapping[el]
    )

In [None]:
for embedding in emb_mapping:
    elastic = Elasticsearch(f'{ELASTIC_URI}/{embedding}/')
    
    actions = []
    for element in data[["@id", embedding]].to_dict("records"):
        morph_id = re.findall("https:\/\/(.*)\/(.*)", element["@id"])[0][1]
        actions.append({
            "_id": embedding + "_" + morph_id,
            "@id": embedding + "_" + morph_id,
            "derivation": element["@id"],
            "@type": "Embedding",
            "embedding": element[embedding]
        })

    try:
        # make the bulk call, and get a response
        response = helpers.bulk(elastic, actions)
        print ("\nRESPONSE:", response)

    except Exception as e:
        print("\nERROR:", e)

In [None]:
r = requests.get(
    f'{ELASTIC_URI}/_cat/indices')
print(r.text)

In [None]:
test_mapping = {
    "mappings": {
        "properties": {
          "@id": {
            "type": "keyword"
          },
          "@type": {
            "type": "keyword"
          },
          "embedding": {
            "dims": 3,
            "type": "dense_vector"
          },
          "derivation": {
            "properties": {
              "entity": {
                "properties": {
                  "@id": {
                    "type": "keyword"
                  }
                },
                "type": "nested"
              }
            },
            "type": "nested"
          },
          "generation": {
            "properties": {
              "activity": {
                "properties": {
                  "used": {
                    "properties": {
                      "@id": {
                        "type": "keyword"
                      }
                    },
                    "type": "nested"
                  }
                },
                "type": "nested"
              }
            },
            "type": "nested"
          }

        }
    }
}

In [None]:
requests.delete(f"{ELASTIC_URI}/test_embedding_der_gen")
r = requests.put(
    f'{ELASTIC_URI}/test_embedding_der_gen/',
    json=test_mapping
)

In [None]:
r.text

In [None]:
r = requests.get(
    f'{ELASTIC_URI}/_cat/indices')
print(r.text)

# 3. Query individual indices

### Search a morphology

In [None]:
selected_morphology = morphologies["@id"].iloc[0]

In [None]:
r = requests.post(
    f'{ELASTIC_URI}/morphologies/_search',
    json={
        "query": {
            "term": {
              "@id": {
                "value" : selected_morphology
              }
            }
        }
    }
)
r.json()

### Find its coordinates in the atlas

In [None]:
r = requests.post(
    f'{ELASTIC_URI}/coordinates/_search',
    json={
        "query": {
            "term": {
              "derivation": {
                "value" : selected_morphology
              }
            }
        }
    }
)
selected_coord_vector = r.json()["hits"]["hits"][0]["_source"]["embedding"]

In [None]:
selected_coord_vector

### Find 5 closest morphologies "geographically" (Euclidean distance)

In [None]:
# Euclidean distance
r = requests.post(
    f'{ELASTIC_URI}/coordinates/_search',
    json={
      "size": 5,
      "query": {
        "script_score": {
          "query": {"match_all": {}},
          "script": {
            "source": "1 / (1 + l2norm(params.query_vector, 'embedding'))", 
            "params": {"query_vector": selected_coord_vector}
          }
        }
      }
    }
)
r.json()

### Find its embedding buit using axon co-projection graph

In [None]:
r = requests.post(
    f'{ELASTIC_URI}/axon_coproj_embedding/_search',
    json={
        "query": {
            "term": {
              "derivation": {
                "value" : selected_morphology
              }
            }
        }
    }
)
selected_embedding_vector = r.json()["hits"]["hits"][0]["_source"]["embedding"]

In [None]:
selected_embedding_vector

### Find 5 closest morphologies in this space (Cosine similarity)

In [None]:
# Cosine similarity
r = requests.post(
    f'{ELASTIC_URI}/axon_coproj_embedding/_search',
    json={
      "size": 10,
      "query": {
        "script_score": {
          "query": {"match_all": {}},
          "script": {
            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
            "params": {"query_vector": selected_embedding_vector}
          }
        }
      }
    }
)
r.json()

In [None]:
import numpy as np
import faiss

from collections import defaultdict

In [None]:
coordiates_index = faiss.IndexFlatL2(3)
coordiates_index.add(np.array(data["coordinates"].tolist()).astype(np.float32))

In [None]:
neurite_feature_index = faiss.IndexFlatL2(len(data["neurite_features"].iloc[0]))
neurite_feature_index.add(np.array(data["neurite_features"].tolist()).astype(np.float32))

In [None]:
axon_index = faiss.IndexFlatL2(len(data["axon_coproj_embedding"].iloc[0]))
axon_index.add(np.array(data["axon_coproj_embedding"].tolist()).astype(np.float32))

In [None]:
dendrite_index = faiss.IndexFlatL2(len(data["dendrite_coproj_embedding"].iloc[0]))
dendrite_index.add(np.array(data["dendrite_coproj_embedding"].tolist()).astype(np.float32))

In [None]:
def get_position(responses, el, ascending=True):
    sorted_df = pd.DataFrame(
        responses.items(), columns=["id", "score"]).set_index("id").sort_values(
            "score", ascending=ascending)
    return sorted_df.index.get_indexer([el])[0] + 1

In [None]:
def build_fusion_graph(indices, query, k):
    query_responses = {}
    for index_name, index in indices.items():
        query_vector = index.reconstruct(query)
        scores, points = index.search(np.array([query_vector]), k)
        query_responses[index_name] = {
            el: scores[0][i] for i, el in enumerate(points[0])
        }
    
    # Create a node per unique item in response sets
    nodes = {}
    for index_response in query_responses.values():
        for key in index_response.keys():
            if key in nodes:
                nodes[key] += index_response[key]
            else:
                nodes[key] = index_response[key]

    # Compute response sets for each node being a query
    node_responses = {}
    for index_name, index in indices.items():
        node_responses[index_name] = {}
        node_list = list(nodes.keys())
        query_vectors = np.array([index.reconstruct(int(node)) for node in node_list])
        all_scores, all_points = index.search(query_vectors, k)
        node_responses[index_name] = {
            node_list[i]: {el: scores[j] for j, el in enumerate(points)}
            for i, (scores, points) in enumerate(zip(all_scores, all_points))
        }
    
    # Create edges between unique response items
    edges = {}
    for index_name in indices:
        for el in query_responses[index_name]:
            el_responses = [
                response[el] for response in node_responses.values()
            ]
            for response in el_responses:
                for target_el in response:
                    if target_el != el and target_el in nodes:
                        weight = response[target_el] / get_position(
                            query_responses[index_name], el, False)
                        if (el, target_el) not in edges:
                            edges[(el, target_el)] = weight
                        else:
                            edges[(el, target_el)] += weight
                        
    
    fusion_graph = nx.DiGraph()
    fusion_graph.add_nodes_from([
        (node, {"weight": weight})for node, weights  in nodes.items()
    ])
    fusion_graph.add_edges_from([
        (s, t, {"weight": weight}) for (s, t), weights  in edges.items()
    ])
    return fusion_graph

In [None]:
graph = build_fusion_graph(
    {
        "coordindates": coordiates_index,
        "neurite_features": neurite_feature_index,
        "axon_coproj": axon_index,
        "dendrite_coproj": dendrite_index
    },
    0, 10)