# Creating vector representations of SEU Neuron Morphologies

In [None]:
import getpass
import jwt

import numpy as np
import pandas as pd

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.downstream.utils import transform_to_2d, plot_2d
from bluegraph.preprocess import CooccurrenceGenerator
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder
from bluegraph.backends.networkx import NXCommunityDetector
from bluegraph.downstream import EmbeddingPipeline
from bluegraph.downstream.similarity import SimilarityProcessor

In [None]:
def get_encoder_features(prop_name, encoder, last_index):
    if encoder is None or isinstance(encoder, StandardScaler):
        return {last_index: f"{prop_name}_IDENTITY"}, last_index + 1
    if isinstance(encoder, TfidfVectorizer):
        return (
            {
                i + last_index: f"{prop_name}_WORD_{f}"
                for i, f in enumerate(encoder.get_feature_names())
            },
            last_index + len(encoder.get_feature_names())
        )
    elif isinstance(encoder, MultiLabelBinarizer):
        return (
            {
                i + last_index: f"{prop_name}_CLASS_{c}"
                for i, c in enumerate(encoder.classes_)
            },
            last_index + len(encoder.classes_)
        )
    
    else:
        return {}, last_index

def explain_property_coordinates(encoder, graph):
    last_index = 0
    property_coordinates = {}
    for p in graph.node_properties():
        if p in encoder._node_encoders:
            res, new_index = get_encoder_features(
                p, encoder._node_encoders[p], last_index)
            property_coordinates.update(res)
            last_index = new_index
    return property_coordinates

## Load morphologies from Nexus

In [None]:
TOKEN = getpass.getpass()

In [None]:
forge = KnowledgeGraphForge("configs/new-forge-config.yaml",
                            token=TOKEN,
                            bucket="bbp-external/seu")

In [None]:
morphologies = forge.search({"type": "NeuronMorphology"}, limit=None)

In [None]:
morphologies_df = forge.as_dataframe(morphologies)[[
    "id",
    "brainLocation.brainRegion.id",
    "brainLocation.coordinatesInBrainAtlas.valueX.value",
    "brainLocation.coordinatesInBrainAtlas.valueY.value",
    "brainLocation.coordinatesInBrainAtlas.valueZ.value",
    "somaNumberOfPoints.value",
    "neuriteFeature",
    "subject.name",
    # to remove
    "atlasRelease.id",
    "objectOfStudy.label",
    "generation.activity.hadProtocol.id"
]]

In [None]:
forge = KnowledgeGraphForge(
    "configs/new-forge-config.yaml",
    token=TOKEN,
    bucket="neurosciencegraph/datamodels")

In [None]:
brain_region_resources = [
    forge.retrieve(el) for el in morphologies_df["brainLocation.brainRegion.id"]
]

In [None]:
brain_region_notation = {
    r.id: (r.notation, r.prefLabel)
    for r in brain_region_resources
}

In [None]:
brain_region_notation

In [None]:
def get_notation(region_id):
    if region_id not in brain_region_notation:
        r = forge.retrieve(region_id)
        brain_region_notation[r.id] = (r.notation, r.prefLabel)
    return brain_region_notation[region_id][0]

In [None]:
def get_neurite_features(data):
    
    result = {
        "has_Axon_feature": 0,
        "has_BasalDendrite_feature": 0,
        "has_ApicalDendrite_feature": 0
    }
    
    def _process_data(el, prefix):
        result[f"{prefix}_cumulatedLength"] = el["cumulatedLength"]["value"]
        result[f"{prefix}_longestBranchLength"] = el["longestBranchLength"]["value"]
        result[f"{prefix}_longestBranchNumberOfNodes"] = el["longestBranchNumberOfNodes"]
        result[f"{prefix}_numberOfProjections"] = el["numberOfProjections"]
        result[f"{prefix}_projectionBrainRegion"] = {
            get_notation(r["id"]) for r in el["projectionBrainRegion"]
        } if isinstance(el["projectionBrainRegion"], list) else {get_notation(el["projectionBrainRegion"]["id"])}
        result[f"{prefix}_traversedBrainRegion"] = {
            get_notation(r["id"]) for r in el["traversedBrainRegion"]
        } if isinstance(el["traversedBrainRegion"], list) else {get_notation(el["traversedBrainRegion"]["id"])}

    for el in data:
        result["has_" + el["type"] + "_feature"] = 1
        _process_data(el, el["type"])

    return result

In [None]:
neurite_features = morphologies_df["neuriteFeature"].apply(get_neurite_features).tolist()

In [None]:
neurite_feature_df = pd.DataFrame(neurite_features)

In [None]:
morphologies_df["brainLocation.brainRegion.id"] = morphologies_df["brainLocation.brainRegion.id"].apply(
    lambda x: brain_region_notation[x][0])

In [None]:
morphologies_df.head(5)

## Create a property graph without edges

In [None]:
nodes = pd.concat(
    [morphologies_df, neurite_feature_df], axis=1).drop(
        columns=["neuriteFeature"]).rename(columns={"id": "@id"}).set_index("@id")

In [None]:
frame = PandasPGFrame()
frame._nodes = nodes
categorical_props = [
    "brainLocation.brainRegion.id",
    "subject.name",
    "has_Axon_feature",
    "has_BasalDendrite_feature",
    "has_ApicalDendrite_feature",
    "BasalDendrite_projectionBrainRegion",
    "BasalDendrite_traversedBrainRegion",
    "ApicalDendrite_projectionBrainRegion",
    "ApicalDendrite_traversedBrainRegion",
    "Axon_projectionBrainRegion",
    "Axon_traversedBrainRegion",
]
for column in nodes.columns:
    if column != "@type":
        if column not in categorical_props:
            try:
                frame.node_prop_as_numeric(column)
            except ValueError:
                pass
        else:
            frame.node_prop_as_category(column)
frame.rename_node_properties({
    p: p.replace(".", "_")
    for p in frame.node_properties()
})

## Encode properties

In [None]:
encoder = ScikitLearnPGEncoder(
    node_properties=frame.node_properties(),
    missing_numeric="impute",
    imputation_strategy="mean",
    reduce_node_dims=True,
    n_node_components=64)

In [None]:
encoded_frame = encoder.fit_transform(frame)

Explained variance of data

In [None]:
sum(encoder.node_reducer.explained_variance_ratio_)

Let's plot a 2D projection of what we have got

In [None]:
X = np.array(encoded_frame._nodes["features"].tolist())
features_2d = transform_to_2d(X)

In [None]:
plot_2d(
    frame, vectors=features_2d, label_prop="brainLocation_brainRegion_id",
    title="Colored by Brain Region")

## Create co-projection PGFrames 

### Axon co-projection graph

In [None]:
gen = CooccurrenceGenerator(frame)
axon_edges = gen.generate_from_nodes(
    "Axon_projectionBrainRegion",
    compute_statistics=["frequency"])

In [None]:
axon_edges = axon_edges[axon_edges["frequency"].values > 3]

In [None]:
axon_edges.shape

In [None]:
axon_coprojection_frame = PandasPGFrame.from_frames(
    nodes=encoded_frame._nodes, edges=axon_edges)
axon_coprojection_frame.edge_prop_as_numeric("frequency")

### Dendrite co-projection graph

In [None]:
gen = CooccurrenceGenerator(frame)
dendrite_edges = gen.generate_from_nodes(
    "BasalDendrite_projectionBrainRegion",
    compute_statistics=["frequency"])

In [None]:
dendrite_edges = dendrite_edges[dendrite_edges["frequency"].values > 1]

In [None]:
dendrite_edges.shape

In [None]:
dendrite_coprojection_frame = PandasPGFrame.from_frames(
    nodes=encoded_frame._nodes, edges=dendrite_edges)
dendrite_coprojection_frame.edge_prop_as_numeric("frequency")

## Embed nodes

### Axon co-projection embedding

In [None]:
axon_D = 128

In [None]:
axon_attri2vec_embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features",
    length=5, number_of_walks=10,
    epochs=10, embedding_dimension=axon_D, edge_weight="frequency")
axon_embedding = axon_attri2vec_embedder.fit_model(axon_coprojection_frame)

In [None]:
axon_coprojection_frame.add_node_properties(
    axon_embedding.rename(columns={"embedding": "attri2vec"}))

In [None]:
embedding_2d = transform_to_2d(axon_coprojection_frame._nodes["attri2vec"].tolist())
plot_2d(frame, vectors=embedding_2d, label_prop="brainLocation_brainRegion_id")

### Dendrite co-projection embedding

In [None]:
dendrite_D = 100

In [None]:
dendrite_attri2vec_embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features",
    length=6, number_of_walks=20,
    epochs=15, embedding_dimension=dendrite_D, edge_weight="frequency")
dendrite_embedding = dendrite_attri2vec_embedder.fit_model(dendrite_coprojection_frame)

In [None]:
dendrite_coprojection_frame.add_node_properties(
    dendrite_embedding.rename(columns={"embedding": "attri2vec"}))

In [None]:
embedding_2d = transform_to_2d(dendrite_coprojection_frame._nodes["attri2vec"].tolist())
plot_2d(frame, vectors=embedding_2d, label_prop="brainLocation_brainRegion_id")

In [None]:
# with open ("meta.tsv", "w") as f:
#     f.write("id\tregion\n")
#     for el in dendrite_attri2vec_embedding["embedding"].index:
#         f.write("{}\t{}\n".format(el, frame._nodes.loc[el, "brainLocation_brainRegion_id"]))

In [None]:
# with open ("vecs.tsv", "w") as f:
#     for el in dendrite_attri2vec_embedding["embedding"].tolist():
#         f.write("\t".join([str(v) for v in el]) + "\n")

## Create and save the embedding pipelines

### Axon co-projection pipeline

In [None]:
sim_processor = SimilarityProcessor(similarity="euclidean", dimension=axon_D)
sim_processor.add(axon_embedding["embedding"].tolist(),
                  axon_embedding.index)
pipeline = EmbeddingPipeline(
    preprocessor=encoder,
    embedder=axon_attri2vec_embedder,
    similarity_processor=sim_processor)

In [None]:
pipeline.save("SEU_morph_axon_coproj_attri2vec_euclidean", compress=True)

### Dendrite co-projection pipeline

In [None]:
sim_processor = SimilarityProcessor(similarity="euclidean", dimension=dendrite_D)
sim_processor.add(dendrite_embedding["embedding"].tolist(),
                  dendrite_embedding.index)
pipeline = EmbeddingPipeline(
    preprocessor=encoder,
    embedder=dendrite_attri2vec_embedder,
    similarity_processor=sim_processor)

In [None]:
pipeline.save("SEU_morph_dendrite_coproj_attri2vec_euclidean", compress=True)

### 5. TODO: Push the model into the model catalog

In [None]:
TOKEN = getpass.getpass()

In [None]:
forge = KnowledgeGraphForge(
    "configs/new-forge-config.yaml",
    endpoint="https://staging.nexus.ocp.bbp.epfl.ch/v1",
    token=TOKEN,
    bucket="dke/embedder_catalog")

In [None]:
def get_agent(token):
    agent_data = jwt.decode(token, verify=False)
    agent = forge.reshape(
        forge.from_json(agent_data), keep=[
            "name", "email", "sub", "preferred_username"])
    agent.id = agent.sub
    agent.type = "Person"
    return agent

In [None]:
def push_model(forge, agent, name, description, distribution):
    # Create a new model resource
    model_resource = Dataset(
        forge,
        name=name,
        description=description)
    model_resource.type = ["Dataset", "EmbeddingModel"]
    # Add distrubution
    model_resource.add_distribution(
        distribution, content_type="application/octet-stream")
    # Add contribution
    model_resource.add_contribution(agent, versioned=False)
    model_resource.contribution.hadRole = "Engineer"

    forge.register(model_resource)

In [None]:
agent = get_agent(TOKEN)

In [None]:
push_model(
    forge, agent, "SEU NeuronMorphology Axon Co-Projection Embedding",
    "Node embedding model built on an axon co-projection graph extracted from the SEU neuron morphology dataset resources",
    "SEU_morph_axon_attri2vec_euclidean.zip")

In [None]:
push_model(
    forge, agent, "SEU NeuronMorphology Dendrite Co-Projection Embedding",
    "Node embedding model built on a dendrite co-projection graph extracted from the SEU neuron morphology dataset resources",
    "SEU_morph_dendrite_coproj_attri2vec_euclidean.zip")

## Visualize the graph in Gephi

In [None]:
frame.export_to_gephi(
    "seu_morphologies_co_proj",
    node_attr_mapping={
        "brainLocation_brainRegion_id": "Region"
    },
    edge_attr_mapping={
        "frequency": "weight"
    })

<img src="figures/neu_co_proj_illustration.png">

## Create a df with different representations (for testing pipelined recommendation/search)

### Create coordinate vectors

In [None]:
coordinate_df = pd.DataFrame(morphologies_df["id"])

In [None]:
coordinate_df["coordinates"] = pd.Series(morphologies_df[[
    "brainLocation.coordinatesInBrainAtlas.valueX.value",
    "brainLocation.coordinatesInBrainAtlas.valueY.value",
    "brainLocation.coordinatesInBrainAtlas.valueZ.value"
]].values.tolist())
coordinate_df = coordinate_df.rename(columns={"id": "@id"})

In [None]:
embedding_2d = transform_to_2d(coordinate_df["coordinates"].tolist())
plot_2d(axon_proj_frame, vectors=embedding_2d, label_prop="brainLocation_brainRegion_id")

### Create neurite feature vectors

In [None]:
projection_columns = [
    "Axon_traversedBrainRegion",
    "Axon_projectionBrainRegion",
    "BasalDendrite_traversedBrainRegion",
    "BasalDendrite_projectionBrainRegion",
    "ApicalDendrite_traversedBrainRegion",
    "ApicalDendrite_projectionBrainRegion"
]

In [None]:
neurite_features = nodes[[c for c in neurite_feature_df.columns if c not in projection_columns]]

In [None]:
neurite_frame = PandasPGFrame.from_frames(
    nodes=neurite_features, edges=pd.DataFrame())

In [None]:
for c in neurite_frame._nodes.columns:
    neurite_frame.node_prop_as_numeric(c)

In [None]:
encoder = ScikitLearnPGEncoder(
    node_properties=neurite_frame.node_properties(),
    missing_numeric="impute",
    imputation_strategy="mean")
encoded_frame = encoder.fit_transform(neurite_frame)

In [None]:
neurite_features = encoded_frame._nodes.rename(columns={"features": "neurite_features"})

In [None]:
embedding_2d = transform_to_2d(encoded_frame._nodes["features"].tolist())
plot_2d(axon_proj_frame, vectors=embedding_2d, label_prop="brainLocation_brainRegion_id")

### Create co-projection graph representations

In [None]:
axon_proj_frame = PandasPGFrame.from_frames(
    nodes=frame._nodes[["brainLocation_brainRegion_id", "Axon_projectionBrainRegion"]], edges=pd.DataFrame())
dendrite_proj_frame = PandasPGFrame.from_frames(
    nodes=frame._nodes[["brainLocation_brainRegion_id", "BasalDendrite_projectionBrainRegion"]], edges=pd.DataFrame())

Create a co-projection graph based on axon projections (output)

In [None]:
gen = CooccurrenceGenerator(axon_proj_frame)
edges = gen.generate_from_nodes(
    "Axon_projectionBrainRegion",
    compute_statistics=["frequency"])
edges = edges[edges["frequency"].values > 3]
axon_proj_frame._edges = edges
axon_proj_frame.edge_prop_as_numeric("frequency")

In [None]:
watchyourstep_embedder = StellarGraphNodeEmbedder(
    "watchyourstep", directed=False, epochs=100, num_walks=80,
    embedding_dimension=128, num_powers=10)
watchyourstep_embedding = watchyourstep_embedder.fit_model(axon_proj_frame)

In [None]:
axon_proj_frame.add_node_properties(
    watchyourstep_embedding.rename(columns={"embedding": "axon_coproj_embedding"}))

In [None]:
embedding_2d = transform_to_2d(axon_proj_frame._nodes["axon_coproj_embedding"].tolist())
plot_2d(axon_proj_frame, vectors=embedding_2d, label_prop="brainLocation_brainRegion_id")

Create a co-projection graph based on dendrite projections (input)

In [None]:
gen = CooccurrenceGenerator(dendrite_proj_frame)
edges = gen.generate_from_nodes(
    "BasalDendrite_projectionBrainRegion",
    compute_statistics=["frequency"])
edges = edges[edges["frequency"].values > 3]
dendrite_proj_frame._edges = edges
dendrite_proj_frame.edge_prop_as_numeric("frequency")

In [None]:
watchyourstep_embedder = StellarGraphNodeEmbedder(
    "watchyourstep", directed=False, epochs=100, num_walks=80,
    embedding_dimension=128, num_powers=10)
watchyourstep_embedding = watchyourstep_embedder.fit_model(dendrite_proj_frame)

In [None]:
dendrite_proj_frame.add_node_properties(
    watchyourstep_embedding.rename(columns={"embedding": "dendrite_coproj_embedding"}))

In [None]:
embedding_2d = transform_to_2d(dendrite_proj_frame._nodes["dendrite_coproj_embedding"].tolist())
plot_2d(dendrite_proj_frame, vectors=embedding_2d, label_prop="brainLocation_brainRegion_id")

In [None]:
combined_data["coordinates"] = combined_data["coordinates"].apply(lambda x: [float(el) for el in x])

In [None]:
combined_data["neurite_features"] = combined_data["neurite_features"].apply(
    lambda x: x.tolist())

In [None]:
combined_data = coordinate_df.merge(
    neurite_features.reset_index(), on="@id").merge(
        axon_proj_frame._nodes[["axon_coproj_embedding"]].reset_index(),
        on="@id").merge(
            dendrite_proj_frame._nodes[["dendrite_coproj_embedding"]].reset_index()).merge(
                morphologies_df[[
                    "id",
                    "brainLocation.brainRegion.id",
                    "atlasRelease.id",
                    "objectOfStudy.label",
                    "generation.activity.hadProtocol.id"]].rename(columns={"id": "@id"}),
                on="@id")

In [None]:
combined_data.to_csv("combined_data.csv", index=None)