In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.svm import LinearSVC

from bluegraph.core import PandasPGFrame
from bluegraph.preprocess.generators import CooccurrenceGenerator
from bluegraph.preprocess.encoders import ScikitLearnPGEncoder

from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

from bluegraph.downstream.similarity import (SimilarityProcessor, NodeSimilarityProcessor)
from bluegraph.downstream.node_classification import NodeClassifier
from bluegraph.downstream.benchmark import get_classification_scores
from bluegraph.downstream.link_prediction import (generate_negative_edges,
                                                  EdgePredictor)

from bluegraph.downstream.utils import EmbeddingPipeline
from bluegraph.core.embed.embedders import GraphElementEmbedder

## Data preparation

Fist, we read the source dataset with mentions of entities in different paragraphs

In [2]:
mentions = pd.read_csv("data/labeled_entity_occurrence.csv")

In [3]:
# Extract unique paper/seciton/paragraph identifiers
mentions = mentions.rename(columns={"occurrence": "paragraph"})
number_of_paragraphs = len(mentions["paragraph"].unique())

In [4]:
mentions

Unnamed: 0,entity,paragraph
0,lithostathine-1-alpha,1
1,pulmonary,1
2,host,1
3,lithostathine-1-alpha,2
4,surfactant protein d measurement,2
...,...,...
2281346,covid-19,227822
2281347,covid-19,227822
2281348,viral infection,227823
2281349,lipid,227823


We will also load a dataset that contains definitions of entities and their types

In [5]:
entity_data = pd.read_csv("data/entity_types_defs.csv")

In [6]:
entity_data

Unnamed: 0,entity,entity_type,definition
0,(e3-independent) e2 ubiquitin-conjugating enzyme,PROTEIN,(E3-independent) E2 ubiquitin-conjugating enzy...
1,(h115d)vhl35 peptide,CHEMICAL,A peptide vaccine derived from the von Hippel-...
2,"1,1-dimethylhydrazine",DRUG,"A clear, colorless, flammable, hygroscopic liq..."
3,"1,2-dimethylhydrazine",CHEMICAL,A compound used experimentally to induce tumor...
4,"1,25-dihydroxyvitamin d(3) 24-hydroxylase, mit...",PROTEIN,"1,25-dihydroxyvitamin D(3) 24-hydroxylase, mit..."
...,...,...,...
28127,zygomycosis,DISEASE,Any infection due to a fungus of the Zygomycot...
28128,zygomycota,ORGANISM,A phylum of fungi that are characterized by ve...
28129,zygosity,ORGANISM,"The genetic condition of a zygote, especially ..."
28130,zygote,CELL_COMPARTMENT,"The cell formed by the union of two gametes, e..."


### Generation of a co-occurrence graph

We first create a graph whose nodes are entities

In [7]:
graph = PandasPGFrame()
entity_nodes = mentions["entity"].unique()
graph.add_nodes(entity_nodes)
graph.add_node_types({n: "Entity" for n in entity_nodes})

entity_props = entity_data.rename(columns={"entity": "@id"}).set_index("@id")
graph.add_node_properties(entity_props["entity_type"], prop_type="category")
graph.add_node_properties(entity_props["definition"], prop_type="text")

In [8]:
paragraph_prop = pd.DataFrame({"paragraphs": mentions.groupby("entity").aggregate(set)["paragraph"]})
graph.add_node_properties(paragraph_prop, prop_type="category")

In [9]:
graph.nodes(raw_frame=True)

Unnamed: 0_level_0,@type,entity_type,definition,paragraphs
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lithostathine-1-alpha,Entity,PROTEIN,"Lithostathine-1-alpha (166 aa, ~19 kDa) is enc...","{1, 2, 3, 18178, 195589, 104454, 88967, 104455..."
pulmonary,Entity,ORGAN,Relating to the lungs as the intended site of ...,"{1, 196612, 196613, 196614, 196621, 196623, 16..."
host,Entity,ORGANISM,An organism that nourishes and supports anothe...,"{1, 114689, 3, 221193, 180243, 180247, 28, 180..."
surfactant protein d measurement,Entity,PROTEIN,The determination of the amount of surfactant ...,"{145537, 2, 3, 4, 5, 6, 51202, 103939, 103940,..."
communication response,Entity,PATHWAY,A statement (either spoken or written) that is...,"{46592, 64000, 2, 28162, 166912, 226304, 88585..."
...,...,...,...,...
drug binding site,Entity,PATHWAY,The reactive parts of a macromolecule that dir...,"{225082, 225079}"
carbaril,Entity,CHEMICAL,A synthetic carbamate acetylcholinesterase inh...,"{225408, 225409, 225415, 225419, 225397}"
ny-eso-1 positive tumor cells present,Entity,CELL_TYPE,An indication that Cancer/Testis Antigen 1 exp...,"{225544, 226996}"
mustelidae,Entity,ORGANISM,Taxonomic family which includes the Ferret.,"{225901, 225903}"


For each node we will add the `frequency` property that counts the total number of paragraphs where the entity was mentioned.

In [10]:
frequencies = graph._nodes["paragraphs"].apply(len)
frequencies.name = "frequency"
graph.add_node_properties(frequencies)

In [11]:
graph.nodes(raw_frame=True)

Unnamed: 0_level_0,@type,entity_type,definition,paragraphs,frequency
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lithostathine-1-alpha,Entity,PROTEIN,"Lithostathine-1-alpha (166 aa, ~19 kDa) is enc...","{1, 2, 3, 18178, 195589, 104454, 88967, 104455...",80
pulmonary,Entity,ORGAN,Relating to the lungs as the intended site of ...,"{1, 196612, 196613, 196614, 196621, 196623, 16...",8295
host,Entity,ORGANISM,An organism that nourishes and supports anothe...,"{1, 114689, 3, 221193, 180243, 180247, 28, 180...",2660
surfactant protein d measurement,Entity,PROTEIN,The determination of the amount of surfactant ...,"{145537, 2, 3, 4, 5, 6, 51202, 103939, 103940,...",268
communication response,Entity,PATHWAY,A statement (either spoken or written) that is...,"{46592, 64000, 2, 28162, 166912, 226304, 88585...",160
...,...,...,...,...,...
drug binding site,Entity,PATHWAY,The reactive parts of a macromolecule that dir...,"{225082, 225079}",2
carbaril,Entity,CHEMICAL,A synthetic carbamate acetylcholinesterase inh...,"{225408, 225409, 225415, 225419, 225397}",5
ny-eso-1 positive tumor cells present,Entity,CELL_TYPE,An indication that Cancer/Testis Antigen 1 exp...,"{225544, 226996}",2
mustelidae,Entity,ORGANISM,Taxonomic family which includes the Ferret.,"{225901, 225903}",2


Now, for constructing co-occurrence network we will select only 1000 most frequent entities.

In [12]:
nodes_to_include = graph._nodes.nlargest(1000, "frequency").index

In [13]:
nodes_to_include

Index(['covid-19', 'blood', 'human', 'infectious disorder', 'heart',
       'diabetes mellitus', 'lung', 'sars-cov-2', 'mouse', 'pulmonary',
       ...
       'wheezing', 'chief complaint', 'azathioprine', 'ileum', 'hematology',
       'nonalcoholic steatohepatitis', 'nervous system disorder',
       'renal impairment', 'urticaria', 'rectum'],
      dtype='object', name='@id', length=1000)

The `CooccurrenceGenerator` class allows us to generate co-occurrence edges from overlaps in node property values or edge (or edge properties). In this case we consider the `paragraph` node property and construct co-occurrence edges from overlapping sets of paragraphs. In addition, we will compute some co-occurrence statistics: total co-occurrence frequency and normalized pointwise mutual information (NPMI).

In [14]:
%%time
generator = CooccurrenceGenerator(graph.subgraph(nodes=nodes_to_include))
paragraph_cooccurrence_edges = generator.generate_from_nodes(
    "paragraphs", total_factor_instances=number_of_paragraphs,
    compute_statistics=["frequency", "npmi"],
    parallelize=True, cores=8)

Examining 499500 pairs of terms for co-occurrence...
CPU times: user 7.62 s, sys: 2.26 s, total: 9.88 s
Wall time: 1min 20s


In [15]:
cutoff = paragraph_cooccurrence_edges["npmi"].mean()

In [16]:
paragraph_cooccurrence_edges = paragraph_cooccurrence_edges[paragraph_cooccurrence_edges["npmi"] > cutoff]

We add generated edges to the original graph

In [17]:
graph._edges = paragraph_cooccurrence_edges
graph.edge_prop_as_numeric("frequency")
graph.edge_prop_as_numeric("npmi")

In [18]:
graph.edges(raw_frame=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,common_factors,frequency,npmi
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
surfactant protein d measurement,microorganism,"{2, 3, 7810, 58, 41, 7754, 7850, 26218, 7853, ...",19,0.235263
surfactant protein d measurement,lung,"{2, 103939, 51202, 5, 4, 103940, 15, 145438, 3...",93,0.221395
surfactant protein d measurement,alveolar,"{223872, 2, 51202, 100502, 7831, 149657, 19522...",25,0.336175
surfactant protein d measurement,epithelial cell,"{2, 4, 5, 222298, 7825, 7732, 7733, 169174, 7738}",9,0.175923
surfactant protein d measurement,molecule,"{2, 7750, 49991, 134504, 206448, 49, 52, 20645...",10,0.113611
...,...,...,...,...
ciliated bronchial epithelial cell,cystic fibrosis pulmonary exacerbation,{46779},1,0.088963
ciliated bronchial epithelial cell,caax prenyl protease 2,"{215748, 220047}",2,0.151639
ciliated bronchial epithelial cell,transmembrane protease serine 2,"{167360, 167358, 167301, 214566, 214567, 16138...",14,0.305697
n-terminal fragment brain natriuretic protein,cardiac valve injury,"{217473, 217476, 164110, 162703, 218236, 18358...",40,0.436540


Recall that we have generated edges only for the 1000 most frequent entities, the rest of the entities will be isolated (having no incident edges). Let us remove all the isolated nodes.

In [19]:
graph.remove_isolated_nodes()

In [20]:
graph.number_of_nodes()

1000

Next, we save the generated co-occurrence graph.

In [None]:
graph.export_json("data/cooccurrence_graph.json")

In [2]:
graph = PandasPGFrame.load_json("data/cooccurrence_graph.json")

### Node feature extraction

We extract node features from entity definitions using the `tfidf` model.

In [8]:
encoder = ScikitLearnPGEncoder(
    node_properties=["definition"],
    text_encoding_max_dimension=512)

In [9]:
%%time
transformed_graph = encoder.fit_transform(graph)

CPU times: user 531 ms, sys: 19.6 ms, total: 550 ms
Wall time: 550 ms


We can have a glance at the vocabulary that the encoder constructed for the 'definition' property

In [10]:
vocabulary = encoder._node_encoders["definition"].vocabulary_
list(vocabulary.keys())[:10]

['relating',
 'lungs',
 'site',
 'administration',
 'product',
 'usually',
 'action',
 'lower',
 'respiratory',
 'tract']

We will add additional properties to our transformed graph corresponding to the entity type labels. We will also add NPMI as an edge property to this transformed graph.

In [6]:
transformed_graph.add_node_properties(
    graph.get_node_property_values("entity_type"))
transformed_graph.add_edge_properties(
    graph.get_edge_property_values("npmi"), prop_type="numeric")

In [7]:
transformed_graph.nodes(raw_frame=True)

Unnamed: 0_level_0,features,@type,entity_type
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pulmonary,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGAN
host,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGANISM
surfactant protein d measurement,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,PROTEIN
microorganism,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGANISM
lung,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGAN
...,...,...,...
candida parapsilosis,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGANISM
ciliated bronchial epithelial cell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,CELL_TYPE
cystic fibrosis pulmonary exacerbation,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,DISEASE
caax prenyl protease 2,"[0.0, 0.0, 0.3198444339599345, 0.0, 0.0, 0.0, ...",Entity,PROTEIN


## Node embedding and downstream tasks

### Node embedding using StellarGraph

Using `StellarGraphNodeEmbedder` we construct three different embeddings of our transformed graph corresponding to different embedding techniques.

In [9]:
node2vec_embedder = StellarGraphNodeEmbedder(
    "node2vec", edge_weight="npmi", embedding_dimension=64, length=10, number_of_walks=20)
node2vec_embedding = node2vec_embedder.fit_model(transformed_graph)

In [10]:
attri2vec_embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features",
    length=5, number_of_walks=10,
    epochs=10, embedding_dimension=128, edge_weight="npmi")
attri2vec_embedding = attri2vec_embedder.fit_model(transformed_graph)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [11]:
gcn_dgi_embedder = StellarGraphNodeEmbedder(
    "gcn_dgi", feature_vector_prop="features", epochs=250, embedding_dimension=512)
gcn_dgi_embedding = gcn_dgi_embedder.fit_model(transformed_graph)

Using GCN (local pooling) filters...


The `fit_model` methods produces a dataframe of the following shape

In [12]:
node2vec_embedding

Unnamed: 0,embedding
pulmonary,"[0.058199942111968994, 0.14610271155834198, -0..."
host,"[-0.06427627801895142, -0.30984464287757874, 0..."
surfactant protein d measurement,"[-0.11748885363340378, -0.11227574199438095, 0..."
microorganism,"[0.19230332970619202, -0.8941215872764587, 0.0..."
lung,"[-0.07322319597005844, 0.02298489399254322, 0...."
...,...
candida parapsilosis,"[0.119646355509758, -0.3787333369255066, 0.051..."
ciliated bronchial epithelial cell,"[-0.11820391565561295, -0.16134190559387207, 0..."
cystic fibrosis pulmonary exacerbation,"[0.06567505747079849, -0.2529926896095276, 0.0..."
caax prenyl protease 2,"[-0.07791422307491302, 0.053784679621458054, 0..."


Let us add the embedding vectors obtained using different models as node properties of our graph.

In [13]:
transformed_graph.add_node_properties(
    node2vec_embedding.rename(columns={"embedding": "node2vec"}))

In [14]:
transformed_graph.add_node_properties(
    attri2vec_embedding.rename(columns={"embedding": "attri2vec"}))

In [15]:
transformed_graph.add_node_properties(
    gcn_dgi_embedding.rename(columns={"embedding": "gcn_dgi"}))

In [16]:
transformed_graph.nodes(raw_frame=True)

Unnamed: 0_level_0,features,@type,entity_type,node2vec,attri2vec,gcn_dgi
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pulmonary,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGAN,"[0.058199942111968994, 0.14610271155834198, -0...","[0.022058159112930298, 0.019829988479614258, 0...","[0.0, 0.018860364332795143, 0.0043135471642017..."
host,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGANISM,"[-0.06427627801895142, -0.30984464287757874, 0...","[0.03038617968559265, 0.05566278100013733, 0.0...","[0.0, 0.02871589921414852, 0.00203686766326427..."
surfactant protein d measurement,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,PROTEIN,"[-0.11748885363340378, -0.11227574199438095, 0...","[0.035707831382751465, 0.04418954253196716, 0....","[0.0, 0.02296564169228077, 0.00784338731318712..."
microorganism,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGANISM,"[0.19230332970619202, -0.8941215872764587, 0.0...","[0.04623153805732727, 0.05907624959945679, 0.0...","[0.0, 0.04383104294538498, 0.0, 0.039705105125..."
lung,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGAN,"[-0.07322319597005844, 0.02298489399254322, 0....","[0.015269577503204346, 0.016422420740127563, 0...","[0.0, 0.02111690863966942, 0.00910695362836122..."
...,...,...,...,...,...,...
candida parapsilosis,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,ORGANISM,"[0.119646355509758, -0.3787333369255066, 0.051...","[0.032234519720077515, 0.032671838998794556, 0...","[0.0, 0.026741722598671913, 0.0, 0.03041464462..."
ciliated bronchial epithelial cell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,CELL_TYPE,"[-0.11820391565561295, -0.16134190559387207, 0...","[0.009313046932220459, 0.008033335208892822, 0...","[0.0, 0.023387573659420013, 0.0086997915059328..."
cystic fibrosis pulmonary exacerbation,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Entity,DISEASE,"[0.06567505747079849, -0.2529926896095276, 0.0...","[0.0471823513507843, 0.06246209144592285, 0.03...","[0.0, 0.023007649928331375, 0.0, 0.02054181694..."
caax prenyl protease 2,"[0.0, 0.0, 0.3198444339599345, 0.0, 0.0, 0.0, ...",Entity,PROTEIN,"[-0.07791422307491302, 0.053784679621458054, 0...","[0.026905536651611328, 0.021910756826400757, 0...","[0.0, 0.017998207360506058, 0.0130367819219827..."


### Node similarity

We would like to be able to search for similar nodes using the computed vector embeddings. For this we can use the `NodeSimilarityProcessor` interfaces provided as a part of `bluegraph`.

We construct similarity processors for different embeddings and query top 10 most similar nodes to the terms `glucose` and `covid-19`.

In [18]:
node2vec_l2 = NodeSimilarityProcessor(transformed_graph, "node2vec", similarity="euclidean")
node2vec_cosine = NodeSimilarityProcessor(
    transformed_graph, "node2vec", similarity="cosine")

In [19]:
node2vec_l2.get_similar_nodes(["glucose", "covid-19"], k=10)

{'glucose': {'glucose': 0.0,
  'obesity': 0.024657082,
  'insulin': 0.027283816,
  'vascular disorder': 0.032074038,
  'hyperglycemia': 0.034703307,
  'high density lipoprotein': 0.0394917,
  'metabolic disorder': 0.04257843,
  'nonalcoholic fatty liver disease': 0.045563146,
  'triglycerides': 0.046047315,
  'angiotensin-converting enzyme': 0.046518676},
 'covid-19': {'covid-19': 0.0,
  'coronavirus': 0.018573485,
  'catecholamine': 0.08507119,
  'cardiac valve injury': 0.08957823,
  'person': 0.09138094,
  'diabetic ketoacidosis': 0.096787885,
  'middle east respiratory syndrome': 0.10105667,
  'lung disorder': 0.10322104,
  'hypoglycemia': 0.10616878,
  'troponin i, cardiac muscle': 0.10721621}}

In [20]:
node2vec_cosine.get_similar_nodes(["glucose", "covid-19"], k=10)

{'glucose': {'glucose': 1.0,
  'obesity': 0.9949844,
  'insulin': 0.99460316,
  'vascular disorder': 0.9930061,
  'triglycerides': 0.9929766,
  'nonalcoholic fatty liver disease': 0.99250674,
  'hyperglycemia': 0.9924481,
  'metabolic disorder': 0.9922365,
  'high density lipoprotein': 0.99159557,
  'angiotensin-converting enzyme': 0.9899549},
 'covid-19': {'covid-19': 1.0000001,
  'coronavirus': 0.9966542,
  'cardiac valve injury': 0.9889957,
  'troponin i, cardiac muscle': 0.9875988,
  'c-reactive protein': 0.9869299,
  'fatal': 0.9852569,
  'person': 0.98402774,
  'catecholamine': 0.9831245,
  'diabetic ketoacidosis': 0.98299456,
  'n-terminal fragment brain natriuretic protein': 0.98261017}}

In [21]:
attri2vec_l2 = NodeSimilarityProcessor(transformed_graph, "attri2vec")
attri2vec_cosine = NodeSimilarityProcessor(
    transformed_graph, "attri2vec", similarity="cosine")

In [22]:
attri2vec_l2.get_similar_nodes(["glucose", "covid-19"], k=10)

{'glucose': {'glucose': 0.0,
  'aggregation': 0.021461695,
  'dna': 0.023853727,
  'undifferentiated pleomorphic sarcoma, inflammatory variant': 0.025215067,
  'digestion': 0.025289385,
  'acid': 0.025814142,
  'adenosine': 0.025855444,
  'embryo': 0.025894884,
  'thyroid gland': 0.026511166,
  'excision': 0.026981559},
 'covid-19': {'covid-19': 0.0,
  'pleural effusion': 0.0006802912,
  'anemia': 0.0006830017,
  'autoimmune disease': 0.0007813163,
  'cystic fibrosis': 0.00087290554,
  'vasculitis': 0.0008763613,
  'liver failure': 0.0009016517,
  'systemic inflammatory response syndrome': 0.00094110286,
  'dopamine': 0.0009539187,
  'respiratory failure': 0.0010581961}}

In [23]:
attri2vec_cosine.get_similar_nodes(["glucose", "covid-19"], k=10)

{'glucose': {'glucose': 1.0,
  'metal': 0.97050047,
  'aggregation': 0.96850204,
  'degradation': 0.9671387,
  'hydrophobicity': 0.96706814,
  'formation': 0.9666637,
  'epithelial': 0.96641994,
  'acid': 0.9661392,
  'autosome': 0.9657872,
  'base': 0.96562},
 'covid-19': {'covid-19': 1.0,
  'middle east respiratory syndrome': 0.9787885,
  'severe acute respiratory syndrome': 0.9759722,
  'wheezing': 0.9720144,
  'viral respiratory tract infection': 0.9716125,
  'systemic inflammatory response syndrome': 0.9667535,
  'pulmonary': 0.96416664,
  'dyspnea': 0.9639136,
  'asthma': 0.96206,
  'respiratory failure, ctcae': 0.9616722}}

In [24]:
gcn_l2 = NodeSimilarityProcessor(transformed_graph, "gcn_dgi")
gcn_cosine = NodeSimilarityProcessor(
    transformed_graph, "gcn_dgi", similarity="cosine")

In [25]:
gcn_l2.get_similar_nodes(["glucose", "covid-19"], k=10)

{'glucose': {'glucose': 0.0,
  'glucose tolerance test': 0.0033582333,
  'insulin': 0.0036404473,
  'high density lipoprotein': 0.0038155522,
  'triglycerides': 0.004127929,
  'organic phosphate': 0.004997046,
  'cholesterol': 0.0055623124,
  'citrate': 0.006085644,
  'uric acid': 0.006244778,
  'serum protein': 0.006598945},
 'covid-19': {'covid-19': 0.0,
  'coronavirus': 0.000899521,
  'acute respiratory distress syndrome': 0.0024894448,
  'fatal': 0.0025519214,
  'sars-cov-2': 0.0043987706,
  'severe acute respiratory syndrome': 0.004473505,
  'myocarditis': 0.004814123,
  'angiotensin ii receptor antagonist': 0.004987154,
  'middle east respiratory syndrome': 0.005641263,
  'cardiac valve injury': 0.005660265}}

In [26]:
gcn_cosine.get_similar_nodes(["glucose", "covid-19"], k=10)

{'glucose': {'glucose': 1.0,
  'insulin': 0.98022723,
  'cholesterol': 0.9793629,
  'triglycerides': 0.979231,
  'glucose tolerance test': 0.9781836,
  'high density lipoprotein': 0.97648215,
  'low density lipoprotein': 0.970876,
  'organic phosphate': 0.9690872,
  'calcium': 0.9683775,
  'plasma': 0.9681946},
 'covid-19': {'covid-19': 1.0000001,
  'coronavirus': 0.99556243,
  'acute respiratory distress syndrome': 0.9885864,
  'fatal': 0.9881703,
  'severe acute respiratory syndrome': 0.97803414,
  'sars-cov-2': 0.9779138,
  'myocarditis': 0.9763925,
  'angiotensin ii receptor antagonist': 0.9763065,
  'cardiac valve injury': 0.9716424,
  'middle east respiratory syndrome': 0.97150135}}

### Node classification

Another downstream task that we would like to perform is node classification. We would like to automatically assign entity types according to their node embeddings. For this we will build predictive models for entity type prediction based on:

- Only node features
- Node2vec embeddings (only structure)
- Attri2vec embeddings (structure and node features)
- GCN Deep Graph Infomax embeddings (structure and node features)

First of all, we split the graph nodes into the train and the test sets.

In [28]:
train_nodes, test_nodes = model_selection.train_test_split(
    transformed_graph.nodes(), train_size=0.8)

Now we use the `NodeClassifier` interface to create our classification models. As the base model we will use the linear SVM classifier (`LinearSVC`) provided by `scikit-learn`.

In [29]:
features_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="features")
features_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
features_pred = features_classifier.predict(transformed_graph, predict_elements=test_nodes)

In [30]:
node2vec_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="node2vec")
node2vec_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
node2vec_pred = node2vec_classifier.predict(transformed_graph, predict_elements=test_nodes)

In [31]:
attri2vec_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="attri2vec")
attri2vec_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
attri2vec_pred = attri2vec_classifier.predict(transformed_graph, predict_elements=test_nodes)



In [32]:
gcn_dgi_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="gcn_dgi")
gcn_dgi_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
gcn_dgi_pred = gcn_dgi_classifier.predict(transformed_graph, predict_elements=test_nodes)

Let us have a look at the scores of different node classification models we have produced.

In [33]:
true_labels = transformed_graph._nodes.loc[test_nodes, "entity_type"]

In [34]:
get_classification_scores(true_labels, features_pred, multiclass=True)

{'accuracy': 0.53,
 'precision': 0.53,
 'recall': 0.53,
 'f1_score': 0.53,
 'roc_auc_score': 0.7596807989462514}

In [35]:
get_classification_scores(true_labels, node2vec_pred, multiclass=True)

{'accuracy': 0.41,
 'precision': 0.41,
 'recall': 0.41,
 'f1_score': 0.41,
 'roc_auc_score': 0.7038196450707648}

In [36]:
get_classification_scores(true_labels, attri2vec_pred, multiclass=True)

{'accuracy': 0.445,
 'precision': 0.445,
 'recall': 0.445,
 'f1_score': 0.445,
 'roc_auc_score': 0.7206422024278101}

In [37]:
get_classification_scores(true_labels, gcn_dgi_pred, multiclass=True)

{'accuracy': 0.4,
 'precision': 0.4,
 'recall': 0.4,
 'f1_score': 0.4000000000000001,
 'roc_auc_score': 0.7005318509498862}

## Link prediction

Finally, we would like to use the produced node embeddings to predict the existance of edges. This downstream task is formulated as follows: given a pair of nodes and their embedding vectors, is there an edge between these nodes?

As the first step of the edges prediciton task we will generate false edges for training (node pairs that don't have edges between them).

In [39]:
false_edges = generate_negative_edges(transformed_graph)

We will now split both true and false edges into training and test sets.

In [40]:
true_train_edges, true_test_edges = model_selection.train_test_split(
    transformed_graph.edges(), train_size=0.8)

In [41]:
false_train_edges, false_test_edges = model_selection.train_test_split(
    false_edges, train_size=0.8)

And, finally, we will use the `EdgePredictor` interface to build our model (using `LinearSVC` as before and the Hadamard product as the binary operator between the embedding vectors for the source and the target nodes.

In [42]:
model = EdgePredictor(LinearSVC(), feature_vector_prop="node2vec",
                      operator="hadamard", directed=False)
model.fit(transformed_graph, true_train_edges, negative_samples=false_train_edges)

In [43]:
true_labels = np.hstack([
    np.ones(len(true_test_edges)),
    np.zeros(len(false_test_edges))])

In [44]:
y_pred = model.predict(transformed_graph, true_test_edges + false_test_edges)

Let us have a look at the obtained scores.

In [45]:
get_classification_scores(true_labels, y_pred)

{'accuracy': 0.7367616371562571,
 'precision': 0.7367616371562571,
 'recall': 0.7367616371562571,
 'f1_score': 0.7367616371562571,
 'roc_auc_score': 0.6447396928370788}

## Creating and saving embedding pipelines

`bluegraph` allows to create emebedding pipelines (using the `EmbeddingPipeline` class) that represent a useful wrapper around a sequence of step necessary to produce embeddings and compute point similarities. In the example below we create a pipeline for producing `attri2vec` node embeddings and computing their cosine similarity.

We first create an encoder object that will be used in our pipeline as a preprocessing step.

In [3]:
definition_encoder = ScikitLearnPGEncoder(
    node_properties=["definition"], text_encoding_max_dimension=512)

We then create an embedder object.

In [16]:
D = 128
params = {
    "length": 5,
    "number_of_walks": 10,
    "epochs": 5,
    "embedding_dimension": D
}
attri2vec_embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features", edge_weight="npmi", **params)

And finally we create a pipeline object. Note that in the code below we use the `SimilarityProcessor` interface and not `NodeSimilarityProcessor` as we have done it previously. We use this lower abstraction level interface, because the `EmbeddingPipeline` is designed to work with any embedding models (not only node embedding models).

In [17]:
attri2vec_pipeline = EmbeddingPipeline(
    preprocessor=definition_encoder,
    embedder=attri2vec_embedder,
    similarity_processor=SimilarityProcessor(similarity="cosine", dimension=D))

We run the fitting process, which given the input data:
1. fits the encoder
2. transforms the data
3. fits the embedder
4. produces the embedding table
5. fits the similarity processor index

In [6]:
attri2vec_pipeline.run_fitting(graph)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


How we can save our pipeline to the file system.

In [7]:
attri2vec_pipeline.save(
    "../../BlueBrainEmbedder/models/attri2vec_test_model",
    compress=True)

INFO:tensorflow:Assets written to: ../../BlueBrainEmbedder/models/attri2vec_test_model/embedder/model/assets


And we can load the pipeline back into memory:

In [18]:
pipeline = EmbeddingPipeline.load(
    "../../BlueBrainEmbedder/models/attri2vec_test_model.zip",
    embedder_interface=GraphElementEmbedder,
    embedder_ext="zip")

We can use `retrieve_embeddings` and `get_similar_points` methods of the pipeline object to respectively get embedding vectors and top most similar nodes for the input nodes.

In [22]:
pipeline.retrieve_embeddings(["covid-19", "glucose"])

[[0.029623359441757202,
  0.023537278175354004,
  0.024319171905517578,
  0.024620652198791504,
  0.02817395329475403,
  0.02425360679626465,
  0.024932682514190674,
  0.019140541553497314,
  0.028304308652877808,
  0.02664545178413391,
  0.021070539951324463,
  0.02607104182243347,
  0.02721339464187622,
  0.02756527066230774,
  0.021632134914398193,
  0.0284273624420166,
  0.018631428480148315,
  0.02770298719406128,
  0.022368431091308594,
  0.026864826679229736,
  0.034454673528671265,
  0.02783811092376709,
  0.01671704649925232,
  0.021527260541915894,
  0.028066307306289673,
  0.02377992868423462,
  0.023289978504180908,
  0.032778024673461914,
  0.02355477213859558,
  0.02976474165916443,
  0.023018091917037964,
  0.021784305572509766,
  0.02232244610786438,
  0.022601783275604248,
  0.02480250597000122,
  0.02384042739868164,
  0.02109050750732422,
  0.02449852228164673,
  0.02510356903076172,
  0.02391582727432251,
  0.024091094732284546,
  0.02578127384185791,
  0.0258342921

In [23]:
pipeline.get_similar_points(["covid-19", "glucose"], k=5)

([Index(['covid-19', 'middle east respiratory syndrome',
         'severe acute respiratory syndrome', 'dyspnea', 'asthma'],
        dtype='object', name='@id'),
  Index(['glucose', 'fatigue', 'hydrophobicity', 'pruritus',
         'congenital abnormality'],
        dtype='object', name='@id')],
 array([[1.0000001 , 0.990842  , 0.9890994 , 0.9888878 , 0.98887044],
        [1.        , 0.99028957, 0.9881565 , 0.9877604 , 0.9877604 ]],
       dtype=float32))