In [1]:
import pandas as pd

from bluegraph.core import PandasPGFrame
from bluegraph.preprocess.generators import CooccurrenceGenerator
from bluegraph.preprocess.encoders import ScikitLearnPGEncoder

## Data preparation

Fist, we read the source dataset with mentions of entities in different paragraphs

In [2]:
mentions = pd.read_csv("data/labeled_entity_occurrence.csv")

In [3]:
# Extract unique paper/seciton/paragraph identifiers
mentions = mentions.rename(columns={"occurrence": "paragraph"})
number_of_paragraphs = len(mentions["paragraph"].unique())

In [4]:
mentions

Unnamed: 0,entity,paragraph
0,lithostathine-1-alpha,1
1,pulmonary,1
2,host,1
3,lithostathine-1-alpha,2
4,surfactant protein d measurement,2
...,...,...
2281346,covid-19,227822
2281347,covid-19,227822
2281348,viral infection,227823
2281349,lipid,227823


We will also load a dataset that contains definitions of entities and their types

In [5]:
entity_data = pd.read_csv("data/entity_types_defs.csv")

In [6]:
entity_data

Unnamed: 0,entity,entity_type,definition
0,(e3-independent) e2 ubiquitin-conjugating enzyme,PROTEIN,(E3-independent) E2 ubiquitin-conjugating enzy...
1,(h115d)vhl35 peptide,CHEMICAL,A peptide vaccine derived from the von Hippel-...
2,"1,1-dimethylhydrazine",DRUG,"A clear, colorless, flammable, hygroscopic liq..."
3,"1,2-dimethylhydrazine",CHEMICAL,A compound used experimentally to induce tumor...
4,"1,25-dihydroxyvitamin d(3) 24-hydroxylase, mit...",PROTEIN,"1,25-dihydroxyvitamin D(3) 24-hydroxylase, mit..."
...,...,...,...
28127,zygomycosis,DISEASE,Any infection due to a fungus of the Zygomycot...
28128,zygomycota,ORGANISM,A phylum of fungi that are characterized by ve...
28129,zygosity,ORGANISM,"The genetic condition of a zygote, especially ..."
28130,zygote,CELL_COMPARTMENT,"The cell formed by the union of two gametes, e..."


### Generation of a co-occurrence graph

We first create a graph whose nodes are entities

In [7]:
graph = PandasPGFrame()
entity_nodes = mentions["entity"].unique()
graph.add_nodes(entity_nodes)
graph.add_node_types({n: "Entity" for n in entity_nodes})

entity_props = entity_data.rename(columns={"entity": "@id"}).set_index("@id")
graph.add_node_properties(entity_props["entity_type"], prop_type="category")
graph.add_node_properties(entity_props["definition"], prop_type="text")

In [8]:
paragraph_prop = pd.DataFrame({"paragraphs": mentions.groupby("entity").aggregate(set)["paragraph"]})
graph.add_node_properties(paragraph_prop, prop_type="category")

For each node we will add the `frequency` property that counts the total number of paragraphs where the entity was mentioned.

In [9]:
frequencies = graph._nodes["paragraphs"].apply(len)
frequencies.name = "frequency"
graph.add_node_properties(frequencies)

In [10]:
graph.nodes(raw_frame=True)

Unnamed: 0_level_0,@type,entity_type,definition,paragraphs,frequency
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lithostathine-1-alpha,Entity,PROTEIN,"Lithostathine-1-alpha (166 aa, ~19 kDa) is enc...","{1, 2, 3, 18178, 195589, 104454, 88967, 104455...",80
pulmonary,Entity,ORGAN,Relating to the lungs as the intended site of ...,"{1, 196612, 196613, 196614, 196621, 196623, 16...",8295
host,Entity,ORGANISM,An organism that nourishes and supports anothe...,"{1, 114689, 3, 221193, 180243, 180247, 28, 180...",2660
surfactant protein d measurement,Entity,PROTEIN,The determination of the amount of surfactant ...,"{145537, 2, 3, 4, 5, 6, 51202, 103939, 103940,...",268
communication response,Entity,PATHWAY,A statement (either spoken or written) that is...,"{46592, 64000, 2, 28162, 166912, 226304, 88585...",160
...,...,...,...,...,...
drug binding site,Entity,PATHWAY,The reactive parts of a macromolecule that dir...,"{225082, 225079}",2
carbaril,Entity,CHEMICAL,A synthetic carbamate acetylcholinesterase inh...,"{225408, 225409, 225415, 225419, 225397}",5
ny-eso-1 positive tumor cells present,Entity,CELL_TYPE,An indication that Cancer/Testis Antigen 1 exp...,"{225544, 226996}",2
mustelidae,Entity,ORGANISM,Taxonomic family which includes the Ferret.,"{225901, 225903}",2


Now, for constructing co-occurrence network we will select only 1000 most frequent entities.

In [11]:
nodes_to_include = graph._nodes.nlargest(1000, "frequency").index

In [12]:
nodes_to_include

Index(['covid-19', 'blood', 'human', 'infectious disorder', 'heart',
       'diabetes mellitus', 'lung', 'sars-cov-2', 'mouse', 'pulmonary',
       ...
       'wheezing', 'chief complaint', 'azathioprine', 'ileum', 'hematology',
       'nonalcoholic steatohepatitis', 'nervous system disorder',
       'renal impairment', 'urticaria', 'rectum'],
      dtype='object', name='@id', length=1000)

The `CooccurrenceGenerator` class allows us to generate co-occurrence edges from overlaps in node property values or edge (or edge properties). In this case we consider the `paragraph` node property and construct co-occurrence edges from overlapping sets of paragraphs. In addition, we will compute some co-occurrence statistics: total co-occurrence frequency and normalized pointwise mutual information (NPMI).

In [13]:
%%time
generator = CooccurrenceGenerator(graph.subgraph(nodes=nodes_to_include))
paragraph_cooccurrence_edges = generator.generate_from_nodes(
    "paragraphs", total_factor_instances=number_of_paragraphs,
    compute_statistics=["frequency", "npmi"],
    parallelize=True, cores=8)

Examining 499500 pairs of terms for co-occurrence...
CPU times: user 7.37 s, sys: 2.18 s, total: 9.54 s
Wall time: 1min 17s


In [14]:
cutoff = paragraph_cooccurrence_edges["npmi"].mean()

In [15]:
paragraph_cooccurrence_edges = paragraph_cooccurrence_edges[paragraph_cooccurrence_edges["npmi"] > cutoff]

We add generated edges to the original graph

In [16]:
graph._edges = paragraph_cooccurrence_edges
graph.edge_prop_as_numeric("frequency")
graph.edge_prop_as_numeric("npmi")

In [17]:
graph.edges(raw_frame=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,common_factors,frequency,npmi
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
surfactant protein d measurement,microorganism,"{2, 3, 7810, 58, 41, 7754, 7850, 26218, 7853, ...",19,0.235263
surfactant protein d measurement,lung,"{2, 103939, 51202, 5, 4, 103940, 15, 145438, 3...",93,0.221395
surfactant protein d measurement,alveolar,"{223872, 2, 51202, 100502, 7831, 149657, 19522...",25,0.336175
surfactant protein d measurement,epithelial cell,"{2, 4, 5, 222298, 7825, 7732, 7733, 169174, 7738}",9,0.175923
surfactant protein d measurement,molecule,"{2, 7750, 49991, 134504, 206448, 49, 52, 20645...",10,0.113611
...,...,...,...,...
severe acute respiratory syndrome,caax prenyl protease 2,"{205345, 185829, 227486, 220124, 220126}",5,0.142611
severe acute respiratory syndrome,transmembrane protease serine 2,"{223746, 223747, 167301, 223752, 200971, 22375...",21,0.238160
chloroquine,severe acute respiratory syndrome,"{205345, 170212, 227238, 195047, 172167, 21501...",11,0.160524
chloroquine,caax prenyl protease 2,"{201001, 205345, 225292, 198655}",4,0.159462


Recall that we have generated edges only for the 1000 most frequent entities, the rest of the entities will be isolated (having no incident edges). Let us remove all the isolated nodes.

In [26]:
graph.remove_isolated_nodes()

Next, we save the generated co-occurrence graph.

In [27]:
# graph.to_csv("data/graph_nodes.csv", "data/graph_edges.csv",)

In [28]:
# graph = PandasPGFrame.from_csv(
#     "data/graph_nodes.csv", "data/graph_edges.csv",
#     node_property_types={
#         "@type": "category",
#         "entity_type": "category",
#         "definition": "text",
#         "paragraphs": "category",
#         "frequency": "numeric"
#     },
#     edge_property_types={
#         "common_factors": "category",
#         "frequency": "numeritc",
#         "ppmi": "numeric",
#         "npmi": "numeric"
#     })

### Node feature extraction

We extract node features from entity definitions using the `tfidf` model.

In [29]:
encoder = ScikitLearnPGEncoder(text_encoding_max_dimension=512)

In [30]:
%%time
transformed_graph = encoder.fit_transform(
    graph, node_properties=["definition"], edge_properties=None)

CPU times: user 398 ms, sys: 9.82 ms, total: 408 ms
Wall time: 407 ms


In [31]:
vocabulary = encoder._node_encoders["definition"].vocabulary_
list(vocabulary.keys())[:10]

['relating',
 'lungs',
 'site',
 'administration',
 'product',
 'usually',
 'action',
 'lower',
 'respiratory',
 'tract']

We will add additional properties to our transformed graph corresponding to the entity type labels. We will also add NPMI as an edge property to this transformed graph.

In [32]:
transformed_graph.add_node_properties(
    graph.get_node_property_values("entity_type"))
transformed_graph.add_edge_properties(
    graph.get_edge_property_values("npmi"), prop_type="numeric")

KeyError: "None of [Index(['pulmonary', 'host', 'surfactant protein d measurement',\n       'microorganism', 'lung', 'alveolar', 'epithelial cell', 'mucosa',\n       'gastrointestinal tract', 'molecule',\n       ...\n       'brain natriuretic peptide measurement', 'chloroquine',\n       'autoimmune encephalitis', 'oxacillin',\n       'severe acute respiratory syndrome', 'candida parapsilosis',\n       'ciliated bronchial epithelial cell',\n       'cystic fibrosis pulmonary exacerbation', 'caax prenyl protease 2',\n       'transmembrane protease serine 2'],\n      dtype='object', length=1000)] are in the [columns]"

In [None]:
transformed_graph.nodes(raw_frame=True)

## Node embedding and downstream tasks

### Node embedding using StellarGraph

Using `StellarGraphNodeEmbedder` we construct three different embeddings of our transformed graph.

In [None]:
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

In [None]:
embedder = StellarGraphNodeEmbedder(
    "node2vec", edge_weight="npmi", embedding_dimension=10, length=10, number_of_walks=20)
node2vec_embedding = embedder.fit_model(transformed_graph)

In [None]:
embedder = StellarGraphNodeEmbedder(
    "attri2vec", feature_vector_prop="features",
    length=5, number_of_walks=10,
    epochs=10, embedding_dimension=128, edge_weight="npmi")
attri2vec_embedding = embedder.fit_model(transformed_graph)

In [None]:
embedder = StellarGraphNodeEmbedder(
    "gcn_dgi", feature_vector_prop="features", epochs=250, embedding_dimension=512)
gcn_dgi_embedding = embedder.fit_model(transformed_graph)

In [None]:
transformed_graph.add_node_properties(
    node2vec_embedding.rename(columns={"embedding": "node2vec"}))

In [None]:
transformed_graph.add_node_properties(
    attri2vec_embedding.rename(columns={"embedding": "attri2vec"}))

In [None]:
transformed_graph.add_node_properties(
    gcn_dgi_embedding.rename(columns={"embedding": "gcn_dgi"}))

In [None]:
transformed_graph.nodes(raw_frame=True)

### Node similarity

In [None]:
import numpy as np

from bluegraph.downstream.similarity import NodeSimilarityProcessor

In [None]:
node2vec_l2 = NodeSimilarityProcessor(transformed_graph, "node2vec")
node2vec_cosine = NodeSimilarityProcessor(
    transformed_graph, "node2vec", similarity="cosine")

In [None]:
node2vec_l2.get_similar_nodes(["glucose", "covid-19"], k=10)

In [None]:
node2vec_cosine.get_similar_nodes(["glucose", "covid-19"], k=10)

In [None]:
attri2vec_l2 = NodeSimilarityProcessor(transformed_graph, "attri2vec")
attri2vec_cosine = NodeSimilarityProcessor(
    transformed_graph, "attri2vec", similarity="cosine")

In [None]:
attri2vec_l2.get_similar_nodes(["glucose", "covid-19"], k=10)

In [None]:
attri2vec_cosine.get_similar_nodes(["glucose", "covid-19"], k=10)

In [None]:
gcn_l2 = NodeSimilarityProcessor(transformed_graph, "gcn_dgi")
gcn_cosine = NodeSimilarityProcessor(
    transformed_graph, "gcn_dgi", similarity="cosine")

In [None]:
gcn_l2.get_similar_nodes(["glucose", "covid-19"], k=10)

In [None]:
gcn_cosine.get_similar_nodes(["glucose", "covid-19"], k=10)

### Node classification

We will build a predictive model for entity type prediction based on:

- Only node features
- Node2vec embeddings (only structure)
- Attri2vec embeddings (structure and node features)
- GCN Deep Graph Infomax embeddings (structure and node features)

### Splitting the graph into train/test set

In [None]:
from bluegraph.downstream.node_classification import NodeClassifier

from sklearn import model_selection
from sklearn.svm import LinearSVC

Split the graph nodes into the train and test set

In [None]:
train_nodes, test_nodes = model_selection.train_test_split(
    transformed_graph.nodes(), train_size=0.8)

In [None]:
features_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="features")
features_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
pred_y = features_classifier.predict(transformed_graph, predict_elements=test_nodes)
accuracy = (transformed_graph._nodes.loc[test_nodes, "entity_type"] == pred_y).mean()
print(accuracy)

In [None]:
node2vec_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="node2vec")
node2vec_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
pred_y = node2vec_classifier.predict(transformed_graph, predict_elements=test_nodes)
accuracy = (transformed_graph._nodes.loc[test_nodes, "entity_type"] == pred_y).mean()
print(accuracy)

In [None]:
attri2vec_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="attri2vec")
attri2vec_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
pred_y = attri2vec_classifier.predict(transformed_graph, predict_elements=test_nodes)
accuracy = (transformed_graph._nodes.loc[test_nodes, "entity_type"] == pred_y).mean()
print(accuracy)

In [None]:
gcn_dgi_classifier = NodeClassifier(LinearSVC(), feature_vector_prop="gcn_dgi")
gcn_dgi_classifier.fit(transformed_graph, train_elements=train_nodes, label_prop="entity_type")
pred_y = gcn_dgi_classifier.predict(transformed_graph, predict_elements=test_nodes)
accuracy = (transformed_graph._nodes.loc[test_nodes, "entity_type"] == pred_y).mean()
print(accuracy)

## Link prediction

In [None]:
from bluegraph.downstream.link_prediction import (generate_negative_edges,
                                                  EdgePredictor)

In [None]:
false_edges = generate_negative_edges(transformed_graph)

In [None]:
true_train_edges, true_test_edges = model_selection.train_test_split(
    transformed_graph.edges(), train_size=0.8)

In [None]:
false_train_edges, false_test_edges = model_selection.train_test_split(
    false_edges, train_size=0.8)

In [None]:
model = EdgePredictor(LinearSVC(), feature_vector_prop="node2vec",
                      operator="hadamard", directed=False)
model.fit(transformed_graph, true_train_edges, negative_samples=false_train_edges)

In [None]:
y_pred = model.predict(transformed_graph, true_train_edges + false_train_edges)

In [None]:
true_labels = np.hstack([
    np.ones(len(true_train_edges)),
    np.zeros(len(false_train_edges))])

In [None]:
sum(y_pred == true_labels) / len(y_pred)