In [1]:
import logging
import os

import pandas as pd
import grape
from grape import Graph  # https://github.com/AnacletoLAB/grape, pip install grape

logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)

In [None]:
# You should set this to the parent folder of the helper_lib folder
# In this case the helper_lib folder is at /home/ftorgano/Lab/Test/helper_lib so
# this is set to /home/ftorgano/Lab/Test
# This should not be needed if the script you are trying to run was executed from that same
# folder that contains the helper_lib one
# If for some reason this doesn't work, you can also simply copy and paste the functions
# you need in the code directly and use those
os.chdir("/home/ftorgano/Lab/Test")

In [11]:
holdouts_path = (
    "holdouts_trial/"  # Change this to the folder you want to save the holdouts to
)
random_state = 42
os.makedirs(holdouts_path, exist_ok=True)  # making sure the directory exists

# Loading graph

In [4]:
# nodes csv structure: name, type
df_nodes = pd.read_csv("RNA-KG/nodes.csv")
# edges csv structure: subject, predicate, object
df_edges = pd.read_csv("RNA-KG/edges_reduced_weight_degree.csv").sample(100_000)

RNAKG_reduced = Graph.from_pd(
    nodes_df=df_nodes,
    node_name_column="name",
    # node_type_column="type",
    edges_df=df_edges,
    edge_src_column="subject",
    edge_dst_column="object",
    directed=False,
    name="RNA-KG_Reduced",
)
del df_nodes, df_edges

# Generating holdouts

In [5]:
from helper_lib import predict

## Generate the negative graph

In [6]:
negative_graph = predict.generate_negative_graph(
    positive_graph=RNAKG_reduced,  # always pass the full graph to avoid generating false negatives
    train_size=0.7,
    testing_unbalance_rate=3,  # the negative test set size will be 3x the size of positive testing set
    training_unbalance_rate=1,  # the negative train set size will be 1x the size of positive training set
    # the pair to predict, can be omitted if you don't want to predict a specific pair
    # pair_to_predict=("miRNA", "Disease"),
    random_state=random_state,
)

## Generate the holdouts

In [8]:
number_of_holdouts = 2

holdouts = predict.generate_holdouts(
    positive_graph=RNAKG_reduced,
    negative_graph=negative_graph,
    train_size=0.7,
    training_unbalance_rate=1,
    number_of_holdouts=number_of_holdouts,
    # pair_to_predict=("miRNA", "Disease"),
    seed=random_state,
)

INFO:root:Generating holdout 1/2
INFO:root:Generating holdout 2/2


In [9]:
for i, holdout in enumerate(holdouts):
    print(f"Holdout {i}")
    for key in holdout.keys():
        print(f"{key}: {holdout[key].get_number_of_edges()}")

Holdout 0
embedding_graph: 70002
positive_train_graph: 70002
positive_test_graph: 29998
negative_train_graph: 70002
negative_test_graph: 90002
Holdout 1
embedding_graph: 70002
positive_train_graph: 70002
positive_test_graph: 29998
negative_train_graph: 70000
negative_test_graph: 90004


# Exporting the embedding graph to generate embeddings

In [12]:
for i, holdout in enumerate(holdouts):
    # These files can get big, if the original graph is also big, so watch out for the file size
    # The node names should not contain the separator, otherwise the csv will be corrupted
    # this is why I'm using a different separator in this example
    # By default this creates a csv with a subject and object column separated by the separator
    holdout["embedding_graph"].dump_edges(
        f"{holdouts_path}embedding_edges_{i}.csv", separator="\t"
    )

In [None]:
# holdout["embedding_graph"].dump_nodes(
#     "nodes.csv", node_types_column_number=1, node_type_column="node_type"
# )

# Generate embeddings externally

In [None]:
### THIS IS PLACEHOLDER CODE TO TEST THAT LOADING EXTERNAL EMBEDDINGS WORKS
### This should be either removed or replaced with the code to generate the embeddings

from grape.embedders import FirstOrderLINEEnsmallen

embedder = FirstOrderLINEEnsmallen(embedding_size=10, random_state=random_state)

embeddings = []
for i, holdout in enumerate(holdouts):
    embedding = predict.generate_embedding(
        holdout["embedding_graph"],
        embedder=embedder,
        cache_embedding_externally=False,  # not implemented yet
    )
    embedding.to_csv(f"./embeddings/node_embeddings_{i}.csv")
    embeddings.append(embedding)

INFO:root:Training embedding on the train graph
INFO:root:Embedding time:0.82
INFO:root:Training embedding on the train graph
INFO:root:Embedding time:0.69


In [21]:
embeddings[0].get_all_node_embedding()[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
http://www.ncbi.nlm.nih.gov/gene/80017,0.530862,-0.560381,-0.546650,-0.268646,0.347209,-0.194220,0.029438,-0.683387,-0.014384,-0.530488
http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-1947535,-0.208578,0.039666,0.543555,-0.591321,-0.241295,0.455584,0.253224,-0.251362,-0.007537,0.357383
http://purl.obolibrary.org/obo/VO_0005298,-0.570191,-0.396584,-0.287045,-0.361203,0.427845,0.223473,-0.382828,-0.241861,0.647575,-0.398197
http://purl.obolibrary.org/obo/CLO_0011234,0.630660,-0.541423,0.701572,-0.604703,-0.691136,-0.691984,0.474265,0.401452,0.250928,-0.331634
http://purl.obolibrary.org/obo/CHEBI_32751,0.175113,-0.561145,0.354800,-0.624407,0.722151,0.465386,0.562974,-0.515261,-0.527366,-0.352689
...,...,...,...,...,...,...,...,...,...,...
http://purl.obolibrary.org/obo/CHEBI_149642,0.371546,0.029397,0.036439,0.329943,0.510688,0.124528,-0.342944,0.097791,0.454574,0.435790
http://purl.obolibrary.org/obo/PR_000069022,0.390182,-0.324938,0.395822,0.120324,0.588265,-0.008517,-0.521332,0.170826,-0.396744,0.566556
http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=SPROHSA106243,-0.647232,0.165922,0.537871,-0.036306,-0.425600,-0.179171,-0.659531,0.459182,-0.521874,-0.272385
http://purl.obolibrary.org/obo/PR_P55196-3,0.579996,-0.708331,0.384084,-0.537364,-0.565143,0.301206,0.124172,-0.080553,-0.340590,0.642017


# Load external embeddings

In [14]:
def load_node_embeddings(folder, filename):
    # index_col=0 which sets the index of the df as the node names is important otherwise it errors out
    df_embedding = pd.read_csv(folder + f"{filename}.csv", index_col=0)

    embedding = grape.EmbeddingResult(
        embedding_method_name=filename, node_embeddings=[df_embedding]
    )

    return embedding

In [15]:
imported_node_embeddings = []

for i in range(number_of_holdouts):
    node_embeddings = load_node_embeddings(f"./embeddings/", f"node_embeddings_{i}")
    imported_node_embeddings.append(node_embeddings)

# Combine grape graph and node embeddings to export train and test sets

In [16]:
# This function is used to export the edge embeddings to a parquet file
# so that they can be used as input to the model
# They correspont to the x values in the training/testing set
# We don't export the y values which are simply 0 for negative examples and 1 for positive examples
# to save space and generate them as needed but you can add them if you need
def export_examples_to_parquet(
    examples,
    examples_output_path: str,
    examples_output_filename: str,
    parquet_compression="brotli",
):
    # We usually convert them to a dataframe and export them to parquet to save space,
    # but you can also save them as whatever format you want
    df_examples = pd.DataFrame(examples)
    os.makedirs(examples_output_path, exist_ok=True)
    df_examples.to_parquet(
        f"{examples_output_path}{examples_output_filename}",
        compression=parquet_compression,
    )

In [17]:
for i in range(number_of_holdouts):
    node_embeddings = imported_node_embeddings[i]
    holdout = holdouts[i]
    node_embeddings_concatenation_method = (
        "Concatenate"  # The currently implemented methods are Concatenate and Hadamard
    )

    # We don't need to export the embedding graph, because we shouldn't train our models on it
    keys_to_export = [
        "positive_train_graph",
        "positive_test_graph",
        "negative_train_graph",
        "negative_test_graph",
    ]

    for key in keys_to_export:
        edge_embeddings = predict.graph_to_edge_embeddings(
            embedding=node_embeddings,
            graph=holdout[key],
            node_embeddings_concatenation_method=node_embeddings_concatenation_method,
            # this considers edges as bidirectional if the graph is undirected, adjust as needed. Since we are using concatenation, this should be set to True
            treat_edges_as_bidirectional=True,
        )
        # this will output a series of parquet files (basically fancy compressed csv) with shape (#edgesInGraph) x (2*#features)
        #  (2*#features since we are concatenating node embeddings to obtain edge embeddings)
        export_examples_to_parquet(
            examples=edge_embeddings,
            examples_output_path=holdouts_path,
            examples_output_filename=f"{key}_edge_embeddings_{i}.parquet",
        )

INFO:root:Graph is undirected, extracting directed edge embeddings


INFO:root:Number of edges: 139990
INFO:root:Embedding extraction time: 0.79
INFO:root:Graph is undirected, extracting directed edge embeddings
INFO:root:Number of edges: 59996
INFO:root:Embedding extraction time: 0.19
INFO:root:Graph is undirected, extracting directed edge embeddings
INFO:root:Number of edges: 139990
INFO:root:Embedding extraction time: 0.40
INFO:root:Graph is undirected, extracting directed edge embeddings
INFO:root:Number of edges: 179988
INFO:root:Embedding extraction time: 0.75
INFO:root:Graph is undirected, extracting directed edge embeddings
INFO:root:Number of edges: 139990
INFO:root:Embedding extraction time: 0.52
INFO:root:Graph is undirected, extracting directed edge embeddings
INFO:root:Number of edges: 59996
INFO:root:Embedding extraction time: 0.17
INFO:root:Graph is undirected, extracting directed edge embeddings
INFO:root:Number of edges: 139990
INFO:root:Embedding extraction time: 0.72
INFO:root:Graph is undirected, extracting directed edge embeddings
I

The exported files can be used as training and testing sets for edge prediction (which is now a simple binary classification task)

(positive_train_graph_edge_embeddings file + negative_train_graph_edge_embeddings) = train set 

(positive_test_graph_edge_embeddings file + negative_test_graph_edge_embeddings) = test set 

In [18]:
pd.read_parquet(f"{holdouts_path}positive_test_graph_edge_embeddings_0.parquet")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.806920,0.062056,-0.206826,-0.035563,-0.094563,-0.261423,0.077907,-0.175314,0.073117,-0.426094,0.048928,0.377359,0.328561,0.028090,0.802782,0.045007,-0.188311,-0.184459,-0.173846,-0.016656
1,-0.806920,0.062056,-0.206826,-0.035563,-0.094563,-0.261423,0.077907,-0.175314,0.073117,-0.426094,0.487222,0.334561,0.154995,0.450544,0.058697,-0.286533,-0.119449,0.181404,0.410793,-0.349573
2,-0.806920,0.062056,-0.206826,-0.035563,-0.094563,-0.261423,0.077907,-0.175314,0.073117,-0.426094,-0.286262,-0.504342,0.228032,-0.222679,0.006809,-0.022498,-0.025316,0.243179,0.446422,0.549983
3,-0.806920,0.062056,-0.206826,-0.035563,-0.094563,-0.261423,0.077907,-0.175314,0.073117,-0.426094,-0.149182,0.154333,0.775750,-0.268467,-0.137775,0.150468,-0.426213,-0.021402,0.152033,-0.182229
4,-0.806920,0.062056,-0.206826,-0.035563,-0.094563,-0.261423,0.077907,-0.175314,0.073117,-0.426094,-0.294812,0.566878,0.464734,0.131198,0.101751,0.188239,0.144733,0.117530,0.451143,0.272889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59991,0.112412,-0.522191,0.134784,-0.095157,0.570256,-0.252883,0.348185,0.338896,0.046346,0.245133,0.282569,-0.318671,-0.339753,-0.440880,0.108045,-0.280387,-0.278693,-0.102189,0.461030,0.343290
59992,-0.115788,0.397033,-0.331055,0.470426,0.157614,-0.249525,-0.081765,0.566293,-0.285589,-0.044962,-0.198160,-0.132650,0.021006,-0.828258,-0.115931,0.097759,-0.387358,-0.216924,-0.152456,0.115525
59993,-0.115788,0.397033,-0.331055,0.470426,0.157614,-0.249525,-0.081765,0.566293,-0.285589,-0.044962,-0.232784,0.109959,0.525203,-0.316771,-0.493267,-0.412713,-0.011535,0.105518,0.223014,0.287916
59994,-0.115788,0.397033,-0.331055,0.470426,0.157614,-0.249525,-0.081765,0.566293,-0.285589,-0.044962,-0.186139,-0.198390,0.614144,0.532921,0.099838,-0.308250,-0.095671,0.337069,-0.184902,-0.053594
