In [None]:
import json
import logging
import os

with open('config.json', 'r') as f:
    config = json.load(f)

path = config['working_dir']

input_dir = os.path.join(path,'output_embedding')
print('input_dir:',input_dir)
output_dir = os.path.join(path,'output_embedding/results')
print('output_dir:',output_dir)

os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

logging.basicConfig(filename=os.path.join(output_dir,'output_prediction.log'), 
                    level=logging.DEBUG, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Load View

In [None]:
from grape import Graph
import pandas as pd

nodes_df_path = config['nodes_file_path']
print('nodes_df_path:',nodes_df_path)
edges_df_path = config['edges_file_path']
print('edges_df_path:',edges_df_path)

nodes_df = pd.read_csv(nodes_df_path, sep="\t")
edges_df = pd.read_csv(edges_df_path, sep="\t")

view_directed = Graph.from_pd(
    edges_df=edges_df,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties",
)
view_undirected = Graph.from_pd(
    edges_df=edges_df,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=False,
    name="RNA-KG VIEW_properties",
)

# Custom functions using SciKit-Learn to predict

In [None]:
# trying to use the models directly from sci-kit learn instead of grape
import random
import time
import numpy as np
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from IPython.display import clear_output
import math
import grape


def get_edge_type_ids_list(graph):
    edge_types = []
    for edge_id in range(graph.get_number_of_directed_edges()):
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)
        predicate_id = graph.get_edge_type_id_from_edge_type_name(predicate)
        edge_types.append(predicate_id)
    return edge_types


def extract_embeddings_for_graph(grape_embedding, grape_graph):
    before = time.time()
    node_embedding = grape_embedding.get_all_node_embedding()
    try:
        edge_type_embedding = grape_embedding.get_all_edge_type_embeddings()
    except ValueError as e:
        logging.warning(
            f"Error while extracting edge type embeddings: {e}, using empty list instead"
        )
        edge_type_embedding = []
    number_of_edges = grape_graph.get_number_of_directed_edges()
    edge_node_names = grape_graph.get_edge_node_names(directed=True)
    embeddings = []
    for edge_id in range(number_of_edges):
        subject = edge_node_names[edge_id][0]
        object = edge_node_names[edge_id][1]
        predicate = grape_graph.get_edge_type_name_from_edge_id(edge_id)
        subject_embedding = node_embedding[0].loc[subject].values
        predicate_embedding = (
            edge_type_embedding[0].loc[predicate].values
            if len(edge_type_embedding) > 0
            else np.empty(0)
        )
        object_embedding = node_embedding[0].loc[object].values
        edge_embedding = np.concatenate(
            [subject_embedding, predicate_embedding, object_embedding]
        )
        embeddings.append(edge_embedding)
    logging.info(f"Embedding extraction time:{time.time()-before}")
    return embeddings


def edge_pred_pairs_sklearn(
    graph,
    embedder,
    edge_pred_model,
    pairs_to_predict,
    name_for_df=None,
    clear_output=False,
    train_size=0.7,
    number_of_holdouts=5,
    seed=42,
    use_scale_free_distribution=True,
    train_on_filtered=True,
    training_unbalance_rate=1.0,
    verbose=False,
    binary=True,
):
    df_results = pd.DataFrame()
    columns = [
        "Graph",
        "Embedder",
        "Model",
        "Source Type",
        "Destination Type",
        "Train on filtered",
        "Training set size",
        "Testing set size",
        "Training balanced accuracy",
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",  # ,'AUC'
    ]

    for i, pair_to_predict in enumerate(pairs_to_predict):
        print(f"Predicting pair: {pair_to_predict} ({i+1}/{len(pairs_to_predict)})")

        # change how the training behaves
        # update_fit(edge_pred_model, pair_to_predict if train_on_filtered else None, graph) # Not needed anymore with the sklearn models
        # negative graph extracted from the full graph to avoid false negatives
        # model will only be trained on data of the relevant type pair to predict
        results_custom_filtered_train = edge_prediction_pipeline_sklearn(
            graph,
            edge_pred_model,
            embedder,
            pair_to_predict,
            train_on_filtered=train_on_filtered,
            train_size=train_size,
            number_of_holdouts=number_of_holdouts,
            seed=seed,
            verbose=verbose,
            clear_output_holdout=clear_output,
            use_scale_free_distribution=use_scale_free_distribution,
            training_unbalance_rate=training_unbalance_rate,
            binary=binary,
        )
        df_results_custom_filtered_train = pd.DataFrame(results_custom_filtered_train)
        df_results = pd.concat([df_results, df_results_custom_filtered_train])

    df_results.columns = columns
    edge_pred_model_name = (
        edge_pred_model.__class__.__module__ + "." + edge_pred_model.__class__.__name__
    )
    df_results["edge_pred_model"] = edge_pred_model_name
    df_results["embedding_model"] = ""#embedder.model_name()
    df_results["name"] = (
        f"-{edge_pred_model_name}"#{embedder.model_name()}
        if name_for_df is None
        else name_for_df
    )
    # results[f'{embedder.model_name()}-{model.model_name()}'] = df_results
    return df_results


def edge_prediction_pipeline_sklearn(
    graph,
    model,
    embedder,
    pair_to_predict,
    train_on_filtered=True,
    train_size=0.7,
    number_of_holdouts=5,
    seed=42,
    verbose=False,
    clear_output_holdout=True,
    use_scale_free_distribution=True,
    training_unbalance_rate=1.0,
    binary=True,
):
    random.seed(seed)

    results = []

    for i in range(number_of_holdouts):
        # clean the cell output at each iteration to avoid huge cell outputs
        if clear_output_holdout:
            clear_output(wait=True)
        # use connected monte carlo to obtain a training set that has the same connectivity guarantees as full graph
        logging.info(f"Generating holdout {i+1}/{number_of_holdouts}")
        random_state = random.randrange(0, 100000)
        train_graph, positive_test_graph = graph.connected_holdout(
            train_size=train_size, random_state=random_state
        )

        # check if number of connected components is the same in the training set and full graph
        logging.debug(train_graph.get_number_of_connected_components())
        assert (
            train_graph.get_number_of_connected_components()
            == graph.get_number_of_connected_components()
        )

        logging.info("Filtering train and test graph by source/destination node type")
        # keep only the edges (source-destination node type) we are interested in
        train_graph_filtered = train_graph.filter_from_names(
            source_node_type_name_to_keep=[pair_to_predict[0]],
            destination_node_type_name_to_keep=[pair_to_predict[1]],
        )
        test_graph_filtered = positive_test_graph.filter_from_names(
            source_node_type_name_to_keep=[pair_to_predict[0]],
            destination_node_type_name_to_keep=[pair_to_predict[1]],
        )

        # check if embedder is of class grape.EmbeddingResult
        if not isinstance(embedder, grape.EmbeddingResult):
            # calculate the embedding on the not filtered train graph
            logging.info("Training embedding on unfiltered train graph")
            before = time.time()

            train_embedding = embedder.fit_transform(train_graph)

            logging.info(f"Embedding time:{time.time()-before}")
        else:
            logging.info("Using precalculated embeddings")
            train_embedding = embedder

        logging.info("Training model using the filtered train graph")

        logging.info("Generating negative training graph")
        before = time.time()
        number_of_negative_samples = (
            int(
                math.ceil(
                    train_graph_filtered.get_number_of_directed_edges()
                    * training_unbalance_rate
                )
            )
            if train_on_filtered
            else int(
                math.ceil(
                    train_graph.get_number_of_directed_edges() * training_unbalance_rate
                )
            )
        )

        logging.info(f"Number of negative samples: {number_of_negative_samples}")
        train_negative_graph = graph.sample_negative_graph(  # using the full graph to generate the negative edges to avoid false negatives
            number_of_negative_samples=number_of_negative_samples,
            # only_from_same_component=True,
            random_state=random_state,
            use_scale_free_distribution=use_scale_free_distribution,
            # sample_edge_types=False, # TODO: check if this is correct and test what happens if it is True
            source_node_types_names=[pair_to_predict[0]] if pair_to_predict else None,
            destination_node_types_names=(
                [pair_to_predict[1]] if pair_to_predict else None
            ),
        )
        logging.info(train_negative_graph.get_unique_edge_type_ids())
        logging.info(
            f"Number of edge types in negative training graph: {len(train_negative_graph.get_unique_edge_type_ids())}"
        )
        logging.info(f"Negative training graph generation time:{time.time()-before}")
        logging.info("Extracting embeddings for the negative training graph")
        before = time.time()
        train_negative_graph_embeddings = extract_embeddings_for_graph(
            train_embedding, train_negative_graph
        )
        logging.info(
            f"Negative training graph embedding extraction time:{time.time()-before}"
        )

        if train_on_filtered:
            logging.info(train_graph_filtered.get_unique_edge_type_ids())
            logging.info(
                f"Number of edge types in positive training graph: {len(train_graph_filtered.get_unique_edge_type_ids())}"
            )
            # generate the list of embeddings for the train_graph_filtered
            logging.info("Extracting embeddings for the positive training graph")
            train_positive_graph_filtered_embeddings = extract_embeddings_for_graph(
                train_embedding, train_graph_filtered
            )
            # combine the positive and negative embeddings
            train_graph_embeddings = np.concatenate(
                [
                    train_positive_graph_filtered_embeddings,
                    train_negative_graph_embeddings,
                ]
            )
            # generate the list of labels for the train_graph_filtered
            if binary:
                train_graph_filtered_labels = [
                    1 for _ in range(len(train_positive_graph_filtered_embeddings))
                ] + [0 for _ in range(len(train_negative_graph_embeddings))]
            else:
                positive_train_edge_types = get_edge_type_ids_list(train_graph_filtered)
                negative_train_edge_types = [
                    -1 for _ in range(len(train_negative_graph_embeddings))
                ]  # get_edge_type_ids_list(train_negative_graph)
                train_graph_filtered_labels = (
                    positive_train_edge_types + negative_train_edge_types
                )

            # train the model
            logging.info("Training model on the filtered training graph")
            model = model.fit(train_graph_embeddings, train_graph_filtered_labels)
        else:
            logging.info(train_graph.get_unique_edge_type_ids())
            logging.info(
                f"Number of edge types in positive training graph: {len(train_graph.get_unique_edge_type_ids())}"
            )
            # generate the list of embeddings for the train_graph
            logging.info("Extracting embeddings for the positive training graph")
            train_positive_graph_embeddings = extract_embeddings_for_graph(
                train_embedding, train_graph
            )
            # combine the positive and negative embeddings
            train_graph_embeddings = np.concatenate(
                [train_positive_graph_embeddings, train_negative_graph_embeddings]
            )
            # generate the list of labels for the train_graph
            if binary:
                train_graph_labels = [
                    1 for _ in range(len(train_positive_graph_embeddings))
                ] + [0 for _ in range(len(train_negative_graph_embeddings))]
            else:
                positive_train_edge_types = get_edge_type_ids_list(train_graph)
                negative_train_edge_types = [
                    -1 for _ in range(len(train_negative_graph_embeddings))
                ]  # get_edge_type_ids_list(train_negative_graph)
                train_graph_labels = (
                    positive_train_edge_types + negative_train_edge_types
                )
            # train the model
            logging.info("Training model on the training graph")
            model = model.fit(train_graph_embeddings, train_graph_labels)

        logging.info("Evaluating model on train set")
        train_pred = model.predict(train_graph_embeddings)
        if train_on_filtered:
            logging.info("Evaluating model on filtered positive train set")
            pos_train_pred = model.predict(train_positive_graph_filtered_embeddings)
        else:
            logging.info("Evaluating model on positive train set")
            pos_train_pred = model.predict(train_positive_graph_embeddings)
        logging.info("Evaluating model on negative train set")
        neg_train_pred = model.predict(train_negative_graph_embeddings)

        training_set_size = len(train_pred)

        if binary:
            train_score = (
                balanced_accuracy_score(train_graph_filtered_labels, train_pred)
                if train_on_filtered
                else balanced_accuracy_score(train_graph_labels, train_pred)
            )
            pos_train_score = balanced_accuracy_score(
                [1 for _ in range(len(pos_train_pred))], pos_train_pred
            )
            neg_train_score = balanced_accuracy_score(
                [0 for _ in range(len(neg_train_pred))], neg_train_pred
            )
        else:
            train_score = (
                balanced_accuracy_score(train_graph_filtered_labels, train_pred)
                if train_on_filtered
                else balanced_accuracy_score(train_graph_labels, train_pred)
            )
            pos_train_score = balanced_accuracy_score(
                positive_train_edge_types, pos_train_pred
            )
            neg_train_score = balanced_accuracy_score(
                negative_train_edge_types, neg_train_pred
            )

        if verbose:
            # pred_train_edge_presence = train_pred.apply(lambda row:check_if_in_graph(graph,row['sources'],row['destinations'],pair_to_predict),axis=1)
            # train_score = balanced_accuracy_score(pred_train_edge_presence, train_pred['prediction'].apply(lambda x:x>0.5))
            logging.info(f"Balanced accuracy score TRAINING: {train_score}")
            logging.info(
                f"Balanced accuracy positive score TRAINING: {pos_train_score}"
            )
            logging.info(
                f"Balanced accuracy negative score TRAINING: {neg_train_score}"
            )

        logging.info("Creating a graph with the negative edges for testing")
        # create graph with negative edges for testing
        negative_test_graph = graph.sample_negative_graph(
            # number_of_negative_samples=test_graph_filtered.get_number_of_edges(), # this option creates only half the edges
            number_of_negative_samples=test_graph_filtered.get_number_of_directed_edges(),
            # only_from_same_component=True,
            source_node_types_names=[pair_to_predict[0]],
            destination_node_types_names=[pair_to_predict[1]],
            random_state=random_state
            + 1,  # to avoid the same random state as the negative training graph
            use_scale_free_distribution=use_scale_free_distribution,
        )
        logging.info(negative_test_graph.get_unique_edge_type_ids())
        logging.info(
            f"Number of edge types in negative test graph: {len(negative_test_graph.get_unique_edge_type_ids())}"
        )

        if verbose:
            logging.info(
                f"#edges in positive test graph: {test_graph_filtered.get_number_of_directed_edges()}"
            )
            logging.info(
                f"#edges in negative test graph: {negative_test_graph.get_number_of_directed_edges()}"
            )

        # use model to predict on the positive edges
        logging.info("Using the model to predict the existence of positive edges")
        # pos_pred = model.predict_proba(
        #   graph=test_graph_filtered,
        #   edge_features=transe_edge_features,
        #   return_predictions_dataframe=True,
        #   support=train_graph
        # )
        pos_pred = model.predict(
            extract_embeddings_for_graph(train_embedding, test_graph_filtered)
        )

        # if verbose:
        #     # check if all edges of positive test set are in the original graph
        #     pos_pred_edge_presence = pos_pred.apply(lambda row:check_if_in_graph(graph,row['sources'],row['destinations'],pair_to_predict),axis=1)
        #     logging.info(f'Are all positive edges present in the positive test set also in the original graph? {pos_pred_edge_presence.all()}')
        #     logging.info(pos_pred_edge_presence.value_counts())

        # use model to predict on the negative edges
        logging.info("Using the model to predict the non-existence of negative edges")
        # neg_pred = model.predict_proba(
        #   graph=negative_test_graph,
        #   edge_features=transe_edge_features,
        #   return_predictions_dataframe=True,
        #   support=train_graph
        # )
        neg_pred = model.predict(
            extract_embeddings_for_graph(train_embedding, negative_test_graph)
        )

        testing_set_size = len(pos_pred) + len(neg_pred)

        # if verbose:
        #     # check if all edges of negative test set are not in the original graph
        #     neg_pred_edge_presence = neg_pred.apply(lambda row:check_if_in_graph(graph,row['sources'],row['destinations'],pair_to_predict),axis=1)
        #     logging.info(f'Are all negative edges present in the negative test set NOT in the original graph? {~neg_pred_edge_presence.all()}')
        #     logging.info(neg_pred_edge_presence.value_counts())

        # calculate balanced accuracy score for positive and negative predictions
        if binary:
            pos_score = balanced_accuracy_score(
                [1 for _ in range(len(pos_pred))], pos_pred
            )
            neg_score = balanced_accuracy_score(
                [0 for _ in range(len(neg_pred))], neg_pred
            )
        else:
            positive_test_edge_types = get_edge_type_ids_list(test_graph_filtered)
            negative_test_edge_types = [
                -1 for _ in range(len(neg_pred))
            ]  # get_edge_type_ids_list(negative_test_graph)
            pos_score = balanced_accuracy_score(positive_test_edge_types, pos_pred)
            neg_score = balanced_accuracy_score(negative_test_edge_types, neg_pred)
            overall_score = balanced_accuracy_score(
                positive_test_edge_types + negative_test_edge_types,
                np.concatenate([pos_pred, neg_pred]),
            )
            logging.info(f"Overall balanced accuracy score: {overall_score}")
        # pos_score = balanced_accuracy_score([True for _ in range(len(pos_pred))], pos_pred)
        # neg_score = balanced_accuracy_score([False for _ in range(len(neg_pred))], neg_pred)
        logging.info(f"Balanced accuracy positive score: {pos_score}")
        logging.info(f"Balanced accuracy negative score: {neg_score}")
        avg_score = (pos_score + neg_score) / 2
        logging.info(f"Balanced accuracy mean score: {avg_score}")

        # if binary:
        #     auc_score = roc_auc_score(
        #         [True for _ in range(len(pos_pred))] + [False for _ in range(len(neg_pred))],
        #         np.concatenate([pos_pred, neg_pred])
        #     )
        # else:
        #     auc_score = roc_auc_score(
        #         positive_test_edge_types + negative_test_edge_types,
        #         np.concatenate([pos_pred, neg_pred])
        #     )
        # logging.info(f"AUC score: {auc_score}")

        if hasattr(model, "get_depth"):
            logging.info(f"Tree depth: {model.get_depth()}")

        model_name = model.__class__.__module__ + "." + model.__class__.__name__

        results.append(
            (
                graph.get_name(),
                "",#embedder.model_name(),
                model_name,
                pair_to_predict[0],
                pair_to_predict[1],
                train_on_filtered,
                training_set_size,
                testing_set_size,
                train_score,
                pos_score,
                neg_score,
                avg_score,
            )  # ,auc_score
        )
    return results

# Specific Edge Prediction

### Load naïve embedders (and store)

In [None]:
# Naïve embedders

# embedder_transE = TransEEnsmallen(random_state=seed) #default embedding_size=100
# embedder_line = FirstOrderLINEEnsmallen(
#     random_state=seed, enable_cache=False, embedding_size=100, verbose=False
# )
# embedder_node2vec = Node2VecSkipGramEnsmallen(random_state=seed, embedding_size=100)

# graph_embedding_transe = embedder_transE.fit_transform(view_directed) # NB. directed graph 
# graph_embedding_line = embedder_line.fit_transform(view_undirected) # NB. undirected graph
# graph_embedding_node2vec = embedder_node2vec.fit_transform(view_undirected) # NB. undirected graph

In [None]:
# Store naïve embeddings
import numpy as np

def store_embedding(embedding, folder, filename):
  for i in range(embedding.number_of_embeddings()):
    emb_i = embedding.get_node_embedding_from_index(i)
    emb_i.loc[:, 'embedding'] = emb_i.iloc[:, 0:].apply(lambda row: row.to_list(), axis=1)
    emb_i.index.name = 'name'
    emb_i[['embedding']].to_csv(folder + f'{filename}_{i}.tsv', sep='\t')

store_embedding(graph_embedding_transe, 'store_embeddings/', 'transe_dim100')
# store_embedding(graph_embedding_line, 'store_embeddings/', 'line_dim100')
# store_embedding(graph_embedding_node2vec, 'store_embeddings/', 'node2vec_dim100')

In [None]:
# Load embedding via BioBERT / DNABERT

def process_embeddings(input_text, input_seq, model_name):
    """ 
    """
    print(model_name)
    
    # df_t = pd.read_csv(input_seq, sep="\t", usecols=['name', 'embedding'])
    # df_t.set_index('name', inplace=True)
    # df_d = pd.read_csv(input_text, sep="\t", usecols=['name', 'embedding'])
    # df_d.set_index('name', inplace=True)
    
    # df_b = pd.concat([df_t, df_d], axis=0)
    df_b = pd.read_csv(input_seq, sep="\t", usecols=['name', 'embedding'])
    df_b.set_index('name', inplace=True)
    print(len(df_b['embedding'].apply(ast.literal_eval).iloc[0]))

    df_fo = pd.read_csv(os.path.join(path, f"prediction_output/{model_name}_dim100_0.tsv"), sep="\t",usecols=['name', 'embedding'])
    df_fo.set_index('name', inplace=True)
    print(len(df_fo['embedding'].apply(ast.literal_eval).iloc[0]))

    df = pd.merge(df_b, df_fo, left_index=True, right_index=True, suffixes=('_l', '_r'))  # default inner join

    df['embedding'] = df.apply(lambda row: ast.literal_eval(row['embedding_l']) + ast.literal_eval(row['embedding_r']), axis=1)
    df.drop(columns=['embedding_l', 'embedding_r'], inplace=True)

    emb_length = len(df['embedding'].iloc[0])
    print(emb_length)

    exploded_df = pd.DataFrame(df['embedding'].tolist(), index=df.index)

    df_final = pd.concat([df, exploded_df], axis=1)

    df_final = df_final.drop(columns=['embedding'])
    df_final.to_csv(os.path.join(output_dir, f'{prefix}_{model_name}100_0.csv'))


In [None]:
input_text = os.path.join(output_dir, "embedded_text_nan_filled.tsv")
input_seq = os.path.join(output_dir, "BERT_embeddings.tsv")

process_embeddings(input_text,input_seq,'line')
process_embeddings(input_text,input_seq,'node2vec')
process_embeddings(input_text,input_seq,'transe')

### Load external embeddings

In [None]:
# Load the external embeddings
import pandas as pd
import grape

# Feel free to update this function to match the format of your embeddings
# If the IDs are strings, 
def load_node_embedding(folder, filename):
    i = 0
    embeddings = []
    while True:
        try:
            df_embedding_i = pd.read_csv(os.path.join(folder, f'{filename}_{i}.csv'), index_col=0)
            # print(df_embedding_i.head())
            embeddings.append(df_embedding_i)
            i += 1
        except FileNotFoundError:
            logging.warning('load_node_embedding: file not found'+os.path.join(folder, f'{filename}_{i}.csv'))
            break
    embedding = grape.EmbeddingResult(embedding_method_name=filename, node_embeddings=embeddings)
    return embedding

# TODO: Change the folder and filename to match the location of your embeddings
# node_embedding = load_node_embedding(folder='store_embeddings/', filename='emb_concat_transe_100_llm') 

## Decision Tree

In [None]:
from grape.embedders import TransEEnsmallen, FirstOrderLINEEnsmallen, Node2VecSkipGramEnsmallen

# from grape.edge_prediction import DecisionTreeEdgePrediction, RandomForestEdgePrediction
from sklearn.tree import DecisionTreeClassifier

seed = 42

model_tree = DecisionTreeClassifier(random_state=seed)

pairs_to_predict = [
    ("miRNA", "Gene"),
    ("miRNA", "Disease"),
    ("miRNA", "Phenotype"),
    ("Gene", "Disease"),
    ("Gene", "Phenotype"),
]

In [None]:
# LINE
node_embedding = load_node_embedding(folder=input_dir, filename='openai_line100') # è necessario che il nome del file abbia suffisso _0 ma non va inserito in questo parametro

# binary
results_fun_line_tree_b = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding, #embedder_line,
    model_tree,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=True
)
results_fun_line_tree_b.to_csv( os.path.join(output_dir, 'openai_line_100_tree_binary_undirected.csv')) 

# multiclass
results_fun_line_tree_m = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding,
    model_tree,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)
results_fun_line_tree_m.to_csv( os.path.join(output_dir, 'openai_line_100_tree_multiclass_undirected.csv')) 

In [None]:
# node2vec
node_embedding = load_node_embedding(folder=input_dir, filename='openai_node2vec100') # è necessario che il nome del file abbia suffisso _0 ma non va inserito in questo parametro

# binary
results_fun_node2vec_tree_b = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding, #embedder_line,
    model_tree,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=True
)
results_fun_node2vec_tree_b.to_csv( os.path.join(output_dir, 'openai_node2vec_100_tree_binary_undirected.csv')) 

# multiclass
results_fun_node2vec_tree_m = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding,
    model_tree,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)
results_fun_node2vec_tree_m.to_csv( os.path.join(output_dir, 'openai_node2vec_100_tree_multiclass_undirected.csv')) 

In [None]:
# TransE
node_embedding = load_node_embedding(folder=input_dir, filename='openai_transe100') # è necessario che il nome del file abbia suffisso _0 ma non va inserito in questo parametro

# binary
results_fun_transE_tree_b = edge_pred_pairs_sklearn(
    view_directed,  # change this to view_undirected to use the undirected graph
    node_embedding,# change this to the embedder that you want to use or the precalculated embeddings loaded with load_node_embedding function
    model_tree,  # change this to the model that you want to use usually DecisionTreeClassifier or RandomForestClassifier
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=True,  # change this to False if you want to predict the edge types instead of a binary classification
)
results_fun_transE_tree_b.to_csv(os.path.join(output_dir, 'openai_transe_100_tree_binary_directed.csv'))

# multiclass
results_fun_transE_tree_m = edge_pred_pairs_sklearn(
    view_directed,  # change this to view_undirected to use the undirected graph
    node_embedding,# change this to the embedder that you want to use or the precalculated embeddings loaded with load_node_embedding function
    model_tree,  # change this to the model that you want to use usually DecisionTreeClassifier or RandomForestClassifier
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=False,  # change this to False if you want to predict the edge types instead of a binary classification
)
results_fun_transE_tree_m.to_csv(os.path.join(output_dir, 'openai_transe_100_tree_multiclass_directed.csv'))

In [None]:
# group results by source_type and destination_type, calculate mean and std for each group
# results_fun_transE_tree.groupby(["Source Type", "Destination Type"])[
#     [
#         "Positive balanced accuracy",
#         "Negative balanced accuracy",
#         "Mean balanced accuracy",
#     ]
# ].agg(["mean", "std"])

## Random Forest

In [None]:
from grape.embedders import TransEEnsmallen, FirstOrderLINEEnsmallen, Node2VecSkipGramEnsmallen

# from grape.edge_prediction import DecisionTreeEdgePrediction, RandomForestEdgePrediction
from sklearn.ensemble import RandomForestClassifier

seed = 42

model_forest = RandomForestClassifier(random_state=seed, n_jobs=6)

pairs_to_predict = [
    ("miRNA", "Gene"),
    ("miRNA", "Disease"),
    ("miRNA", "Phenotype"),
    ("Gene", "Disease"),
    ("Gene", "Phenotype"),
] 

In [None]:
# LINE
node_embedding = load_node_embedding(folder=input_dir, filename='openai_line100') 

# binary
results_fun_line_forest_b = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding, #embedder_line,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=True
)
results_fun_line_forest_b.to_csv( os.path.join(output_dir, 'openai_line_100_forest_binary_undirected.csv')) 

# multiclass
results_fun_line_forest_m = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)
results_fun_line_forest_m.to_csv( os.path.join(output_dir, 'openai_line_100_forest_multiclass_undirected.csv')) 

In [None]:
# node2vec
node_embedding = load_node_embedding(folder=input_dir, filename='openai_node2vec100') # è necessario che il nome del file abbia suffisso _0 ma non va inserito in questo parametro

# binary
results_fun_node2vec_forest_b = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding, #embedder_line,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=True
)
results_fun_node2vec_forest_b.to_csv( os.path.join(output_dir, 'openai_node2vec_100_forest_binary_undirected.csv')) 

# multiclass
results_fun_node2vec_forest_m = edge_pred_pairs_sklearn(
    view_undirected,
    node_embedding,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)
results_fun_node2vec_forest_m.to_csv( os.path.join(output_dir, 'openai_node2vec_100_forest_multiclass_undirected.csv')) 

In [None]:
# TransE
node_embedding = load_node_embedding(folder=input_dir, filename='openai_transe100')

# binary
results_fun_transE_forest_b = edge_pred_pairs_sklearn(
    view_directed,  # change this to view_undirected to use the undirected graph
    node_embedding,# change this to the embedder that you want to use or the precalculated embeddings loaded with load_node_embedding function
    model_forest,  # change this to the model that you want to use usually DecisionTreeClassifier or RandomForestClassifier
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=True,  # change this to False if you want to predict the edge types instead of a binary classification
)
results_fun_transE_forest_b.to_csv(os.path.join(output_dir, 'openai_transe_100_forest_binary_directed.csv'))

# multiclass
results_fun_transE_forest_m = edge_pred_pairs_sklearn(
    view_directed,  # change this to view_undirected to use the undirected graph
    node_embedding,# change this to the embedder that you want to use or the precalculated embeddings loaded with load_node_embedding function
    model_forest,  # change this to the model that you want to use usually DecisionTreeClassifier or RandomForestClassifier
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=5,
    binary=False,  # change this to False if you want to predict the edge types instead of a binary classification
)
results_fun_transE_forest_m.to_csv(os.path.join(output_dir, 'openai_transe_100_forest_multiclass_directed.csv'))

In [None]:
# results_fun_transE_forest.groupby(["Source Type", "Destination Type"])[
#     [
#         "Positive balanced accuracy",
#         "Negative balanced accuracy",
#         "Mean balanced accuracy",
#     ]
# ].agg(["mean", "std"])