In [None]:
import logging

logging.basicConfig(level=logging.WARNING)
logging.getLogger().setLevel(logging.WARNING)

# Load View

In [None]:
import pandas as pd
edges_complete = pd.read_csv("complete_edges.csv", usecols=[':START_ID', ':TYPE', ':END_ID', 'PubMedID:string[]'])# entire edges file
edges_complete = edges_complete.rename(
    columns={':START_ID': 'subject', ':END_ID': 'object', ':TYPE': 'predicate', 'PubMedID:string[]': 'PubMedID'})
edges_complete = edges_complete[edges_complete['PubMedID'].notna()]
len(edges_complete)

In [None]:
def count_pmids(pmid_cell):
    if pd.isna(pmid_cell):
        return 0
    return len(str(pmid_cell).split(";"))

total_pmids = edges_complete["PubMedID"].apply(count_pmids).sum()
total_pmids

In [None]:
def extract_pmids(pmid_cell):
    if pd.isna(pmid_cell):
        return []
    return [pmid.strip() for pmid in str(pmid_cell).split(";") if pmid.strip()]

all_pmids = edges_complete["PubMedID"].apply(extract_pmids)
unique_pmids = set(pmid for sublist in all_pmids for pmid in sublist)
len(unique_pmids)

In [None]:
from grape import Graph
import pandas as pd

path_to_folder_with_graph_files = "./data/"

nodes_df_path = path_to_folder_with_graph_files + "nodes.tsv"
edges_df_path = path_to_folder_with_graph_files + "edges.tsv"
nodes_df = pd.read_csv(nodes_df_path, sep="\t")
edges_df = pd.read_csv(edges_df_path, sep="\t")

edges_df = edges_df.merge(edges_complete, how='left', on=['subject', 'predicate', 'object'])
edges_df

In [None]:
len(edges_df[edges_df['PubMedID'].notna()])

In [None]:
total_pmids = edges_df["PubMedID"].apply(count_pmids).sum()
total_pmids

In [None]:
all_pmids = edges_df["PubMedID"].apply(extract_pmids)
unique_pmids = set(pmid for sublist in all_pmids for pmid in sublist)
len(unique_pmids)

In [None]:
from Bio import Entrez
from time import sleep
from tqdm import tqdm
import json
import os

Entrez.email = "emanuele.cavalleri@unimi.it"
Entrez.api_key = "8d88dc3d63cd73854f0034baa217b05a9808"

pmid_cache_path = "pmid_to_year.json"

if os.path.exists(pmid_cache_path):
    with open(pmid_cache_path, "r") as f:
        pmid_to_year = json.load(f)
else:
    pmid_to_year = {}

to_process = [pmid for pmid in unique_pmids if pmid not in pmid_to_year]

for pmid in tqdm(to_process, desc="Fetching publication years"):
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid)
        record = Entrez.read(handle)
        pmid_to_year[pmid] = record['PubmedArticle'][0]['MedlineCitation'].get("DateRevised", [])['Year']
    except Exception as e:
        pmid_to_year[pmid] = "NA"
    finally:
        sleep(0.15)
        if len(pmid_to_year) % 100 == 0 or pmid == to_process[-1]:
            with open(pmid_cache_path, "w") as f:
                json.dump(pmid_to_year, f)

In [None]:
import json
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import numpy as np

with open("pmid_to_year.json", "r") as f:
    pmid_to_year = json.load(f)

processed_years = []
for year in pmid_to_year.values():
    if year == "NA":
        continue
    try:
        y = int(year)
        if y <= 2012:
            processed_years.append("2002–2012")
        elif 2013 <= y <= 2017:
            processed_years.append("2013–2017")
        else:
            processed_years.append(str(y))
    except ValueError:
        continue

year_counts = Counter(processed_years)

df = pd.DataFrame(year_counts.items(), columns=["year", "count"])
df = df.sort_values(
    by="year",
    key=lambda col: col.map(
        lambda x: 2010 if x == "2002–2012" else (2011 if x == "2013–2017" else int(x))
    )
)

df["color"] = df["year"].apply(lambda x: [176/255, 205/255, 241/255])

# Plot
fig, ax = plt.subplots(figsize=(4.5, 3.5))
bars = ax.bar(df["year"], df["count"], color=df["color"].tolist(), width=0.8)

def format_large_number(value):
    if value >= 1_000_000:
        return f"{value / 1_000_000:.0f}M"
    elif value >= 1_000:
        return f"{value / 1_000:.0f}k"
    return str(value)

for bar, value in zip(bars, df["count"]):
    y_pos = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, y_pos * 1.02, format_large_number(value),
            ha='center', va='bottom', fontsize=9)

ax.set_yscale("log")
ax.set_ylim(1e3, 10 ** 4.3)
ax.tick_params(axis='x', labelrotation=45, labelsize=9)
ax.tick_params(axis='y', labelsize=9)

ax.set_xlabel("")
ax.set_ylabel("")
ax.tick_params(axis='y', which='both', labelleft=False)
ax.grid(False)
for spine in ax.spines.values():
    spine.set_visible(False)
ax.tick_params(axis='y', which='both', length=0)

plt.tight_layout()
plt.savefig('pubmed_years.png', dpi=400, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
edges_df_ = edges_df.copy()

In [None]:
with open("pmid_to_year.json", "r") as f:
    pmid_to_year = json.load(f)

edges_df['PubMedID_list'] = edges_df['PubMedID'].str.split(';')
edges_df = edges_df.drop(columns=['PubMedID'])
edges_df = edges_df.explode('PubMedID_list').rename(columns={'PubMedID_list': 'PubMedID'})

def extract_year(pmid_string):
    if pd.isna(pmid_string) or not isinstance(pmid_string, str):
        return None
    pmids = pmid_string.split(";")
    years = []
    for pmid in pmids:
        pmid = pmid.strip()
        year = pmid_to_year.get(pmid)
        if year and year != "NA":
            try:
                years.append(int(year))
            except ValueError:
                continue
    return years[0] if years else None

edges_df["year"] = edges_df["PubMedID"].apply(extract_year)
edges_df

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

df = edges_df.merge(nodes_df[['name','type']], left_on='subject', right_on='name').drop(columns=['name']).merge(
    nodes_df[['name','type']], left_on='object', right_on='name').drop(columns=['name'])

def mirna_relation_type(row):
    t1 = row['type_x'].lower()
    t2 = row['type_y'].lower()
    if 'mirna' in (t1, t2):
        if 'gene' in (t1, t2):
            return 'miRNA-Gene'
        elif 'disease' in (t1, t2):
            return 'miRNA-Disease'
        elif 'phenotype' in (t1, t2):
            return 'miRNA-Phenotype'
    if 'gene' in (t1, t2):
        if 'disease' in (t1, t2):
            return 'Gene-Disease'
        elif 'phenotype' in (t1, t2):
            return 'Gene-Phenotype'
    return 'Other'

df['mirna_relation_type'] = df.apply(mirna_relation_type, axis=1)

df = df[df['year'].notna()]

def simplify_year(y):
    try:
        y = int(float(y))
        if y <= 2012:
            return "2002–2012"
        elif 2013 <= y <= 2017:
            return "2013–2017"
        else:
            return str(y)
    except:
        return None

df['year_group'] = df['year'].apply(simplify_year)
df = df[df['year_group'].notna()]

counts_all = Counter(df['year_group'])

counts_mirna_gene = Counter(df[df['mirna_relation_type'] == 'miRNA-Gene']['year_group'])
counts_mirna_disease = Counter(df[df['mirna_relation_type'] == 'miRNA-Disease']['year_group'])
counts_mirna_phenotype = Counter(df[df['mirna_relation_type'] == 'miRNA-Phenotype']['year_group'])
counts_gene_disease = Counter(df[df['mirna_relation_type'] == 'Gene-Disease']['year_group'])
counts_gene_phenotype = Counter(df[df['mirna_relation_type'] == 'Gene-Phenotype']['year_group'])

years_sorted = sorted(set(counts_all.keys()), key=lambda x: 2010 if x == "2002–2012" else (2015 if x == "2013–2017" else int(x)))

plot_df = pd.DataFrame({
    'year': years_sorted,
    'all_relations': [counts_all[y] for y in years_sorted],
    'miRNA-Gene': [counts_mirna_gene.get(y, 0) for y in years_sorted],
    'miRNA-Disease': [counts_mirna_disease.get(y, 0) for y in years_sorted],
    'miRNA-Phenotype': [counts_mirna_phenotype.get(y, 0) for y in years_sorted],
    'Gene-Disease': [counts_gene_disease.get(y, 0) for y in years_sorted],
    'Gene-Phenotype': [counts_gene_phenotype.get(y, 0) for y in years_sorted],
})

colors = {
    'all_relations': [176/255, 205/255, 241/255],
    'miRNA-Gene': [255/255, 127/255, 80/255],      # Coral
    'miRNA-Disease': [100/255, 149/255, 237/255],  # CornflowerBlue
    'miRNA-Phenotype': [60/255, 179/255, 113/255], # MediumSeaGreen
    'Gene-Disease': [100/255, 149/255, 237/255],  # CornflowerBlue
    'Gene-Phenotype': [60/255, 179/255, 113/255], # MediumSeaGreen
}

# Plot
fig, ax = plt.subplots(figsize=(7, 4.5))
bar_width = 0.18
x = range(len(plot_df))

positions_all = [i - 1.5*bar_width for i in x]
positions_gene = [i - 0.5*bar_width for i in x]
positions_disease = [i + 0.5*bar_width for i in x]
positions_phenotype = [i + 1.5*bar_width for i in x]
positions_disease2 = [i + 2*bar_width for i in x]
positions_phenotype2 = [i + 2.5*bar_width for i in x]

ax.bar(positions_all, plot_df['all_relations'], width=bar_width, color=colors['all_relations'], label='Tutte le relazioni')
ax.bar(positions_gene, plot_df['miRNA-Gene'], width=bar_width, color=colors['miRNA-Gene'], label='miRNA-Gene')
ax.bar(positions_disease, plot_df['miRNA-Disease'], width=bar_width, color=colors['miRNA-Disease'], label='miRNA-Disease')
ax.bar(positions_phenotype, plot_df['miRNA-Phenotype'], width=bar_width, color=colors['miRNA-Phenotype'], label='miRNA-Phenotype')
ax.bar(positions_disease2, plot_df['Gene-Disease'], width=bar_width, color=colors['Gene-Disease'], label='Gene-Disease')
ax.bar(positions_phenotype2, plot_df['Gene-Phenotype'], width=bar_width, color=colors['Gene-Phenotype'], label='Gene-Phenotype')

def format_large_number(value):
    if value >= 1_000_000:
        return f"{value / 1_000_000:.0f}M"
    elif value >= 1_000:
        return f"{value / 1_000:.0f}k"
    return str(value)

for i in x:
    vals = [plot_df.at[i, 'all_relations'], plot_df.at[i, 'miRNA-Gene'], plot_df.at[i, 'miRNA-Disease'], plot_df.at[i, 'miRNA-Phenotype'], plot_df.at[i, 'Gene-Phenotype'], plot_df.at[i, 'Gene-Disease']]
    pos_list = [positions_all[i], positions_gene[i], positions_disease[i], positions_phenotype[i], positions_disease2[i], positions_phenotype2[i]]
    for val, pos in zip(vals, pos_list):
        if val > 0:
            ax.text(pos, val * 1.02, format_large_number(val), ha='center', va='bottom', fontsize=7)

ax.set_xticks(x)
ax.set_xticklabels(plot_df['year'], rotation=45, ha='right', fontsize=9)
ax.set_yscale('log')
ax.set_ylim(1e2, 10**4.5)
ax.tick_params(axis='y', labelsize=9)
ax.set_ylabel('Numero di relazioni (log scale)', fontsize=9)
ax.grid(False)
for spine in ax.spines.values():
    spine.set_visible(False)
ax.tick_params(axis='y', which='both', length=0)
ax.legend(fontsize=9, loc='upper left')

plt.tight_layout()
plt.savefig("mirna_relation_types.png", dpi=400, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
train_edges_df = edges_df[(edges_df['year']<2025) | (edges_df['year'].isna())]
test_edges_df = edges_df[edges_df['year']>=2025]

nodes_df = nodes_df[(nodes_df['name'].isin(train_edges_df['subject'])) | (nodes_df['name'].isin(train_edges_df['object']))]

test_edges_df = test_edges_df[test_edges_df['subject'].isin(nodes_df['name'])]
test_edges_df = test_edges_df[test_edges_df['object'].isin(nodes_df['name'])]

In [None]:
test_edges_df.to_csv("test_edges.tsv", sep="\t", index=False)

In [None]:
view_directed = Graph.from_pd(
    edges_df=train_edges_df,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties",
)

# Custom functions to use GRAPE embeddings via SciKit-Learn to predict

In [None]:
# trying to use the models directly from sci-kit learn instead of grape
import random
import time
import numpy as np
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from IPython.display import clear_output
import math
import grape


def get_edge_type_ids_list(graph):
    edge_types = []
    for edge_id in range(graph.get_number_of_directed_edges()):
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)
        predicate_id = graph.get_edge_type_id_from_edge_type_name(predicate)
        edge_types.append(predicate_id)
    return edge_types


def extract_embeddings_for_graph(grape_embedding, grape_graph):
    before = time.time()
    node_embedding = grape_embedding.get_all_node_embedding()
    try:
        edge_type_embedding = grape_embedding.get_all_edge_type_embeddings()
    except ValueError as e:
        logging.warning(
            f"Error while extracting edge type embeddings: {e}, using empty list instead"
        )
        edge_type_embedding = []
    number_of_edges = grape_graph.get_number_of_directed_edges()
    edge_node_names = grape_graph.get_edge_node_names(directed=True)
    embeddings = []
    for edge_id in range(number_of_edges):
        subject = edge_node_names[edge_id][0]
        object = edge_node_names[edge_id][1]
        predicate = grape_graph.get_edge_type_name_from_edge_id(edge_id)
        subject_embedding = node_embedding[0].loc[subject].values
        predicate_embedding = (
            edge_type_embedding[0].loc[predicate].values
            if len(edge_type_embedding) > 0
            else np.empty(0)
        )
        object_embedding = node_embedding[0].loc[object].values
        edge_embedding = np.concatenate(
            [subject_embedding, predicate_embedding, object_embedding]
        )
        embeddings.append(edge_embedding)
    logging.info(f"Embedding extraction time:{time.time()-before}")
    return embeddings


def edge_pred_pairs_sklearn(
    graph,
    embedder,
    edge_pred_model,
    pairs_to_predict,
    name_for_df=None,
    clear_output=False,
    train_size=0.7,
    number_of_holdouts=5,
    seed=42,
    use_scale_free_distribution=True,
    train_on_filtered=True,
    training_unbalance_rate=1.0,
    verbose=False,
    binary=True,
):
    df_results = pd.DataFrame()
    columns = [
        "Graph",
        "Embedder",
        "Model",
        "Source Type",
        "Destination Type",
        "Train on filtered",
        "Training set size",
        "Testing set size",
        "Training balanced accuracy",
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",  # ,'AUC'
    ]

    for i, pair_to_predict in enumerate(pairs_to_predict):
        print(f"Predicting pair: {pair_to_predict} ({i+1}/{len(pairs_to_predict)})")

        # change how the training behaves
        # update_fit(edge_pred_model, pair_to_predict if train_on_filtered else None, graph) # Not needed anymore with the sklearn models
        # negative graph extracted from the full graph to avoid false negatives
        # model will only be trained on data of the relevant type pair to predict
        results_custom_filtered_train = edge_prediction_pipeline_sklearn(
            graph,
            edge_pred_model,
            embedder,
            pair_to_predict,
            train_on_filtered=train_on_filtered,
            train_size=train_size,
            number_of_holdouts=number_of_holdouts,
            seed=seed,
            verbose=verbose,
            clear_output_holdout=clear_output,
            use_scale_free_distribution=use_scale_free_distribution,
            training_unbalance_rate=training_unbalance_rate,
            binary=binary,
        )
        df_results_custom_filtered_train = pd.DataFrame(results_custom_filtered_train)
        df_results = pd.concat([df_results, df_results_custom_filtered_train])

    df_results.columns = columns
    edge_pred_model_name = (
        edge_pred_model.__class__.__module__ + "." + edge_pred_model.__class__.__name__
    )
    df_results["edge_pred_model"] = edge_pred_model_name
    df_results["embedding_model"] = ""#embedder.model_name()
    df_results["name"] = (
        f"-{edge_pred_model_name}"#{embedder.model_name()}
        if name_for_df is None
        else name_for_df
    )
    # results[f'{embedder.model_name()}-{model.model_name()}'] = df_results
    return df_results


def edge_prediction_pipeline_sklearn(
    graph,
    model,
    embedder,
    pair_to_predict,
    train_on_filtered=True,
    train_size=0.7,
    number_of_holdouts=5,
    seed=42,
    verbose=False,
    clear_output_holdout=True,
    use_scale_free_distribution=True,
    training_unbalance_rate=1.0,
    binary=True,
):
    random.seed(seed)

    results = []

    for i in range(number_of_holdouts):
        # clean the cell output at each iteration to avoid huge cell outputs
        if clear_output_holdout:
            clear_output(wait=True)
        # use connected monte carlo to obtain a training set that has the same connectivity guarantees as full graph
        logging.info(f"Generating holdout {i+1}/{number_of_holdouts}")
        random_state = random.randrange(0, 100000)
        train_graph, positive_test_graph = graph.connected_holdout(
            train_size=train_size, random_state=random_state
        )

        # check if number of connected components is the same in the training set and full graph
        logging.debug(train_graph.get_number_of_connected_components())
        assert (
            train_graph.get_number_of_connected_components()
            == graph.get_number_of_connected_components()
        )

        logging.info("Filtering train and test graph by source/destination node type")
        # keep only the edges (source-destination node type) we are interested in
        train_graph_filtered = train_graph.filter_from_names(
            source_node_type_name_to_keep=[pair_to_predict[0]],
            destination_node_type_name_to_keep=[pair_to_predict[1]],
        )
        test_graph_filtered = positive_test_graph.filter_from_names(
            source_node_type_name_to_keep=[pair_to_predict[0]],
            destination_node_type_name_to_keep=[pair_to_predict[1]],
        )

        # check if embedder is of class grape.EmbeddingResult
        if not isinstance(embedder, grape.EmbeddingResult):
            # calculate the embedding on the not filtered train graph
            logging.info("Training embedding on unfiltered train graph")
            before = time.time()

            train_embedding = embedder.fit_transform(train_graph)

            logging.info(f"Embedding time:{time.time()-before}")
        else:
            logging.info("Using precalculated embeddings")
            train_embedding = embedder

        logging.info("Training model using the filtered train graph")

        logging.info("Generating negative training graph")
        before = time.time()
        number_of_negative_samples = (
            int(
                math.ceil(
                    train_graph_filtered.get_number_of_directed_edges()
                    * training_unbalance_rate
                )
            )
            if train_on_filtered
            else int(
                math.ceil(
                    train_graph.get_number_of_directed_edges() * training_unbalance_rate
                )
            )
        )

        logging.info(f"Number of negative samples: {number_of_negative_samples}")
        train_negative_graph = graph.sample_negative_graph(  # using the full graph to generate the negative edges to avoid false negatives
            number_of_negative_samples=number_of_negative_samples,
            # only_from_same_component=True,
            random_state=random_state,
            use_scale_free_distribution=use_scale_free_distribution,
            # sample_edge_types=False, # TODO: check if this is correct and test what happens if it is True
            source_node_types_names=[pair_to_predict[0]] if pair_to_predict else None,
            destination_node_types_names=(
                [pair_to_predict[1]] if pair_to_predict else None
            ),
        )
        logging.info(train_negative_graph.get_unique_edge_type_ids())
        logging.info(
            f"Number of edge types in negative training graph: {len(train_negative_graph.get_unique_edge_type_ids())}"
        )
        logging.info(f"Negative training graph generation time:{time.time()-before}")
        logging.info("Extracting embeddings for the negative training graph")
        before = time.time()
        train_negative_graph_embeddings = extract_embeddings_for_graph(
            train_embedding, train_negative_graph
        )
        logging.info(
            f"Negative training graph embedding extraction time:{time.time()-before}"
        )

        if train_on_filtered:
            logging.info(train_graph_filtered.get_unique_edge_type_ids())
            logging.info(
                f"Number of edge types in positive training graph: {len(train_graph_filtered.get_unique_edge_type_ids())}"
            )
            # generate the list of embeddings for the train_graph_filtered
            logging.info("Extracting embeddings for the positive training graph")
            train_positive_graph_filtered_embeddings = extract_embeddings_for_graph(
                train_embedding, train_graph_filtered
            )
            # combine the positive and negative embeddings
            train_graph_embeddings = np.concatenate(
                [
                    train_positive_graph_filtered_embeddings,
                    train_negative_graph_embeddings,
                ]
            )
            # generate the list of labels for the train_graph_filtered
            if binary:
                train_graph_filtered_labels = [
                    1 for _ in range(len(train_positive_graph_filtered_embeddings))
                ] + [0 for _ in range(len(train_negative_graph_embeddings))]
            else:
                positive_train_edge_types = get_edge_type_ids_list(train_graph_filtered)
                negative_train_edge_types = [
                    -1 for _ in range(len(train_negative_graph_embeddings))
                ]  # get_edge_type_ids_list(train_negative_graph)
                train_graph_filtered_labels = (
                    positive_train_edge_types + negative_train_edge_types
                )

            # train the model
            logging.info("Training model on the filtered training graph")
            model = model.fit(train_graph_embeddings, train_graph_filtered_labels)
        else:
            logging.info(train_graph.get_unique_edge_type_ids())
            logging.info(
                f"Number of edge types in positive training graph: {len(train_graph.get_unique_edge_type_ids())}"
            )
            # generate the list of embeddings for the train_graph
            logging.info("Extracting embeddings for the positive training graph")
            train_positive_graph_embeddings = extract_embeddings_for_graph(
                train_embedding, train_graph
            )
            # combine the positive and negative embeddings
            train_graph_embeddings = np.concatenate(
                [train_positive_graph_embeddings, train_negative_graph_embeddings]
            )
            # generate the list of labels for the train_graph
            if binary:
                train_graph_labels = [
                    1 for _ in range(len(train_positive_graph_embeddings))
                ] + [0 for _ in range(len(train_negative_graph_embeddings))]
            else:
                positive_train_edge_types = get_edge_type_ids_list(train_graph)
                negative_train_edge_types = [
                    -1 for _ in range(len(train_negative_graph_embeddings))
                ]  # get_edge_type_ids_list(train_negative_graph)
                train_graph_labels = (
                    positive_train_edge_types + negative_train_edge_types
                )
            # train the model
            logging.info("Training model on the training graph")
            model = model.fit(train_graph_embeddings, train_graph_labels)

        logging.info("Evaluating model on train set")
        train_pred = model.predict(train_graph_embeddings)
        if train_on_filtered:
            logging.info("Evaluating model on filtered positive train set")
            pos_train_pred = model.predict(train_positive_graph_filtered_embeddings)
        else:
            logging.info("Evaluating model on positive train set")
            pos_train_pred = model.predict(train_positive_graph_embeddings)
        logging.info("Evaluating model on negative train set")
        neg_train_pred = model.predict(train_negative_graph_embeddings)

        training_set_size = len(train_pred)

        if binary:
            train_score = (
                balanced_accuracy_score(train_graph_filtered_labels, train_pred)
                if train_on_filtered
                else balanced_accuracy_score(train_graph_labels, train_pred)
            )
            pos_train_score = balanced_accuracy_score(
                [1 for _ in range(len(pos_train_pred))], pos_train_pred
            )
            neg_train_score = balanced_accuracy_score(
                [0 for _ in range(len(neg_train_pred))], neg_train_pred
            )
        else:
            train_score = (
                balanced_accuracy_score(train_graph_filtered_labels, train_pred)
                if train_on_filtered
                else balanced_accuracy_score(train_graph_labels, train_pred)
            )
            pos_train_score = balanced_accuracy_score(
                positive_train_edge_types, pos_train_pred
            )
            neg_train_score = balanced_accuracy_score(
                negative_train_edge_types, neg_train_pred
            )

        if verbose:
            # pred_train_edge_presence = train_pred.apply(lambda row:check_if_in_graph(graph,row['sources'],row['destinations'],pair_to_predict),axis=1)
            # train_score = balanced_accuracy_score(pred_train_edge_presence, train_pred['prediction'].apply(lambda x:x>0.5))
            logging.info(f"Balanced accuracy score TRAINING: {train_score}")
            logging.info(
                f"Balanced accuracy positive score TRAINING: {pos_train_score}"
            )
            logging.info(
                f"Balanced accuracy negative score TRAINING: {neg_train_score}"
            )

        logging.info("Creating a graph with the negative edges for testing")
        # create graph with negative edges for testing
        negative_test_graph = graph.sample_negative_graph(
            # number_of_negative_samples=test_graph_filtered.get_number_of_edges(), # this option creates only half the edges
            number_of_negative_samples=test_graph_filtered.get_number_of_directed_edges(),
            # only_from_same_component=True,
            source_node_types_names=[pair_to_predict[0]],
            destination_node_types_names=[pair_to_predict[1]],
            random_state=random_state
            + 1,  # to avoid the same random state as the negative training graph
            use_scale_free_distribution=use_scale_free_distribution,
        )
        logging.info(negative_test_graph.get_unique_edge_type_ids())
        logging.info(
            f"Number of edge types in negative test graph: {len(negative_test_graph.get_unique_edge_type_ids())}"
        )

        if verbose:
            logging.info(
                f"#edges in positive test graph: {test_graph_filtered.get_number_of_directed_edges()}"
            )
            logging.info(
                f"#edges in negative test graph: {negative_test_graph.get_number_of_directed_edges()}"
            )

        # use model to predict on the positive edges
        logging.info("Using the model to predict the existence of positive edges")
        # pos_pred = model.predict_proba(
        #   graph=test_graph_filtered,
        #   edge_features=transe_edge_features,
        #   return_predictions_dataframe=True,
        #   support=train_graph
        # )
        pos_pred = model.predict(
            extract_embeddings_for_graph(train_embedding, test_graph_filtered)
        )

        # if verbose:
        #     # check if all edges of positive test set are in the original graph
        #     pos_pred_edge_presence = pos_pred.apply(lambda row:check_if_in_graph(graph,row['sources'],row['destinations'],pair_to_predict),axis=1)
        #     logging.info(f'Are all positive edges present in the positive test set also in the original graph? {pos_pred_edge_presence.all()}')
        #     logging.info(pos_pred_edge_presence.value_counts())

        # use model to predict on the negative edges
        logging.info("Using the model to predict the non-existence of negative edges")
        # neg_pred = model.predict_proba(
        #   graph=negative_test_graph,
        #   edge_features=transe_edge_features,
        #   return_predictions_dataframe=True,
        #   support=train_graph
        # )
        neg_pred = model.predict(
            extract_embeddings_for_graph(train_embedding, negative_test_graph)
        )

        testing_set_size = len(pos_pred) + len(neg_pred)

        # if verbose:
        #     # check if all edges of negative test set are not in the original graph
        #     neg_pred_edge_presence = neg_pred.apply(lambda row:check_if_in_graph(graph,row['sources'],row['destinations'],pair_to_predict),axis=1)
        #     logging.info(f'Are all negative edges present in the negative test set NOT in the original graph? {~neg_pred_edge_presence.all()}')
        #     logging.info(neg_pred_edge_presence.value_counts())

        # calculate balanced accuracy score for positive and negative predictions
        if binary:
            pos_score = balanced_accuracy_score(
                [1 for _ in range(len(pos_pred))], pos_pred
            )
            neg_score = balanced_accuracy_score(
                [0 for _ in range(len(neg_pred))], neg_pred
            )
        else:
            positive_test_edge_types = get_edge_type_ids_list(test_graph_filtered)
            negative_test_edge_types = [
                -1 for _ in range(len(neg_pred))
            ]  # get_edge_type_ids_list(negative_test_graph)
            pos_score = balanced_accuracy_score(positive_test_edge_types, pos_pred)
            neg_score = balanced_accuracy_score(negative_test_edge_types, neg_pred)
            overall_score = balanced_accuracy_score(
                positive_test_edge_types + negative_test_edge_types,
                np.concatenate([pos_pred, neg_pred]),
            )
            logging.info(f"Overall balanced accuracy score: {overall_score}")
        # pos_score = balanced_accuracy_score([True for _ in range(len(pos_pred))], pos_pred)
        # neg_score = balanced_accuracy_score([False for _ in range(len(neg_pred))], neg_pred)
        logging.info(f"Balanced accuracy positive score: {pos_score}")
        logging.info(f"Balanced accuracy negative score: {neg_score}")
        avg_score = (pos_score + neg_score) / 2
        logging.info(f"Balanced accuracy mean score: {avg_score}")

        # if binary:
        #     auc_score = roc_auc_score(
        #         [True for _ in range(len(pos_pred))] + [False for _ in range(len(neg_pred))],
        #         np.concatenate([pos_pred, neg_pred])
        #     )
        # else:
        #     auc_score = roc_auc_score(
        #         positive_test_edge_types + negative_test_edge_types,
        #         np.concatenate([pos_pred, neg_pred])
        #     )
        # logging.info(f"AUC score: {auc_score}")

        if hasattr(model, "get_depth"):
            logging.info(f"Tree depth: {model.get_depth()}")

        model_name = model.__class__.__module__ + "." + model.__class__.__name__

        results.append(
            (
                graph.get_name(),
                "",#embedder.model_name(),
                model_name,
                pair_to_predict[0],
                pair_to_predict[1],
                train_on_filtered,
                training_set_size,
                testing_set_size,
                train_score,
                pos_score,
                neg_score,
                avg_score,
            )  # ,auc_score
        )

    import joblib
    model_path = f"{graph.get_name()}_{model_name}.pkl"
    joblib.dump(model, model_path)
    logging.info(f"{graph.get_name()}_{model_name}.pkl saved")

    return results

# Specific Edge Prediction

In [None]:
from grape.embedders import TransEEnsmallen

embedder_transE = TransEEnsmallen(random_state=42)
graph_embedding_transe = embedder_transE.fit_transform(view_directed)

## Random Forest - Multi-class

In [None]:
data_ora_corrente = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("La data e l'ora correnti sono:", data_ora_corrente)

In [None]:
test_edges_df = pd.read_csv("test_edges.tsv", sep="\t")

# miRNA-Gene

In [None]:
from sklearn.ensemble import RandomForestClassifier

seed = 42

model_forest = RandomForestClassifier(random_state=seed, n_jobs=6)
pairs_to_predict = [
    ("miRNA", "Gene"),
    #("miRNA", "Disease"),
    #("miRNA", "Phenotype"),
    #("Gene", "Disease"),
    #("Gene", "Phenotype"),
]


In [None]:
logging.basicConfig(level=logging.DEBUG) 
logging.getLogger().setLevel(logging.DEBUG)

results_fun_transE_forest = edge_pred_pairs_sklearn(
    view_directed,
    graph_embedding_transe,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)

In [None]:
results_fun_transE_forest.groupby(["Source Type", "Destination Type"])[
    [
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",
    ]
].agg(["mean", "std"])

In [None]:
import joblib
import numpy as np
from tqdm import tqdm

def predict_on_graph(graph, embedder, model_path, include_edge_types=True):
    """
    Predicts link scores for all edges in the provided graph using the given embedder and a trained sklearn model.

    Parameters
    ----------
    graph : grape.Graph
        The graph to use for extracting edges and embeddings.
    embedder : grape.EmbeddingResult or compatible embedder
        The node and optionally edge embedder used for embedding nodes and edges.
    model_path : str
        Path to the pre-trained sklearn model.
    include_edge_types : bool, optional
        Whether to include edge type embeddings in the concatenated input, by default True.

    Returns
    -------
    predictions : np.ndarray
        An array of predicted scores (e.g., probabilities or binary labels depending on model).
    edge_ids : list
        List of edge (subject, predicate, object) identifiers corresponding to each prediction.
    """

    model = joblib.load(model_path)

    # Estrarre embeddings
    node_embeddings = embedder.get_all_node_embedding()[0]
    try:
        edge_type_embeddings = embedder.get_all_edge_type_embeddings()[0] if include_edge_types else {}
    except:
        edge_type_embeddings = {}

    edge_node_names = graph.get_edge_node_names(directed=True)
    predictions = []
    edge_ids = []

    for edge_id in tqdm(range(graph.get_number_of_directed_edges()), desc="Predicting edges"):
        subj, obj = edge_node_names[edge_id]
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)

        subj_emb = node_embeddings.loc[subj].values
        obj_emb = node_embeddings.loc[obj].values

        if include_edge_types and predicate in edge_type_embeddings.index:
            pred_emb = edge_type_embeddings.loc[predicate].values
            edge_input = np.concatenate([subj_emb, pred_emb, obj_emb])
        else:
            edge_input = np.concatenate([subj_emb, obj_emb])

        score = model.predict_proba([edge_input])[0][1] if hasattr(model, "predict_proba") else model.predict([edge_input])[0]
        predictions.append(score)
        edge_ids.append((subj, predicate, obj))

    return np.array(predictions), edge_ids


In [None]:
test_edges_df_ = test_edges_df.merge(
    nodes_df[['name', 'type']], left_on='subject', right_on='name'
).drop(columns=['name']).merge(
    nodes_df[['name', 'type']], left_on='object', right_on='name'
).drop(columns=['name'])

mask = False
for type1, type2 in pairs_to_predict:
    mask |= ((test_edges_df_['type_x'] == type1) & (test_edges_df_['type_y'] == type2)) | \
            ((test_edges_df_['type_x'] == type2) & (test_edges_df_['type_y'] == type1))

test_edges_df_filtered = test_edges_df_[mask][['subject', 'predicate', 'object']].drop_duplicates().reset_index(drop=True)

In [None]:
test_view = Graph.from_pd(
    edges_df=test_edges_df_filtered,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties test set",
)

predictions, edge_triples = predict_on_graph(
    graph=test_view,
    embedder=graph_embedding_transe,
    model_path="RNA-KG VIEW_properties_sklearn.ensemble._forest.RandomForestClassifier.pkl",
    include_edge_types=True
)

In [None]:
df = pd.DataFrame(edge_triples, columns=["subject", "predicate", "object"])
df["prediction"] = predictions
df.to_csv("predictions_mirnagene.csv", index=False)

In [None]:
import pandas as pd
mirnagene = pd.read_csv("predictions_mirnagene.csv")
mirnagene = mirnagene.merge(test_edges_df, on=['subject', 'predicate', 'object'], how='left')
mirnagene.to_csv("predictions_mirnagene.csv", index=False)
mirnagene 

In [None]:
predictions = mirnagene['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=mirnagene, x='year', y='prediction')
plt.title('Distribuzione delle Predizioni per Anno')
plt.xlabel('Anno')
plt.ylabel('Valori Predetti')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
mirnagene = mirnagene[mirnagene['prediction'] != 0]

In [None]:
predictions = mirnagene['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=mirnagene, x='year', y='prediction')
plt.title('Distribuzione delle Predizioni per Anno')
plt.xlabel('Anno')
plt.ylabel('Valori Predetti')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# miRNA-Disease

In [None]:
from sklearn.ensemble import RandomForestClassifier

seed = 42

model_forest = RandomForestClassifier(random_state=seed, n_jobs=6)
pairs_to_predict = [
    #("miRNA", "Gene"),
    ("miRNA", "Disease"),
    #("miRNA", "Phenotype"),
    #("Gene", "Disease"),
    #("Gene", "Phenotype"),
]


In [None]:
logging.basicConfig(level=logging.DEBUG) 
logging.getLogger().setLevel(logging.DEBUG)

results_fun_transE_forest = edge_pred_pairs_sklearn(
    view_directed,
    graph_embedding_transe,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)

In [None]:
results_fun_transE_forest.groupby(["Source Type", "Destination Type"])[
    [
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",
    ]
].agg(["mean", "std"])

In [None]:
import joblib
import numpy as np
from tqdm import tqdm

def predict_on_graph(graph, embedder, model_path, include_edge_types=True):
    """
    Predicts link scores for all edges in the provided graph using the given embedder and a trained sklearn model.

    Parameters
    ----------
    graph : grape.Graph
        The graph to use for extracting edges and embeddings.
    embedder : grape.EmbeddingResult or compatible embedder
        The node and optionally edge embedder used for embedding nodes and edges.
    model_path : str
        Path to the pre-trained sklearn model.
    include_edge_types : bool, optional
        Whether to include edge type embeddings in the concatenated input, by default True.

    Returns
    -------
    predictions : np.ndarray
        An array of predicted scores (e.g., probabilities or binary labels depending on model).
    edge_ids : list
        List of edge (subject, predicate, object) identifiers corresponding to each prediction.
    """

    model = joblib.load(model_path)

    # Estrarre embeddings
    node_embeddings = embedder.get_all_node_embedding()[0]
    try:
        edge_type_embeddings = embedder.get_all_edge_type_embeddings()[0] if include_edge_types else {}
    except:
        edge_type_embeddings = {}

    edge_node_names = graph.get_edge_node_names(directed=True)
    predictions = []
    edge_ids = []

    for edge_id in tqdm(range(graph.get_number_of_directed_edges()), desc="Predicting edges"):
        subj, obj = edge_node_names[edge_id]
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)

        subj_emb = node_embeddings.loc[subj].values
        obj_emb = node_embeddings.loc[obj].values

        if include_edge_types and predicate in edge_type_embeddings.index:
            pred_emb = edge_type_embeddings.loc[predicate].values
            edge_input = np.concatenate([subj_emb, pred_emb, obj_emb])
        else:
            edge_input = np.concatenate([subj_emb, obj_emb])

        score = model.predict_proba([edge_input])[0][1] if hasattr(model, "predict_proba") else model.predict([edge_input])[0]
        predictions.append(score)
        edge_ids.append((subj, predicate, obj))

    return np.array(predictions), edge_ids


In [None]:
test_edges_df_ = test_edges_df.merge(
    nodes_df[['name', 'type']], left_on='subject', right_on='name'
).drop(columns=['name']).merge(
    nodes_df[['name', 'type']], left_on='object', right_on='name'
).drop(columns=['name'])

mask = False
for type1, type2 in pairs_to_predict:
    mask |= ((test_edges_df_['type_x'] == type1) & (test_edges_df_['type_y'] == type2)) | \
            ((test_edges_df_['type_x'] == type2) & (test_edges_df_['type_y'] == type1))

test_edges_df_filtered = test_edges_df_[mask][['subject', 'predicate', 'object']].drop_duplicates().reset_index(drop=True)

In [None]:
test_view = Graph.from_pd(
    edges_df=test_edges_df_filtered,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties test set",
)

predictions, edge_triples = predict_on_graph(
    graph=test_view,
    embedder=graph_embedding_transe,
    model_path="RNA-KG VIEW_properties_sklearn.ensemble._forest.RandomForestClassifier.pkl",
    include_edge_types=True
)

In [None]:
df = pd.DataFrame(edge_triples, columns=["subject", "predicate", "object"])
df["prediction"] = predictions
df.to_csv("predictions_mirnadisease.csv", index=False)

In [None]:
import pandas as pd
mirnadisease = pd.read_csv("predictions_mirnadisease.csv")
mirnadisease = mirnadisease.merge(test_edges_df, on=['subject', 'predicate', 'object'], how='left')
mirnadisease.to_csv("predictions_mirnadisease.csv", index=False)
mirnadisease 

In [None]:
predictions = mirnadisease['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=mirnadisease, x='year', y='prediction')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
mirnadisease = mirnadisease[mirnadisease['prediction'] != 0]

In [None]:
predictions = mirnadisease['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=mirnadisease, x='year', y='prediction')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# miRNA-Phenotype

In [None]:
from sklearn.ensemble import RandomForestClassifier

seed = 42

model_forest = RandomForestClassifier(random_state=seed, n_jobs=6)
pairs_to_predict = [
    #("miRNA", "Gene"),
    #("miRNA", "Disease"),
    ("miRNA", "Phenotype"),
    #("Gene", "Disease"),
    #("Gene", "Phenotype"),
]


In [None]:
logging.basicConfig(level=logging.DEBUG) 
logging.getLogger().setLevel(logging.DEBUG)

results_fun_transE_forest = edge_pred_pairs_sklearn(
    view_directed,
    graph_embedding_transe,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)

In [None]:
results_fun_transE_forest.groupby(["Source Type", "Destination Type"])[
    [
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",
    ]
].agg(["mean", "std"])

In [None]:
import joblib
import numpy as np
from tqdm import tqdm

def predict_on_graph(graph, embedder, model_path, include_edge_types=True):
    """
    Predicts link scores for all edges in the provided graph using the given embedder and a trained sklearn model.

    Parameters
    ----------
    graph : grape.Graph
        The graph to use for extracting edges and embeddings.
    embedder : grape.EmbeddingResult or compatible embedder
        The node and optionally edge embedder used for embedding nodes and edges.
    model_path : str
        Path to the pre-trained sklearn model.
    include_edge_types : bool, optional
        Whether to include edge type embeddings in the concatenated input, by default True.

    Returns
    -------
    predictions : np.ndarray
        An array of predicted scores (e.g., probabilities or binary labels depending on model).
    edge_ids : list
        List of edge (subject, predicate, object) identifiers corresponding to each prediction.
    """

    model = joblib.load(model_path)

    node_embeddings = embedder.get_all_node_embedding()[0]
    try:
        edge_type_embeddings = embedder.get_all_edge_type_embeddings()[0] if include_edge_types else {}
    except:
        edge_type_embeddings = {}

    edge_node_names = graph.get_edge_node_names(directed=True)
    predictions = []
    edge_ids = []

    for edge_id in tqdm(range(graph.get_number_of_directed_edges()), desc="Predicting edges"):
        subj, obj = edge_node_names[edge_id]
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)

        subj_emb = node_embeddings.loc[subj].values
        obj_emb = node_embeddings.loc[obj].values

        if include_edge_types and predicate in edge_type_embeddings.index:
            pred_emb = edge_type_embeddings.loc[predicate].values
            edge_input = np.concatenate([subj_emb, pred_emb, obj_emb])
        else:
            edge_input = np.concatenate([subj_emb, obj_emb])

        score = model.predict_proba([edge_input])[0][1] if hasattr(model, "predict_proba") else model.predict([edge_input])[0]
        predictions.append(score)
        edge_ids.append((subj, predicate, obj))

    return np.array(predictions), edge_ids


In [None]:
test_edges_df_ = test_edges_df.merge(
    nodes_df[['name', 'type']], left_on='subject', right_on='name'
).drop(columns=['name']).merge(
    nodes_df[['name', 'type']], left_on='object', right_on='name'
).drop(columns=['name'])

mask = False
for type1, type2 in pairs_to_predict:
    mask |= ((test_edges_df_['type_x'] == type1) & (test_edges_df_['type_y'] == type2)) | \
            ((test_edges_df_['type_x'] == type2) & (test_edges_df_['type_y'] == type1))

test_edges_df_filtered = test_edges_df_[mask][['subject', 'predicate', 'object']].drop_duplicates().reset_index(drop=True)

In [None]:
test_view = Graph.from_pd(
    edges_df=test_edges_df_filtered,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties test set",
)

predictions, edge_triples = predict_on_graph(
    graph=test_view,
    embedder=graph_embedding_transe,
    model_path="RNA-KG VIEW_properties_sklearn.ensemble._forest.RandomForestClassifier.pkl",
    include_edge_types=True
)

In [None]:
df = pd.DataFrame(edge_triples, columns=["subject", "predicate", "object"])
df["prediction"] = predictions
df.to_csv("predictions_mirnaphenotype.csv", index=False)

In [None]:
import pandas as pd
mirnaphenotype = pd.read_csv("predictions_mirnaphenotype.csv")
mirnaphenotype = mirnaphenotype.merge(test_edges_df, on=['subject', 'predicate', 'object'], how='left')
mirnaphenotype.to_csv("predictions_mirnaphenotype.csv", index=False)
mirnaphenotype 

In [None]:
predictions = mirnaphenotype['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=mirnaphenotype, x='year', y='prediction')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
mirnaphenotype = mirnaphenotype[mirnaphenotype['prediction'] != 0]

In [None]:
predictions = mirnaphenotype['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=mirnaphenotype, x='year', y='prediction')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Gene-Disease

In [None]:
from sklearn.ensemble import RandomForestClassifier

seed = 42

model_forest = RandomForestClassifier(random_state=seed, n_jobs=6)
pairs_to_predict = [
    #("miRNA", "Gene"),
    #("miRNA", "Disease"),
    #("miRNA", "Phenotype"),
    ("Gene", "Disease"),
    #("Gene", "Phenotype"),
]


In [None]:
logging.basicConfig(level=logging.DEBUG) 
logging.getLogger().setLevel(logging.DEBUG)

results_fun_transE_forest = edge_pred_pairs_sklearn(
    view_directed,
    graph_embedding_transe,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)

In [None]:
results_fun_transE_forest.groupby(["Source Type", "Destination Type"])[
    [
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",
    ]
].agg(["mean", "std"])

In [None]:
import joblib
import numpy as np
from tqdm import tqdm

def predict_on_graph(graph, embedder, model_path, include_edge_types=True):
    """
    Predicts link scores for all edges in the provided graph using the given embedder and a trained sklearn model.

    Parameters
    ----------
    graph : grape.Graph
        The graph to use for extracting edges and embeddings.
    embedder : grape.EmbeddingResult or compatible embedder
        The node and optionally edge embedder used for embedding nodes and edges.
    model_path : str
        Path to the pre-trained sklearn model.
    include_edge_types : bool, optional
        Whether to include edge type embeddings in the concatenated input, by default True.

    Returns
    -------
    predictions : np.ndarray
        An array of predicted scores (e.g., probabilities or binary labels depending on model).
    edge_ids : list
        List of edge (subject, predicate, object) identifiers corresponding to each prediction.
    """

    model = joblib.load(model_path)

    node_embeddings = embedder.get_all_node_embedding()[0]
    try:
        edge_type_embeddings = embedder.get_all_edge_type_embeddings()[0] if include_edge_types else {}
    except:
        edge_type_embeddings = {}

    edge_node_names = graph.get_edge_node_names(directed=True)
    predictions = []
    edge_ids = []

    for edge_id in tqdm(range(graph.get_number_of_directed_edges()), desc="Predicting edges"):
        subj, obj = edge_node_names[edge_id]
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)

        subj_emb = node_embeddings.loc[subj].values
        obj_emb = node_embeddings.loc[obj].values

        if include_edge_types and predicate in edge_type_embeddings.index:
            pred_emb = edge_type_embeddings.loc[predicate].values
            edge_input = np.concatenate([subj_emb, pred_emb, obj_emb])
        else:
            edge_input = np.concatenate([subj_emb, obj_emb])

        score = model.predict_proba([edge_input])[0][1] if hasattr(model, "predict_proba") else model.predict([edge_input])[0]
        predictions.append(score)
        edge_ids.append((subj, predicate, obj))

    return np.array(predictions), edge_ids


In [None]:
test_edges_df_ = test_edges_df.merge(
    nodes_df[['name', 'type']], left_on='subject', right_on='name'
).drop(columns=['name']).merge(
    nodes_df[['name', 'type']], left_on='object', right_on='name'
).drop(columns=['name'])

mask = False
for type1, type2 in pairs_to_predict:
    mask |= ((test_edges_df_['type_x'] == type1) & (test_edges_df_['type_y'] == type2)) | \
            ((test_edges_df_['type_x'] == type2) & (test_edges_df_['type_y'] == type1))

test_edges_df_filtered = test_edges_df_[mask][['subject', 'predicate', 'object']].drop_duplicates().reset_index(drop=True)

In [None]:
test_view = Graph.from_pd(
    edges_df=test_edges_df_filtered,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties test set",
)

predictions, edge_triples = predict_on_graph(
    graph=test_view,
    embedder=graph_embedding_transe,
    model_path="RNA-KG VIEW_properties_sklearn.ensemble._forest.RandomForestClassifier.pkl",
    include_edge_types=True
)

In [None]:
df = pd.DataFrame(edge_triples, columns=["subject", "predicate", "object"])
df["prediction"] = predictions
df.to_csv("predictions_genedisease.csv", index=False)

In [None]:
import pandas as pd
genedisease = pd.read_csv("predictions_genedisease.csv")
genedisease = genedisease.merge(test_edges_df, on=['subject', 'predicate', 'object'], how='left')
genedisease.to_csv("predictions_genedisease.csv", index=False)
genedisease 

In [None]:
predictions = genedisease['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=genedisease, x='year', y='prediction')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
genedisease = genedisease[genedisease['prediction'] != 0]

In [None]:
predictions = genedisease['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=genedisease, x='year', y='prediction')
plt.title('Distribuzione delle Predizioni per Anno')
plt.xlabel('Anno')
plt.ylabel('Valori Predetti')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Gene-Phenotype

In [None]:
from sklearn.ensemble import RandomForestClassifier

seed = 42

model_forest = RandomForestClassifier(random_state=seed, n_jobs=6)
pairs_to_predict = [
    #("miRNA", "Gene"),
    #("miRNA", "Disease"),
    #("miRNA", "Phenotype"),
    #("Gene", "Disease"),
    ("Gene", "Phenotype"),
]


In [None]:
logging.basicConfig(level=logging.DEBUG) 
logging.getLogger().setLevel(logging.DEBUG)

results_fun_transE_forest = edge_pred_pairs_sklearn(
    view_directed,
    graph_embedding_transe,
    model_forest,
    pairs_to_predict,
    seed=seed,
    clear_output=True,
    use_scale_free_distribution=True,
    verbose=True,
    number_of_holdouts=1,
    binary=False,
)

In [None]:
results_fun_transE_forest.groupby(["Source Type", "Destination Type"])[
    [
        "Positive balanced accuracy",
        "Negative balanced accuracy",
        "Mean balanced accuracy",
    ]
].agg(["mean", "std"])

In [None]:
import joblib
import numpy as np
from tqdm import tqdm

def predict_on_graph(graph, embedder, model_path, include_edge_types=True):
    """
    Predicts link scores for all edges in the provided graph using the given embedder and a trained sklearn model.

    Parameters
    ----------
    graph : grape.Graph
        The graph to use for extracting edges and embeddings.
    embedder : grape.EmbeddingResult or compatible embedder
        The node and optionally edge embedder used for embedding nodes and edges.
    model_path : str
        Path to the pre-trained sklearn model.
    include_edge_types : bool, optional
        Whether to include edge type embeddings in the concatenated input, by default True.

    Returns
    -------
    predictions : np.ndarray
        An array of predicted scores (e.g., probabilities or binary labels depending on model).
    edge_ids : list
        List of edge (subject, predicate, object) identifiers corresponding to each prediction.
    """

    model = joblib.load(model_path)

    # Estrarre embeddings
    node_embeddings = embedder.get_all_node_embedding()[0]
    try:
        edge_type_embeddings = embedder.get_all_edge_type_embeddings()[0] if include_edge_types else {}
    except:
        edge_type_embeddings = {}

    edge_node_names = graph.get_edge_node_names(directed=True)
    predictions = []
    edge_ids = []

    for edge_id in tqdm(range(graph.get_number_of_directed_edges()), desc="Predicting edges"):
        subj, obj = edge_node_names[edge_id]
        predicate = graph.get_edge_type_name_from_edge_id(edge_id)

        subj_emb = node_embeddings.loc[subj].values
        obj_emb = node_embeddings.loc[obj].values

        if include_edge_types and predicate in edge_type_embeddings.index:
            pred_emb = edge_type_embeddings.loc[predicate].values
            edge_input = np.concatenate([subj_emb, pred_emb, obj_emb])
        else:
            edge_input = np.concatenate([subj_emb, obj_emb])

        score = model.predict_proba([edge_input])[0][1] if hasattr(model, "predict_proba") else model.predict([edge_input])[0]
        predictions.append(score)
        edge_ids.append((subj, predicate, obj))

    return np.array(predictions), edge_ids


In [None]:
test_edges_df_ = test_edges_df.merge(
    nodes_df[['name', 'type']], left_on='subject', right_on='name'
).drop(columns=['name']).merge(
    nodes_df[['name', 'type']], left_on='object', right_on='name'
).drop(columns=['name'])

mask = False
for type1, type2 in pairs_to_predict:
    mask |= ((test_edges_df_['type_x'] == type1) & (test_edges_df_['type_y'] == type2)) | \
            ((test_edges_df_['type_x'] == type2) & (test_edges_df_['type_y'] == type1))

test_edges_df_filtered = test_edges_df_[mask][['subject', 'predicate', 'object']].drop_duplicates().reset_index(drop=True)

In [None]:
test_view = Graph.from_pd(
    edges_df=test_edges_df_filtered,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    edge_type_column="predicate",
    directed=True,
    name="RNA-KG VIEW_properties test set",
)

predictions, edge_triples = predict_on_graph(
    graph=test_view,
    embedder=graph_embedding_transe,
    model_path="RNA-KG VIEW_properties_sklearn.ensemble._forest.RandomForestClassifier.pkl",
    include_edge_types=True
)

In [None]:
df = pd.DataFrame(edge_triples, columns=["subject", "predicate", "object"])
df["prediction"] = predictions
df.to_csv("predictions_genephenotype.csv", index=False)

In [None]:
import pandas as pd
genephenotype = pd.read_csv("predictions_genephenotype.csv")
genephenotype = genephenotype.merge(test_edges_df, on=['subject', 'predicate', 'object'], how='left')
genephenotype.to_csv("predictions_genephenotype.csv", index=False)
genephenotype 

In [None]:
predictions = genephenotype['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=genephenotype, x='year', y='prediction')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
genephenotype = genephenotype[genephenotype['prediction'] != 0]

In [None]:
predictions = genephenotype['prediction'].to_numpy()
threshold = 0.5
(predictions>=threshold).sum(), len(predictions), \
len(predictions[predictions>=threshold])/len(predictions), len(predictions[predictions<threshold])/len(predictions)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(data=genephenotype, x='year', y='prediction')
plt.title('Distribuzione delle Predizioni per Anno')
plt.xlabel('Anno')
plt.ylabel('Valori Predetti')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Results

# TransE

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "TransE_test>=2022/predictions_mirnagene.csv",
    "miRNA–Disease": "TransE_test>=2022/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "TransE_test>=2022/predictions_mirnaphenotype.csv",
    "Gene–Disease": "TransE_test>=2022/predictions_genedisease.csv",
    "Gene–Phenotype": "TransE_test>=2022/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")
plot_box(without_zeros, "predictions_without_zeros", "predictions_without_zeros.png")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "TransE_test>=2023/predictions_mirnagene.csv",
    "miRNA–Disease": "TransE_test>=2023/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "TransE_test>=2023/predictions_mirnaphenotype.csv",
    "Gene–Disease": "TransE_test>=2023/predictions_genedisease.csv",
    "Gene–Phenotype": "TransE_test>=2023/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")
plot_box(without_zeros, "predictions_without_zeros", "predictions_without_zeros.png")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "TransE_test>=2024/predictions_mirnagene.csv",
    "miRNA–Disease": "TransE_test>=2024/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "TransE_test>=2024/predictions_mirnaphenotype.csv",
    "Gene–Disease": "TransE_test>=2024/predictions_genedisease.csv",
    "Gene–Phenotype": "TransE_test>=2024/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")
plot_box(without_zeros, "predictions_without_zeros", "predictions_without_zeros.png")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "TransE_test>=2025/predictions_mirnagene.csv",
    "miRNA–Disease": "TransE_test>=2025/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "TransE_test>=2025/predictions_mirnaphenotype.csv",
    "Gene–Disease": "TransE_test>=2025/predictions_genedisease.csv",
    "Gene–Phenotype": "TransE_test>=2025/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")
plot_box(without_zeros, "predictions_without_zeros", "predictions_without_zeros.png")


# Node2Vec

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "Node2Vec_test>=2022/predictions_mirnagene.csv",
    "miRNA–Disease": "Node2Vec_test>=2022/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "Node2Vec_test>=2022/predictions_mirnaphenotype.csv",
    "Gene–Disease": "Node2Vec_test>=2022/predictions_genedisease.csv",
    "Gene–Phenotype": "Node2Vec_test>=2022/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "Node2Vec_test>=2023/predictions_mirnagene.csv",
    "miRNA–Disease": "Node2Vec_test>=2023/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "Node2Vec_test>=2023/predictions_mirnaphenotype.csv",
    "Gene–Disease": "Node2Vec_test>=2023/predictions_genedisease.csv",
    "Gene–Phenotype": "Node2Vec_test>=2023/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "Node2Vec_test>=2024/predictions_mirnagene.csv",
    "miRNA–Disease": "Node2Vec_test>=2024/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "Node2Vec_test>=2024/predictions_mirnaphenotype.csv",
    "Gene–Disease": "Node2Vec_test>=2024/predictions_genedisease.csv",
    "Gene–Phenotype": "Node2Vec_test>=2024/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "Node2Vec_test>=2025/predictions_mirnagene.csv",
    "miRNA–Disease": "Node2Vec_test>=2025/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "Node2Vec_test>=2025/predictions_mirnaphenotype.csv",
    "Gene–Disease": "Node2Vec_test>=2025/predictions_genedisease.csv",
    "Gene–Phenotype": "Node2Vec_test>=2025/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("Prediction Score")
    plt.xlabel("")
    plt.xticks(rotation=45)
    plt.legend(title="Year", loc='upper right', fontsize=8)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

plot_box(with_zeros, "predictions_with_zeros", "predictions_with_zeros.png")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

files = {
    "miRNA–Gene": "Node2Vec_test>=2023/predictions_mirnagene.csv",
    "miRNA–Disease": "Node2Vec_test>=2023/predictions_mirnadisease.csv",
    "miRNA–Phenotype": "Node2Vec_test>=2023/predictions_mirnaphenotype.csv"#,
    #"Gene–Disease": "Node2Vec_test>=2023/predictions_genedisease.csv",
    #"Gene–Phenotype": "Node2Vec_test>=2023/predictions_genephenotype.csv"
}

dfs = []
for name, path in files.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'year_x': 'year', 'prediction_x': 'prediction'})
    df['year'] = df['year'].astype(str).str.replace('.0', '')
    df = df[df['year'].str.isnumeric()]  # Filtra solo gli anni numerici
    df = df.sort_values(by='year')
    df["relation_type"] = name
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

with_zeros = all_data.copy()
without_zeros = all_data[all_data["prediction"] != 0].copy()

def plot_box(data, title, filename):
    plt.figure(figsize=(4, 2.5))
    sns.boxplot(data=data, x="relation_type", y="prediction", hue="year",
                palette="Set2", showfliers=False)
    plt.axhline(0.5, color="darkred", linestyle="--", linewidth=1)
    plt.title(title)
    plt.ylabel("")
    plt.xlabel("")
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    #plt.xticks(rotation=45)
    plt.legend(title="Year", loc='lower right', fontsize=6, title_fontsize=7)
    plt.tight_layout()
    plt.savefig(filename, dpi=400)
    plt.show()

plot_box(with_zeros, "", "timestratified_predictions.png")

In [None]:
len(all_data[all_data['prediction'] > 0.5]),\
len(all_data),\
len(all_data[all_data['prediction'] > 0.5]) / len(all_data)

In [None]:
all_data[all_data['prediction'] > 0.5].groupby('year').size().sort_index(),\
all_data.groupby('year').size().sort_index(),\
all_data[all_data['prediction'] > 0.5].groupby('year').size().sort_index() / all_data.groupby('year').size().sort_index()