## Dependencies

In [1]:
! pip install -U grape





In [2]:
! pip freeze | grep "grape"
! pip freeze | grep "embiggen"
! pip freeze | grep "ensmallen"

grape==0.1.24
embiggen==0.11.38
ensmallen==0.8.24


In [1]:
# imports
from grape import Graph
import pandas as pd

from grape.edge_prediction import PerceptronEdgePrediction
from grape.embedders import FirstOrderLINEEnsmallen
# from grape.datasets.string import HomoSapiens

2022-11-02 05:07:23.093980: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-02 05:07:23.094038: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
!ls ./input

merged_edge_list.tsv  merged_node_list.tsv


## Independent evaluation of different edges

In [7]:
g = Graph.from_csv(
  directed=False, 
  node_path='./input/merged_node_list.tsv',
  edge_path='./input/merged_edge_list.tsv',
  verbose=True,
  nodes_column='node',
  node_list_node_types_column='node_type',
  default_node_type='None',
  sources_column='head',
  destinations_column='tail',
  edge_list_edge_types_column='edge_type',
  name="CVD KG"
)
g = g.remove_disconnected_nodes()

ValueError: The column 'edge_type' is not present in the header:
 ["head", "relation", "tail", "weight"]
Note that the separator used was `	`.

In [3]:
# how many of each edge type?
g.get_edge_type_names_counts_hashmap()

{'in_protein_group': 61142,
 'ClinVar': 3920,
 'interacts_with': 373456,
 'MeSH_hierarchy': 25454,
 'MeSH_is': 25952,
 'ClinGen': 12988,
 'is_gene_product': 75888,
 'MeSH_CVD': 38,
 'in_species_cardiac_proteome': 56058,
 'CaseOLAP_score': 62320,
 'participates_in_pathway': 360594}

In [12]:
def label_predictions_with_ground_truth(pred_df, test_graph, return_bool = True):
    test_edges = pd.DataFrame(test_graph.get_edge_node_names(directed=False))
    test_edges.columns = ['sources','destinations']
    
    labeled_pred_df = pred_df.merge(test_edges,how='left',indicator=True)
    test_truth = []
    for b in labeled_pred_df['_merge'] == 'both':
        if return_bool:
            y_ = b
        else:
            # return 1 or 0
            y_ = 1 if b else 0
        test_truth += [y_]
    labeled_pred_df['ground_truth'] = test_truth
    labeled_pred_df = labeled_pred_df.drop('_merge', axis=1)
    return labeled_pred_df


def label_negative_sample_pred(pred_df):
    pred_df['ground_truth'] = False
    return pred_df


def evaluate_predictions(m, labeled_pred_df):
    # convert DataFrame to numpy used for evaluation
    y_label = labeled_pred_df['ground_truth'].to_numpy()
    y_score = labeled_pred_df['predictions'].to_numpy()
    
    # handle case where no ground truth (for negative sampled edges)
    all_false = len(set(y_label)) == 1 and (~y_label[0])
    if all_false:
        e1 = {'auroc':float('NaN'),'auprc':float('NaN')}
    else:
        e1 = m.evaluate_prediction_probabilities(y_label,y_score)
    e2 = m.evaluate_predictions(y_label,y_score)
    return e1 | e2

In [6]:
def get_edge_type_to_node_types_mapping(g, directed=False):
    '''
    This function returns a mapping from edge_type -> (from_types,to_types)
    '''
    # gather node and edge types for every edge as a DataFrame
    edge_to_type_dict = {h:[] for h in ["from","to","from_type","to_type","edge_type"]}
    for from_node_id,to_node_id in g.get_edge_node_ids(directed=False):
        # get node types
        from_node_type_ids = g.get_node_type_ids_from_node_id(from_node_id)
        from_node_type = [g.get_node_type_name_from_node_type_id(i) for i in from_node_type_ids]
        to_node_type_ids = g.get_node_type_ids_from_node_id(to_node_id)
        to_node_type =  [g.get_node_type_name_from_node_type_id(i) for i in to_node_type_ids]

        # get edge type
        edge_id = g.get_edge_id_from_node_ids(from_node_id,to_node_id)
        edge_type = g.get_edge_type_name_from_edge_id(edge_id)

        # append to dict
        edge_to_type_dict['from'] += [from_node_id]
        edge_to_type_dict['to'] += [to_node_id]
        edge_to_type_dict['from_type'] += [from_node_type]
        edge_to_type_dict['to_type'] += [to_node_type]
        edge_to_type_dict['edge_type'] += [edge_type]
    edge_to_type_df = pd.DataFrame(edge_to_type_dict)

    # take unique node types for each edge type
    edge_type_to_node_types = {}
    for edge_type in set(edge_to_type_df['edge_type']):
        # only rows with the specified edge_type
        sub_df = edge_to_type_df[edge_to_type_df['edge_type'] == edge_type]
        
        # get unique pairs of node types
        unique_node_type_pairs = set()
        for from_node_types, to_node_types in zip(sub_df['from_type'],sub_df['to_type']):
            # enumerate all pairs, since these are lists of node types
            pairs = [(f,t) for f in from_node_types for t in to_node_types]
            
            # if undirected, do not include reverse node type 
            # i.e. include (type_1,type_2) but not (type_2,type_1)
            if not directed:                
                pairs_sorted = set()
                for f,t in pairs:
                    # sort pair alphabetically
                    pair = (f,t) if (f < t) else (t,f) 
                    # keep unique pairs
                    pairs_sorted.add(pair)
                pairs = pairs_sorted
                
            # add unique pairs
            unique_node_type_pairs = unique_node_type_pairs.union(pairs)
            
        edge_type_to_node_types[edge_type] = list(unique_node_type_pairs)
    return edge_type_to_node_types

edge_type_to_node_types_mapping = get_edge_type_to_node_types_mapping(g,directed=False)
edge_type_to_node_types_mapping

{'in_species_cardiac_proteome': [('Gene', 'Species')],
 'in_protein_group': [('Protein', 'UniRef90_Protein_Group')],
 'participates_in_pathway': [('Protein', 'Reactome_Pathway')],
 'MeSH_hierarchy': [('MeSH_Tree_Disease', 'MeSH_Tree_Disease')],
 'MeSH_is': [('MeSH_Disease', 'MeSH_Tree_Disease')],
 'MeSH_CVD': [('CVD', 'MeSH_Tree_Disease')],
 'interacts_with': [('Protein', 'Protein')],
 'is_gene_product': [('Gene', 'Protein')],
 'ClinVar': [('MeSH_Disease', 'Protein')],
 'CaseOLAP_score': [('CVD', 'UniRef90_Protein_Group')],
 'ClinGen': [('MeSH_Disease', 'Protein')]}

In [13]:
def independent_edge_evaluation(g, node_types):
    
    # get the node types we will be predicting edge between
    source_node_type_list = [node_types[0]]
    destination_node_type_list = [node_types[1]]
    print(source_node_type_list,destination_node_type_list)
    
    # split graph into train/test
    train, test = g.connected_holdout(train_size=0.7)

    # train embedding on train graph
    embedding = FirstOrderLINEEnsmallen().fit_transform(train)

    # train model on train graph
    model = PerceptronEdgePrediction(
        edge_features=None,
        number_of_edges_per_mini_batch=32,
        edge_embeddings="CosineSimilarity"
    )
    model.fit(
        graph=train, 
        node_features=embedding
    )
    
    # predictions for train, test, and negative sampled graph
    train_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=train, 
                                        node_features=embedding, 
                                        source_node_types = source_node_type_list,
                                        destination_node_types = destination_node_type_list,
                                        return_predictions_dataframe=True
                                    )
    test_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=test, 
                                        node_features=embedding, 
                                        source_node_types = source_node_type_list,
                                        destination_node_types = destination_node_type_list,
                                        return_predictions_dataframe=True
                                    )
    negative_sampled_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(
                                    graph=g.sample_negative_graph(number_of_negative_samples=test.get_number_of_edges()),
                                    node_features=embedding, 
                                    source_node_types = source_node_type_list,
                                    destination_node_types = destination_node_type_list,
                                    return_predictions_dataframe=True
                                )
    
    # label predictions
    labeled_train_pred_df = label_predictions_with_ground_truth(train_pred_df,train)
    labeled_test_pred_df = label_predictions_with_ground_truth(test_pred_df,test)
    labeled_negative_sample_pred_df = label_negative_sample_pred(negative_sampled_pred_df)
    
    # evaluation
    data = [evaluate_predictions(model,labeled_train_pred_df),
            evaluate_predictions(model,labeled_test_pred_df),
            evaluate_predictions(model,labeled_negative_sample_pred_df)]
    eval_df = pd.DataFrame(data)
    
    return eval_df

edge_pair = edge_type_to_node_types_mapping['CaseOLAP_score'][0]
independent_edge_evaluation(g,edge_pair)

['CVD'] ['UniRef90_Protein_Group']
<embiggen.edge_prediction.edge_prediction_ensmallen.perceptron.PerceptronEdgePrediction object at 0x7f9cbcb927f0>
<embiggen.edge_prediction.edge_prediction_ensmallen.perceptron.PerceptronEdgePrediction object at 0x7f9cbcb927f0>
<embiggen.edge_prediction.edge_prediction_ensmallen.perceptron.PerceptronEdgePrediction object at 0x7f9cbcb927f0>


Unnamed: 0,auroc,auprc,markedness,informedness,recall,specificity,balanced_accuracy,prevalence_threshold,negative_predictive_value,positive_likelyhood_ratio,...,false_omission_rate,diagnostic_odds_ratio,matthews_correlation_coefficient,false_discovery_rate,fall_out,prevalence,precision,accuracy,f1_score,fowlkes_mallows_index
0,0.873651,0.311496,0.192625,0.618093,0.987636,0.630457,0.809047,0.379535,0.998233,2.672586,...,0.001767,136.282171,0.345051,0.805608,0.369543,0.08281,0.194392,0.660035,0.324846,0.438165
1,0.757725,0.076638,0.054756,0.438936,0.845886,0.59305,0.719468,0.409545,0.991649,2.078599,...,0.008351,7.998707,0.15503,0.936893,0.40695,0.031388,0.063107,0.600986,0.117451,0.231043
2,,,0.0,,,0.579273,,,1.0,,...,0.0,,,1.0,0.420727,0.0,0.0,0.579273,0.0,


In [None]:
edge_pair[0]

In [None]:
reverse_caseolap = (edge_type_to_node_types_mapping['CaseOLAP_score'][1],
                   edge_type_to_node_types_mapping['CaseOLAP_score'][0])
independent_edge_evaluation(g,reverse_caseolap)

In [14]:
def independent_edge_evaluation(g, node_types,test_reverse=False):
    
#     # get the node types we will be predicting edge between
#     source_node_type_list = node_types[0]
#     destination_node_type_list = node_types[1]
    
    # get the node types we will be predicting edge between
    source_node_type_list = [node_types[0]]
    destination_node_type_list = [node_types[1]]
    print(source_node_type_list,destination_node_type_list)
    
    
    # split graph into train/test
    train, test = g.connected_holdout(train_size=0.7)

    # train embedding on train graph
    embedding = FirstOrderLINEEnsmallen().fit_transform(train)

    # train model on train graph
    model = PerceptronEdgePrediction(
        edge_features=None,
        number_of_edges_per_mini_batch=32,
        edge_embeddings="CosineSimilarity"
    )
    model.fit(
        graph=train, 
        node_features=embedding
    )

    # predictions for train, test, and negative sampled graph
    train_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=train, 
                                        node_features=embedding, 
                                        source_node_types = source_node_type_list,
                                        destination_node_types = destination_node_type_list,
                                        return_predictions_dataframe=True
                                    )
    test_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=test, 
                                        node_features=embedding, 
                                        source_node_types = source_node_type_list,
                                        destination_node_types = destination_node_type_list,
                                        return_predictions_dataframe=True
                                    )
    negative_sampled_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(
                                    graph=g.sample_negative_graph(number_of_negative_samples=test.get_number_of_edges()),
                                    node_features=embedding, 
                                    source_node_types = source_node_type_list,
                                    destination_node_types = destination_node_type_list,
                                    return_predictions_dataframe=True
                                )

    # label predictions
    labeled_train_pred_df = label_predictions_with_ground_truth(train_pred_df,train)
    labeled_test_pred_df = label_predictions_with_ground_truth(test_pred_df,test)
    labeled_negative_sample_pred_df = label_negative_sample_pred(negative_sampled_pred_df)
    
    # evaluation
    data = [evaluate_predictions(model,labeled_train_pred_df),
            evaluate_predictions(model,labeled_test_pred_df),
            evaluate_predictions(model,labeled_negative_sample_pred_df)]
    eval_df = pd.DataFrame(data)
    
    ''' REVERSE '''
    # swapping source and destination nodes results in different results...
    if test_reverse:
        # predictions for train, test, and negative sampled graph
        train_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=train, 
                                            node_features=embedding, 
                                            source_node_types = destination_node_type_list,
                                            destination_node_types = source_node_type_list,
                                            return_predictions_dataframe=True
                                        )
        test_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=test, 
                                            node_features=embedding, 
                                            source_node_types = destination_node_type_list,
                                            destination_node_types = source_node_type_list,
                                            return_predictions_dataframe=True
                                        )
        negative_sampled_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(
                                        graph=g.sample_negative_graph(number_of_negative_samples=test.get_number_of_edges()),
                                        node_features=embedding, 
                                        source_node_types = destination_node_type_list,
                                        destination_node_types = source_node_type_list,
                                        return_predictions_dataframe=True
                                    )

        # label predictions
        labeled_train_pred_df = label_predictions_with_ground_truth(train_pred_df,train)
        labeled_test_pred_df = label_predictions_with_ground_truth(test_pred_df,test)
        labeled_negative_sample_pred_df = label_negative_sample_pred(negative_sampled_pred_df)

        # evaluation
        data += [evaluate_predictions(model,labeled_train_pred_df)]
        data += [evaluate_predictions(model,labeled_test_pred_df)]
        data += [evaluate_predictions(model,labeled_negative_sample_pred_df)]
        
    # convert to dataframe
    eval_df = pd.DataFrame(data)
    
    # add experiment column
    experiments = ['test','train','negative_sample']
    if test_reverse:
        experiments += [h+"_reverse" for h in experiments]
    eval_df.insert(loc=0, column='experiment', value=experiments)
    
    return eval_df

temp_df = independent_edge_evaluation(g,edge_type_to_node_types_mapping['CaseOLAP_score'][0], test_reverse=True)
temp_df.drop([2,5])

['CVD'] ['UniRef90_Protein_Group']


Unnamed: 0,experiment,auroc,auprc,miss_rate,false_omission_rate,markedness,fall_out,fowlkes_mallows_index,prevalence_threshold,recall,...,threat_score,matthews_correlation_coefficient,false_discovery_rate,negative_predictive_value,precision,specificity,accuracy,positive_likelyhood_ratio,prevalence,balanced_accuracy
0,test,0.873619,0.311358,0.012016,0.001717,0.192842,0.36928,0.43843,0.379409,0.987984,...,0.1941,0.345415,0.805441,0.998283,0.194559,0.63072,0.660305,2.675433,0.08281,0.809352
1,train,0.757041,0.076599,0.153381,0.008308,0.054885,0.406707,0.231302,0.409368,0.846619,...,0.062478,0.155386,0.936807,0.991692,0.063193,0.593293,0.601245,2.081645,0.031388,0.719956
3,test_reverse,0.870654,0.181043,0.009661,0.000715,0.100244,0.394993,0.316202,0.387083,0.990339,...,0.10086,0.244295,0.899041,0.999285,0.100959,0.605007,0.621526,2.507234,0.042869,0.797673
4,train_reverse,0.80178,0.066066,0.077861,0.002978,0.045621,0.409146,0.211695,0.399797,0.922139,...,0.0484,0.152981,0.951401,0.997022,0.048599,0.590854,0.598196,2.253815,0.022162,0.756496


In [15]:
def label_predictions_with_ground_truth(pred_df, test_graph, return_bool = True, directed=False):
    test_edges = set(test_graph.get_edge_node_names(directed=False))
    # consider reverse edges if directed is False
    if not directed:
        test_edges = test_edges.union([(t,f) for (f,t) in test_edges])
    
    # convert to dataframe to label edges
    test_edges_df = pd.DataFrame(test_edges)
    test_edges_df.columns = ['sources','destinations']
    
    labeled_pred_df = pred_df.merge(test_edges_df,how='left',indicator=True)
    test_truth = []
    for b in labeled_pred_df['_merge'] == 'both':
        if return_bool:
            y_ = b
        else:
            # return 1 or 0
            y_ = 1 if b else 0
        test_truth += [y_]
    labeled_pred_df['ground_truth'] = test_truth
    labeled_pred_df = labeled_pred_df.drop('_merge', axis=1)
    return labeled_pred_df


temp_df = independent_edge_evaluation(g,edge_type_to_node_types_mapping['CaseOLAP_score'][0], test_reverse=True)
temp_df.drop([2,5])
# labeled_rev_train_pred_df = label_predictions_with_ground_truth(rev_train_pred_df,train,directed=False)
# labeled_rev_train_pred_df[labeled_rev_train_pred_df['ground_truth']]

['CVD'] ['UniRef90_Protein_Group']


Unnamed: 0,experiment,auroc,auprc,specificity,f1_score,informedness,miss_rate,positive_likelyhood_ratio,accuracy,prevalence_threshold,...,false_omission_rate,negative_likelyhood_ratio,prevalence,false_discovery_rate,recall,threat_score,balanced_accuracy,fowlkes_mallows_index,matthews_correlation_coefficient,diagnostic_odds_ratio
0,test,0.896688,0.491091,0.661217,0.455039,0.649913,0.011304,2.918377,0.702374,0.369232,...,0.002451,0.017096,0.125679,0.704473,0.988696,0.294531,0.824956,0.540542,0.436433,170.7026
1,train,0.783484,0.14204,0.605436,0.198423,0.483631,0.121805,2.225734,0.620042,0.401302,...,0.011255,0.201185,0.05355,0.888153,0.878195,0.110138,0.741816,0.313406,0.220566,11.063125
3,test_reverse,0.896688,0.491092,0.661217,0.455039,0.649913,0.011304,2.918377,0.702374,0.369232,...,0.002451,0.017096,0.125679,0.704473,0.988696,0.294531,0.824956,0.540542,0.436433,170.7026
4,train_reverse,0.783484,0.14204,0.605436,0.198423,0.483631,0.121805,2.225734,0.620042,0.401302,...,0.011255,0.201185,0.05355,0.888153,0.878195,0.110138,0.741816,0.313406,0.220566,11.063125


In [18]:
# imports
from grape import Graph
import pandas as pd

from grape.edge_prediction import PerceptronEdgePrediction
from grape.embedders import FirstOrderLINEEnsmallen

def label_predictions_with_ground_truth(pred_df, test_graph, return_bool = True):
    test_edges = pd.DataFrame(test_graph.get_edge_node_names(directed=False))
    test_edges.columns = ['sources','destinations']
    
    labeled_pred_df = pred_df.merge(test_edges,how='left',indicator=True)
    test_truth = []
    for b in labeled_pred_df['_merge'] == 'both':
        if return_bool:
            y_ = b
        else:
            # return 1 or 0
            y_ = 1 if b else 0
        test_truth += [y_]
    labeled_pred_df['ground_truth'] = test_truth
    labeled_pred_df = labeled_pred_df.drop('_merge', axis=1)
    return labeled_pred_df


def label_negative_sample_pred(pred_df):
    pred_df['ground_truth'] = False
    return pred_df


def evaluate_predictions(m, labeled_pred_df):
    # convert DataFrame to numpy used for evaluation
    y_label = labeled_pred_df['ground_truth'].to_numpy()
    y_score = labeled_pred_df['predictions'].to_numpy()
    
    # handle case where no ground truth (for negative sampled edges)
    all_false = len(set(y_label)) == 1 and (~y_label[0])
    if all_false:
        e1 = {'auroc':float('NaN'),'auprc':float('NaN')}
    else:
        e1 = m.evaluate_prediction_probabilities(y_label,y_score)
    e2 = m.evaluate_predictions(y_label,y_score)
    return e1 | e2

def independent_edge_evaluation(g, node_types):
    
    # get the node types we will be predicting edge between
    source_node_type_list = [node_types[0]] # these need to be in a list for grape
    destination_node_type_list = [node_types[1]]
    
    # split graph into train/test
    train, test = g.connected_holdout(train_size=0.7)

    # train embedding on train graph
    embedding = FirstOrderLINEEnsmallen().fit_transform(train)

    # train model on train graph
    model = PerceptronEdgePrediction(
        edge_features=None,
        number_of_edges_per_mini_batch=32,
        edge_embeddings="CosineSimilarity"
    )
    model.fit(
        graph=train, 
        node_features=embedding
    )
    
    # predictions for train, test, and negative sampled graph
    train_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=train, 
                                        node_features=embedding, 
                                        source_node_types = source_node_type_list,
                                        destination_node_types = destination_node_type_list,
                                        return_predictions_dataframe=True
                                    )
    test_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=test, 
                                        node_features=embedding, 
                                        source_node_types = source_node_type_list,
                                        destination_node_types = destination_node_type_list,
                                        return_predictions_dataframe=True
                                    )
    negative_sampled_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(
                                    graph=g.sample_negative_graph(number_of_negative_samples=test.get_number_of_edges()),
                                    node_features=embedding, 
                                    source_node_types = source_node_type_list,
                                    destination_node_types = destination_node_type_list,
                                    return_predictions_dataframe=True
                                )
    
    # label predictions
    labeled_train_pred_df = label_predictions_with_ground_truth(train_pred_df,train)
    labeled_test_pred_df = label_predictions_with_ground_truth(test_pred_df,test)
    labeled_negative_sample_pred_df = label_negative_sample_pred(negative_sampled_pred_df)
    
    # evaluation
    data = [evaluate_predictions(model,labeled_train_pred_df),
            evaluate_predictions(model,labeled_test_pred_df),
            evaluate_predictions(model,labeled_negative_sample_pred_df)]
    eval_df = pd.DataFrame(data)
    
    # add experiment column
    experiments = ['test','train','negative_sample']
    eval_df.insert(loc=0, column='experiment', value=experiments)
    
    return eval_df

# load graph
g = Graph.from_csv(
  directed=False, 
  node_path='merged_nodes.tsv',
  edge_path='merged_edges.tsv',
  verbose=True,
  nodes_column='node',
  node_list_node_types_column='node_type',
  default_node_type='None',
  sources_column='head',
  destinations_column='tail',
  edge_list_edge_types_column='edge_type',
  name="CVD KG"
)
g = g.remove_disconnected_nodes()

# load edge types
edge_type_to_node_types_mapping = get_edge_type_to_node_types_mapping(g,directed=False)

# compute edge evaluation for 
edge_pair = edge_type_to_node_types_mapping['CaseOLAP_score'][0]
independent_edge_evaluation(g,edge_pair)

Unnamed: 0,experiment,auroc,auprc,threat_score,specificity,fowlkes_mallows_index,negative_predictive_value,markedness,informedness,f1_score,...,prevalence,positive_likelyhood_ratio,balanced_accuracy,prevalence_threshold,recall,false_omission_rate,diagnostic_odds_ratio,precision,negative_likelyhood_ratio,accuracy
0,test,0.873595,0.311321,0.194144,0.630858,0.438468,0.998273,0.19288,0.618772,0.32516,...,0.08281,2.676245,0.809386,0.379374,0.987914,0.001727,139.694813,0.194606,0.019158,0.660426
1,train,0.756981,0.076565,0.062512,0.593436,0.231389,0.991704,0.054931,0.440238,0.117668,...,0.031388,2.082826,0.720119,0.409299,0.846802,0.008296,8.068157,0.063227,0.258154,0.601389
2,negative_sample,,,0.0,0.579618,,1.0,0.0,,0.0,...,0.0,,,,,0.0,,0.0,,0.579618


In [19]:
import os
import yaml
import pandas as pd
from grape import Graph

# node embedding imports
from embiggen.embedders.ensmallen_embedders.degree_spine import DegreeSPINE
from embiggen.embedders import GLEEEnsmallen
from embiggen.embedders import HOPEEnsmallen
from embiggen.embedders.pykeen_embedders.distmult import DistMultPyKEEN
from embiggen.embedders.pykeen_embedders.hole import HolEPyKEEN

# edge prediction imports
from embiggen.edge_prediction.edge_prediction_model import AbstractEdgePredictionModel
from grape.edge_prediction import PerceptronEdgePrediction
from grape.edge_prediction import MLPEdgePrediction
from grape.edge_prediction import GNNEdgePrediction


def label_predictions_with_ground_truth(pred_df, test_graph, return_bool = True):
    test_edges = pd.DataFrame(test_graph.get_edge_node_names(directed=False))
    test_edges.columns = ['sources','destinations']
    
    labeled_pred_df = pred_df.merge(test_edges,how='left',indicator=True)
    test_truth = []
    for b in labeled_pred_df['_merge'] == 'both':
        if return_bool:
            y_ = b
        else:
            # return 1 or 0
            y_ = 1 if b else 0
        test_truth += [y_]
    labeled_pred_df['ground_truth'] = test_truth
    labeled_pred_df = labeled_pred_df.drop('_merge', axis=1)
    return labeled_pred_df


def label_negative_sample_pred(pred_df):
    pred_df['ground_truth'] = False
    return pred_df


def evaluate_predictions(m, labeled_pred_df):
    # convert DataFrame to numpy used for evaluation
    y_label = labeled_pred_df['ground_truth'].to_numpy()
    y_score = labeled_pred_df['predictions'].to_numpy()
    
    # handle case where no ground truth (for negative sampled edges)
    all_false = len(set(y_label)) == 1 and (~y_label[0])
    if all_false:
        e1 = {'auroc':float('NaN'),'auprc':float('NaN')}
    else:
        e1 = m.evaluate_prediction_probabilities(y_label,y_score)
    e2 = m.evaluate_predictions(y_label,y_score)
    return e1 | e2

def get_edge_type_to_node_types_mapping(g, directed=False):
    '''
    This function returns a mapping from edge_type -> (from_types,to_types)
    '''
    # gather node and edge types for every edge as a DataFrame
    edge_to_type_dict = {h:[] for h in ["from","to","from_type","to_type","edge_type"]}
    for from_node_id,to_node_id in g.get_edge_node_ids(directed=False):
        # get node types
        from_node_type_ids = g.get_node_type_ids_from_node_id(from_node_id)
        from_node_type = [g.get_node_type_name_from_node_type_id(i) for i in from_node_type_ids]
        to_node_type_ids = g.get_node_type_ids_from_node_id(to_node_id)
        to_node_type =  [g.get_node_type_name_from_node_type_id(i) for i in to_node_type_ids]

        # get edge type
        edge_id = g.get_edge_id_from_node_ids(from_node_id,to_node_id)
        edge_type = g.get_edge_type_name_from_edge_id(edge_id)

        # append to dict
        edge_to_type_dict['from'] += [from_node_id]
        edge_to_type_dict['to'] += [to_node_id]
        edge_to_type_dict['from_type'] += [from_node_type]
        edge_to_type_dict['to_type'] += [to_node_type]
        edge_to_type_dict['edge_type'] += [edge_type]
    edge_to_type_df = pd.DataFrame(edge_to_type_dict)

    # take unique node types for each edge type
    edge_type_to_node_types = {}
    for edge_type in set(edge_to_type_df['edge_type']):
        # only rows with the specified edge_type
        sub_df = edge_to_type_df[edge_to_type_df['edge_type'] == edge_type]
        
        # get unique pairs of node types
        unique_node_type_pairs = set()
        for from_node_types, to_node_types in zip(sub_df['from_type'],sub_df['to_type']):
            # enumerate all pairs, since these are lists of node types
            pairs = [(f,t) for f in from_node_types for t in to_node_types]
            
            # if undirected, do not include reverse node type 
            # i.e. include (type_1,type_2) but not (type_2,type_1)
            if not directed:                
                pairs_sorted = set()
                for f,t in pairs:
                    # sort pair alphabetically
                    pair = (f,t) if (f < t) else (t,f) 
                    # keep unique pairs
                    pairs_sorted.add(pair)
                pairs = pairs_sorted
                
            # add unique pairs
            unique_node_type_pairs = unique_node_type_pairs.union(pairs)
            
        edge_type_to_node_types[edge_type] = list(unique_node_type_pairs)
    return edge_type_to_node_types


subgraph_description_file = "./subgraphs/subgraphs_summary.tsv"
results_table_file_name = "./2022-09-20_independent_evaluation_results_table.csv"

# file listing all sub-graphs to test
subgraph_df = pd.read_csv(subgraph_description_file,sep="\t")

for node_path, edge_path, kg_description, kg_num in zip(subgraph_df['Node file'],
                                                        subgraph_df['Edge file'],
                                                        subgraph_df['Resources Included'],
                                                        subgraph_df['Experiment Number']):
    kg_name = 'CKG %d: %s'%(kg_num,kg_description)

    # load graph
    g = Graph.from_csv(
      directed=False,
      node_path=node_path,
      edge_path=edge_path,
      verbose=True,
      nodes_column='id',
      node_list_node_types_column='category',
      default_node_type='None',
      sources_column='subject',
      destinations_column='object',
      edge_list_edge_types_column='predicate',
      weights_column = 'score',
      default_weight=1.0,
      name=kg_name
    )
    g = g.remove_disconnected_nodes()
    
    # split graph into train/test
    train, test = g.connected_holdout(train_size=0.7)

    # load edge types
    edge_type_to_node_types_mapping = get_edge_type_to_node_types_mapping(g,directed=False)
    # to compute edge evaluation only for CaseOLAP_score edges
    edge_pair = edge_type_to_node_types_mapping['CaseOLAP_score'][0]
    
    # train all node embeddings
    embedding_models = {'SPINE':DegreeSPINE().fit_transform(train),
                    'GLEE':GLEEEnsmallen().fit_transform(train),
                    'HOPE':HOPEEnsmallen().fit_transform(train),
                    'DistMultPyKEEN':DistMultPyKEEN().fit_transform(train),
                    'HolEPyKEEN':HolEPyKEEN().fit_transform(train)}
    
    # store results of gridsearch here
    results_df = None

    # grid search over node embedding models 
    for node_embedding_model_name,embedding in node_models.items():
                
        # grid search over edge prediction models
        edge_prediciton_models = {'Perceptron': PerceptronEdgePrediction(edge_features=None,
                                           number_of_edges_per_mini_batch=32,
                                           edge_embeddings="CosineSimilarity"),
                                  'MLP': MLPEdgePrediction(edge_features=None,
                                           number_of_edges_per_mini_batch=32,
                                           edge_embeddings="CosineSimilarity"),
                                  'GNN': GNNEdgePrediction(edge_features=None,
                                           number_of_edges_per_mini_batch=32,
                                           edge_embeddings="CosineSimilarity")}
        
        for edge_pred_model_name, model in edge_prediciton_models.items():
            
            # train model
            model.fit(
                graph=train, 
                node_features=embedding
            )
            
            # predictions for train, test, and negative sampled graph
            train_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=train, 
                                                node_features=embedding, 
                                                source_node_types = source_node_type_list,
                                                destination_node_types = destination_node_type_list,
                                                return_predictions_dataframe=True
                                            )
            test_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(graph=test, 
                                                node_features=embedding, 
                                                source_node_types = source_node_type_list,
                                                destination_node_types = destination_node_type_list,
                                                return_predictions_dataframe=True
                                            )
            negative_sampled_pred_df = model.predict_proba_bipartite_graph_from_edge_node_types(
                                            graph=g.sample_negative_graph(number_of_negative_samples=test.get_number_of_edges()),
                                            node_features=embedding, 
                                            source_node_types = source_node_type_list,
                                            destination_node_types = destination_node_type_list,
                                            return_predictions_dataframe=True
                                        )

            # label predictions
            labeled_train_pred_df = label_predictions_with_ground_truth(train_pred_df,train)
            labeled_test_pred_df = label_predictions_with_ground_truth(test_pred_df,test)
            labeled_negative_sample_pred_df = label_negative_sample_pred(negative_sampled_pred_df)

            # evaluation
            data = [evaluate_predictions(model,labeled_train_pred_df),
                    evaluate_predictions(model,labeled_test_pred_df),
                    evaluate_predictions(model,labeled_negative_sample_pred_df)]
            
            
            ### Make result dataframe ###
            eval_df = pd.DataFrame(data)

            # add experiment column
            experiments = ['test','train','negative_sample']
            eval_df.insert(loc=0, column='experiment', value=experiments)
            
            # add edge prediction and node embedding models
            eval_df.insert(loc=0, column='edge_prediction_model', value = edge_pred_model_name)
            eval_df.insert(loc=0, column='node_embedding', value = node_embedding_model_name)
            
            # add kg graph name
            eval_df.insert(loc=0, column='graph_name', value = kg_name)
            
            # store and save results into one big table
            if not results_df:
                results_df = eval_df
            # append new results
            results_df = results_df.append(eval_df, ignore_index = True)
            results_df.to_csv(results_table_file_name)
            
            exp_tag = "kg: %s, node_emb: %s, edge_pred_model: %s"%(kg_name,
                                                                   node_embedding_model_name,
                                                                   ege_pred_model_name)
            print("Finished %s"%exp_tag)
print("Completed.")

ModuleNotFoundError: No module named 'pykeen'

In [None]:
from grape.edge_prediction import PerceptronEdgePrediction
from grape.edge_prediction import MLPEdgePrediction
from grape.edge_prediction import GNNEdgePrediction

from embiggen import get_available_models_for_edge_prediction, get_available_models_for_node_embedding
from embiggen.edge_prediction.edge_prediction_model import AbstractEdgePredictionModel
get_available_models_for_edge_prediction()

In [None]:
!ls 