In [1]:
import argparse

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator

from logger import Logger

import numpy as np

import pandas as pd

import networkx as nx
from tqdm import tqdm
from scipy.spatial.distance import squareform
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
dataset = PygLinkPropPredDataset(name='ogbl-collab')
split_edge = dataset.get_edge_split()
data = dataset[0]

In [3]:
data

Data(num_nodes=235868, edge_index=[2, 2358104], x=[235868, 128], edge_weight=[2358104, 1], edge_year=[2358104, 1])

In [4]:
device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

In [5]:
device

device(type='cpu')

In [6]:
x = data.x
pos_train_edge = split_edge['train']['edge'].to(x.device)
pos_valid_edge = split_edge['valid']['edge'].to(x.device)
neg_valid_edge = split_edge['valid']['edge_neg'].to(x.device)
pos_test_edge = split_edge['test']['edge'].to(x.device)
neg_test_edge = split_edge['test']['edge_neg'].to(x.device)

In [7]:
pos_train_edge_df = pd.DataFrame(pos_train_edge.numpy())
pos_valid_edge_df = pd.DataFrame(pos_valid_edge.numpy())
neg_valid_edge_df = pd.DataFrame(neg_valid_edge.numpy())
pos_test_edge_df = pd.DataFrame(pos_test_edge.numpy())
neg_test_edge_df = pd.DataFrame(neg_test_edge.numpy())

In [8]:
pos_train_nodes = list(set(pos_train_edge_df[0]).union(set(pos_train_edge_df[1])))
print('Nodes in training: ', len(pos_train_nodes))

Nodes in training:  235868


In [9]:
G_ddi = nx.from_pandas_edgelist(pos_train_edge_df, 0, 1, create_using=nx.Graph())

In [10]:
k_train = np.array([[G_ddi.degree(k) if k in G_ddi.nodes else 0 for k in G_ddi.nodes]]).T

In [11]:
G_ddi_nodes = list(G_ddi.nodes)

In [12]:
k_train_list = list(k_train)

In [18]:
pos_test_edge_pred = []
inductive_count = 0

for index, row in tqdm(pos_test_edge_df.iterrows()):
    try:
        node_0 = row[0]
        node_1 = row[1]
        k_0 = k_train_list[G_ddi_nodes.index(node_0)]
        k_1 = k_train_list[G_ddi_nodes.index(node_1)]
        pos_test_edge_pred.append(k_0*k_1/(sum(k_train_list))) ## sum of degrees = 2 * number of edges 
        
    except:
        inductive_count = inductive_count + 1

46329it [1:14:29, 10.37it/s]


In [19]:
neg_test_edge_pred = []
#inductive_count_neg = 0

for index, row in tqdm(neg_test_edge_df.iterrows()):
    try:
        node_0 = row[0]
        node_1 = row[1]
        k_0 = k_train_list[G_ddi_nodes.index(node_0)]
        k_1 = k_train_list[G_ddi_nodes.index(node_1)]
        neg_test_edge_pred.append(k_0*k_1/(sum(k_train_list))) ## sum of degrees = 2 * number of edges 
        
    except:
        inductive_count = inductive_count + 1

100000it [2:57:00,  9.42it/s]


In [20]:
evaluator = Evaluator(name='ogbl-ddi')

In [25]:
K = 20
test_hits = evaluator.eval({
            'y_pred_pos': np.array(pos_test_edge_pred).reshape(len(pos_test_edge_pred)),
            'y_pred_neg': np.array(neg_test_edge_pred).reshape(len(neg_test_edge_pred)),
        })[f'hits@{K}']

In [26]:
test_hits

0.04806924388611884

In [28]:
print('Total edge pairs not found: ', inductive_count)

Total edge pairs not found:  45


In [33]:
from sklearn import metrics

In [34]:
fpr, tpr, thresholds = metrics.roc_curve([1] * len(pos_test_edge_pred) + [0] * len(neg_test_edge_pred), pos_test_edge_pred + neg_test_edge_pred)
metrics.auc(fpr, tpr)

0.8628438284071926

In [36]:
from sklearn.metrics import average_precision_score

In [37]:
average_precision_score([1] * len(pos_test_edge_pred) + [0] * len(neg_test_edge_pred), pos_test_edge_pred + neg_test_edge_pred)

0.7838922212381295