In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import tensorflow as tf

In [2]:
import dgl.data

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [16]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [17]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [18]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [19]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [20]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [21]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [22]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [23]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))



In epoch 0, loss: 0.6926009058952332
In epoch 5, loss: 0.6188351511955261
In epoch 10, loss: 0.6028839349746704
In epoch 15, loss: 0.5761141777038574
In epoch 20, loss: 0.5345200300216675
In epoch 25, loss: 0.48386064171791077
In epoch 30, loss: 0.43791279196739197
In epoch 35, loss: 0.3974567651748657
In epoch 40, loss: 0.35329943895339966
In epoch 45, loss: 0.3209739327430725
In epoch 50, loss: 0.2918635606765747
In epoch 55, loss: 0.2707023024559021
In epoch 60, loss: 0.24770532548427582
In epoch 65, loss: 0.226336270570755
In epoch 70, loss: 0.20706160366535187
In epoch 75, loss: 0.1881372630596161
In epoch 80, loss: 0.17024725675582886
In epoch 85, loss: 0.15257792174816132
In epoch 90, loss: 0.13577206432819366
In epoch 95, loss: 0.1198478639125824
AUC 0.87250600840053


In [49]:
# Test for positive edges

number_of_edges = len(train_pos_g.edges()[0].numpy())
all_edge_ends = list(train_pos_g.edges()[0].numpy()) + list(train_pos_g.edges()[1].numpy())

node_degree_dict = dict()

for node in set(all_edge_ends):
    node_degree_dict[node] = all_edge_ends.count(node)
    
    
test_list = [list(test_pos_g.edges()[0].numpy()), list(test_pos_g.edges()[1].numpy())]
test_size = len(test_pos_g.edges()[0].numpy())
test_pos_predicted_config = []

for j in range(test_size):
    if test_list[0][j] in node_degree_dict.keys() and test_list[1][j] in node_degree_dict.keys():
        test_pos_predicted_config.append(node_degree_dict[test_list[0][j]]*node_degree_dict[test_list[1][j]]/(2*number_of_edges))
    elif test_list[0][j] in node_degree_dict.keys():
        test_pos_predicted_config.append(node_degree_dict[test_list[0][j]]*1/(2*number_of_edges))
    elif test_list[1][j] in node_degree_dict.keys():
        test_pos_predicted_config.append(1*node_degree_dict[test_list[1][j]]/(2*number_of_edges))
    else:
        test_pos_predicted_config.append(1/(2*number_of_edges))

In [50]:
# Test for negative edges

number_of_edges = len(train_neg_g.edges()[0].numpy())
all_edge_ends = list(train_neg_g.edges()[0].numpy()) + list(train_neg_g.edges()[1].numpy())

node_degree_dict = dict()

for node in set(all_edge_ends):
    node_degree_dict[node] = all_edge_ends.count(node)
    
    
test_list = [list(test_neg_g.edges()[0].numpy()), list(test_neg_g.edges()[1].numpy())]
test_size = len(test_neg_g.edges()[0].numpy())
test_neg_predicted_config = []

for j in range(test_size):
    if test_list[0][j] in node_degree_dict.keys() and test_list[1][j] in node_degree_dict.keys():
        test_neg_predicted_config.append(node_degree_dict[test_list[0][j]]*node_degree_dict[test_list[1][j]]/(2*number_of_edges))
    elif test_list[0][j] in node_degree_dict.keys():
        test_neg_predicted_config.append(node_degree_dict[test_list[0][j]]*1/(2*number_of_edges))
    elif test_list[1][j] in node_degree_dict.keys():
        test_neg_predicted_config.append(1*node_degree_dict[test_list[1][j]]/(2*number_of_edges))
    else:
        test_neg_predicted_config.append(1/(2*number_of_edges))

In [51]:
def compute_auc_config(pos_score, neg_score):
    scores = pos_score + neg_score
    labels = list(np.ones(len(pos_score))) + list(np.zeros(len(neg_score)))
    return roc_auc_score(labels, scores)

In [52]:
print('AUC', compute_auc_config(test_pos_predicted_config, test_neg_predicted_config))

AUC 0.8292473214887357
