In [1]:
%%capture
!git clone https://github.com/joerg84/Graph_Powered_ML_Workshop.git
!rsync -av Graph_Powered_ML_Workshop/ ./ --exclude=.git
!pip3 install dgl
!pip3 install numpy
!pip3 install torch==1.9.1
!pip3 install networkx
!pip3 install matplotlib

In [2]:
%matplotlib inline

In [3]:
import itertools
import networkx as nx
import matplotlib.pyplot as plt
import dgl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from dgl.nn.pytorch import GraphConv
import json
import itertools
import scipy.sparse as sp
from dgl.nn import SAGEConv

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


Using backend: pytorch


In [4]:
from google.colab import files
uploaded = files.upload()

Saving isolated_nodes.csv to isolated_nodes.csv
Saving node_classification.csv to node_classification.csv
Saving node_features_text.json to node_features_text.json
Saving training_graph.csv to training_graph.csv


In [16]:
# Create the source and target nodes:
data_path = "training_graph.csv"
df = pd.read_csv(data_path)
src = df["node1"].to_numpy( dtype = np.int_ )
dst = df["node2"].to_numpy( dtype = np.int_ )
    
# Make the graph bidirectional as the data provided is undirected
y = np.concatenate([dst, src] )
x = np.concatenate([src, dst] )
g = dgl.graph((x, y))


In [17]:
# Open the json file for the features and load it as a dict in python
with open('node_features_text.json') as json_file:
    data = json.load(json_file)

# Open the node classification file as an array
classif = pd.read_csv( 'node_classification.csv' ).page_type.to_numpy( dtype = np.int_ )

# Calculate the maximum column in the one-hot coded file
col_len = max( [ max( x ) for x in data.values() ] ) + 1

# Construct a feature matrix out of the one-hot coded data
# featMat = np.zeros( ( len(data.keys()), col_len + 1 ), dtype = int )
featMat = torch.zeros( len(data.keys()), col_len + 1, dtype=torch.float32)
# featMat = [ [0 for i in range( col_len ) ] for j in range( len(data.keys()) ) ]
for key in data:
  for idx in data[key]:
    featMat[ int(key) ][ int(idx) ] = 1
  
# concatenate the classification of every node to the end of the one-hot
# coded feature
for i in range( len( featMat ) ):
  featMat[i][-1] = int( classif[ i ] )
  # featMat[ i ].append( int( classif[ i ] ) )

# featMat = np.array( [np.array(xi) for xi in featMat])

In [18]:
g.ndata[ 'feat' ] = torch.tensor( featMat, dtype = torch.float32 )
g.ndata[ 'feat' ][0].type()
g

  """Entry point for launching an IPython kernel.


Graph(num_nodes=22470, num_edges=264076,
      ndata_schemes={'feat': Scheme(shape=(4715,), dtype=torch.float32)}
      edata_schemes={})

In [19]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [20]:
train_g = dgl.remove_edges(g, eids[:test_size])
# train_g.ndata['x'].shape[1]

In [21]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [22]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [23]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata[ 'h' ] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [24]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [25]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 0.6033222079277039
In epoch 5, loss: 0.4779208302497864
In epoch 10, loss: 0.42895135283470154
In epoch 15, loss: 0.39269599318504333
In epoch 20, loss: 0.3646790683269501
In epoch 25, loss: 0.35295429825782776
In epoch 30, loss: 0.338355153799057
In epoch 35, loss: 0.3255649209022522
In epoch 40, loss: 0.31748101115226746
In epoch 45, loss: 0.3117625117301941
In epoch 50, loss: 0.3064935803413391
In epoch 55, loss: 0.3019779622554779
In epoch 60, loss: 0.2981290817260742
In epoch 65, loss: 0.29474130272865295
In epoch 70, loss: 0.291605681180954
In epoch 75, loss: 0.28867536783218384
In epoch 80, loss: 0.28596463799476624
In epoch 85, loss: 0.283858060836792
In epoch 90, loss: 0.28131330013275146
In epoch 95, loss: 0.2789929211139679
AUC 0.9564790102593214


In [40]:
labels = files.upload()

Saving test_edges.csv to test_edges.csv
Saving test_labels.csv to test_labels (2).csv


In [42]:
label_path = "test_labels.csv"
test_edges_path = "test_edges.csv"
df = pd.read_csv(label_path)
clsf = df["label"].to_numpy( dtype = np.int_ )
df2 = pd.read_csv(test_edges_path)
src = df2["node1"].to_numpy( dtype = np.int_ )
dst = df2["node2"].to_numpy( dtype = np.int_ )


pos_truth_u = []
pos_truth_v = []
neg_truth_u = []
neg_truth_v = []
for i in range( len(clsf) ):
  if ( int(clsf[i]) ):
    pos_truth_u.append( src[ i ] )
    pos_truth_v.append( dst[ i ] )
  else:
    neg_truth_u.append( src[ i ] )
    neg_truth_v.append( dst[ i ] )

truth_pos_g = dgl.graph((pos_truth_u, pos_truth_v), num_nodes=g.number_of_nodes())
truth_neg_g = dgl.graph((neg_truth_u, neg_truth_v), num_nodes=g.number_of_nodes())
# ----------- 6. check ground truth rsults ------------------------ #
with torch.no_grad():
    posT_score = pred(truth_pos_g, h)
    negT_score = pred(truth_neg_g, h)
    print('AUC', compute_auc(posT_score, negT_score))

AUC 0.8819001032636351
