In [1]:
%%capture
!git clone https://github.com/joerg84/Graph_Powered_ML_Workshop.git
!rsync -av Graph_Powered_ML_Workshop/ ./ --exclude=.git
!pip3 install dgl
!pip3 install numpy
!pip3 install torch==1.9.1
!pip3 install networkx
!pip3 install matplotlib

In [2]:
%matplotlib inline

In [3]:
import itertools
import networkx as nx
import matplotlib.pyplot as plt
import dgl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from dgl.nn.pytorch import GraphConv
import json
import itertools
import scipy.sparse as sp
from dgl.nn import SAGEConv

Using backend: pytorch


In [4]:
import dgl.data

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]
g.ndata[ 'feat' ][0].type()
g

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

In [5]:
from google.colab import files
uploaded = files.upload()

Saving isolated_nodes.csv to isolated_nodes (1).csv
Saving node_classification.csv to node_classification (1).csv
Saving node_features_text.json to node_features_text (1).json
Saving training_graph.csv to training_graph (1).csv


In [6]:
def build_network():
    # Create the source and target nodes:
    src = np.array([], dtype = np.int_)
    dst = np.array([], dtype = np.int_)
    count = 0
    data_path = "training_graph.csv"
    df = pd.read_csv(data_path)
    src = df["node1"].to_numpy( dtype = np.int_ )
    dst = df["node2"].to_numpy( dtype = np.int_ )
    
    # Make the graph bidirectional as the data provided is undirected
    y = np.concatenate([dst, src] )
    x = np.concatenate([src, dst] )
    return dgl.graph((x, y))

In [7]:
g = build_network()
g

Graph(num_nodes=22470, num_edges=264076,
      ndata_schemes={}
      edata_schemes={})

In [8]:
# Open the json file for the features and load it as a dict in python
with open('node_features_text.json') as json_file:
    data = json.load(json_file)

# Open the node classification file as an array
classif = pd.read_csv( 'node_classification.csv' ).page_type.to_numpy( dtype = np.int_ )

# Calculate the maximum column in the one-hot coded file
col_len = max( [ max( x ) for x in data.values() ] ) + 1

# Construct a feature matrix out of the one-hot coded data
# featMat = np.zeros( ( len(data.keys()), col_len + 1 ), dtype = int )
featMat = torch.zeros( len(data.keys()), col_len + 1, dtype=torch.float32)
# featMat = [ [0 for i in range( col_len ) ] for j in range( len(data.keys()) ) ]
for key in data:
  for idx in data[key]:
    featMat[ int(key) ][ int(idx) ] = 1
  
# concatenate the classification of every node to the end of the one-hot
# coded feature
for i in range( len( featMat ) ):
  featMat[i][-1] = int( classif[ i ] )
  # featMat[ i ].append( int( classif[ i ] ) )

# featMat = np.array( [np.array(xi) for xi in featMat])

In [9]:
g.ndata[ 'feat' ] = torch.tensor( featMat, dtype = torch.float32 )
g.ndata[ 'feat' ][0].type()
g

  """Entry point for launching an IPython kernel.


Graph(num_nodes=22470, num_edges=264076,
      ndata_schemes={'feat': Scheme(shape=(4715,), dtype=torch.float32)}
      edata_schemes={})

In [10]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [11]:
train_g = dgl.remove_edges(g, eids[:test_size])
# train_g.ndata['x'].shape[1]

In [12]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [13]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [14]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata[ 'h' ] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [15]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [16]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    print(type(train_g.ndata['feat'][0][0].item()))
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

<class 'float'>
In epoch 0, loss: 0.6166734099388123
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 5, loss: 0.5121425986289978
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 10, loss: 0.41029733419418335
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 15, loss: 0.3730793297290802
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 20, loss: 0.34785956144332886
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 25, loss: 0.3330219089984894
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 30, loss: 0.32246193289756775
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 35, loss: 0.3143177628517151
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
In epoch 40, loss: 0.3090357780456543
<