In [None]:
# Import libraries
import sys

import networkx as nx
import pandas as pd
import numpy as np
import os

import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.mapper import GraphSAGENodeGenerator, FullBatchNodeGenerator
from stellargraph.layer import GraphSAGE, GCN

import tensorflow as tf
from tensorflow import keras
from keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection

import json

In [None]:
# Setup GPU

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## Load Data and Preprocessing

In [None]:
# Import dataset
G = nx.read_graphml( "data/cora/cora.graphml" )

print(G) # should print a graph with 2708 nodes and 5429 edges (directed) 

# in this case, we have created a undirected graph, so the numbers of edges are diffent from the original paper

In [None]:
G.graph

In [None]:
print( json.dumps( list( G.nodes(data=True) )[0], indent = 4 ) ) # 1-1432: weights, 1433: subject

In [None]:
for node in G.nodes():
    print( G.nodes[node]['subject'] )
    break

In [None]:
# for each node, encode weight as a feature
for node in G.nodes():
    enc = [ val for key, val in G.nodes[node].items() if key != 'subject' and key != 'label' ]
    G.nodes[node]['feature'] = enc

In [None]:
for node in G.nodes( data=True ):
    print( node )
    break

In [None]:
# copy graph
from copy import deepcopy
G2 = deepcopy(G)

# remove redundant weight attributes from nodes
for node in G2.nodes():
    for key in G2.nodes[node].keys():
        if key != 'subject' and key != 'label' and key != 'feature':
            G.nodes[node].pop(key)

del G2

In [None]:
# print node attributes
for node in G.nodes( data=True ):
    print( node )
    break

In [None]:
print( json.dumps( list( G.nodes(data=True) )[0], indent = 4 ) )

In [None]:
print( json.dumps( list( G.edges(data=True) )[0:10], indent = 4 ) )

## Create StellarGraph

In [None]:
SG = StellarGraph.from_networkx(
    graph=G, node_type_default="subject", node_features="feature"
)

print( SG.info() )

In [None]:
# create a node_subjects Pandas Series with node IDs as index and subject as value
node_subjects = pd.Series( [ G.nodes[node]['subject'] for node in G.nodes() ], index = G.nodes() )
print( node_subjects.head() )

In [None]:
# split into train and test sets
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=0.1, test_size=None, stratify=node_subjects
)

print( train_subjects )

In [None]:
from collections import Counter
Counter(train_subjects) # consider imbalance between class

In [None]:
# Convert to numeric labels
le = preprocessing.LabelBinarizer()
train_targets = le.fit_transform(train_subjects)
test_targets = le.transform(test_subjects)

In [None]:
print( train_targets ) # for each node, the subject is encoded as a number

## Define benchmarking metrics

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_auc_score


def benchmarking_result( targets, predictions ) :
    # Accuracy
    train_acc = np.mean( targets == predictions )

    # Confusion matrix
    # cm = confusion_matrix( targets, predictions )

    # Precision, recall, f1-score
    precision, recall, f1, _ = precision_recall_fscore_support( targets, predictions, average='micro' )

    # AUC-ROC
    # auc = roc_auc_score( targets, predictions )

    return train_acc, precision, recall, f1


## GraphSAGE model

In [None]:
batch_size = 50
num_samples = [10, 10, 5]

generator = GraphSAGENodeGenerator( SG, batch_size, num_samples )

In [None]:
train_gen = generator.flow( train_subjects.index, train_targets, shuffle = True )

In [None]:
graphsage_model = GraphSAGE(
    layer_sizes=[32, 32, 16], generator=generator, bias=True, dropout=0.5
)

In [None]:
print( train_targets.shape )

In [None]:
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

### Train model

In [None]:
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam( learning_rate=0.005 ),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

In [None]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [None]:
history = model.fit(
    train_gen, epochs=20, validation_data=test_gen, verbose=2, shuffle=False
)

In [None]:
sg.utils.plot_history(history)

In [None]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

### Benchmark model

In [None]:
# print benchmarking results
targets = np.argmax( test_targets, axis=1 )
predictions = np.argmax( model.predict(test_gen), axis=1 )

In [None]:
train_acc, precision, recall, f1 = benchmarking_result( targets, predictions )

print( "Accuracy: ", train_acc )
print( "Precision: ", precision )
print( "Recall: ", recall )
print( "F1: ", f1 )

# this result is wrong

In [None]:
all_nodes = node_subjects.index
all_mapper = generator.flow(all_nodes)
all_predictions = model.predict(all_mapper)

In [None]:
node_predictions = le.inverse_transform(all_predictions)

In [None]:
df = pd.DataFrame({"Predicted": node_predictions, "True": node_subjects})
df.head(10)

In [None]:
# calculate accuracy
df['Correct'] = df['Predicted'] == df['True']
df['Correct'].value_counts()

In [None]:
accuracy = df['Correct'].value_counts()[True] / df['Correct'].value_counts().sum()
print( accuracy )

## GCN model

In [None]:
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=140, test_size=None, stratify=node_subjects, random_state=42
)

val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=500, test_size=None, stratify=test_subjects
)

In [None]:
train_subjects.value_counts().to_frame() # why is equal to demo?

In [None]:
# Convert to numeric labels
le = preprocessing.LabelBinarizer()
train_targets = le.fit_transform(train_subjects)
val_targets = le.transform(val_subjects)
test_targets = le.transform(test_subjects)

In [None]:
generator = FullBatchNodeGenerator(SG, method="gcn")

In [None]:
train_gen = generator.flow(train_subjects.index, train_targets)

In [None]:
gcn = GCN(
    layer_sizes=[16, 16], activations=["relu", "relu"], generator=generator, dropout=0.5
)

In [None]:
x_inp, x_out = gcn.in_out_tensors()

In [None]:
predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

### Train model

In [None]:
model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam( learning_rate=0.005 ),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

In [None]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

In [None]:
history = model.fit(
    train_gen,
    epochs=200,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback],
)

In [None]:
sg.utils.plot_history(history)

In [None]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [None]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

## GAT model

In [None]:
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GAT

In [None]:
# split data
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=140, test_size=None, stratify=node_subjects, random_state=42
)

val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=500, test_size=None, stratify=test_subjects
)

In [None]:
from collections import Counter

Counter(train_subjects)

In [None]:
# Convert to numeric labels
le = preprocessing.LabelBinarizer()
train_targets = le.fit_transform(train_subjects)
val_targets = le.transform(val_subjects)
test_targets = le.transform(test_subjects)

In [None]:
# Create the GAT model
generator = FullBatchNodeGenerator(SG, method="gat")

In [None]:
train_gen = generator.flow(train_subjects.index, train_targets)

In [None]:
gat = GAT(
    layer_sizes=[16, 16],
    activations=["elu", "elu"],
    attn_heads=8,
    generator=generator,
    in_dropout=0.5,
    attn_dropout=0.5,
    normalize=None,
)

In [None]:
x_inp, x_out = gat.in_out_tensors()

In [None]:
predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

In [None]:
model = Model(inputs=x_inp, outputs= predictions)
model.compile(
    optimizer=optimizers.Adam( learning_rate=0.005 ),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

In [None]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

if not os.path.isdir("logs"):
    os.makedirs("logs")
es_callback = EarlyStopping(
    monitor="val_acc", patience=20
)  # patience is the number of epochs to wait before early stopping in case of no further improvement
mc_callback = ModelCheckpoint(
    "logs/best_model.h5", monitor="val_acc", save_best_only=True, save_weights_only=True
)

In [None]:
history = model.fit(
    train_gen,
    epochs=200,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback, mc_callback],
)

In [None]:
sg.utils.plot_history(history)

In [None]:
model.load_weights("logs/best_model.h5")

In [None]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [None]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
# shape of test_gen
print( test_gen.__len__() )

In [None]:
# Compute F1 score
from sklearn.metrics import f1_score


y_pred = model.predict(test_gen)

In [None]:
# print size of y_pred and y_true
print(y_pred.shape)