In [1]:
import mxnet as mx
from mxnet import gluon
from mxnet import nd
from mxnet import autograd
from mxnet.gluon import nn

In [2]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys

In [3]:
ctx = mx.cpu()

In [4]:
class gcn_layer(nn.HybridBlock):
    def __init__(self, num_of_filters, **kwargs):
        super(gcn_layer, self).__init__(**kwargs)
        with self.name_scope():
            self.fc = nn.Dense(num_of_filters)
        
    def hybrid_forward(self, F, x, A_):
        '''
        Parameters
        ----------
        A_, D^{-1/2} A D^{-1/2}
        '''
        return self.fc(F.dot(A_, x))
        
        
class GCN(nn.HybridBlock):
    def __init__(self, **kwargs):
        super(GCN, self).__init__(**kwargs)
        with self.name_scope():
            self.gcn1 = gcn_layer(256)
            self.gcn2 = gcn_layer(7)
        
    def hybrid_forward(self, F, x, A_):
        return self.gcn2(F.relu(self.gcn1(x, A_)), A_)

In [5]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask

In [6]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data('cora')

In [7]:
features = nd.array(features.toarray(), ctx = ctx)
y_train = nd.array(y_train, ctx = ctx)
y_val = nd.array(y_val, ctx = ctx)

In [8]:
A_tilde = adj.toarray() + np.identity(adj.shape[0])
D = A_tilde.sum(axis = 1)
A_ = nd.array(np.diag(D ** -0.5).dot(A_tilde).dot(np.diag(D ** -0.5)), ctx = ctx)
A_.shape

(2708, 2708)

In [9]:
idx = np.arange(len(A_))

In [10]:
net = GCN()
net.initialize(ctx = ctx)
net.hybridize()
output = net(features, A_)

In [11]:
loss_function = gluon.loss.SoftmaxCrossEntropyLoss()

In [12]:
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 1e-3})

In [13]:
for epoch in range(100):
    with autograd.record():
        output = net(features, A_)
        l = loss_function(output[idx[train_mask]], nd.argmax(y_train[idx[train_mask]], axis = 1))
    l.backward()
    trainer.step(1)
    print('training loss: %.2f'%(l.mean().asnumpy()[0]))
    
    output = net(features, A_)
    l = loss_function(output[idx[val_mask]], nd.argmax(y_val[idx[val_mask]], axis = 1))
    print('validation loss %.2f'%(l.mean().asnumpy()[0]))
    print()

training loss: 1.95
validation loss 1.93

training loss: 1.91
validation loss 1.91

training loss: 1.88
validation loss 1.90

training loss: 1.85
validation loss 1.88

training loss: 1.81
validation loss 1.86

training loss: 1.77
validation loss 1.84

training loss: 1.74
validation loss 1.82

training loss: 1.70
validation loss 1.80

training loss: 1.66
validation loss 1.78

training loss: 1.61
validation loss 1.75

training loss: 1.57
validation loss 1.73

training loss: 1.52
validation loss 1.70

training loss: 1.47
validation loss 1.67

training loss: 1.42
validation loss 1.64

training loss: 1.37
validation loss 1.61

training loss: 1.31
validation loss 1.57

training loss: 1.26
validation loss 1.54

training loss: 1.20
validation loss 1.51

training loss: 1.15
validation loss 1.47

training loss: 1.09
validation loss 1.44

training loss: 1.03
validation loss 1.40

training loss: 0.98
validation loss 1.37

training loss: 0.92
validation loss 1.33

training loss: 0.87
validation los

In [14]:
output = net(features, A_)
from sklearn.metrics import accuracy_score
accuracy_score(np.argmax(y_test[idx[test_mask]], axis = 1), nd.argmax(output[idx[test_mask]], axis = 1).asnumpy())

0.78800000000000003

# Co-Train

In [15]:
t = 50
alpha = 1e-6
Lambda = np.identity(len(A_))
L = np.diag(D) - adj

In [16]:
from numpy.linalg import inv

In [17]:
P = inv(L + alpha * Lambda)

In [18]:
train_label = nd.argmax(y_train[idx[train_mask]], axis = 1).asnumpy()

In [19]:
train_idx = set(idx[train_mask].tolist())

In [20]:
train_dict = dict(zip(train_idx, map(int, train_label)))

In [21]:
for k in range(y_train.shape[1]):
    nodes = idx[train_mask][train_label == k]
    probability = P[:, nodes].sum(axis = 1).flatten()
    for i in np.argsort(probability).tolist()[0][::-1][:t]:
        if i in train_dict:
            continue
        train_dict[i] = k

In [22]:
len(train_dict)

350

In [23]:
new_train_index = sorted(train_dict.keys())
new_train_label = [train_dict[i] for i in new_train_index]

In [24]:
net2 = GCN()
net2.initialize(ctx = ctx)
net2.hybridize()

trainer = gluon.Trainer(net2.collect_params(), 'adam', {'learning_rate': 1e-3})

for epoch in range(100):
    with autograd.record():
        output = net2(features, A_)
        l = loss_function(output[new_train_index], nd.array(new_train_label, ctx = ctx))
    l.backward()
    trainer.step(1)
    print('training loss: %.2f'%(l.mean().asnumpy()[0]))
    
    output = net2(features, A_)
    l = loss_function(output[idx[val_mask]], nd.argmax(y_val[idx[val_mask]], axis = 1))
    print('validation loss %.2f'%(l.mean().asnumpy()[0]))
    print()

training loss: 1.95
validation loss 1.93

training loss: 1.91
validation loss 1.92

training loss: 1.88
validation loss 1.90

training loss: 1.84
validation loss 1.89

training loss: 1.81
validation loss 1.87

training loss: 1.77
validation loss 1.85

training loss: 1.73
validation loss 1.83

training loss: 1.69
validation loss 1.81

training loss: 1.65
validation loss 1.79

training loss: 1.61
validation loss 1.77

training loss: 1.56
validation loss 1.74

training loss: 1.51
validation loss 1.72

training loss: 1.46
validation loss 1.69

training loss: 1.41
validation loss 1.66

training loss: 1.36
validation loss 1.63

training loss: 1.30
validation loss 1.60

training loss: 1.25
validation loss 1.57

training loss: 1.19
validation loss 1.54

training loss: 1.14
validation loss 1.50

training loss: 1.08
validation loss 1.47

training loss: 1.02
validation loss 1.44

training loss: 0.97
validation loss 1.40

training loss: 0.91
validation loss 1.37

training loss: 0.86
validation los

In [25]:
output = net(features, A_)
from sklearn.metrics import accuracy_score
accuracy_score(np.argmax(y_test[idx[test_mask]], axis = 1), nd.argmax(output[idx[test_mask]], axis = 1).asnumpy())

0.78800000000000003