In [1]:
'''

Much of this code is adapted from code written by 
Xifeng Guo, E-mail: `guoxifeng1990@163.com`, Github: `https://github.com/XifengGuo/CapsNet-Keras`
'''


import keras.backend as K
import tensorflow as tf
from keras import initializers, layers, activations
from keras.regularizers import l2


class Length(layers.Layer):
    """
    Compute the length of vectors. This is used to compute a Tensor that has the same shape with y_true in margin_loss.
    Using this layer as model's output can directly predict labels by using `y_pred = np.argmax(model.predict(x), 1)`
    inputs: shape=[None, num_vectors, dim_vector]
    output: shape=[None, num_vectors]
    
    Author: Xifeng Guo, E-mail: `guoxifeng1990@163.com`, Github: `https://github.com/XifengGuo/CapsNet-Keras`
    """
    def call(self, inputs, **kwargs):
        return K.sqrt(K.sum(K.square(inputs), -1))

    def compute_output_shape(self, input_shape):
        return input_shape[:-1]


def squash(vectors, axis=-1):
    """
    The non-linear activation used in Capsule. It drives the length of a large vector to near 1 and small vector to 0
    :param vectors: some vectors to be squashed, N-dim tensor
    :param axis: the axis to squash
    :return: a Tensor with same shape as input vectors
    """
    s_squared_norm = K.sum(K.square(vectors), axis, keepdims=True)
    scale = s_squared_norm / (1 + s_squared_norm) / K.sqrt(s_squared_norm + K.epsilon())
    return scale * vectors

class CapsuleLayer(layers.Layer):
    """
    The capsule layer. It is similar to Dense layer. Dense layer has `in_num` inputs, each is a scalar, the output of the 
    neuron from the former layer, and it has `out_num` output neurons. CapsuleLayer just expand the output of the neuron
    from scalar to vector. So its input shape = [None, input_num_capsule, input_dim_capsule] and output shape = \
    [None, num_capsule, dim_capsule]. For Dense Layer, input_dim_capsule = dim_capsule = 1.
    
    :param num_capsule: number of capsules in this layer
    :param dim_capsule: dimension of the output vectors of the capsules in this layer
    :param num_routing: number of iterations for the routing algorithm

    Author: Xifeng Guo, E-mail: `guoxifeng1990@163.com`, Github: `https://github.com/XifengGuo/CapsNet-Keras`
    """
    def __init__(self, num_capsule, dim_capsule, num_routing=3,
                 kernel_initializer='glorot_uniform', kernel_regularizer=l2(1e-2),
                 **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.num_routing = num_routing
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.kernel_regularizer = kernel_regularizer

    def build(self, input_shape):
        assert len(input_shape) >= 3, "The input Tensor should have shape=[None, input_num_capsule, input_dim_capsule]"
        self.input_num_capsule = input_shape[1]
        self.input_dim_capsule = input_shape[2]

        # Transform matrix
        self.W = self.add_weight(shape=[self.num_capsule, self.input_num_capsule,
                                        self.dim_capsule, self.input_dim_capsule],
                                 initializer=self.kernel_initializer,
                                 regularizer=self.kernel_regularizer,
                                 name='W')

        self.built = True

    def call(self, inputs, training=None):
        # inputs.shape=[None, input_num_capsule, input_dim_capsule]
        # inputs_expand.shape=[None, 1, input_num_capsule, input_dim_capsule]
        inputs_expand = K.expand_dims(inputs, 1)

        # Replicate num_capsule dimension to prepare being multiplied by W
        # inputs_tiled.shape=[None, num_capsule, input_num_capsule, input_dim_capsule]
        inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1])

        # Compute `inputs * W` by scanning inputs_tiled on dimension 0.
        # x.shape=[num_capsule, input_num_capsule, input_dim_capsule]
        # W.shape=[num_capsule, input_num_capsule, dim_capsule, input_dim_capsule]
        # Regard the first two dimensions as `batch` dimension,
        # then matmul: [input_dim_capsule] x [dim_capsule, input_dim_capsule]^T -> [dim_capsule].
        # inputs_hat.shape = [None, num_capsule, input_num_capsule, dim_capsule]
        inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]), elems=inputs_tiled)

        """
        # Begin: routing algorithm V1, dynamic ------------------------------------------------------------#
        # The prior for coupling coefficient, initialized as zeros.
        b = K.zeros(shape=[self.batch_size, self.num_capsule, self.input_num_capsule])

        def body(i, b, outputs):
            c = tf.nn.softmax(b, dim=1)  # dim=2 is the num_capsule dimension
            outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))
            if i != 1:
                b = b + K.batch_dot(outputs, inputs_hat, [2, 3])
            return [i-1, b, outputs]

        cond = lambda i, b, inputs_hat: i > 0
        loop_vars = [K.constant(self.num_routing), b, K.sum(inputs_hat, 2, keepdims=False)]
        shape_invariants = [tf.TensorShape([]),
                            tf.TensorShape([None, self.num_capsule, self.input_num_capsule]),
                            tf.TensorShape([None, self.num_capsule, self.dim_capsule])]
        _, _, outputs = tf.while_loop(cond, body, loop_vars, shape_invariants)
        # End: routing algorithm V1, dynamic ------------------------------------------------------------#
        """
        # Begin: Routing algorithm ---------------------------------------------------------------------#
        # In forward pass, `inputs_hat_stopped` = `inputs_hat`;
        # In backward, no gradient can flow from `inputs_hat_stopped` back to `inputs_hat`.
        inputs_hat_stopped = K.stop_gradient(inputs_hat)
        
        # The prior for coupling coefficient, initialized as zeros.
        # b.shape = [None, self.num_capsule, self.input_num_capsule].
        b = tf.zeros(shape=[K.shape(inputs_hat)[0], self.num_capsule, self.input_num_capsule])

        assert self.num_routing > 0, 'The num_routing should be > 0.'
        for i in range(self.num_routing):
            # c.shape=[batch_size, num_capsule, input_num_capsule]
            c = tf.nn.softmax(b, dim=1)

            # At last iteration, use `inputs_hat` to compute `outputs` in order to backpropagate gradient
            if i == self.num_routing - 1:
                # c.shape =  [batch_size, num_capsule, input_num_capsule]
                # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
                # The first two dimensions as `batch` dimension,
                # then matmal: [input_num_capsule] x [input_num_capsule, dim_capsule] -> [dim_capsule].
                # outputs.shape=[None, num_capsule, dim_capsule]
                outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))  # [None, 10, 16]
            else:  # Otherwise, use `inputs_hat_stopped` to update `b`. No gradients flow on this path.
                outputs = squash(K.batch_dot(c, inputs_hat_stopped, [2, 2]))

                # outputs.shape =  [None, num_capsule, dim_capsule]
                # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
                # The first two dimensions as `batch` dimension,
                # then matmal: [dim_capsule] x [input_num_capsule, dim_capsule]^T -> [input_num_capsule].
                # b.shape=[batch_size, num_capsule, input_num_capsule]
                b += K.batch_dot(outputs, inputs_hat_stopped, [2, 3])
        # End: Routing algorithm -----------------------------------------------------------------------#

        return outputs

    def compute_output_shape(self, input_shape):
        return tuple([None, self.num_capsule, self.dim_capsule])


def PrimaryCap(x, L, dim_capsule, num_capsule, kernel_size, activation=None):
    """
    Apply GraphConv `n_channels` times and concatenate all capsules
    :param inputs: 3D tensor, shape=[None, N, channels]
    :param dim_capsule: the dim of the output vector of capsule
    :param n_channels: the number of types of capsules
    :return: output tensor, shape=[None, num_capsule, dim_capsule]

    Author: David McDonald, E-mail: `dxm237@cs.bham.ac.uk`
    """
    output = GraphConv(num_filters=dim_capsule*num_capsule, 
                       kernel_size=kernel_size, activation=activation,
                           name='primarycap_GraphConv')([L, x])
    outputs = layers.Reshape(target_shape=[-1, dim_capsule], name='primarycap_reshape')(output)
    return layers.Lambda(squash, name='primarycap_squash')(outputs)

class GraphConv(layers.Layer):
    """
    A layer to perform a convolutional filter on a graph

    :param L_hat: rescaled graph laplacian: L_hat = 2L/lambda_max - identity(N)
    :param num_filters: number of filters
    :param kernel_size: size of kernel to use
    
    Author: David McDonald, Email: `dxm237@cs.bham.ac.uk'
    """
    def __init__(self, num_filters, kernel_size, activation=None,
                 kernel_initializer='glorot_uniform', kernel_regularizer=l2(1e-2),
                 **kwargs):
        super(GraphConv, self).__init__(**kwargs)
        # SparseTensor
#         self.L_hat = L_hat
        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.activation = activation
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.kernel_regularizer = kernel_regularizer

    def build(self, input_shape):
        '''
        L input shape[0] = [batch_size, batch_size]
        x input_shape[1] = [batch_size, D, input_channels]
        '''
        
        _, D, num_channels = input_shape[1]
        
        # chebyshev co-efficients
        self.W = self.add_weight(shape=[num_channels * (self.kernel_size + 1), self.num_filters],
                                 initializer=self.kernel_initializer, regularizer=self.kernel_regularizer,
                                 name='chebyshev_co-efficients')
        
        # bias term ?
        self.bias = self.add_weight(shape=[1, D, self.num_filters], 
                                    initializer=self.kernel_initializer, regularizer=self.kernel_regularizer,
                                   name="bias")

        self.built = True
        
    def sparse_matmul(self, L, x):
        '''
        L.shape = [N, N]
        x.shape = [D, N, C]
        
        permute dimensions of x to [N, D, C]
        reshape to [N, D*C]
        sparse to dense matmul with L: [N, N] @ [N, D*C] -> [N, D*C]
        reshape to [N, D, C]
        permute axes to [D, N, C]
        
        ----------------------------------------------------
        
        L.shape = [N, N]
        x.shape = [N, D, C]
        
        reshape to [N, D*C]
        sparse to dense matmul with L: [N, N] @ [N, D*C] -> [N, D*C]
        reshape to [N, D, C]
        
        '''
        N = K.shape(x)[0]
        D = K.shape(x)[1]
        C = K.shape(x)[2]
        
        x_transformed = x
#         x_transformed = K.permute_dimensions(x_transformed, [1,0,2])
        x_transformed = K.reshape(x_transformed, [N, D*C])
        x_transformed = tf.matmul(L, x_transformed)
#         x_transformed =  K.reshape(x_transformed, [N, D, C])
#         return K.permute_dimensions(x_transformed, [1, 0, 2])
        return K.reshape(x_transformed, [N, D, C])
        
    def compute_x_hat(self, L, x):
        '''
        efficient computation of filter using recurrence relation of Chebyshev polynomials
        T_k(x) = 2xT_k-1(x) - T_k-2(x)
        T_0(x) = 1
        T_1(x) = x
        
        input shape is [None, D, input_channels]
        output shape is [None, D, filters]
        
        '''

        x_hat = [x]
        
        if self.kernel_size > 0:
            x_hat.append(self.sparse_matmul(L, x))
        
        for k in range(self.kernel_size - 1):
            x_hat.append(2 * self.sparse_matmul(L, x_hat[-1]) - x_hat[-2])
    
        # concatenate to combine input_channels and kernel_size axes
        return K.concatenate(x_hat, axis=-1)
        

    def call(self, inputs):

        '''
        chebyshev polynomial for filtering using coefficients started in kernel
        '''
        L, x = inputs
        x_hat = self.compute_x_hat(L, x)
        # x_hat shape = [None, D, input_channels*filter_size]
        # W shape = [input_channels*filter_size, num_filters]
        # output shape = [None, D, num_filters]
        output = K.dot(x_hat, self.W)
        
#         print output.shape
#         print self.bias.shape
        
        output += self.bias

        if self.activation is not None:
            output = activations.get(self.activation)(output)
        
        return output
        
    def compute_output_shape(self, input_shape):
        '''
        L input shape[0] = [batch_size, batch_size]
        x input_shape[1] = [batch_size, D, input_channels]
        
        output_shape is [None, D, num_filters]
        '''
        return tuple([None, input_shape[1][1], self.num_filters])

Using TensorFlow backend.


In [2]:
def masked_crossentropy(y_true, y_pred):
    '''
    y_true shape = [None, num_classes]
    y_pred shape = [None, 2*  num_classes]
    
    '''    
    mask, y_pred = tf.split(y_pred, num_or_size_splits=2, axis=-1)
    
    y_pred = K.clip(y_pred, min_value=K.epsilon(), max_value=1-K.epsilon())
    
    return -K.sum(mask * y_true * K.log(y_pred))

def masked_margin_loss(y_true, y_pred):
    """
    Margin loss for Eq.(4). When y_true[i, :] contains not just one `1`, this loss should work too. Not test it.
    :param y_true: [None, n_classes]
    :param y_pred: [None, 2 * num_classes]
    :return: a scalar loss value.
    """

    mask, y_pred = tf.split(y_pred, num_or_size_splits=2, axis=-1)
    
    L = y_true * K.square(K.maximum(0., 0.9 - y_pred)) + \
        0.5 * (1 - y_true) * K.square(K.maximum(0., y_pred - 0.1))
        
    L *= mask

    return K.sum(K.sum(L, 1))

In [3]:
import numpy as np
import scipy as sp
import pandas as pd

from keras.models import Model
from keras.layers import Input

import networkx as nx

In [4]:
with open("../data/cora/cites.tsv", "rb") as inf:
    next(inf, "")
    G = nx.read_edgelist(inf, delimiter="\t", )

X = sp.sparse.load_npz("../data/cora/cited_words.npz")
Y = sp.sparse.load_npz("../data/cora/paper_labels.npz")

N, D = X.shape
_, num_classes = Y.shape
C = 1

X = np.reshape(X.toarray(), [N, D, C]).astype(np.float32)
Y = Y.toarray().astype(np.float32)

In [5]:
# G = nx.karate_club_graph()
# N = nx.number_of_nodes(G)

# X = np.identity(N)
# map_ = {"Mr. Hi" : 0, "Officer" : 1}
# Y = np.zeros((N, 2))
# for i, v in enumerate(nx.get_node_attributes(G, "club").values()):
#     Y[i, map_[v]] = 1
    
# N, D = X.shape
# _, num_classes = Y.shape
# C = 1

# X = np.reshape(X, [N, D, C])

In [6]:
p = 20
assignments = Y.argmax(axis=1)
patterns_to_remove = np.concatenate([np.random.permutation(np.where(assignments==i)[0])[:-p] 
                                     for i in range(num_classes)])
mask = np.ones(Y.shape, dtype=np.float32)
mask[patterns_to_remove] = 0

In [7]:
L = nx.laplacian_matrix(G).astype(np.float32)
l, U = sp.sparse.linalg.eigen.arpack.eigsh(L, k=1, which="LM")

L_hat = (2 * L / l[0] - sp.sparse.identity(L.shape[0], dtype=np.float32)).astype(np.float32)

In [8]:
def sparse_matrix_power(M, n):
    if n==0:
        return sp.sparse.identity(M.shape[0])
    if n%2==0:
        x = sparse_matrix_power(M, n/2)
        return x.dot(x)
    return M.dot(sparse_matrix_power(M, n-1))

def pad_neighbours(N, neighbours, n):
    num_neighbours = len(neighbours)
    if num_neighbours < n:
        non_neighbours = np.arange(N)[np.isin(np.arange(N), neighbours, assume_unique=True, invert=True)]
        neighbours = np.append(neighbours, np.random.choice(non_neighbours, replace=False, size=(n-num_neighbours)))
    return neighbours

def n_hop_neighbours(L, batch_size=100, n=2):
    N = L.shape[0]
    I, J = np.nonzero(sparse_matrix_power(L, n))
    return [pad_neighbours(N, J[I==i], batch_size) for i in range(N)]

def generator(L, X, Y, mask, num_nodes=1, num_neighbours=100, n=2):
    n_hop_nodes = n_hop_neighbours(L, batch_size=batch_size, n=n)
    N = L.shape[0]
    i = 0
    nodes = np.random.permutation(N)
    batch = np.zeros(num_nodes*num_neighbours, dtype=np.int)
    while True:
        
        for node in range(num_nodes):
            if i == N:
                i -= N
                nodes = np.random.permutation(N)
            n = nodes[i]
            neighbours = np.random.choice(n_hop_nodes[n], size=num_neighbours-1, replace=False )
            batch[node*num_neighbours] = n
            batch[node*num_neighbours+1:(node+1)*num_neighbours] = neighbours
            i += 1
        if (mask[batch]>0).any():
            yield [X[batch], L[batch][:, batch].todense(), mask[batch]], Y[batch]

In [9]:
num_nodes = 2
num_neighbours = 25
batch_size = num_nodes * num_neighbours

In [10]:
x = Input(shape=(D, C), name="signal_input")
L_input = Input(shape=(batch_size,),  name="L_input")
mask_input = Input(shape=(num_classes,), name="mask_input")

conv1 = GraphConv(num_filters=32, kernel_size=1, activation="relu", 
                  name="conv1")([L_input, x])
primary_cap = PrimaryCap(x=conv1, L=L_input, 
                         dim_capsule=8, num_capsule=32, kernel_size=1)
secondary_cap = CapsuleLayer(dim_capsule=16, num_capsule=num_classes, 
                             num_routing=3, name="secondary_cap")(primary_cap)
secondary_cap_length = Length(name="length")(secondary_cap)

outputs = layers.Concatenate()([mask_input, secondary_cap_length])

model = Model([x, L_input, mask_input], [outputs])

model.compile(optimizer="adam", #loss=[masked_margin_loss])
              loss=masked_crossentropy)

In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
L_input (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
signal_input (InputLayer)       (None, 1433, 1)      0                                            
__________________________________________________________________________________________________
conv1 (GraphConv)               (None, 1433, 32)     45920       L_input[0][0]                    
                                                                 signal_input[0][0]               
__________________________________________________________________________________________________
primarycap_GraphConv (GraphConv (None, 1433, 256)    383232      L_input[0][0]                    
          

In [12]:
from keras.callbacks import TerminateOnNaN

In [13]:
gen = generator(L_hat, X, Y, mask, num_nodes=num_nodes, num_neighbours=num_neighbours, n=2)

In [14]:
model.fit_generator(generator=gen,
                    epochs=1, steps_per_epoch=N/2, callbacks=[TerminateOnNaN()])

Epoch 1/1
 155/1354 [==>...........................] - ETA: 54:49 - loss: 40.9714Batch 155: Invalid loss, terminating training
 156/1354 [==>...........................] - ETA: 54:46 - loss: nan   

<keras.callbacks.History at 0x7fbf3c46f550>

In [15]:
for l in model.layers:
    print l.get_weights()

[]
[]
[array([[ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
         nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
         nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
         nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
         nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan]], dtype=float32), array([[[        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan, -0.00012895],
        ..., 
        [        nan,         nan,         nan, ...,         nan,
         -0.00286441,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan, -0.00073145],
        [        n

In [71]:
[X_test, L_test, mask_test], Y_test = gen.next()

In [75]:
Y_test.dtype

dtype('float32')

In [17]:
Y_test.argmax(axis=1)

array([6, 6, 6, 6, 6, 6, 0, 5, 6, 4, 6, 6, 4, 4, 3, 2, 6, 5, 6, 6, 0, 6, 1,
       6, 6])

In [18]:
predections = model.predict([X_test, L_test, mask_test])

In [19]:
predections.argmax(axis=1)

array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7])

In [20]:
predections

array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  1.,   1.,   1.,   1.,   1.,   1.,   1.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  inf,  inf],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,  inf,  inf,  inf,  inf,
         inf,  in