In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import numpy as np
import tensorflow as tf
from typing import Tuple, List
import scipy.sparse as sp
import os
import pandas as pd
import networkx as nx

In [16]:
CLASS_WEIGTHS = [0.5,0.5]
NUM_ROLLS = 4
TEST_SHARE = 0.3
NUM_EPOCH = 50
LEARNING_RATE = 1e-3

## Create user labels

In [None]:
DATA = "/content/drive/MyDrive/data/processed_graphs.csv"
df = pd.read_csv(DATA)
df.head()

Unnamed: 0.2,idx,date,Unnamed: 0,from,to,amount,timestamp,fromIsPhi,toIsPhi,isPhi,Unnamed: 0.1
0,0,2016-11-30,87613,644996311924151884824924215840682271362850202643,1249665516472213179549841288377450362199154772633,0.984401,1480464000.0,0,0,0,
1,0,2016-11-30,87614,644996311924151884824924215840682271362850202643,119639879959584517570242490761449175088357594858,98.431564,1480465000.0,0,0,0,
2,0,2016-11-30,87615,644996311924151884824924215840682271362850202643,154744709618196328765779089700513530202844453134,11.650149,1480466000.0,0,0,0,
3,0,2016-11-30,87616,644996311924151884824924215840682271362850202643,132541851783283011242317528707846741855867577893,0.5,1480466000.0,0,0,0,
4,0,2016-11-30,87617,644996311924151884824924215840682271362850202643,218017970891763324453226269432084353002068817149,1.025745,1480469000.0,0,0,0,


In [None]:
df['idx'].max()

677

In [None]:
def get_unique_labelled(col, target_col):
  uniques = df.drop_duplicates(subset=col)
  return pd.DataFrame().assign(Users=df[col], Target=df[target_col])

In [None]:
unique_froms = get_unique_labelled("from", "fromIsPhi")
unique_tos = get_unique_labelled("to", "toIsPhi")
users = pd.concat([unique_froms, unique_tos]).drop_duplicates()

In [None]:
users = users.set_index("Users")
users # correct no of users

Unnamed: 0_level_0,Target
Users,Unnamed: 1_level_1
644996311924151884824924215840682271362850202643,0
744704369617947813251945981798011406624304165083,0
198215763635881730413814114272029314331152169154,0
181754426868731309256468892704777390517471106230,0
98863497962045004479775482974222130412383101711,0
...,...
856032292418037557214196133134941805797228363120,0
1182442082327443807850778649107894758400187517045,0
745225797796186069702020169108028652965690717208,0
716031830206199786309621055838584775743061589905,0


In [None]:
users.to_csv("/content/drive/MyDrive/data/processed_graphs_usr_labels.csv")

In [None]:
del users

In [None]:
I = np.eye(2)
targets = np.array([0,1,0,1,1])
I[targets]

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

## EvolveGCN

In [17]:
def reset_metrics(list_of_metrics):
    for m in list_of_metrics:
        m.reset_states()

In [18]:
def convert_scipy_CRS_space_to_tensor(sparce_mat: sp.csr.csr_matrix):
    sparce_mat_coo = sparce_mat.tocoo()
    indices = np.transpose(np.array([sparce_mat_coo.row, sparce_mat_coo.col]))
    return tf.SparseTensor(indices, sparce_mat_coo.data, sparce_mat_coo.shape)


def normalize_adjencency_mat(adj_mat: sp.csr.csr_matrix):
    assert len(adj_mat.shape) == 2

    d = np.array(adj_mat.sum(axis=-1))[...,0]
    d = np.sqrt(d)
    d = sp.diags(d).tocsr()
    a = adj_mat + sp.eye(adj_mat.shape[-1],dtype=adj_mat.dtype)
    return d @ a @ d

In [19]:
class HGRUCell(tf.keras.layers.Layer):
    def __init__(self, units: int, activation='tanh',
                    recurrent_activation='sigmoid',
                    use_bias=True,
                    kernel_initializer='glorot_uniform',
                    recurrent_initializer='orthogonal',
                    bias_initializer='zeros',
                    dtype=tf.float32):
        super(HGRUCell, self).__init__(dtype=dtype)

        self.units = int(units)
        self.activation = tf.keras.activations.get(activation)
        self.recurrent_activation = tf.keras.activations.get(recurrent_activation)
        self.use_bias = use_bias
        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.recurrent_initializer = tf.keras.initializers.get(recurrent_initializer)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)

    def build(self, input_shape):
        inp_shape = tf.TensorShape(input_shape[0])
        rec_shape = tf.TensorShape(input_shape[1])
        last_dim_inp = inp_shape[-1]
        last_dim_rec = rec_shape[-1]

        self.kernel_inp_x = self.add_weight(
            'kernel_input_x',
            shape=[last_dim_inp, 2*self.units],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.kernel_inp_h = self.add_weight(
            'kernel_input_h',
            shape=[last_dim_rec, 2*self.units],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)

        self.kernel_rec_x = self.add_weight(
            "kernel_recurrent_x",
            shape=[last_dim_inp, self.units],
            initializer=self.recurrent_initializer,
            dtype=self.dtype,
            trainable=True)
        self.kernel_rec_h = self.add_weight(
            "kernel_recurrent_h",
            shape=[last_dim_rec, self.units],
            initializer=self.recurrent_initializer,
            dtype=self.dtype,
            trainable=True)

        if self.use_bias:
            self.bias_inp = self.add_weight(
                "bias_input",
                shape=[1, 2*self.units],
                initializer=self.bias_initializer,
                dtype=self.dtype,
                trainable=True)
            self.bias_rec = self.add_weight(
                "bias_recurrent",
                shape=[1, self.units],
                initializer=self.bias_initializer,
                dtype=self.dtype,
                trainable=True)

        self.built = True

    def call(self, inputs: Tuple[tf.Tensor,tf.Tensor], training=None, mask=None):
        X,H = inputs
        ZR = self.activation(tf.matmul(X,self.kernel_inp_x) + tf.matmul(H, self.kernel_inp_h) + self.bias_inp)
        Z, R = tf.split(ZR,2,axis=-1)
        H_new = self.recurrent_activation(tf.matmul(X,self.kernel_rec_x) + tf.matmul(R*H, self.kernel_rec_h) + self.bias_rec)
        H_new = (1 - Z) * H + Z * H_new
        return H_new

    def get_initial_state(self, input_shape) -> tf.Tensor:
        inp_shape = tf.TensorShape(input_shape)
        return tf.zeros(inp_shape[-1:] + [self.units])

In [20]:
class GCNLayer(tf.keras.layers.Layer):
    def __init__(self, units: int, activation=None, kernel_initializer="glorot_uniform", dtype=tf.float32):
        super(GCNLayer, self).__init__(dtype=dtype)

        self.units = int(units)
        self.activation = tf.keras.activations.get(activation)
        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)


    def build(self, input_shape):
        last_dim = tf.TensorShape(input_shape[1])[-1]

        self.kernel = self.add_weight(
            'kernel',
            shape=[last_dim, self.units],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.built = True

    def call(self, inputs: Tuple[tf.SparseTensor,tf.Tensor], training=None, mask=None):
        adj, nodes = inputs
        return self.activation(tf.matmul(tf.sparse.sparse_dense_matmul(adj,nodes),self.kernel))

In [21]:
class SummarizeLayer(tf.keras.layers.Layer):
    def __init__(self,kernel_initializer="glorot_uniform", dtype=tf.float32):
        super(SummarizeLayer, self).__init__(dtype=dtype)
        self.kernel_initializer = kernel_initializer

    def build(self, input_shape):
        last_dim_inp = tf.TensorShape(input_shape[0])[-1]

        self.p = self.add_weight(
            'p',
            shape=[last_dim_inp],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)

        self.built = True

    def call(self, inputs: Tuple[tf.Tensor, tf.Tensor], training=None, mask=None):
        x,k = inputs
        y = tf.linalg.matvec(x,self.p) / tf.linalg.norm(self.p)
        top_y = tf.math.top_k(y,k)
        out = tf.gather(x,top_y.indices,axis=0) * tf.expand_dims(tf.tanh(top_y.values),-1)
        return out

In [22]:
class EGCUH(tf.keras.Model):
    def __init__(self, gru_cell: HGRUCell, summarize: SummarizeLayer, activation=None, dtype=tf.float32):
        super(EGCUH, self).__init__(dtype=dtype)

        self.gru_cell = gru_cell
        self.activation = tf.keras.activations.get(activation)
        self.summarize = summarize

    def call(self, inputs: Tuple[tf.SparseTensor,tf.Tensor,tf.Tensor], training=None, mask=None) -> Tuple[tf.Tensor,tf.Tensor]:
        adj,nodes,weigths = inputs
        node_summary = self.summarize([nodes,tf.shape(weigths)[-1]])
        weigths_new = self.gru_cell([tf.transpose(node_summary),weigths])

        nodes_new = self.activation(tf.matmul(tf.sparse.sparse_dense_matmul(adj,nodes),weigths_new))
        return nodes_new, weigths_new

    def get_initial_weigths(self, input_shape) -> tf.Tensor:
        return self.gru_cell.get_initial_state(input_shape)

In [23]:
class EvolveGCN(tf.keras.Model):
    def __init__(self, layers: List[EGCUH]):
        super(EvolveGCN, self).__init__()
        self.layers_ = layers

    def call(self, inputs: Tuple[tf.SparseTensor,tf.Tensor,List[tf.Tensor]], training=None, mask=None):
        adj, nodes, weigts = inputs
        new_weigths = []
        for i in range(len(self.layers_)):
            nodes,nw = self.layers_[i]([adj,nodes,weigts[i]])
            new_weigths.append(nw)
        return nodes, new_weigths

    def get_initial_weigths(self, input_shape) -> List[tf.Tensor]:
        states = []
        s = input_shape
        for l in self.layers_:
            s = l.get_initial_weigths(s)
            states.append(s)
            s = tf.shape(s)
        return states

In [24]:
graphs_path = "/content/drive/MyDrive/data/processed_graphs.csv"
labels_path = "/content/drive/MyDrive/data/processed_graphs_usr_labels.csv"
graphs_df = pd.read_csv(graphs_path)
node_labels = pd.read_csv(labels_path)

In [25]:
node_labels = node_labels.set_index("Users")

In [26]:
class DatasetLoader(object):
    def __init__(self, test_portion=0.3, MAX_ID=677, graphs_df=graphs_df,node_labels=node_labels):
        self.graphs_df = graphs_df
        self.node_labels = node_labels
        train_len = int(MAX_ID-test_portion)
        graph_ids = np.arange(MAX_ID)
        np.random.shuffle(graph_ids)
        self.train_range = graph_ids[:train_len]
        self.test_range = graph_ids[train_len:]

    @property
    def num_classes(self):
        return 2

    def test_batch_iterator(self):
        for id in self.test_range:
          new_graph = self.graphs_df[self.graphs_df['idx'] == id]
          graph_df = new_graph.reset_index().drop("idx",axis=1)
          graph_df["date"] = graph_df["date"].astype(str)
          G = nx.convert_matrix.from_pandas_edgelist(graph_df, "from", "to", edge_attr=True,create_using=nx.MultiDiGraph())
          node_attrs = {}
          for node in G.nodes():
            node_attrs[node] = {"addr": node, "label": self.node_labels.loc[node]["Target"]}
          nx.classes.function.set_node_attributes(G, node_attrs)
        one_hot = np.eye(2, dtype=np.float32)
        nodes, targets = [], []
        for _, d in G.nodes(data=True):
              targets.append(d["label"])
              nodes.append(int(d["addr"]))
        nodes = np.vstack(nodes).astype(np.float32)
        targets = one_hot[np.array(targets)]
        adjacency_mat = normalize_adjencency_mat(nx.adjacency_matrix(G).astype(np.float32))
        adjacency_mat = convert_scipy_CRS_space_to_tensor(adjacency_mat)
        yield(G, nodes,targets, adjacency_mat)

    def train_batch_iterator(self):
        for id in self.train_range:
          new_graph = self.graphs_df[self.graphs_df['idx'] == id]
          graph_df = new_graph.reset_index().drop("idx",axis=1)
          graph_df["date"] = graph_df["date"].astype(str)
          G = nx.convert_matrix.from_pandas_edgelist(graph_df, "from", "to", edge_attr=True,create_using=nx.MultiDiGraph())
          node_attrs = {}
          for node in G.nodes():
            node_attrs[node] = {"addr": node, "label": self.node_labels.loc[node]["Target"]}
          nx.classes.function.set_node_attributes(G, node_attrs)
        one_hot = np.eye(2, dtype=np.float32)
        nodes, targets = [], []
        for _, d in G.nodes(data=True):
              targets.append(d["label"])
              nodes.append(int(d["addr"]))
        nodes = np.vstack(nodes).astype(np.float32)
        targets = one_hot[np.array(targets)]
        adjacency_mat = normalize_adjencency_mat(nx.adjacency_matrix(G).astype(np.float32))
        adjacency_mat = convert_scipy_CRS_space_to_tensor(adjacency_mat)
        yield(G, nodes,targets, adjacency_mat)

In [None]:
dl = DatasetLoader(test_portion=TEST_SHARE)

model = EvolveGCN([
    EGCUH(HGRUCell(2),SummarizeLayer(),activation="relu"),
    EGCUH(HGRUCell(dl.num_classes),SummarizeLayer())
])

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss_func = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

def run_model(adj,nodes,targets,training=False):
    weigths = tf.reduce_sum(CLASS_WEIGTHS * targets, axis=-1)
    states = model.get_initial_weigths(tf.shape(nodes))
    logits = []
    for i in range(NUM_ROLLS):
        l, states = model([adj, nodes, states], training=training)
        logits.append(l)

    loss = sum(loss_func(targets, l, sample_weight=weigths) for l in logits)
    return logits[-1], loss, weigths


train_loss_metric = tf.keras.metrics.Mean()
train_accuracy_metric = tf.keras.metrics.Accuracy()
train_precision_metric = tf.keras.metrics.Precision()
train_recall_metric = tf.keras.metrics.Recall()
test_loss_metric = tf.keras.metrics.Mean()
test_accuracy_metric = tf.keras.metrics.Accuracy()
test_precision_metric = tf.keras.metrics.Precision()
test_recall_metric = tf.keras.metrics.Recall()
metrics = [train_loss_metric,train_accuracy_metric,train_precision_metric,train_recall_metric
            ,test_loss_metric,test_accuracy_metric,test_precision_metric,test_recall_metric]

for epoch in range(NUM_EPOCH):
    reset_metrics(metrics)

    for _, n, t, adj in dl.train_batch_iterator():
        with tf.GradientTape() as tape:
            logits, loss, weigths = run_model(adj, n, t, training=True)

        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads,model.trainable_weights))

        y_true = tf.cast(tf.argmax(t,axis=-1) == 0,tf.float32)
        y_pred = tf.cast(tf.argmax(logits,axis=-1) == 0,tf.float32)
        train_loss_metric(loss)
        train_accuracy_metric(tf.argmax(t,axis=-1), tf.argmax(logits,axis=-1), sample_weight=weigths)
        train_precision_metric(y_true, y_pred)
        train_recall_metric(y_true, y_pred)

    for _, n, t, adj in dl.test_batch_iterator():
        logits, loss, weigths = run_model(adj, n, t)

        y_true = tf.cast(tf.argmax(t, axis=-1) == 0, tf.float32)
        y_pred = tf.cast(tf.argmax(logits, axis=-1) == 0, tf.float32)
        test_loss_metric(loss)
        test_accuracy_metric(tf.argmax(t,axis=-1), tf.argmax(logits,axis=-1), sample_weight=weigths)
        test_precision_metric(y_true, y_pred)
        test_recall_metric(y_true, y_pred)

    print("Epoch: {}\nTRAIN Loss: {:.5}| Accuracy: {:.4}| Precision: {:.4}| Recall: {:.4}\nTEST Loss: {:.4}| Accuracy: {:.4}| Precision: {:.4}| Recall: {:.4}".format(
        epoch, train_loss_metric.result().numpy(), train_accuracy_metric.result().numpy(),
        train_precision_metric.result().numpy(),train_recall_metric.result().numpy(),
        test_loss_metric.result().numpy(), test_accuracy_metric.result().numpy(),
        test_precision_metric.result().numpy(),test_recall_metric.result().numpy()
    ))

Epoch: 0
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 1
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 2
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 3
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 4
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 5
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 6
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
Epoch: 7
TRAIN Loss: nan| Accuracy: 1.0| Precision: 1.0| Recall: 1.0
TEST Loss: nan| Accuracy: 1.0| Prec