In [5]:
import torch
import numpy as np
from torch_geometric.data import Data
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
from torch_geometric.utils import to_dense_adj
from pygsp import graphs

In [1]:
def time_step_split_helper(new_nodes, new_edges, labels):
    """
    Split the graph and store node features, edges (represented by adjacency list),
    and labels separately by timestamp t (from 1 to 49).

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list

    Returns:
        features_t    A list of (|N_t|, d) feature matrices by timestamp
        edge_indices  A list of (2, |E_t|) adjacency list by timestamp
        labels_t      A list of (|N_t|) labels by timestamp
    """

    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    times = new_nodes.iloc[:, 1].to_numpy()
    times = torch.LongTensor(times.reshape(len(times),))
    labels = labels.iloc[:, 1].to_numpy().astype(int)
    labels = torch.LongTensor(labels.reshape(len(labels),))
    labels -= 1

    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))

    min_t = torch.min(times) # 1
    max_t = torch.max(times) # 49

    # Construct nodes of the directed graph for each time step;
    # features by time step are stored in "features_t"; labels by
    # time step are stored in "labels_t"
    features_t = []
    labels_t = []

    # Create a dictionary where
    # <key, value> = <node_id, <<idx, node_index_in_time_t_subgraph>, <t, time_t>>>.
    id2idx = {}
    for t in range(min_t, max_t + 1):
        features_t.append(features[times == t, :])
        labels_t.append(labels[times == t])
        nodes_t = nodes_id[times == t]
        for i in range(nodes_t.shape[0]):
            id2idx[nodes_t[i].item()] = {}
            id2idx[nodes_t[i].item()]['idx'] = i
            id2idx[nodes_t[i].item()]['t'] = t

    # Construct adjacency lists of the directed graph (non-symmetric) for each time step;
    # adjacency lists for each time step are stored in "edge_indices".
    edge_idx_t = [[] for _ in range(min_t, max_t + 1)]
    for index in range(new_edges.shape[0]):
        node1_t = id2idx[new_edges.iloc[index, 0]]['t']
        node1_idx = id2idx[new_edges.iloc[index, 0]]['idx']
        node2_t = id2idx[new_edges.iloc[index, 1]]['t']
        node2_idx = id2idx[new_edges.iloc[index, 1]]['idx']
        edge_idx_t[node1_t - 1].append([node1_idx, node2_idx]) # time_step starts from 1

    edge_indices = [torch.LongTensor(edge_idx_t[i]).t() for i in range(len(edge_idx_t))]
    return features_t, edge_indices, labels_t

In [7]:
def time_step_split(new_nodes, new_edges, labels, device, train_lt = 31, val_lt = 36, test_lt = 49):
    """
    Create and return the training, validation, and test set, splitted by time step,
    where each subgraph at time t is considered as an input of GCN model.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        device        Computing device
        train_lt      The last time step index of training set
        val_lt        The last time step index of validation set
        test_lt       The last time step index of test set

    Returns:
        data          A dictionary that stores training, validation, and test set,
                        each value is a list of Data object
        graph_info    A matrix where each row contains information of the time-step subgraph
                      [time_step, num_of_nodes, num_of_edges, num_of_illicit_nodes]
    """
    features_t, edge_indices, labels_t = time_step_split_helper(new_nodes, new_edges, labels)

    graph_info = np.zeros((len(labels_t), 4), dtype = np.int64)
    # for t in range(len(labels_t)):
    #     if(edge_indices[t].shape != 0):
    #         break
    #     graph_info[t, :] = np.array([t, features_t[t].shape[0], edge_indices[t].shape[1],
    #                                  labels_t[t][labels_t[t] == 1].shape[0]])

    train_idx, val_idx, test_idx = [np.arange(train_lt), np.arange(train_lt, val_lt),
                                    np.arange(val_lt, test_lt)]
    train_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                       y = labels_t[idx]).to(device) for idx in train_idx ]
    val_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                     y = labels_t[idx]).to(device) for idx in val_idx ]
    test_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                      y = labels_t[idx]).to(device) for idx in test_idx ]
    data = {}
    data['train'] = train_list
    data['val'] = val_list
    data['test'] = test_list

    return data, graph_info

In [None]:
def create_graph_helper(new_nodes, new_edges, labels):
    """
    Split the graph and store node features, edges (represented by adjacency list),
    and labels separately by timestamp t (from 1 to 49).

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list

    Returns:
        features_t    A list of (|N_t|, d) feature matrices by timestamp
        edge_indices  A list of (2, |E_t|) adjacency list by timestamp
        labels_t      A list of (|N_t|) labels by timestamp
    """

    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    times = new_nodes.iloc[:, 1].to_numpy()
    times = torch.LongTensor(times.reshape(len(times),))
    labels = labels.iloc[:, 1].to_numpy().astype(int)
    labels = torch.LongTensor(labels.reshape(len(labels),))
    labels -= 1

    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))

    # Construct nodes of the directed graph for each time step;
    # features by time step are stored in "features_t"; labels by
    # time step are stored in "labels_t"
    features_t = features
    labels_t = labels

    # Create a dictionary where
    # <key, value> = <node_id, <<idx, node_index_in_time_t_subgraph>, <t, time_t>>>.
    id2idx = {}
    nodes_t = nodes_id

    for i in range(nodes_t.shape[0]):
        id2idx[nodes_t[i].item()] = {}
        id2idx[nodes_t[i].item()]['idx'] = i

    # Construct adjacency lists of the directed graph (non-symmetric) for each time step;
    # adjacency lists for each time step are stored in "edge_indices".
    edge_idx_t = []
    for index in range(new_edges.shape[0]):
        node1_idx = id2idx[new_edges.iloc[index, 0]]['idx']
        node2_idx = id2idx[new_edges.iloc[index, 1]]['idx']
        edge_idx_t.append([node1_idx, node2_idx]) # time_step starts from 1

    edge_indices = torch.LongTensor(edge_idx_t).t()

    return features_t, edge_indices, labels_t

In [0]:
def create_graph(nodes, edges, labels, device='cpu'):
    """
    Create and return the big graph,
    Args:
        nodes     A dataframe of the node features
        edges     A dataframe of the graph's adjacency list
        device        Computing device

    Returns:
        data          A dictionary that stores training, validation, and test set,
                        each value is a list of Data object
        graph_info    A matrix where each row contains information of the time-step subgraph
                      [time_step, num_of_nodes, num_of_edges, num_of_illicit_nodes]
    """
    features_t, edge_indices, labels_t = create_graph_helper(nodes, edges, labels)
    graph_info = np.zeros((len(labels_t), 4), dtype = np.int64)
    data = Data(x = features_t, edge_index = edge_indices, y = labels_t).to(device)

    return data, graph_info


In [2]:
@torch.no_grad()
def get_ilicit_predictions(model, loader, test_mask, device):
    model.eval()
    for i, batch in enumerate(loader):
        batch = batch.to(device)
        y_pred = model(batch.x, batch.edge_index).argmax(dim=-1)[test_mask].cpu().detach().numpy()
        y_true = batch.y[test_mask].cpu().detach().numpy()

    illicit = np.where(y_true == 0)[0]

    correct = np.where(y_pred[illicit] == y_true[illicit])
    wrong = np.where(y_pred[illicit] != y_true[illicit])
    return correct, wrong

NameError: name 'torch' is not defined

In [8]:
@torch.no_grad()
def test(model, loader,loss_fn, test_mask, device):
    model.eval()
    loss = []
    res = {}
    for i, batch in enumerate(loader):
        batch = batch.to(device)
        indexes = batch.y != 2

        y_pred = model(batch.x, batch.edge_index)[indexes, :]
        y_true = batch.y[indexes]
        res = classification_report(torch.unsqueeze(y_true, -1).cpu(),
                                    y_pred.argmax(dim=-1).cpu(),output_dict=True, zero_division=0)
        loss.append(loss_fn(y_pred, y_true).item())
    return res, np.mean(loss)

In [1]:
def train(model, train_data,train_mask, optimizer, loss_fn, device):
    """
    Train the model by using the given optimizer and loss_fn.

    Args:
        model       The GCN model
        train_data  The Data object that stores x, edge_index, and labels
                      only for training set
        optimizer   The optimizer
        loss_fn     The loss function

    Returns
        The average prediction loss of each time step in the training set
          by the given loss function
    """
    model.train()
    loss = torch.FloatTensor([0]*len(train_data)).to(device)
    optimizer.zero_grad()
    for i, batch in enumerate(train_data):
        batch = batch.to(device)

        indexes = batch.y != 2

        train_slice = model.forward(batch.x, batch.edge_index)[indexes, :]
        train_label = batch.y[indexes]

        loss[i] = loss_fn(train_slice, train_label)
    loss.mean().backward()
    optimizer.step()
    return loss.mean().item()

In [1]:
def isConnected(edge_index):
    """
    Computes whether a graph is connected or not, based on its edge index
    :return:
    """
    # example of how to thing was implemented before and it worked
    # adj = to_dense_adj(data['train'][0].edge_index).squeeze(0)
    adj = to_dense_adj(edge_index).squeeze(0)
    g = graphs.Graph(adj)
    return g.is_connected()