In [5]:
import torch
import numpy as np
from torch_geometric.data import Data
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

In [1]:
def time_step_split_helper(new_nodes, new_edges, labels):
    """
    Split the graph and store node features, edges (represented by adjacency list),
    and labels separately by timestamp t (from 1 to 49).

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list

    Returns:
        features_t    A list of (|N_t|, d) feature matrices by timestamp
        edge_indices  A list of (2, |E_t|) adjacency list by timestamp
        labels_t      A list of (|N_t|) labels by timestamp
    """

    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    times = new_nodes.iloc[:, 1].to_numpy()
    times = torch.LongTensor(times.reshape(len(times),))
    labels = labels.iloc[:, 1].to_numpy().astype(int)
    labels = torch.LongTensor(labels.reshape(len(labels),))
    labels -= 1

    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))

    min_t = torch.min(times) # 1
    max_t = torch.max(times) # 49

    # Construct nodes of the directed graph for each time step;
    # features by time step are stored in "features_t"; labels by
    # time step are stored in "labels_t"
    features_t = []
    labels_t = []

    # Create a dictionary where
    # <key, value> = <node_id, <<idx, node_index_in_time_t_subgraph>, <t, time_t>>>.
    id2idx = {}
    for t in range(min_t, max_t + 1):
        features_t.append(features[times == t, :])
        labels_t.append(labels[times == t])
        nodes_t = nodes_id[times == t]
        for i in range(nodes_t.shape[0]):
            id2idx[nodes_t[i].item()] = {}
            id2idx[nodes_t[i].item()]['idx'] = i
            id2idx[nodes_t[i].item()]['t'] = t

    # Construct adjacency lists of the directed graph (non-symmetric) for each time step;
    # adjacency lists for each time step are stored in "edge_indices".
    edge_idx_t = [[] for _ in range(min_t, max_t + 1)]
    for index in range(new_edges.shape[0]):
        node1_t = id2idx[new_edges.iloc[index, 0]]['t']
        node1_idx = id2idx[new_edges.iloc[index, 0]]['idx']
        node2_t = id2idx[new_edges.iloc[index, 1]]['t']
        node2_idx = id2idx[new_edges.iloc[index, 1]]['idx']
        edge_idx_t[node1_t - 1].append([node1_idx, node2_idx]) # time_step starts from 1

    edge_indices = [torch.LongTensor(edge_idx_t[i]).t() for i in range(len(edge_idx_t))]
    return features_t, edge_indices, labels_t

In [7]:
def time_step_split(new_nodes, new_edges, labels, device, train_lt = 31, val_lt = 36, test_lt = 49):
    """
    Create and return the training, validation, and test set, splitted by time step,
    where each subgraph at time t is considered as an input of GCN model.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        device        Computing device
        train_lt      The last time step index of training set
        val_lt        The last time step index of validation set
        test_lt       The last time step index of test set

    Returns:
        data          A dictionary that stores training, validation, and test set,
                        each value is a list of Data object
        graph_info    A matrix where each row contains information of the time-step subgraph
                      [time_step, num_of_nodes, num_of_edges, num_of_illicit_nodes]
    """
    features_t, edge_indices, labels_t = time_step_split_helper(new_nodes, new_edges, labels)

    graph_info = np.zeros((len(labels_t), 4), dtype = np.int64)
    # for t in range(len(labels_t)):
    #     if(edge_indices[t].shape != 0):
    #         break
    #     graph_info[t, :] = np.array([t, features_t[t].shape[0], edge_indices[t].shape[1],
    #                                  labels_t[t][labels_t[t] == 1].shape[0]])

    train_idx, val_idx, test_idx = [np.arange(train_lt), np.arange(train_lt, val_lt),
                                    np.arange(val_lt, test_lt)]
    train_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                       y = labels_t[idx]).to(device) for idx in train_idx ]
    val_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                     y = labels_t[idx]).to(device) for idx in val_idx ]
    test_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                      y = labels_t[idx]).to(device) for idx in test_idx ]
    data = {}
    data['train'] = train_list
    data['val'] = val_list
    data['test'] = test_list

    return data, graph_info

In [8]:
@torch.no_grad()
def test(model, data, device, save_model_results=False):
    """
    Test the model by using the given split datasets.

    Args:
        model                 The GCN model
        data                  A dictionary of Data objects that store x, edge_index, and labels
                                for three sets
        save_model_results    A boolean determining whether we save the model results

    Returns
        The accuracy and auc-roc score of training, validation, and test set
    """

    model.eval()
    # The output of model on each data sets
    eval = {}
    for name in data.keys():
        data_list = data[name]
        eval_report = []
        eval_auc_roc = 0
        for i, batch in enumerate(data_list):
            batch = batch.to(device)
            out = model.forward(batch)
            y_pred = out.argmax(dim=-1, keepdim=True)
            acc = classification_report(torch.unsqueeze(batch.y, -1),
                                        y_pred,output_dict=True, zero_division=0)
            eval_report.append(acc)
            auc_roc = roc_auc_score(torch.unsqueeze(batch.y, -1),y_pred)
            eval_auc_roc += auc_roc
        report = {}
        for key in eval_report[0].keys():
            if type(eval_report[0][key]) is dict:
                df = pd.DataFrame([sub_report[key] for sub_report in eval_report])
                report[key] = df.mean().to_dict()
            else:
                report[key] = np.mean(np.array([sub_report[key] for sub_report in eval_report]))
        eval_auc_roc /= len(data_list)
        eval[name] = {'report': pd.DataFrame(report), 'auc_roc': eval_auc_roc}

    ### TODO: what is the criterion to save the model results, the whole prediction
    ### y_pred and y_true? or only the test sets' prediction?
    if save_model_results:
        print ("Saving Model Predictions")

        data_new = {}
        data_new ['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

        df = pd.DataFrame(data=data_new )
        # Save locally as csv
        df.to_csv('gcn_ind.csv', sep=',', index=False)

    return eval['train']['report'], eval['val']['report'], eval['test']['report'], \
        eval['train']['auc_roc'], eval['val']['auc_roc'], eval['test']['auc_roc']

In [None]:
def train(model, train_data, optimizer, loss_fn, device):
    """
    Train the model by using the given optimizer and loss_fn.

    Args:
        model       The GCN model
        train_data  The Data object that stores x, edge_index, and labels
                      only for training set
        optimizer   The optimizer
        loss_fn     The loss function

    Returns
        The average prediction loss of each time step in the training set
          by the given loss function
    """
    model.train()
    loss = torch.FloatTensor([0]*len(train_data)).to(device)
    optimizer.zero_grad()
    for i, batch in enumerate(train_data):
        batch = batch.to(device)
        train_slice = model.forward(batch)
        train_label = batch.y
        loss[i] = loss_fn(train_slice, train_label)
    loss.mean().backward()
    optimizer.step()
    return loss.mean().item()