In [1]:
import os
from yaml import Loader
import yaml
from tqdm.notebook import tqdm

import pandas as pd
import networkx as nx
import numpy as np
from sklearn.decomposition import PCA

import torch
from torch_geometric import data as DATA
from torch_geometric.data import Data, DataLoader
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import k_hop_subgraph, to_scipy_sparse_matrix
from torch_geometric.data import Data, InMemoryDataset
import torch.nn.functional as F

from scipy.sparse.csgraph import shortest_path
from itertools import chain

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda')

# Edgelist

In [4]:
edgeframe = pd.read_csv('./Training/training_graph.csv', 
                        sep=',', header=0, names=['source', 'dest'])
edgeframe = edgeframe[edgeframe['source'] != edgeframe['dest']]

G = nx.from_pandas_edgelist(edgeframe, 'source', 'dest')

edge_index = []
for e1, e2 in G.edges:
    edge_index.append([e1, e2])

# Features

In [5]:
features = pd.read_csv("./DOC2Vec_features.csv")
features = features.sort_values(by=['id'])
features = features.drop(columns=['id'])
features = torch.Tensor(features.to_numpy()).to(device)

In [6]:
#pca = PCA()
#pca.fit(features)
#pca_grid_feature_list = pca.transform(features)
#features = torch.Tensor(pca_grid_feature_list).to(device)

In [7]:
# Make and Store graph
ap_graph = DATA.Data(
    x = features.to(device),
    edge_index = torch.LongTensor(edge_index).transpose(1, 0).to(device),
)

ap_graph.root = './'

In [8]:
ap_graph

Data(x=[22470, 128], edge_index=[2, 131889], root='./')

In [11]:
class SEALDataset(InMemoryDataset):
    def __init__(self, dataset, num_hops, split='train'):
        self.data = dataset
        self.num_hops = num_hops
        super().__init__(dataset.root)
        index = ['train', 'val', 'test'].index(split)
        #self.data, self.slices = torch.load(self.processed_paths[index])

    @property
    def processed_file_names(self):
        return ['SEAL_train_data.pt', 'SEAL_val_data.pt', 'SEAL_test_data.pt']

    def process(self):
        transform = RandomLinkSplit(num_val=0.05, num_test=0.1,
                                    is_undirected=True, split_labels=True,
                                   add_negative_train_samples=True
                                   )
        
        train_data, val_data, test_data = transform(self.data)
        print("Train Data:")
        print(train_data)
        print("Val Data:")
        print(val_data)
        print("Test Data:")
        print(test_data)
        
        self._max_z = 0

        # Collect a list of subgraphs for training, validation and testing:
        train_pos_data_list = self.extract_enclosing_subgraphs(
            train_data.edge_index, train_data.pos_edge_label_index, 1)
        train_neg_data_list = self.extract_enclosing_subgraphs(
            train_data.edge_index, train_data.neg_edge_label_index, 0)

        val_pos_data_list = self.extract_enclosing_subgraphs(
            val_data.edge_index, val_data.pos_edge_label_index, 1)
        val_neg_data_list = self.extract_enclosing_subgraphs(
            val_data.edge_index, val_data.neg_edge_label_index, 0)

        test_pos_data_list = self.extract_enclosing_subgraphs(
            test_data.edge_index, test_data.pos_edge_label_index, 1)
        test_neg_data_list = self.extract_enclosing_subgraphs(
            test_data.edge_index, test_data.neg_edge_label_index, 0)

        # Convert node labeling to one-hot features.
        for data in chain(train_pos_data_list, train_neg_data_list,
                          val_pos_data_list, val_neg_data_list,
                          test_pos_data_list, test_neg_data_list):
            # We solely learn links from structure, dropping any node features:
            data.x = F.one_hot(data.z, self._max_z + 1).to(torch.float)

        torch.save(self.collate(train_pos_data_list + train_neg_data_list),
                'SEAL_train_data.pt')
        torch.save(self.collate(val_pos_data_list + val_neg_data_list),
                   'SEAL_val_data.pt')
        torch.save(self.collate(test_pos_data_list + test_neg_data_list),
                   'SEAL_test_data.pt')

    def extract_enclosing_subgraphs(self, edge_index, edge_label_index, y):
        data_list = []
        for src, dst in edge_label_index.t().tolist():
            sub_nodes, sub_edge_index, mapping, _ = k_hop_subgraph(
                [src, dst], self.num_hops, edge_index, relabel_nodes=True)
            src, dst = mapping.tolist()

            # Remove target link from the subgraph.
            mask1 = (sub_edge_index[0] != src) | (sub_edge_index[1] != dst)
            mask2 = (sub_edge_index[0] != dst) | (sub_edge_index[1] != src)
            sub_edge_index = sub_edge_index[:, mask1 & mask2]

            # Calculate node labeling.
            z = self.drnl_node_labeling(sub_edge_index, src, dst,
                                        num_nodes=sub_nodes.size(0))

            data = Data(x = self.data.x[sub_nodes], z=z,
                        edge_index = sub_edge_index, y=y)
            data_list.append(data)

        return data_list

    def drnl_node_labeling(self, edge_index, src, dst, num_nodes=None):
        # Double-radius node labeling (DRNL).
        src, dst = (dst, src) if src > dst else (src, dst)
        adj = to_scipy_sparse_matrix(edge_index, num_nodes=num_nodes).tocsr()

        idx = list(range(src)) + list(range(src + 1, adj.shape[0]))
        adj_wo_src = adj[idx, :][:, idx]

        idx = list(range(dst)) + list(range(dst + 1, adj.shape[0]))
        adj_wo_dst = adj[idx, :][:, idx]

        dist2src = shortest_path(adj_wo_dst, directed=False, unweighted=True,
                                 indices=src)
        dist2src = np.insert(dist2src, dst, 0, axis=0)
        dist2src = torch.from_numpy(dist2src)

        dist2dst = shortest_path(adj_wo_src, directed=False, unweighted=True,
                                 indices=dst - 1)
        dist2dst = np.insert(dist2dst, src, 0, axis=0)
        dist2dst = torch.from_numpy(dist2dst)

        dist = dist2src + dist2dst
        dist_over_2, dist_mod_2 = dist // 2, dist % 2

        z = 1 + torch.min(dist2src, dist2dst)
        z += dist_over_2 * (dist_over_2 + dist_mod_2 - 1)
        z[src] = 1.
        z[dst] = 1.
        z[torch.isnan(z)] = 0.

        self._max_z = max(int(z.max()), self._max_z)

        return z.to(torch.long)

In [12]:
seal_graph = SEALDataset(ap_graph, num_hops=1, split='train')

Processing...
  dist_over_2, dist_mod_2 = dist // 2, dist % 2


Train Data:
Data(x=[22470, 128], edge_index=[2, 120606], root='./', pos_edge_label=[60303], pos_edge_label_index=[2, 60303], neg_edge_label=[60303], neg_edge_label_index=[2, 60303])
Val Data:
Data(x=[22470, 128], edge_index=[2, 120606], root='./', pos_edge_label=[3547], pos_edge_label_index=[2, 3547], neg_edge_label=[3547], neg_edge_label_index=[2, 3547])
Test Data:
Data(x=[22470, 128], edge_index=[2, 127700], root='./', pos_edge_label=[7094], pos_edge_label_index=[2, 7094], neg_edge_label=[7094], neg_edge_label_index=[2, 7094])


KeyboardInterrupt: 