### train_test_split

In [4]:
import numpy as np
import random
import torch

def create_random_split(graph, split : list):

    assert(len(split) == 3)
    
    idx = np.arange(graph.num_nodes)
    random.shuffle(idx)

    train_size = int(graph.num_nodes * (split[0] / 10))
    val_size = int(graph.num_nodes * (split[1] / 10))
    test_size = int(graph.num_nodes * (split[2] / 10))

    train_mask = torch.zeros(graph.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(graph.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(graph.num_nodes, dtype=torch.bool)

    train_mask[idx[:train_size]] = True
    val_mask[idx[train_size: train_size + val_size]] = True
    test_mask[idx[train_size + val_size:]] = True

    graph.train_mask = train_mask
    graph.val_mask = val_mask
    graph.test_mask = test_mask

# create_random_split(graph, [1, 1, 8])

### subgraph sampling

In [5]:
import torch
import random
import numpy as np
from torch_geometric.utils import to_networkx, degree, k_hop_subgraph
from torch_geometric.data import Data

def _convert_to_nodeDegreeFeatures(graphs):
    graph_infos = []
    maxdegree = 0
    for i, graph in enumerate(graphs):
        g = to_networkx(graph, to_undirected=True)
        gdegree = max(dict(g.degree).values())
        if gdegree > maxdegree:
            maxdegree = gdegree
        graph_infos.append((graph, g.degree, graph.num_nodes))    # (graph, node_degrees, num_nodes)

    new_graphs = []
    for i, tpl in enumerate(graph_infos):
        idx, x = tpl[0].edge_index[0], tpl[0].x
        deg = degree(idx, tpl[2], dtype=torch.long)
        deg = F.one_hot(deg, num_classes=maxdegree + 1).to(torch.float)

        new_graph = tpl[0].clone()
        new_graph.__setitem__('x', deg)
        new_graphs.append(new_graph)

    return new_graphs

def to_egoNet(graph, ego, hop_number): 
    # get ego-networks for sampled nodes
    sub_nodes, sub_edge_index, _, _ = k_hop_subgraph(ego, hop_number, graph.edge_index)
    
    def re_index(source):
        mapping = dict(zip(sub_nodes.numpy(), range(sub_nodes.shape[0])))
        return mapping[source]
    
    edge_index_u = [*map(re_index, sub_edge_index[0][:].numpy())]
    edge_index_v = [*map(re_index, sub_edge_index[1][:].numpy())]

    egonet = Data(edge_index=torch.tensor([edge_index_u, edge_index_v]), x=graph.x[sub_nodes], y=graph.y[sub_nodes])
    return egonet

def get_egonetworks(graph, ego_number, hop_number, sampling, dataset_split : bool = False):
    ego_number = min(ego_number, graph.num_nodes)
    
    num_graphs = ego_number
    num_features = graph.num_node_features
    num_labels = len(graph.y.unique())

    if sampling == 'random':
        # egos = []
        # batch_size = graph.num_nodes // ego_number
        # for i in range(ego_number):
        #     egos += random.sample(range(batch_size * i, batch_size * (i+1)), 1)
        egos = random.sample(range(graph.num_nodes), ego_number)
        print("random ego central nodes:{}".format(egos))
        egonetworks = [to_egoNet(graph, ego, hop_number) for ego in egos]

    if sampling == 'byLabel':
        egos_byLabel = {}
        allLabels = graph.y.unique()
        for label in allLabels:
            idx_label = np.where(graph.y == label)[0]
            egos_byLabel[label.item()] = random.sample(list(idx_label), ego_number)    # ego_number is per client in this case (should be smaller)

        egonetworks = {k: [to_egoNet(graph, ego.item(), hop_number) for ego in v] for k, v in egos_byLabel.items()}
        num_graphs = len(allLabels) * ego_number

    if (sampling == 'random' and not egonetworks[0].__contains__('x')):
        egonetworks = _convert_to_nodeDegreeFeatures(egonetworks)

    if (sampling == 'byLabel' and not list(egonetworks.values())[0].__contains__('x')):
        egonetworks = {k: _convert_to_nodeDegreeFeatures(v) for k, v in egonetworks.items()}

    if dataset_split:
        for subgraph in egonetworks:
            create_random_split(subgraph, [1, 1, 8])

    return egonetworks, num_graphs, num_features, num_labels


def log_subgraphs(subgraphs):
    for idx, subgraph in enumerate(subgraphs): 
        print("----------ego {}----------".format(idx))
        print(subgraph)
        print("node label: ", subgraph.y.unique())
        for label in subgraph.y.unique():
            print("{} label node number: ".format(label), (subgraph.y == label).sum().item())


# subgraphs, num_graphs, num_features, num_labels = get_egonetworks(graph.cpu(), ego_number=6, hop_number=10, sampling='random', dataset_split=False)
# log_subgraphs(subgraphs)

### Mat2Graph

In [3]:
import scipy.io as sio
import numpy as np
import torch
from torch_geometric.data import Data

def load_network(file:str, dataset_split: bool = False) -> Data:
    net = sio.loadmat(file)
    x, a, y = net['attrb'], net['network'].todense(), net['group']
    
    A = torch.tensor(np.where(a == 1), dtype = torch.long)
    X = torch.tensor(x, dtype = torch.float)
    Y = torch.tensor(np.argmax(y, axis = 1))

    graph = Data(x=X, edge_index=A, y=Y)

    if dataset_split:
        create_random_split(graph, [1, 1, 8])

    return graph

# graph = load_network(file, dataset_split = False)

### save subgraph

In [22]:
import pickle

def save_egographs(data: list, outfile):
    '''
    data: [subgraphs, num_graphs, num_features, num_labels]
    outfile: file to save
    '''
    pickle.dump(data, open(outfile, 'wb'))
    print(f"Wrote to {outfile}.")

### network visualization

In [5]:
import pyecharts
from pyecharts import options as opts
from pyecharts.charts import Graph
import numpy as np
import torch_geometric

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK

def visualize(g : torch_geometric.data.Data = None, gName : str = None):
    repulsion_size = 800000

    nodes = [opts.GraphNode(name=str(idx), category=label.item()) for idx, label in zip(range(g.num_nodes), g.y)]

    links = [opts.GraphLink(source=s.item(), target=t.item()) for s, t in zip(g.edge_index[0], g.edge_index[1])]

    categories = []
    classes = np.unique(g.y)
    for i in classes:
        categories.append(opts.GraphCategory(name=str(i)))

    c = (
        Graph()
        .add("", nodes, links, categories, repulsion=repulsion_size)
        .set_global_opts(title_opts=opts.TitleOpts(title=gName))
    )

    c.load_javascript()
    
    return c

### Dataset1 process

In [6]:
# dataset path
acm_path = "dataset1/acmv9.mat"
citation_path = "dataset1/citationv1.mat"
dblp_path = "dataset1/dblpv7.mat"

In [None]:
from describe import describe

acm = load_network(acm_path, True)
print("acm:")
describe(acm, infos=['basic', 'label', 'degree'], deg_types = ['out'])

citation = load_network(citation_path, True)
print("citation:")
describe(citation, infos=['basic', 'label', 'degree'], deg_types = ['out'])

dblp = load_network(dblp_path, True)
print("dblp:")
describe(dblp, infos=['basic', 'label', 'degree'], deg_types = ['out'])

In [53]:
# c = visualize(acm, "acm")
# c.render_notebook()

In [54]:
# c = visualize(citation, "citation")
# c.render_notebook()

In [55]:
# c = visualize(dblp, "dblp")
# c.render_notebook()

In [None]:
# save ac2d.pkl
acm_subgraphs, acm_num_graphs, num_features, num_labels = get_egonetworks(acm, ego_number=2, hop_number=10, sampling='random', dataset_split=True)
log_subgraphs(acm_subgraphs)

citation_subgraphs, citation_num_graphs, num_features, num_labels = get_egonetworks(citation, ego_number=2, hop_number=10, sampling='random', dataset_split=True)
log_subgraphs(citation_subgraphs)

subgraphs = acm_subgraphs + citation_subgraphs
subgraphs.append(dblp)
num_graphs = acm_num_graphs + citation_num_graphs + 1

save_egographs([[acm, citation, dblp], 3, num_features, num_labels], outfile='dataset1/ac2d_3.pkl')
save_egographs([subgraphs, num_graphs, num_features, num_labels], outfile=f'dataset1/ac2d_{num_graphs}.pkl')

In [None]:
# save ad2c.pkl
acm_subgraphs, acm_num_graphs, num_features, num_labels = get_egonetworks(acm, ego_number=2, hop_number=10, sampling='random', dataset_split=True)
log_subgraphs(acm_subgraphs)

dblp_subgraphs, dblp_num_graphs, num_features, num_labels = get_egonetworks(dblp, ego_number=2, hop_number=10, sampling='random', dataset_split=True)
log_subgraphs(dblp_subgraphs)

subgraphs = acm_subgraphs + dblp_subgraphs
subgraphs.append(citation)
num_graphs = acm_num_graphs + dblp_num_graphs + 1

save_egographs([[acm, dblp, citation], 3, num_features, num_labels], outfile='dataset1/ad2c_3.pkl')
save_egographs([subgraphs, num_graphs, num_features, num_labels], outfile=f'dataset1/ad2c_{num_graphs}.pkl')

In [None]:
# save cd2a.pkl
citation_subgraphs, citation_num_graphs, num_features, num_labels = get_egonetworks(citation, ego_number=2, hop_number=10, sampling='random', dataset_split=True)
log_subgraphs(citation_subgraphs)

dblp_subgraphs, dblp_num_graphs, num_features, num_labels = get_egonetworks(dblp, ego_number=2, hop_number=10, sampling='random', dataset_split=True)
log_subgraphs(dblp_subgraphs)

subgraphs = citation_subgraphs + dblp_subgraphs
subgraphs.append(acm)
num_graphs = citation_num_graphs + dblp_num_graphs + 1

save_egographs([[citation, dblp, acm], 3, num_features, num_labels], outfile='dataset1/cd2a_3.pkl')
save_egographs([subgraphs, num_graphs, num_features, num_labels], outfile=f'dataset1/cd2a_{num_graphs}.pkl')