# Generated Datasets

In [None]:
import networkx as nx
import numpy as np
from graph_generation.data import synthetic_graphs
import torch as th
from pathlib import Path
import pickle


data_path = Path("../data")

# Spectre Datasets

In [None]:
def split_dataset(graphs):
    # The following snippet is used from "github.com/KarolisMart/SPECTRE".
    test_len = int(round(len(graphs) * 0.2))
    train_len = int(round((len(graphs) - test_len) * 0.8))
    val_len = len(graphs) - train_len - test_len

    train, val, test = th.utils.data.random_split(
        graphs,
        [train_len, val_len, test_len],
        generator=th.Generator().manual_seed(1234),
    )
    return [graphs[i] for i in train.indices], [graphs[i] for i in val.indices], [graphs[i] for i in test.indices]

## Synthetic Datasets

In [None]:
name = "planar_64_200.pt" # or sbm_200.pt
adjs = th.load(data_path / "spectre/" / name)[0]
graphs = [nx.from_numpy_array(adj.numpy().astype(bool)) for adj in adjs]

train, val, test = split_dataset(graphs)
dataset = {
    "train": train,
    "val": val,
    "test": test,
}


# save the dataset
with open(data_path / "planar.pkl", "wb") as f:
    pickle.dump(dataset, f)

## Protein and Point Cloud


In [None]:
def load(min_size, max_size, name, largest_cc=False):
    with open(data_path / f"{name}/{name}_A.txt", "rb") as f:
        data_adj = np.loadtxt(f, delimiter=',').astype(int)

    with open(data_path / f"{name}/{name}_graph_indicator.txt", "rb") as f:
        data_graph_indicator = np.loadtxt(f, delimiter=',').astype(int)

    G = nx.Graph()
    data_tuple = list(map(tuple, data_adj))

    # Add edges
    G.add_edges_from(data_tuple)

    # remove self-loop
    G.remove_edges_from(nx.selfloop_edges(G))

    # Split into graphs
    graph_num = data_graph_indicator.max()
    node_list = np.arange(data_graph_indicator.shape[0]) + 1

    graphs = []
    for i in range(graph_num):
        # Find the nodes for each graph
        nodes = node_list[data_graph_indicator == i + 1]
        G_sub = G.subgraph(nodes)
        if G_sub.number_of_nodes() >= min_size and G_sub.number_of_nodes() <= max_size:
            if largest_cc and not nx.is_connected(G_sub):
                G_sub = G_sub.subgraph(max(nx.connected_components(G_sub), key=len))
            adj = nx.to_scipy_sparse_array(G_sub).astype(bool)
            G_sub = nx.from_scipy_sparse_array(adj)
            graphs.append(G_sub)


    size = [G.number_of_nodes() for G in graphs]
    num_edges = [G.number_of_edges() for G in graphs]
    print(f"max nodes: {max(size)}")
    print(f"min nodes: {min(size)}")
    print(f"avg nodes: {np.mean(size)}")
    print(f"std nodes: {np.std(size)}")
    print(f"max edges: {max(num_edges)}")
    print(f"min edges: {min(num_edges)}")
    print(f"avg edges: {np.mean(num_edges)}")
    print(f"std edges: {np.std(num_edges)}")

    train, val, test = split_dataset(graphs)
    return {
        "train": train,
        "val": val,
        "test": test,
    }

In [None]:
# Protein
min_size=100
max_size=500

dataset = load(min_size, max_size, "DD", largest_cc=False)

with open(data_path / "protein.pkl", "wb") as f:
    pickle.dump(dataset, f)

In [None]:
# Point Cloud
min_size=0
max_size=10000

dataset = load(min_size, max_size, "FIRSTMM_DB", largest_cc=True)

with open(data_path / "point_cloud.pkl", "wb") as f:
    pickle.dump(dataset, f)

# Ours

In [None]:
train_len = 128
test_len = 40
val_len = 32

## Tree

In [None]:
train_graphs = synthetic_graphs.generate_tree_graphs(num_graphs=train_len, min_size=64, max_size=64, seed=0)
val_graphs = synthetic_graphs.generate_tree_graphs(num_graphs=val_len, min_size=64, max_size=64, seed=1)
test_graphs = synthetic_graphs.generate_tree_graphs(num_graphs=test_len, min_size=64, max_size=64, seed=2)

dataset = {
    "train": train_graphs,
    "val": val_graphs,
    "test": test_graphs,
}

# save the dataset
with open(data_path / "tree.pkl", "wb") as f:
    pickle.dump(dataset, f)


## Extrapolation & Interpolation


In [None]:
train_intervals_extrapolation = [(32, 64)]
train_intervals_interpolation = [(32, 64), (128, 160)]
test_graph_sizes = [48, 64, 80, 96, 112, 128, 144]

In [None]:
generator = synthetic_graphs.generate_planar_graphs # or synthetic_graphs.generate_tree_graphs

train_graphs = []
for interval in train_intervals_extrapolation:
    train_graphs += generator(num_graphs=train_len // len(interval), min_size=interval[0], max_size=interval[1], seed=0)

val_graphs = []
for size in test_graph_sizes:
    val_graphs += generator(num_graphs=val_len, min_size=size, max_size=size, seed=1)

test_graphs = []
for size in test_graph_sizes:
    test_graphs += generator(num_graphs=test_len, min_size=size, max_size=size, seed=2)

dataset = {
    "train": train_graphs,
    "val": val_graphs,
    "test": test_graphs,
}

# save the dataset
with open(data_path / "planar_extrapolation.pkl", "wb") as f:
    pickle.dump(dataset, f)