In [1]:
import os
import numpy as np
import torch

In [2]:
# Define dataset name
dataset_name = "ENZYMES"

# Define datasets destination
datasets_folder = "datasets/"
os.makedirs(datasets_folder, exist_ok = True)

# File paths of edge list and node IDs list
edges_file = os.path.join(datasets_folder, dataset_name, f"{dataset_name}.edges")
graph_idx_file =  os.path.join(datasets_folder, dataset_name, f"{dataset_name}.graph_idx")

# Load edge list and node IDs list
edges = np.loadtxt(edges_file, dtype=int, delimiter=",")
graph_idx = np.loadtxt(graph_idx_file, dtype=int)

In [None]:
# Keep node IDs file and edges file row indexes to read graphs sequentially
nodes_file_row = 0
edges_file_row = 0

# Extract graph ids
unique_graph_ids = np.unique(graph_idx)

# Data structure for storing graphs
graphs = []

# Dictionary of adjacency matrices tensors
adj_matrices = {}

# Sequence number associated to the current graph
seq_num = 0

# Total number of nodes
tot_num_nodes = 0

for current_graph_id in unique_graph_ids:

    # Data structure for soring node IDs and edges
    current_graph_node_ids = []
    current_graph_edges = []

    # Extract node IDs of current graph
    for idx, graph_id in enumerate(graph_idx[nodes_file_row:], start = nodes_file_row):
        if(graph_id == current_graph_id):
            current_graph_node_ids.append(idx + 1)
        else:
            nodes_file_row = nodes_file_row + len(current_graph_node_ids)
            break

    # Number of nodes of the current graph
    N = len(current_graph_node_ids)

    # Initialize data strucutres to store sparse adjacency matrix
    indices = []

    # Structure to store edges already in the current graph
    seen_edges = set()
    
    # Extract edges of current graph
    for row, col in edges[edges_file_row:]:
        if row in current_graph_node_ids and col in current_graph_node_ids:
            
            # Sort edge tuple to check if is already present
            edge = tuple(sorted((row, col)))        
            if(edge not in seen_edges):
                current_graph_edges.append((row, col))
                seen_edges.add(edge)
                
                # Add indices for the sparse tensor
                indices.append([row - 1 - tot_num_nodes, col - 1 - tot_num_nodes])

            # Count at which row we are reading the file
            edges_file_row = edges_file_row + 1
        else:
            break
    
    # Convert indices and values to tensors
    indices = torch.tensor(indices, dtype=torch.long).t()

    # Symmetrize adjacency matrix
    reversed_indices = indices.flip(0)
    indices = torch.cat([indices, reversed_indices], dim=1)

    # Add key-value pair to the adjacency matrices dictionary
    adj_matrices[str(seq_num)] = torch.sparse_coo_tensor(indices, torch.ones(indices.size(1)), size=(N, N), dtype=bool)

    tot_num_nodes = tot_num_nodes + N

    # Increment sequence number
    seq_num += 1

In [4]:
# File name where to save graph tensors
output_file =  os.path.join(datasets_folder, dataset_name, f"{dataset_name}.pth")

# Save graph tensors in .pth file
torch.save(adj_matrices, output_file)