In [1]:
import os
import numpy as np

In [2]:
# Define dataset name
dataset_name = "COLLAB"

# Define datasets destination
datasets_folder = "datasets/"
os.makedirs(datasets_folder, exist_ok = True)

# File paths of edge list and node IDs list
edges_file = os.path.join(datasets_folder, dataset_name, f"{dataset_name}.edges")
graph_idx_file =  os.path.join(datasets_folder, dataset_name, f"{dataset_name}.graph_idx")

# Load edge list and node IDs list
edges = np.loadtxt(edges_file, dtype=int, delimiter=",")
graph_idx = np.loadtxt(graph_idx_file, dtype=int)

In [3]:
# Define parameters for Node2Vec

# Set dimension of node embeddings
embedding_dim = 5

# Set length and number of random walks
walk_length = 5
walks_per_node = 10

# Set context size for Skip-Gram
context_size = 5

# Set number of trainig epochs
num_epochs = 5

# Set learning rate
learning_rate = 0.001

In [4]:
import torch

In [5]:
# Check if GPU is available for training Node2Vec
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Define train procedure for Node2Vec
def train(node2vec, loader, optimizer):
    # Set Node2Vec model to training mode
    node2vec.train()
    total_loss = 0

    # Run training epochs
    for epoch in range(num_epochs):
        # Iterate over random walk samples
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss = total_loss + loss.item()
    
    # Return training loss
    return total_loss / num_epochs

In [7]:
from torch_geometric.nn import Node2Vec
from torch_geometric.data import Data
from torch_geometric.utils import degree

In [8]:
# Keep node IDs file and edges file row indexes to read graphs sequentially
nodes_file_row = 0
edges_file_row = 0

# Extract graph ids
unique_graph_ids = np.unique(graph_idx)

# Data structure for storing graphs
graphs = []

# Dictionary of node embeddings tensors
graph_node_embeddings = {}

# Sequence number associated to the current graph
seq_num = 0

# Total number of nodes
tot_num_nodes = 0

for current_graph_id in unique_graph_ids:

    # Data structure for soring node IDs and edges
    current_graph_node_ids = []
    current_graph_edges = []

    # Extract node IDs of current graph
    for idx, graph_id in enumerate(graph_idx[nodes_file_row:], start = nodes_file_row):
        if(graph_id == current_graph_id):
            current_graph_node_ids.append(idx + 1)
        else:
            nodes_file_row = nodes_file_row + len(current_graph_node_ids)
            break

    # Number of nodes of the current graph
    N = len(current_graph_node_ids)

    # Initialize data strucutres to store sparse adjacency matrix
    indices = []

    # Structure to store edges already in the current graph
    seen_edges = set()
    
    # Extract edges of current graph
    for row, col in edges[edges_file_row:]:
        if row in current_graph_node_ids and col in current_graph_node_ids:
            
            # Sort edge tuple to check if is already present
            edge = tuple(sorted((row, col)))        
            if(edge not in seen_edges):
                current_graph_edges.append((row, col))
                seen_edges.add(edge)
                
                # Add indices for the sparse tensor
                indices.append([row - 1 - tot_num_nodes, col - 1 - tot_num_nodes])

            # Count at which row we are reading the file
            edges_file_row = edges_file_row + 1
        else:
            break
    
    # Convert indices and values to tensors
    indices = torch.tensor(indices, dtype=torch.long).t()

    # Symmetrize adjacency matrix
    reversed_indices = indices.flip(0)
    indices = torch.cat([indices, reversed_indices], dim=1)

    # Create Pytorch Geometric Data object for Node2Vec
    num_nodes = len(current_graph_node_ids)
    data = Data(edge_index=indices, num_nodes=num_nodes)
    data = data.to(device)
    
    # Initialize Node2Vec model
    node2vec = Node2Vec(
        edge_index=data.edge_index,
        embedding_dim=embedding_dim,
        walk_length=walk_length,
        context_size=context_size,
        walks_per_node=walks_per_node,
        num_negative_samples=1,
        sparse=True
    )

    # Move Node2Vec model to GPU
    node2vec = node2vec.to(device) 

    # Create DataLoader object
    loader = node2vec.loader(batch_size=32, shuffle=True, num_workers=4)

    # Define optimizer to train Node2Vec
    optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=learning_rate)

    # Train Node2Vec
    avg_loss = train(node2vec, loader, optimizer)

    # Compute node embeddings
    embeddings = node2vec.embedding.weight.detach().to(device)

    # Compute normalized degree of each node
    num_nodes = data.edge_index.max().item() + 1
    normalized_degrees = degree(data.edge_index[0], num_nodes, dtype=torch.float) / num_nodes
    normalized_degrees = normalized_degrees.view(-1, 1)

    # Concatenate node embeddings and normalized degree
    node_features = torch.cat([embeddings, normalized_degrees], dim=1)

    # Store node embeddings of current graph
    graph_node_embeddings[current_graph_id] = node_features

    # Total number of nodes among all previous graphs
    tot_num_nodes = tot_num_nodes + N

In [9]:
# File name where to save graph tensors
output_file =  os.path.join(datasets_folder, dataset_name, f"{dataset_name}.pt")

# Save graph tensors in .pt file
torch.save(graph_node_embeddings, output_file)