## Baselines
This notebook prepocesses the hypergraphs so the baselines can learn them. For the diffusion / GAN / VAE it converts the hypergraphs to padded images where a white pixel is a 1 and a black pixel is a 0.

For HyperPA, it outputs approximated edge size distribution and number of new hyperedges introduced by each node.

## Subset Sampling (unfinished baseline)

In [None]:
import pickle
import random

# Load the .pkl file
dataset_name = "hypergraphErdosRenyi"
pkl_filename = "../data/" + dataset_name + ".pkl"
with open(pkl_filename, 'rb') as f:
    data = pickle.load(f)

# Create the output file
output_filename = f"{dataset_name}-seqs.txt"

with open(output_filename, 'w') as out_file:
    # Process hypergraphs in the 'train' key
    for hnx_hypergraph in data['train']:
        # Get all hyperedges
        hyperedges = hnx_hypergraph.edges
        
        # Repeat the process multiple times for each hypergraph
        for _ in range(5):  # Adjust the number of repetitions as needed
            # Randomize the order of hyperedges
            keys = list(hyperedges)
            random.shuffle(keys)
            
            # Prepare the line data
            sizes = []
            elements = []
            
            for key in keys:
                sizes.append(len(hyperedges[key]))
                elements.extend(hyperedges[key])
            
            # Create the line string
            line = f"{','.join(map(str, sizes))};{','.join(map(str, elements))}\n"
            
            # Write the line to the output file
            out_file.write(line)

print(f"Output written to {output_filename}")

## HyperPA

In [1]:
import os
import pickle
import hypernetx as hnx
import numpy as np
from collections import Counter

# Define the directory paths
dataset_dir = '../data'
output_dir_simplex = 'hyperpa_SS/simplex per node'
output_dir_size_distribution = 'hyperpa_SS/size distribution'

# Ensure output directories exist
os.makedirs(output_dir_simplex, exist_ok=True)
os.makedirs(output_dir_size_distribution, exist_ok=True)

def write_simplex_per_node_distribution(filename, hypergraph):
    n = len(hypergraph.nodes)
    m = len(hypergraph.edges)
    
    # Initialize distribution
    distribution = [0] * n
    
    remaining_edges = m
    nodes_to_attribute = min(m, n)
    
    while remaining_edges > 0:
        for i in range(nodes_to_attribute):
            distribution[i] += 1
        
        remaining_edges -= nodes_to_attribute
        nodes_to_attribute = min(remaining_edges, n)
    
    # Count the frequency of each number of attributed edges
    max_edges = max(distribution)
    counts = [distribution.count(i) for i in range(max_edges + 1)]
    
    # Convert counts to percentages and round
    percentages = [round(count / n * 100) for count in counts]
    
    # Ensure the sum is exactly 100%
    while sum(percentages) != 100:
        if sum(percentages) < 100:
            percentages[percentages.index(max(percentages))] += 1
        else:
            percentages[percentages.index(min(filter(lambda x: x != 0, percentages)))] -= 1
    
    # Write the distribution to file
    with open(filename, 'w') as f:
        for percentage in percentages:
            f.write(f"{percentage}\n")

def write_size_distribution(filename, hypergraph):
    size_distribution = np.zeros(25)
    for hyperedge in hypergraph.edges:
        size = len(hypergraph.edges[hyperedge])
        if size <= 25:  # Ensure we don't exceed array bounds
            size_distribution[size-1] += 1
    
    # Convert counts to percentages
    total_edges = sum(size_distribution)
    percentages = [round(count / total_edges * 100) for count in size_distribution]
    
    # Ensure the sum is exactly 100%
    while sum(percentages) != 100:
        if sum(percentages) < 100:
            percentages[percentages.index(max(percentages))] += 1
        else:
            percentages[percentages.index(min(filter(lambda x: x != 0, percentages)))] -= 1
    
    # Write the distribution to file
    with open(filename, 'w') as f:
        for percentage in percentages:
            f.write(f"{percentage}\n")

# Iterate over each .pkl file in the dataset directory
for file in os.listdir(dataset_dir):
    if file.endswith('.pkl'):
        file_path = os.path.join(dataset_dir, file)
        with open(file_path, 'rb') as f:
            hypergraphs = pickle.load(f)

        # Process the specified hypergraph in each file
        for i in range(len(hypergraphs['test'])):
            hypergraph = hypergraphs['test'][i]
            
            # Create filenames for output
            output_file_simplex = os.path.join(output_dir_simplex, f"{file[:-4]}_{i}.txt")
            output_file_size_distribution = os.path.join(output_dir_size_distribution, f"{file[:-4]}_{i}.txt")

            # Write node degrees and size distribution to files
            write_simplex_per_node_distribution(output_file_simplex, hypergraph)
            write_size_distribution(output_file_size_distribution, hypergraph)

# Diffusion / GAN / VAE

In [2]:
import numpy as np
import hypernetx as hn
import random
from PIL import Image
import os
import pickle
from pathlib import Path

def matrix_to_image(matrix, save_path):
    img = Image.fromarray(np.uint8(matrix * 255), 'L')  # 'L' mode for grayscale
    img.save(save_path)

def pad_matrix(matrix, max_rows, max_cols):
    padded_matrix = np.zeros((max_rows, max_cols), dtype=int)
    rows, cols = matrix.shape
    padded_matrix[:rows, :cols] = matrix
    return padded_matrix

def shuffle_matrix(matrix):
    np.random.shuffle(matrix)
    copy = matrix.T
    np.random.shuffle(copy)
    return copy.T

def process_dataset(dataset_name, input_folder, output_base_folder):
    # Load the pkl file
    with open(os.path.join(input_folder, f'{dataset_name}.pkl'), 'rb') as file:
        data = pickle.load(file)
    
    # Extract hypergraphs from 'train'
    hypergraphs = data['train']
    
    # Find the maximum dimensions for padding
    max_rows = np.max([len(list(H.nodes)) for H in hypergraphs])
    max_cols = np.max([len(list(H.edges)) for H in hypergraphs])
    
    # Define the folder for saving images
    output_folder = os.path.join(output_base_folder, dataset_name, 'train')
    os.makedirs(output_folder, exist_ok=True)
    
    # Save each shuffled matrix as an image
    for i, H in enumerate(hypergraphs):
        for j in range(5):  # Save each matrix multiple times with different shuffles
            shuffle = shuffle_matrix(H.incidence_matrix().todense())
            padded = pad_matrix(shuffle, max_rows, max_cols)
            
            image_path = os.path.join(output_folder, f'matrix_{i}_{j}.png')
            matrix_to_image(padded, image_path)

# Define input and output folders
input_folder = '../data'
output_base_folder = 'diffusion/data'

# Process all .pkl files in the input folder
for file_path in Path(input_folder).glob('*.pkl'):
    dataset_name = file_path.stem
    process_dataset(dataset_name, input_folder, output_base_folder)