In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.utils import save_image
from PIL import Image
import os
from pathlib import Path

# Define the VAE
class VAE(nn.Module):
    def __init__(self, latent_dim, img_shape):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        self.img_shape = img_shape

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(int(torch.prod(torch.tensor(img_shape))), 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
        )
        
        self.fc_mu = nn.Linear(256, latent_dim)
        self.fc_var = nn.Linear(256, latent_dim)

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, int(torch.prod(torch.tensor(img_shape)))),
            nn.Tanh()
        )

    def encode(self, x):
        x = self.encoder(x)
        mu = self.fc_mu(x)
        log_var = self.fc_var(x)
        return mu, log_var

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        return self.decode(z), mu, log_var

In [32]:
class ImageDataset(Dataset):
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
        
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])  # This works for both grayscale and color images
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.folder_path, self.image_files[idx])
        image = Image.open(img_name)
        image = self.transform(image)
        return image

def load_dataset(folder_name, batch_size=32):
    dataset = ImageDataset(folder_path=folder_name)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

In [33]:
def train_vae(folder_name, num_epochs=100, latent_dim=100, initial_lr=0.0002, model_save_path='vae_model.pth'):
    dataloader = load_dataset(folder_name)
    
    # Get the shape of the first image to determine the input size for the VAE
    first_batch = next(iter(dataloader))
    img_shape = first_batch[0].shape
    
    vae = VAE(latent_dim, img_shape)
    
    optimizer = optim.Adam(vae.parameters(), lr=initial_lr)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

    # Reconstruction + KL divergence losses summed over all elements and batch
    def loss_function(recon_x, x, mu, log_var):
        BCE = nn.functional.binary_cross_entropy(recon_x, x.view(-1, int(torch.prod(torch.tensor(img_shape)))), reduction='sum')
        KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
        return BCE + KLD

    for epoch in range(num_epochs):
        total_loss = 0
        for i, imgs in enumerate(dataloader):
            optimizer.zero_grad()
            
            recon_batch, mu, log_var = vae(imgs)
            loss = loss_function(recon_batch, imgs, mu, log_var)
            
            loss.backward()
            total_loss += loss.item()
            optimizer.step()

        scheduler.step()
        avg_loss = total_loss / len(dataloader.dataset)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Average loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.6f}')

    # Save the trained model
    torch.save({
        'vae_state_dict': vae.state_dict(),
    }, model_save_path)

In [38]:
import torch
import torchvision.transforms.functional as TF
from skimage.filters import threshold_local
import numpy as np

def adaptive_threshold(tensor, block_size=35, offset=10):
    # Convert torch tensor to numpy array
    np_image = tensor.squeeze().cpu().numpy()
    
    # Ensure the image is in the correct range [0, 255]
    np_image = (np_image - np_image.min()) / (np_image.max() - np_image.min()) * 255
    np_image = np_image.astype(np.uint8)
    
    # Apply adaptive thresholding
    # thresh = threshold_local(np_image, block_size, offset=offset)
    binary = np_image > 100
    
    # Convert back to torch tensor
    binary_tensor = torch.from_numpy(binary.astype(np.float32)).unsqueeze(0)
    
    return binary_tensor

def sample_from_vae(model_path, num_samples, latent_dim=100, img_shape=(1, 28, 28), device='cpu'):
    vae = VAE(latent_dim, img_shape)
    checkpoint = torch.load(model_path)
    vae.load_state_dict(checkpoint['vae_state_dict'])
    vae.eval()
    vae.to(device)
    
    samples = []
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim, device=device)
        gen_imgs = vae.decode(z)
        samples = [adaptive_threshold(img.cpu().view(img_shape)) for img in gen_imgs]
    
    return samples

def save_samples(samples, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    for i, sample in enumerate(samples):
        save_image(sample, os.path.join(folder_path, f'sample_{i}.png'))

In [21]:
import os
from pathlib import Path
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

def train_all_datasets(base_data_folder, base_model_folder, num_epochs=100, latent_dim=100, initial_lr=0.0002):
    for dataset_folder in Path(base_data_folder).iterdir():
        if dataset_folder.is_dir():
            dataset_name = dataset_folder.name
            train_folder = dataset_folder / 'train'
            
            if train_folder.exists():
                model_save_path = os.path.join(base_model_folder, f'{dataset_name}_vae.pth')
                
                print(f"Training VAE for dataset: {dataset_name}")
                
                train_vae(
                    folder_name=str(train_folder),
                    num_epochs=num_epochs,
                    latent_dim=latent_dim,
                    initial_lr=initial_lr,
                    model_save_path=model_save_path
                )
                print(f"VAE for {dataset_name} saved to {model_save_path}")
            else:
                print(f"Warning: Train folder not found for {dataset_name}")

# Usage
base_data_folder = 'data'
base_model_folder = 'models'
train_all_datasets(base_data_folder, base_model_folder, num_epochs=100, initial_lr=0.0002)

Training DCGAN for dataset: hypergraphErdosRenyi
Epoch [1/100], D_loss: 0.3722, G_loss: 0.6743, LR: 0.000200
Epoch [2/100], D_loss: 0.5198, G_loss: 0.8616, LR: 0.000200
Epoch [3/100], D_loss: 0.6752, G_loss: 0.3070, LR: 0.000200
Epoch [4/100], D_loss: 0.8007, G_loss: 0.2326, LR: 0.000199
Epoch [5/100], D_loss: 0.7187, G_loss: 0.2946, LR: 0.000199
Epoch [6/100], D_loss: 0.5864, G_loss: 0.9891, LR: 0.000198
Epoch [7/100], D_loss: 0.8120, G_loss: 0.9363, LR: 0.000198
Epoch [8/100], D_loss: 0.6232, G_loss: 1.6396, LR: 0.000197
Epoch [9/100], D_loss: 0.7723, G_loss: 0.3278, LR: 0.000196
Epoch [10/100], D_loss: 0.6949, G_loss: 0.5316, LR: 0.000195
Epoch [11/100], D_loss: 0.7064, G_loss: 0.6541, LR: 0.000194
Epoch [12/100], D_loss: 0.6721, G_loss: 0.5889, LR: 0.000193
Epoch [13/100], D_loss: 0.6195, G_loss: 0.8037, LR: 0.000192
Epoch [14/100], D_loss: 0.6921, G_loss: 0.6807, LR: 0.000190
Epoch [15/100], D_loss: 0.6894, G_loss: 0.9438, LR: 0.000189
Epoch [16/100], D_loss: 0.7983, G_loss: 0.321

Epoch [33/100], D_loss: 0.6513, G_loss: 0.6947, LR: 0.000151
Epoch [34/100], D_loss: 0.6991, G_loss: 0.6729, LR: 0.000148
Epoch [35/100], D_loss: 0.6625, G_loss: 0.6736, LR: 0.000145
Epoch [36/100], D_loss: 0.6827, G_loss: 0.6775, LR: 0.000143
Epoch [37/100], D_loss: 0.6482, G_loss: 0.7159, LR: 0.000140
Epoch [38/100], D_loss: 0.6833, G_loss: 0.6737, LR: 0.000137
Epoch [39/100], D_loss: 0.6859, G_loss: 0.7803, LR: 0.000134
Epoch [40/100], D_loss: 0.7081, G_loss: 0.6483, LR: 0.000131
Epoch [41/100], D_loss: 0.6873, G_loss: 0.6469, LR: 0.000128
Epoch [42/100], D_loss: 0.6828, G_loss: 0.8553, LR: 0.000125
Epoch [43/100], D_loss: 0.6960, G_loss: 0.7647, LR: 0.000122
Epoch [44/100], D_loss: 0.6705, G_loss: 0.6731, LR: 0.000119
Epoch [45/100], D_loss: 0.6841, G_loss: 0.7493, LR: 0.000116
Epoch [46/100], D_loss: 0.6746, G_loss: 0.7635, LR: 0.000113
Epoch [47/100], D_loss: 0.7120, G_loss: 1.1470, LR: 0.000109
Epoch [48/100], D_loss: 0.6864, G_loss: 0.7164, LR: 0.000106
Epoch [49/100], D_loss: 

In [39]:
def load_and_generate_samples(base_model_folder, base_data_folder, num_samples=10, latent_dim=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for model_path in Path(base_model_folder).glob('*dcgan.pth'):
        dataset_name = model_path.stem[:-6]  # Remove '_dcgan' from the name
        train_folder = Path(base_data_folder) / dataset_name / 'train'
        
        if not train_folder.exists():
            print(f"Warning: Train folder not found for {dataset_name}")
            continue
        
        # Get dimensions from the first image in the train folder
        first_image_path = next(train_folder.glob('*.*'), None)
        if first_image_path is None:
            print(f"Warning: No images found in train folder for {dataset_name}")
            continue
        
        with Image.open(first_image_path) as img:
            channels = len(img.getbands())
            width, height = img.size
        
        print(f"Processing dataset: {dataset_name}")
        print(f"Image dimensions: {channels}x{height}x{width}")
        
        generated_images = sample_from_gan(model_path, num_samples, latent_dim, img_shape=(1, height, width))
        
        # Save samples
        output_folder = Path(base_data_folder) / dataset_name / 'generated_samples_dcgan'
        os.makedirs(output_folder, exist_ok=True)
        
        for i, img in enumerate(generated_images):
            save_image(img, str(output_folder / f"sample_{i+1}.png"), normalize=True)
        
        print(f"Generated samples saved in {output_folder}")

# Usage
latent_dim=100
base_model_folder = 'models'
base_data_folder = 'data'
load_and_generate_samples(base_model_folder, base_data_folder, num_samples=5)

Processing dataset: hypergraphErdosRenyi
Image dimensions: 1x32x117
Generated samples saved in data/hypergraphErdosRenyi/generated_samples_dcgan
Processing dataset: hypergraphSBM
Image dimensions: 1x32x104
Generated samples saved in data/hypergraphSBM/generated_samples_dcgan


  checkpoint = torch.load(model_path)


In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(model))

3101953


hypergraphErdosRenyi
Metrics for dataset hypergraphErdosRenyi:
NodeNumDiff: 0.0
NodeDegreeDistrWasserstein: 56.65078125000001
EdgeSizeDistrWasserstein: 15.150107649212696
Spectral: 0.4784787119201299
CentralityCloseness: 0.41243045405411494
CentralityBetweenness: 0.008582975109095349
CentralityHarmonic: 55.62403072745771
Uniqueness: 1.0
Novelty: 1.0


hypergraphEgo
Metrics for dataset hypergraphEgo:
NodeNumDiff: 24.5
NodeDegreeDistrWasserstein: 43.69269389185671
EdgeSizeDistrWasserstein: 69.42772737056265
Spectral: 0.2543679135663608
CentralityCloseness: 0.03370621793060704
CentralityBetweenness: 0.0006188258000384554
CentralityHarmonic: 20.650962036149703
Uniqueness: 1.0
Novelty: 1.0
ValidEgo: 0.0


hypergraphSBM
Metrics for dataset hypergraphSBM:
NodeNumDiff: 0.3
NodeDegreeDistrWasserstein: 50.26655412066247
EdgeSizeDistrWasserstein: 14.860459433040077
Spectral: 0.4041841979537313
CentralityCloseness: 0.43757615606593847
CentralityBetweenness: 0.014774737305912942
CentralityHarmonic: