In [12]:
# Clean GPU memory
import torch
import gc

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("✓ GPU cache cleared")

gc.collect()
print("✓ Memory cleaned")

# Clone fresh repo
!rm -rf DDN-training-classic
!git clone https://github.com/EmreDinc10/DDN-training-classic.git
%cd DDN-training-classic
!pip install -r requirements.txt
print("✓ Repo cloned and dependencies installed")

✓ GPU cache cleared
✓ Memory cleaned
Cloning into 'DDN-training-classic'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (68/68), done.[Kompressing objects:  16% (11/68)[K
remote: Total 85 (delta 25), reused 75 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (85/85), 127.73 KiB | 2.78 MiB/s, done.
Resolving deltas: 100% (25/25), done.
/content/DDN-training-classic/DDN-training-classic
Collecting git+https://github.com/openai/CLIP.git (from -r requirements.txt (line 9))
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-47af_9c6
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-47af_9c6
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
✓ Repo cloned and dependencies installed


In [13]:
# Create MNIST dataset as directory (no ZIP issues)
import torchvision
from torchvision.datasets import MNIST
import os
from PIL import Image

# Clean up
!rm -rf datasets/mnist-32x32
!rm -f datasets/mnist-*.zip

# Create directory
os.makedirs("datasets/mnist-32x32", exist_ok=True)

# Download MNIST
print("Downloading MNIST...")
mnist_train = MNIST(root="datasets/mnist_temp", train=True, download=True, transform=None)

# Save images directly to directory (32x32)
print("Converting to images (32x32)...")
for idx in range(len(mnist_train)):
    img, label = mnist_train[idx]
    img_rgb = img.convert('RGB').resize((32, 32), Image.LANCZOS)
    # Save directly to directory
    img_rgb.save(f"datasets/mnist-32x32/{idx:05d}_{label}.png")
    if (idx + 1) % 10000 == 0:
        print(f"  Processed {idx + 1}/{len(mnist_train)}")

# Cleanup temp
!rm -rf datasets/mnist_temp
print(f"✓ Dataset ready: datasets/mnist-32x32/ ({len(mnist_train)} images)")

Downloading MNIST...


100%|██████████| 9.91M/9.91M [00:00<00:00, 42.0MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.05MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 9.68MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.84MB/s]


Converting to images (32x32)...
  Processed 10000/60000
  Processed 20000/60000
  Processed 30000/60000
  Processed 40000/60000
  Processed 50000/60000
  Processed 60000/60000
✓ Dataset ready: datasets/mnist-32x32/ (60000 images)


In [24]:
import os
import torch
import dnnlib
from training import training_loop

# Initialize distributed for single GPU
os.environ.setdefault("MASTER_ADDR", "localhost")
os.environ.setdefault("MASTER_PORT", "29500")
os.environ.setdefault("RANK", "0")
os.environ.setdefault("LOCAL_RANK", "0")
os.environ.setdefault("WORLD_SIZE", "1")

if not torch.distributed.is_initialized():
    torch.distributed.init_process_group(
        backend="nccl" if torch.cuda.is_available() else "gloo",
        init_method="env://",
        world_size=1,
        rank=0
    )
    if torch.cuda.is_available():
        torch.cuda.set_device(0)
        print(f"✓ Using GPU: {torch.cuda.get_device_name(0)}")

# Configuration - USE DIRECTORY PATH, NOT ZIP
c = dnnlib.EasyDict()
c.dataset_kwargs = dnnlib.EasyDict(
    class_name="training.dataset.ImageFolderDataset",
    path="datasets/mnist-32x32",  # Directory path, not ZIP
    use_labels=False,
    xflip=False,
    cache=True,
)
# FIX: Remove prefetch_factor when num_workers=0
c.data_loader_kwargs = dnnlib.EasyDict(pin_memory=True, num_workers=0)
c.network_kwargs = dnnlib.EasyDict(
    class_name="training.networks.DDNPrecond",
    model_type="PHDDN",
    augment_dim=0,
    dropout=0.0,
    use_fp16=True,
)
c.loss_kwargs = dnnlib.EasyDict(class_name="training.loss.DDNLoss")
c.optimizer_kwargs = dnnlib.EasyDict(
    class_name="torch.optim.Adam", lr=1e-4, betas=[0.9, 0.999], eps=1e-8
)
c.batch_size = 32
c.batch_gpu = 32
c.total_kimg = 10
c.ema_halflife_kimg = 0
c.loss_scaling = 1.0
c.cudnn_benchmark = True
c.kimg_per_tick = 1
c.snapshot_ticks = 5
c.state_dump_ticks = 10
c.run_dir = "/content/training-runs/mnist-test"
os.makedirs(c.run_dir, exist_ok=True)

import sddn
sddn.DiscreteDistributionOutput.learn_residual = True
sddn.DiscreteDistributionOutput.chain_dropout = 0.05

# Validate dataset
dataset_obj = dnnlib.util.construct_class_by_name(**c.dataset_kwargs)
c.dataset_kwargs.resolution = dataset_obj.resolution
c.dataset_kwargs.max_size = len(dataset_obj)
print(f"✓ Dataset: {dataset_obj.name}, {dataset_obj.resolution}x{dataset_obj.resolution}, {len(dataset_obj)} images")
del dataset_obj

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
print(f"Dataset: {c.dataset_kwargs.path}")
print(f"Output: {c.run_dir}")
print(f"Batch size: {c.batch_size}")
print(f"Total kimg: {c.total_kimg}")
print("="*60 + "\n")

# Run training
training_loop.training_loop(**c)

✓ Dataset: mnist-32x32, 32x32, 60000 images

STARTING TRAINING
Dataset: datasets/mnist-32x32
Output: /content/training-runs/mnist-test
Batch size: 32
Total kimg: 10

Loading dataset...
Constructing network...

DDNPrecond              Parameters  Buffers  Output shape  Datatype
---                     ---         ---      ---           ---     
model.block_1x1_0       1377792     2        -             -       
model.block_2x2_0_up    1443584     10       -             -       
model.block_2x2_1       1377792     2        -             -       
model.block_4x4_0_up    1443584     10       -             -       
model.block_4x4_1       1377792     2        -             -       
model.block_4x4_2       1377792     2        -             -       
model.block_4x4_3       1377792     2        -             -       
model.block_8x8_0_up    1443584     10       -             -       
model.block_8x8_1       1377792     2        -             -       
model.block_8x8_2       1377792     2     

TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [23]:
!cd /content/DDN-training-classic && git pull

remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 12 (delta 9), reused 9 (delta 6), pack-reused 0 (from 0)[K
Unpacking objects: 100% (12/12), 1.63 KiB | 418.00 KiB/s, done.
From https://github.com/EmreDinc10/DDN-training-classic
   4fb9e34..1399125  main       -> origin/main
Updating 4fb9e34..1399125
Fast-forward
 training/training_loop.py | 34 [32m+++++++++++++++++++++++++++++[m[31m-----[m
 1 file changed, 29 insertions(+), 5 deletions(-)
