In [None]:
# ============================================================================
# CELL 0: Check GPU Availability
# ============================================================================
# IMPORTANT: Enable GPU first!
# Go to: Runtime -> Change runtime type -> Hardware accelerator -> GPU

import torch

print("="*60)
print("GPU CHECK")
print("="*60)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print("✓ GPU is ready!")
else:
    print("⚠ WARNING: No GPU detected!")
    print("Please enable GPU: Runtime -> Change runtime type -> GPU")
    print("Then restart runtime and run this cell again.")
print("="*60)

Cloning into 'DDN-training-classic'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 60 (delta 9), reused 57 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (60/60), 117.53 KiB | 7.35 MiB/s, done.
Resolving deltas: 100% (9/9), done.
/content/DDN-training-classic
Collecting git+https://github.com/openai/CLIP.git (from -r requirements.txt (line 9))
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-x8jw98it
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-x8jw98it
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sddn>=0.0.2 (from -r requirements.txt (line 1))
  Downloading sddn-0.0.2-py3-none-any.whl.metadata (606 bytes)
Collecting boxx>=0.10.16 (from -r requirements.txt (line 2))


In [None]:
# ============================================================================
# CELL 1: Setup
# ============================================================================
# Make sure GPU is enabled (check previous cell output)

!git clone https://github.com/EmreDinc10/DDN-training-classic.git
%cd DDN-training-classic
!pip install -r requirements.txt

# Verify GPU again after setup
import torch
if not torch.cuda.is_available():
    print("⚠ ERROR: GPU still not available. Please enable GPU in Runtime settings!")
    raise RuntimeError("GPU required for training")

print("✓ Setup complete!")


In [2]:
# ============================================================================
# CELL 2: Prepare MNIST Dataset
# ============================================================================
import torchvision
from torchvision.datasets import MNIST
import os

# Create datasets directory
os.makedirs("datasets", exist_ok=True)
os.makedirs("datasets/mnist_temp", exist_ok=True)

# Download MNIST and save as images
print("Downloading MNIST dataset...")
mnist_train = MNIST(root="datasets/mnist_temp", train=True, download=True, transform=None)

# Save images to temporary directory
print("Converting MNIST to images...")
for idx in range(len(mnist_train)):
    img, label = mnist_train[idx]
    # Convert to RGB (3 channels)
    img_rgb = img.convert('RGB')
    img_rgb.save(f"datasets/mnist_temp/{idx:05d}_{label}.png")

# Create ZIP file using dataset_tool.py
print("Creating ZIP dataset...")
!python dataset_tool.py --source=datasets/mnist_temp --dest=datasets/mnist-28x28.zip

# Cleanup
!rm -rf datasets/mnist_temp
print("✓ MNIST dataset ready at: datasets/mnist-28x28.zip")

Downloading MNIST dataset...


100%|██████████| 9.91M/9.91M [00:00<00:00, 133MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 36.2MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 26.4MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 7.48MB/s]


Converting MNIST to images...
Creating ZIP dataset...
  0% 0/60000 [00:00<?, ?it/s]
Error: Image width/height after scale and crop are required to be power-of-two
✓ MNIST dataset ready at: datasets/mnist-28x28.zip


In [3]:
# ============================================================================
# CELL 3: Configure and Run Training
# ============================================================================
import os

# Set configuration for MNIST
os.environ["DATA_PATH"] = "datasets/mnist-28x28.zip"
os.environ["OUTDIR"] = "/content/training-runs/mnist-test"
os.environ["BATCH_SIZE"] = "32"
os.environ["BATCH_GPU"] = "32"
os.environ["TOTAL_KIMG"] = "10"  # 10k images = ~10 minutes

print("Configuration:")
print(f"  Dataset: {os.environ['DATA_PATH']}")
print(f"  Output: {os.environ['OUTDIR']}")
print(f"  Batch size: {os.environ['BATCH_SIZE']}")
print(f"  Training duration: {os.environ['TOTAL_KIMG']}k images (~{int(os.environ['TOTAL_KIMG']) * 0.5:.1f} min)")

# Run training
!python train_minimal.py

Configuration:
  Dataset: datasets/mnist-28x28.zip
  Output: /content/training-runs/mnist-test
  Batch size: 32
  Training duration: 10k images (~5.0 min)
Traceback (most recent call last):
  File "/content/DDN-training-classic/train_minimal.py", line 13, in <module>
    dist.init()
  File "/content/DDN-training-classic/torch_utils/distributed.py", line 28, in init
    torch.distributed.init_process_group(backend=backend, init_method="env://")
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 95, in wrapper
    func_return = func(*args, **kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 1769, in init_process_group
    default_pg, _ = _new_process_group_helper(
                    ^^^^^^^^^^^^