In [None]:
import torch

# Check if CUDA (NVIDIA GPU support) is available
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "No GPU detected")

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor
from torchvision.utils import save_image
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
import os

# -------------------- GPU Configuration -------------------- 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device} (GPU: {torch.cuda.get_device_name(0)})" if torch.cuda.is_available() else "Using CPU")

# -------------------- Model Definition --------------------
class VAE(nn.Module):
    def __init__(self, latent_dim=20):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64*7*7, 256),
            nn.ReLU(),
            nn.Linear(256, 2*latent_dim)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64*7*7),
            nn.ReLU(),
            nn.Unflatten(1, (64, 7, 7)),
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 1, 3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def encode(self, x):
        h = self.encoder(x)
        mu, log_var = torch.chunk(h, 2, dim=1)
        return mu, log_var

    def decode(self, z):
        return self.decoder(z)  # Now explicitly defined

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        x_recon = self.decode(z)
        return x_recon, mu, log_var

# -------------------- Data Loaders with GPU Pinning --------------------
train_dataset = FashionMNIST(root='data', train=True, download=True, transform=ToTensor())
test_dataset = FashionMNIST(root='data', train=False, download=True, transform=ToTensor())

# Train/Validation split
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

batch_size = 256  # Larger batch size for GPU efficiency
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                         pin_memory=True, num_workers=4)  # GPU-optimized loading
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                       pin_memory=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# -------------------- Training Setup --------------------
model = VAE(latent_dim=20).to(device)  # Entire model moved to GPU
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def loss_function(recon_x, x, mu, log_var):
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum') / x.size(0)
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp()) / x.size(0)
    return BCE + KLD, BCE, KLD

# -------------------- GPU-Optimized Training Loop --------------------
epochs = 30
train_losses, val_losses = [], []

os.makedirs("samples", exist_ok=True)

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device, non_blocking=True)
        optimizer.zero_grad()
        recon_batch, mu, log_var = model(data)
        loss, _, _ = loss_function(recon_batch, data, mu, log_var)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for data, _ in val_loader:
            data = data.to(device, non_blocking=True)
            recon, mu, log_var = model(data)
            loss, _, _ = loss_function(recon, data, mu, log_var)
            val_loss += loss.item()
    
    # Save losses
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    # Save samples every epoch
    with torch.no_grad():
        # Reconstruction samples
        test_images, _ = next(iter(test_loader))
        test_images = test_images.to(device)
        recon_images, _, _ = model(test_images)
        
        # Concatenate original and reconstructed
        comparison = torch.cat([test_images[:8], recon_images[:8]]).cpu()
        save_image(comparison, f'samples/recon_epoch_{epoch+1}.png', nrow=8)  # Now works
        
        # Generate new images
        z = torch.randn(64, model.latent_dim).to(device)
        generated = model.decode(z).cpu()
        save_image(generated, f'samples/generated_epoch_{epoch+1}.png', nrow=8)
    
    print(f'Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')

# -------------------- GPU-Specific Performance Tips --------------------
# 1. Use mixed-precision training (uncomment below)
# from torch.cuda.amp import GradScaler, autocast
# scaler = GradScaler()
# Inside training loop:
# with autocast():
#     recon_batch, mu, log_var = model(data)
#     loss, bce, kld = loss_function(...)
# scaler.scale(loss).backward()
# scaler.step(optimizer)
# scaler.update()

# 2. Use torch.backends.cudnn.benchmark = True (if input sizes don't vary)
torch.backends.cudnn.benchmark = True

# -------------------- Visualization (Move tensors to CPU) --------------------
# Reconstructions
model.eval()
test_images, _ = next(iter(test_loader))
with torch.no_grad():
    test_images = test_images.to(device)
    recon_images, _, _ = model(test_images)
    test_images = test_images.cpu()  # Move back to CPU for plotting
    recon_images = recon_images.cpu()

# Latent Space Visualization (Partial example)
latent_vectors = []
labels = []
with torch.no_grad():
    for data, label in test_loader:
        data = data.to(device)
        mu, _ = model.encode(data)
        latent_vectors.append(mu.cpu())  # Move to CPU for sklearn
        labels.append(label)
latent_vectors = torch.cat(latent_vectors, dim=0).numpy()

Using device: cuda (GPU: NVIDIA GeForce RTX 3060 Laptop GPU)
Epoch 1/30 | Train Loss: 336.272 | Val Loss: 274.681
Epoch 2/30 | Train Loss: 265.141 | Val Loss: 259.823
Epoch 3/30 | Train Loss: 256.102 | Val Loss: 253.439
Epoch 4/30 | Train Loss: 250.792 | Val Loss: 248.937
Epoch 5/30 | Train Loss: 247.891 | Val Loss: 247.142
Epoch 6/30 | Train Loss: 246.000 | Val Loss: 245.882
Epoch 7/30 | Train Loss: 244.863 | Val Loss: 244.214
Epoch 8/30 | Train Loss: 243.779 | Val Loss: 243.652
Epoch 9/30 | Train Loss: 243.065 | Val Loss: 242.875
Epoch 10/30 | Train Loss: 242.350 | Val Loss: 242.154
Epoch 11/30 | Train Loss: 241.756 | Val Loss: 241.880
Epoch 12/30 | Train Loss: 241.251 | Val Loss: 241.523
Epoch 13/30 | Train Loss: 240.893 | Val Loss: 240.819
Epoch 14/30 | Train Loss: 240.523 | Val Loss: 240.619
Epoch 15/30 | Train Loss: 240.089 | Val Loss: 240.307
Epoch 16/30 | Train Loss: 239.788 | Val Loss: 239.946
Epoch 17/30 | Train Loss: 239.598 | Val Loss: 240.027
Epoch 18/30 | Train Loss: 239.