In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomAffine(
        degrees=0,
        translate=(0.05, 0.05),
        shear=5,
        scale=(0.8, 1.2)
    ),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
])
bs = 512
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True, num_workers=8, pin_memory=True, persistent_workers=True, prefetch_factor=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=bs, shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True, prefetch_factor=2)



class RecurrentBlock(nn.Module):
    def __init__(self):
        super(RecurrentBlock, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3)
        self.conv2 = nn.Conv2d(16, 32, 3)
        self.pool1 = nn.MaxPool2d(2, 2, padding=1)
        self.dropout1 = nn.Dropout(0.2)
        self.lstm = nn.LSTM(7200, 1024, batch_first=True)
        self.fc1 = nn.Linear(1024, 2048)
        self.fc2 = nn.Linear(2048, 10)
    
    def forward(self, x):
        bs = x.size(0)
        out = F.elu(self.conv1(x))
        out = F.elu(self.conv2(out))
        out = self.pool1(out)
        out = self.dropout1(out)
        out  = out.view(bs,1, -1)

        out, _ = self.lstm(out)
        out = self.fc1(out[:, -1, :])
        out = self.fc2(out)
        return out

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model = RecurrentBlock().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
for epoch in range(50):
    model.train()
    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

    # Evaluation
    if (epoch+1) % 5 ==0 and epoch>1:
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in testloader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Accuracy: {100 * correct / total:.2f}%')    

Using device: cuda
Epoch 1, Loss: 1.3597
Epoch 2, Loss: 1.0364
Epoch 3, Loss: 1.1021
Epoch 4, Loss: 0.9369
Epoch 5, Loss: 0.7919
Accuracy: 71.50%
Epoch 6, Loss: 0.7472
Epoch 7, Loss: 0.7629
Epoch 8, Loss: 0.6519


KeyboardInterrupt: 

Accuracy: 64.63%
Epoch 42, Loss: 0.3226
Epoch 43, Loss: 0.2490
Epoch 44, Loss: 0.2770
Epoch 45, Loss: 0.2752
Epoch 46, Loss: 0.3371
Accuracy: 64.41%
Epoch 47, Loss: 0.2480
Epoch 48, Loss: 0.2738
Epoch 49, Loss: 0.2071
Epoch 50, Loss: 0.1946

## One Step gradient

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Data loading
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomAffine(
        degrees=0,
        translate=(0.05, 0.05),
        shear=5,
        scale=(0.8, 1.2)
    ),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True, num_workers=8, pin_memory=True, persistent_workers=True, prefetch_factor=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True, prefetch_factor=2)

# RNN Model with "1-step grad" (truncated BPTT K=1):
# - Run the first T-1 timesteps under no_grad to get (h, c)
# - Run the final timestep with gradients enabled to produce logits and loss
class RecurrentBlock(nn.Module):
    def __init__(self):
        super(RecurrentBlock, self).__init__()
        self.hidden_size = 128 * 8  # 1024
        self.input_size = 96        # 3 * 32
        self.seq_len = 32

        self.rnn = nn.LSTM(self.input_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, 10)
    
    def forward(self, x):
        b = x.size(0)
        x = x.view(b, self.seq_len, self.input_size)

        if self.training:
            # First T-1 steps without tracking gradients
            with torch.no_grad():
                _, (h, c) = self.rnn(x[:, :-1, :])  # shapes: (num_layers, B, H)
            # Last step with gradients, starting from detached hidden state
            h = h.detach()
            c = c.detach()
            out_last, _ = self.rnn(x[:, -1:, :], (h, c))  # shape: (B, 1, H)
        else:
            # In eval, just run normally
            out_last, _ = self.rnn(x)  # (B, T, H)

        logits = self.fc(out_last[:, -1, :])  # use last timestep
        return logits

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model = RecurrentBlock().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
for epoch in range(50):
    model.train()
    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)                 # only last roll has grad
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')
    if epoch % 5 == 0:

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in testloader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Accuracy: {100 * correct / total:.2f}%')

Using device: cuda
Epoch 1, Loss: 1.9162
Accuracy: 29.74%


KeyboardInterrupt: 

## Testing profiler


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.profiler

# Data loading
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomAffine(
        degrees=0,
        translate=(0.05, 0.05),
        shear=5,
        scale=(0.8, 1.2)
    ),
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True, num_workers=8, pin_memory=True, persistent_workers=True, prefetch_factor=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=8)

# RNN Model
class RecurrentBlock(nn.Module):
    def __init__(self):
        super(RecurrentBlock, self).__init__()
        self.hidden_size = 128 * 8  # 1024
        self.input_size = 96        # 3 * 32
        self.seq_len = 32

        self.rnn = nn.LSTM(self.input_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, 10)
    
    def forward(self, x):
        b = x.size(0)
        x = x.view(b, self.seq_len, self.input_size)

        if self.training:
            # First T-1 steps without tracking gradients
            with torch.no_grad():
                _, (h, c) = self.rnn(x[:, :-1, :])  # shapes: (num_layers, B, H)
            # Last step with gradients, starting from detached hidden state
            h = h.detach()
            c = c.detach()
            out_last, _ = self.rnn(x[:, -1:, :], (h, c))  # shape: (B, 1, H)
        else:
            # In eval, just run normally
            out_last, _ = self.rnn(x)  # (B, T, H)

        logits = self.fc(out_last[:, -1, :])  # use last timestep
        return logits

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model = RecurrentBlock().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training with profiler
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./profiler_logs'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    model.train()
    for epoch in range(2):
        for step, (images, labels) in enumerate(trainloader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            prof.step()
            if step >= 10:
                break
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Print profiling results
print("\nTop 20 operations by CUDA time:")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
print("\nTop 20 operations by CPU time:")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))

# Continue regular training
for epoch in range(2, 5):
    model.train()
    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total:.2f}%')