In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision.datasets import CIFAR10
from torchvision.models import models, resnet18
from torch.profiler import profile, record_function, ProfilerActivity
from tqdm import tqdm

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transforms
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Load CIFAR-10 dataset
trainset = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=16, shuffle=True, num_workers=2)

testset = CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=16, shuffle=False, num_workers=2)

# Load pre-trained ResNet-18 model
model = resnet18(pretrained=False, num_classes=10).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Training function
def train(epoch):
    model.train()
    running_loss = 0.0
    pbar = tqdm(enumerate(trainloader), total=len(trainloader))
    for batch_idx, (inputs, targets) in pbar:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        pbar.set_description(f'Epoch {epoch+1}, Loss: {running_loss/(batch_idx+1):.4f}')

# Number of epochs
num_epochs = 1

# Profile training loop
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    for epoch in range(num_epochs):
        train(epoch)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

# Test the network
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))

test()

In [None]:
# Prepare a synthetic dataset
batch_size = 32
num_samples = 1000
input_size = (3, 224, 224)  # Typical input size for ResNet18

# Generate random data
inputs = torch.randn(num_samples, *input_size)
targets = torch.randint(0, 1000, (num_samples,))  # Assuming 1000 classes

dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize ResNet18 model
model = models.resnet18(pretrained=False)
model.train()

# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

def train_with_profiler(dataloader, model, criterion, optimizer):
    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
    #              schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
    #              on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
    #              record_shapes=True, 
    #              profile_memory=True, 
    #              with_stack=True) as prof:
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
             record_shapes=True, 
             profile_memory=True, 
             with_stack=True) as prof:
    # Your training loop or workload here
    # Remember to call prof.step() appropriately
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            prof.step()  # Inform the profiler that one step has finished.
            
            if batch_idx == 5:  # For demonstration, we only run a few batches.
                break

train_with_profiler(dataloader, model, criterion, optimizer)

In [8]:
# tensorboard --logdir=./logs