In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.distributed as dist
import os
import argparse
from torch.nn.parallel import DistributedDataParallel as DDP

# Argument parser for easily running script from the command line
parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training with DDP')
parser.add_argument('--local_rank', default=-1, type=int,
                    help='node rank for distributed training')

def main():
    args = parser.parse_args()
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend='nccl')  # communication backends

    # Data loading and normalization
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                              shuffle=False, num_workers=2, sampler=train_sampler)

    # CNN architecture
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # takes an input with 3 channels (e.g., RGB images) and produces 6 output channels using 6 convolutional filters, each with a kernel size of 5x5.
            self.conv1 = nn.Conv2d(3, 6, 5)  # in_channels, out_channels, kernel_size (small matrix used for the convolution operation)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 5)
            self.fc1 = nn.Linear(16 * 5 * 5, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 10)

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = x.view(-1, 16 * 5 * 5) # reshapes the output tensor x from the convolutional and pooling layers into a 1-dimensional tensor, prepares for processing by fully connected layers
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    net = Net().cuda()
    net = DDP(net, device_ids=[args.local_rank], output_device=args.local_rank)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    # Training loop
    for epoch in range(10):  # loop over the dataset multiple times
        train_sampler.set_epoch(epoch)
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

    print('Finished Training')

if __name__ == '__main__':
    main()


In [None]:
# python -m torch.distributed.launch --nproc_per_node=4 train_script.py