In [158]:
!nvidia-smi

Thu Oct 14 13:56:11 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:5E:00.0 Off |                    0 |
| N/A   67C    P0    29W /  70W |  15101MiB / 15109MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [77]:
%%writefile launch_ddp_MNIST.py

import numpy
import os
from datetime import datetime
import argparse
import torchvision
import torchvision.transforms as transforms
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--nodes", default = 1, type = int, metavar = "N", help = "кол-во обработчиков (default: 1)")
    parser.add_argument("-g", "--gpus", default = 1, type = int, help = "число гпу на каждом обработчике")
    parser.add_argument("-nr", "--nr", default = 0, type = int, help = "глобальный ранг") # для каждого новог процесса создается новаывй (но это просиходит под капотом)
    parser.add_argument("--epochs", default = 2, type = int, help = "колво эпох обучения", metavar = "N")
    args = parser.parse_args()
    
    args.world_size = args.gpus * args.nodes
    os.environ["MASTER_ADDR"] = '0.0.0.0' # обявление мастер ноды
    os.environ["MASTER_PORT"] = '8809'
    # create processes
    mp.spawn(train, nprocs = args.gpus, args = (args,)) # paramneter server
    
    
class ConvNet(nn.Module):
    def __init__(self, num_classes = 10):
        super(ConvNet, self).__init__()
        # convoluution
        self.layer1 = nn.Sequential(
            nn.Conv2d(1,16,kernel_size = 5, stride = 1, padding = 2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride = 2))
        
        #convolution
        self.layer2 = nn.Sequential(
            nn.Conv2d(16,32,kernel_size = 5, stride = 1, padding = 2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride = 2))
        # full layer 
        self.fc = nn.Linear(7*7*32, num_classes)
        
    def forward(self,x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out
    
    
def train(gpu, args):
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend = "nccl", init_method = "env://", world_size = args.world_size, rank = rank)
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    
    train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform = transforms.ToTensor(),download=True)
    
    # для того чтобы постоянно нге считыыввать весь датасет сущесвтует DistributedSampler который считывыает только те блоки которые им нужны
    
    # привродим текущий датасет, колв-во устройств и каким рангов производиться текущая обработка
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=args.world_size, rank=rank)
    
    # именно он юудет производить считку данных
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = False, num_workers=0, pin_memory=True, sampler=train_sampler)
    
    
    start = datetime.now()
    total_step = len(train_loader) # кол-во шагов в нашем загрузчике данных
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            
            optimizer.step()
            if (i+1) % 100 == 0:
                print(epoch, i, loss, gpu)

    if gpu == 0:
        print("Обучениен завершено за " + str(datetime.now() - start))
            
            
if __name__=="__main__":
    main()

Overwriting launch_ddp_MNIST.py


In [78]:
! python3 launch_ddp_MNIST.py -n 1 -g 1

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
0 99 tensor(2.1141, device='cuda:0', grad_fn=<NllLossBackward>) 0
0 199 tensor(2.0650, device='cuda:0', grad_fn=<NllLossBackward>) 0
0 299 tensor(1.9299, device='cuda:0', grad_fn=<NllLossBackward>) 0
0 399 tensor(1.7870, device='cuda:0', grad_fn=<NllLossBackward>) 0
0 499 tensor(1.6639, device='cuda:0', grad_fn=<NllLossBackward>) 0
0 599 tensor(1.5558, device='cuda:0', grad_fn=<NllLossBackward>) 0
1 99 tensor(1.5122, device='cuda:0', grad_fn=<NllLossBackward>) 0
1 199 tensor(1.3986, device='cuda:0', grad_fn=<NllLossBackward>) 0
1 299 tensor(1.3406, device='cuda:0', grad_fn=<NllLossBackward>) 0
1 399 tensor(1.2922, device='cuda:0', grad_fn=<NllLossBackward>) 0
1 499 tensor(1.2061, device='cuda:0', grad_fn=<NllLossBackward>) 0
1 599 tensor(1.1375, device='cuda:0', grad_fn=<NllLossBackward>) 0
Обюучениен завершено за 0:00:13.155057
