In [None]:
import torch, time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [None]:
import os
import argparse
import torchvision
import torchvision.transforms as transforms
import torch
from torch import nn
import gc



def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=5, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args([])
    args.world_size = args.gpus * args.nodes
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'

    test(train(args, False, False), False) # Standard
    test(train(args, True, False), False) # Mixed precision
    test(train(args, False, True), True) # F16 everywhere


def train(args, amp, f16):
    torch.manual_seed(42)
    model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)

    if f16:
      model = model.to(torch.float16)
    else:
      model = model.to(torch.float32)

    model = model.cuda()
    model.train()
    batch_size = 100

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    scaler = torch.cuda.amp.GradScaler(enabled=amp)
    train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    total_step = len(train_loader)
    start_timer()
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            with torch.autocast("cuda", dtype=torch.float16, enabled=(amp or f16)):
              images = images.cuda(non_blocking=True)
              labels = labels.cuda(non_blocking=True)
              output = model(images)
              loss = criterion(output, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            if (i + 1) % batch_size == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step, loss.item()))

    if f16:
      end_timer_and_print("F16 precision:")
    elif amp:
      end_timer_and_print("Mixed precision:")
    else:
      end_timer_and_print("Standart precision:")

    return model

def test(model, f16, batch_size = 100):
  model.eval()
  if f16:
    model = model.to(torch.float16)
  else:
    model = model.to(torch.float32)

  test_dataset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.ToTensor(),
        download=True
    )
  test_loader = torch.utils.data.DataLoader(
      dataset=test_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=0,
      pin_memory=True,)
  correct = 0
  total = 0
  with torch.no_grad():
      for images, labels in test_loader:
        with torch.autocast("cuda", dtype=torch.float16, enabled=f16):
          images = images.cuda(non_blocking=True)
          labels = labels.cuda(non_blocking=True)
          outputs = model(images)

          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()
  print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')


if __name__ == '__main__':
    main()


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified




Epoch [1/5], Step [100/500], Loss: 6.9183
Epoch [1/5], Step [200/500], Loss: 6.5926
Epoch [1/5], Step [300/500], Loss: 6.2373
Epoch [1/5], Step [400/500], Loss: 5.9018
Epoch [1/5], Step [500/500], Loss: 5.5489
Epoch [2/5], Step [100/500], Loss: 5.1552
Epoch [2/5], Step [200/500], Loss: 4.6138
Epoch [2/5], Step [300/500], Loss: 4.2194
Epoch [2/5], Step [400/500], Loss: 4.2819
Epoch [2/5], Step [500/500], Loss: 3.8655
Epoch [3/5], Step [100/500], Loss: 3.5685
Epoch [3/5], Step [200/500], Loss: 3.3137
Epoch [3/5], Step [300/500], Loss: 3.1518
Epoch [3/5], Step [400/500], Loss: 2.9987
Epoch [3/5], Step [500/500], Loss: 2.8093
Epoch [4/5], Step [100/500], Loss: 2.8529
Epoch [4/5], Step [200/500], Loss: 2.4370
Epoch [4/5], Step [300/500], Loss: 2.5947
Epoch [4/5], Step [400/500], Loss: 2.4655
Epoch [4/5], Step [500/500], Loss: 2.3294
Epoch [5/5], Step [100/500], Loss: 2.2491
Epoch [5/5], Step [200/500], Loss: 2.3896
Epoch [5/5], Step [300/500], Loss: 2.1843
Epoch [5/5], Step [400/500], Loss:

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/5], Step [100/500], Loss: 6.9275
Epoch [1/5], Step [200/500], Loss: 6.5791
Epoch [1/5], Step [300/500], Loss: 6.2466
Epoch [1/5], Step [400/500], Loss: 5.8921
Epoch [1/5], Step [500/500], Loss: 5.5237
Epoch [2/5], Step [100/500], Loss: 5.1916
Epoch [2/5], Step [200/500], Loss: 4.6644
Epoch [2/5], Step [300/500], Loss: 4.2550
Epoch [2/5], Step [400/500], Loss: 4.2432
Epoch [2/5], Step [500/500], Loss: 3.9107
Epoch [3/5], Step [100/500], Loss: 3.5985
Epoch [3/5], Step [200/500], Loss: 3.3166
Epoch [3/5], Step [300/500], Loss: 3.1856
Epoch [3/5], Step [400/500], Loss: 2.9745
Epoch [3/5], Step [500/500], Loss: 2.8188
Epoch [4/5], Step [100/500], Loss: 2.8825
Epoch [4/5], Step [200/500], Loss: 2.4484
Epoch [4/5], Step [300/500], Loss: 2.5785
Epoch [4/5], Step [400/500], Loss: 2.5000
Epoch [4/5], Step [500/500], Loss: 2.3009
Epoch [5/5], Step [100/500], Loss: 2.1745
Epoch [5/5], Step [200/500], Loss: 2.3812
Epoch [5/5], Step [300/500], Loss: 2.1

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/5], Step [100/500], Loss: 7.0600
Epoch [1/5], Step [200/500], Loss: 6.8269
Epoch [1/5], Step [300/500], Loss: 6.5725
Epoch [1/5], Step [400/500], Loss: 6.4223
Epoch [1/5], Step [500/500], Loss: 6.1407
Epoch [2/5], Step [100/500], Loss: 5.9111
Epoch [2/5], Step [200/500], Loss: 5.5568
Epoch [2/5], Step [300/500], Loss: 5.2705
Epoch [2/5], Step [400/500], Loss: 5.1285
Epoch [2/5], Step [500/500], Loss: 5.0193
Epoch [3/5], Step [100/500], Loss: 4.6782
Epoch [3/5], Step [200/500], Loss: 4.4520
Epoch [3/5], Step [300/500], Loss: 4.3200
Epoch [3/5], Step [400/500], Loss: 4.1896
Epoch [3/5], Step [500/500], Loss: 4.0132
Epoch [4/5], Step [100/500], Loss: 3.9109
Epoch [4/5], Step [200/500], Loss: 3.5569
Epoch [4/5], Step [300/500], Loss: 3.5630
Epoch [4/5], Step [400/500], Loss: 3.4555
Epoch [4/5], Step [500/500], Loss: 3.3631
Epoch [5/5], Step [100/500], Loss: 3.3713
Epoch [5/5], Step [200/500], Loss: 3.2780
Epoch [5/5], Step [300/500], Loss: 3.0

In [None]:
from apex import amp as amp_lib
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
import apex
import gc


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=5, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args([])
    args.world_size = args.gpus * args.nodes
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'

    test(train(args, True, False), False) # Mixed precision
    test(train(args, False, True), True) # F16 everywhere


def train(args, amp, f16):
    torch.manual_seed(42)
    model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)
    model = model.to(torch.float32)
    model = model.cuda()
    model.train()
    batch_size = 100

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    if amp:
      opt_level = "O2"
    elif f16:
      opt_level = "O3"
    model, optimizer = amp_lib.initialize(model, optimizer, opt_level=opt_level)
    train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    total_step = len(train_loader)
    start_timer()
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            with amp_lib.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
            if (i + 1) % batch_size == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1,
                    args.epochs,
                    i + 1,
                    total_step,
                    loss.item()))

    if f16:
      end_timer_and_print("F16 precision:")
    elif amp:
      end_timer_and_print("Mixed precision:")

    return model

def test(model, f16, batch_size = 100):
  model.eval()
  test_dataset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.ToTensor(),
        download=True
    )
  test_loader = torch.utils.data.DataLoader(
      dataset=test_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=0,
      pin_memory=True,)
  correct = 0
  total = 0
  with torch.no_grad():
    for images, labels in test_loader:
      images = images.cuda(non_blocking=True)
      labels = labels.cuda(non_blocking=True)
      outputs = model(images)

      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')



if __name__ == '__main__':
    main()


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
  optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Files already downloaded and verified
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Epoch [1/5], Step [100/500], Loss: 6.9227
Epoch [1/5], Step [200/500], Loss: 6.6222
Epoch [1/5], Step [300/500], Loss: 6.2470
Epoch [1/5], Step [400/500], Loss: 5.9257
Epoch [1/5], Ste

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/5], Step [100/500], Loss: 7.0409
Epoch [1/5], Step [200/500], Loss: 6.8294
Epoch [1/5], Step [300/500], Loss: 6.5850
Epoch [1/5], Step [400/500], Loss: 6.4001
Epoch [1/5], Step [500/500], Loss: 6.1566
Epoch [2/5], Step [100/500], Loss: 5.8349
Epoch [2/5], Step [200/500], Loss: 5.5495
Epoch [2/5], Step [300/500], Loss: 5.1978
Epoch [2/5], Step [400/500], Loss: 5.1349
Epoch [2/5], Step [500/500], Loss: 4.9295
Epoch [3/5], Step [100/500], Loss: 4.7122
Epoch [3/5], Step [200/500], Loss: 4.4653
Epoch [3/5], Step [300/500], Loss: 4.2619
Epoch [3/5], Step [400/500], Loss: 4.2164
Epoch [3/5], Step [500/500], Loss: 3.9607
Epoch [4/5], Step [100/500], Loss: 3.9320
Epoch [4/5], Step [200/500], Loss: 3.5850
Epoch [4/5], Step [300/500], Loss: 3.5993
Epoch [4/5], Step [400/500], Loss: 3.4297
Epoch [4/5], Step [500/500], Loss: 3.3723
Epoch [5/5], Step [100/500], Loss: 3.3171
Epoch [5/5], Step [200/500], Loss: 3.2811
Epoch [5/5], Step [300/500], Loss: 3.0

In [None]:
import os
import argparse
import torchvision
import torchvision.transforms as transforms
import torch
from torch import nn
import gc



def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=5, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args([])
    args.world_size = args.gpus * args.nodes
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'

    test(train(args, False, False), False) # Standard
    test(train(args, True, False), False) # Mixed precision
    test(train(args, False, True), True) # F16 everywhere


def train(args, amp, f16):
    torch.manual_seed(42)
    model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)

    if f16:
      model = model.to(torch.float16)
    else:
      model = model.to(torch.float32)

    model = model.cuda()
    model.train()
    batch_size = 32

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    scaler = torch.cuda.amp.GradScaler(enabled=amp)
    train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    total_step = len(train_loader)
    start_timer()
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            with torch.autocast("cuda", dtype=torch.float16, enabled=(amp or f16)):
              images = images.cuda(non_blocking=True)
              labels = labels.cuda(non_blocking=True)
              output = model(images)
              loss = criterion(output, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            if (i + 1) % batch_size == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step, loss.item()))

    if f16:
      end_timer_and_print("F16 precision:")
    elif amp:
      end_timer_and_print("Mixed precision:")
    else:
      end_timer_and_print("Standart precision:")

    return model

def test(model, f16, batch_size = 32):
  model.eval()
  if f16:
    model = model.to(torch.float16)
  else:
    model = model.to(torch.float32)

  test_dataset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.ToTensor(),
        download=True
    )
  test_loader = torch.utils.data.DataLoader(
      dataset=test_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=0,
      pin_memory=True,)
  correct = 0
  total = 0
  with torch.no_grad():
      for images, labels in test_loader:
        with torch.autocast("cuda", dtype=torch.float16, enabled=f16):
          images = images.cuda(non_blocking=True)
          labels = labels.cuda(non_blocking=True)
          outputs = model(images)

          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()
  print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')


if __name__ == '__main__':
    main()


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified




Epoch [1/5], Step [32/1563], Loss: 7.2048
Epoch [1/5], Step [64/1563], Loss: 7.0966
Epoch [1/5], Step [96/1563], Loss: 7.2747
Epoch [1/5], Step [128/1563], Loss: 6.8985
Epoch [1/5], Step [160/1563], Loss: 6.7508
Epoch [1/5], Step [192/1563], Loss: 6.4326
Epoch [1/5], Step [224/1563], Loss: 6.5897
Epoch [1/5], Step [256/1563], Loss: 6.4370
Epoch [1/5], Step [288/1563], Loss: 6.2156
Epoch [1/5], Step [320/1563], Loss: 6.1928
Epoch [1/5], Step [352/1563], Loss: 5.9927
Epoch [1/5], Step [384/1563], Loss: 5.8934
Epoch [1/5], Step [416/1563], Loss: 5.5741
Epoch [1/5], Step [448/1563], Loss: 5.4609
Epoch [1/5], Step [480/1563], Loss: 5.6098
Epoch [1/5], Step [512/1563], Loss: 5.5587
Epoch [1/5], Step [544/1563], Loss: 5.4192
Epoch [1/5], Step [576/1563], Loss: 4.9475
Epoch [1/5], Step [608/1563], Loss: 4.9196
Epoch [1/5], Step [640/1563], Loss: 4.9008
Epoch [1/5], Step [672/1563], Loss: 5.0907
Epoch [1/5], Step [704/1563], Loss: 4.5843
Epoch [1/5], Step [736/1563], Loss: 4.2456
Epoch [1/5], S

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/5], Step [32/1563], Loss: 7.2458
Epoch [1/5], Step [64/1563], Loss: 7.1433
Epoch [1/5], Step [96/1563], Loss: 7.2489
Epoch [1/5], Step [128/1563], Loss: 6.9563
Epoch [1/5], Step [160/1563], Loss: 6.8020
Epoch [1/5], Step [192/1563], Loss: 6.5088
Epoch [1/5], Step [224/1563], Loss: 6.6088
Epoch [1/5], Step [256/1563], Loss: 6.3727
Epoch [1/5], Step [288/1563], Loss: 6.2603
Epoch [1/5], Step [320/1563], Loss: 6.0604
Epoch [1/5], Step [352/1563], Loss: 6.1151
Epoch [1/5], Step [384/1563], Loss: 5.9714
Epoch [1/5], Step [416/1563], Loss: 5.5478
Epoch [1/5], Step [448/1563], Loss: 5.4944
Epoch [1/5], Step [480/1563], Loss: 5.5835
Epoch [1/5], Step [512/1563], Loss: 5.5022
Epoch [1/5], Step [544/1563], Loss: 5.4563
Epoch [1/5], Step [576/1563], Loss: 4.8080
Epoch [1/5], Step [608/1563], Loss: 5.0001
Epoch [1/5], Step [640/1563], Loss: 4.9603
Epoch [1/5], Step [672/1563], Loss: 5.0196
Epoch [1/5], Step [704/1563], Loss: 4.8317
Epoch [1/5], Step [

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/5], Step [32/1563], Loss: 7.2729
Epoch [1/5], Step [64/1563], Loss: 7.2175
Epoch [1/5], Step [96/1563], Loss: 7.3866
Epoch [1/5], Step [128/1563], Loss: 7.0946
Epoch [1/5], Step [160/1563], Loss: 6.9354
Epoch [1/5], Step [192/1563], Loss: 6.6085
Epoch [1/5], Step [224/1563], Loss: 6.7767
Epoch [1/5], Step [256/1563], Loss: 6.5791
Epoch [1/5], Step [288/1563], Loss: 6.4921
Epoch [1/5], Step [320/1563], Loss: 6.4249
Epoch [1/5], Step [352/1563], Loss: 6.4243
Epoch [1/5], Step [384/1563], Loss: 6.2076
Epoch [1/5], Step [416/1563], Loss: 5.8491
Epoch [1/5], Step [448/1563], Loss: 5.9547
Epoch [1/5], Step [480/1563], Loss: 5.9396
Epoch [1/5], Step [512/1563], Loss: 5.9614
Epoch [1/5], Step [544/1563], Loss: 5.9929
Epoch [1/5], Step [576/1563], Loss: 5.3516
Epoch [1/5], Step [608/1563], Loss: 5.3922
Epoch [1/5], Step [640/1563], Loss: 5.3430
Epoch [1/5], Step [672/1563], Loss: 5.5290
Epoch [1/5], Step [704/1563], Loss: 5.3318
Epoch [1/5], Step [

In [None]:
from apex import amp as amp_lib
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
import apex
import gc


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=5, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args([])
    args.world_size = args.gpus * args.nodes
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'

    test(train(args, True, False), False) # Mixed precision
    test(train(args, False, True), True) # F16 everywhere


def train(args, amp, f16):
    torch.manual_seed(42)
    model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)
    model = model.to(torch.float32)
    model = model.cuda()
    batch_size = 32

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    if amp:
      opt_level = "O2"
    elif f16:
      opt_level = "O3"
    model, optimizer = amp_lib.initialize(model, optimizer, opt_level=opt_level)
    train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    total_step = len(train_loader)
    start_timer()
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            with amp_lib.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
            if (i + 1) % batch_size == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1,
                    args.epochs,
                    i + 1,
                    total_step,
                    loss.item()))

    if f16:
      end_timer_and_print("F16 precision:")
    elif amp:
      end_timer_and_print("Mixed precision:")

    return model

def test(model, f16, batch_size = 32):
  test_dataset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.ToTensor(),
        download=True
    )
  test_loader = torch.utils.data.DataLoader(
      dataset=test_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=0,
      pin_memory=True,)
  correct = 0
  total = 0
  with torch.no_grad():
    for images, labels in test_loader:
      images = images.cuda(non_blocking=True)
      labels = labels.cuda(non_blocking=True)
      outputs = model(images)

      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')



if __name__ == '__main__':
    main()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Files already downloaded and verified
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Epoch [1/5], Step [32/1563], Loss: 7.2597
Epoch [1/5], Step [64/1563], Loss: 7.1872
Epoch [1/5], Step [9

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/5], Step [32/1563], Loss: 7.2834
Epoch [1/5], Step [64/1563], Loss: 7.1748
Epoch [1/5], Step [96/1563], Loss: 7.3359
Epoch [1/5], Step [128/1563], Loss: 7.0357
Epoch [1/5], Step [160/1563], Loss: 6.8382
Epoch [1/5], Step [192/1563], Loss: 6.5858
Epoch [1/5], Step [224/1563], Loss: 6.8189
Epoch [1/5], Step [256/1563], Loss: 6.6004
Epoch [1/5], Step [288/1563], Loss: 6.4512
Epoch [1/5], Step [320/1563], Loss: 6.3442
Epoch [1/5], Step [352/1563], Loss: 6.4114
Epoch [1/5], Step [384/1563], Loss: 6.2304
Epoch [1/5], Step [416/1563], Loss: 5.9504
Epoch [1/5], Step [448/1563], Loss: 5.8842
Epoch [1/5], Step [480/1563], Loss: 5.9356
Epoch [1/5], Step [512/1563], Loss: 5.9028
Epoch [1/5], Step [544/1563], Loss: 5.8081
Epoch [1/5], Step [576/1563], Loss: 5.4526
Epoch [1/5], Step [608/1563], Loss: 5.5709
Epoch [1/5], Step [640/1563], Loss: 5.5099
Epoch [1/5], Step [672/1563], Loss: 5.4788
Epoch [1/5], Step [704/1563], Loss: 5.2500
Epoch [1/5], Step [

In [None]:
from apex import amp as amp_lib
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
import apex
import gc


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=15, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args([])
    args.world_size = args.gpus * args.nodes
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'

    test(train(args, True, False), False) # Mixed precision
    test(train(args, False, True), True) # F16 everywhere


def train(args, amp, f16):
    torch.manual_seed(42)
    model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)
    model = model.to(torch.float32)
    model = model.cuda()
    model.train()
    batch_size = 100

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    if amp:
      opt_level = "O2"
    elif f16:
      opt_level = "O3"
    model, optimizer = amp_lib.initialize(model, optimizer, opt_level=opt_level)
    train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    total_step = len(train_loader)
    start_timer()
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            with amp_lib.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
            if (i + 1) % batch_size == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1,
                    args.epochs,
                    i + 1,
                    total_step,
                    loss.item()))

    if f16:
      end_timer_and_print("F16 precision:")
    elif amp:
      end_timer_and_print("Mixed precision:")

    return model

def test(model, f16, batch_size = 100):
  model.eval()
  test_dataset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.ToTensor(),
        download=True
    )
  test_loader = torch.utils.data.DataLoader(
      dataset=test_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=0,
      pin_memory=True,)
  correct = 0
  total = 0
  with torch.no_grad():
    for images, labels in test_loader:
      images = images.cuda(non_blocking=True)
      labels = labels.cuda(non_blocking=True)
      outputs = model(images)

      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')



if __name__ == '__main__':
    main()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Files already downloaded and verified
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Epoch [1/15], Step [100/500], Loss: 6.9227
Epoch [1/15], Step [200/500], Loss: 6.6222
Epoch [1/15], Step [300/500], Loss: 6.2470
Epoch [1/15], Step [400/500], Loss: 5.9257
Epoch [1/15]

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Selected optimization level O3:  Pure FP16 training.
Defaults for this optimization level are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0
Files already downloaded and verified
Epoch [1/15], Step [100/500], Loss: 7.0409
Epoch [1/15], Step [200/500], Loss: 6.8294
Epoch [1/15], Step [300/500], Loss: 6.5850
Epoch [1/15], Step [400/500], Loss: 6.4001
Epoch [1/15], Step [500/500], Loss: 6.1566
Epoch [2/15], Step [100/500], Loss: 5.8349
Epoch [2/15], Step [200/500], Loss: 5.5495
Epoch [

In [None]:
import os
import argparse
import torchvision
import torchvision.transforms as transforms
import torch
from torch import nn
import gc



def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=15, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args([])
    args.world_size = args.gpus * args.nodes
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'

    test(train(args, False, False), False) # Standard
    test(train(args, True, False), False) # Mixed precision
    test(train(args, False, True), True) # F16 everywhere


def train(args, amp, f16):
    torch.manual_seed(42)
    model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)

    if f16:
      model = model.to(torch.float16)
    else:
      model = model.to(torch.float32)

    model = model.cuda()
    model.train()
    batch_size = 100

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    scaler = torch.cuda.amp.GradScaler(enabled=amp)
    train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    total_step = len(train_loader)
    start_timer()
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            with torch.autocast("cuda", dtype=torch.float16, enabled=(amp or f16)):
              images = images.cuda(non_blocking=True)
              labels = labels.cuda(non_blocking=True)
              output = model(images)
              loss = criterion(output, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            if (i + 1) % batch_size == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step, loss.item()))

    if f16:
      end_timer_and_print("F16 precision:")
    elif amp:
      end_timer_and_print("Mixed precision:")
    else:
      end_timer_and_print("Standart precision:")

    return model

def test(model, f16, batch_size = 100):
  model.eval()
  if f16:
    model = model.to(torch.float16)
  else:
    model = model.to(torch.float32)

  test_dataset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.ToTensor(),
        download=True
    )
  test_loader = torch.utils.data.DataLoader(
      dataset=test_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=0,
      pin_memory=True,)
  correct = 0
  total = 0
  with torch.no_grad():
      for images, labels in test_loader:
        with torch.autocast("cuda", dtype=torch.float16, enabled=f16):
          images = images.cuda(non_blocking=True)
          labels = labels.cuda(non_blocking=True)
          outputs = model(images)

          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()
  print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')


if __name__ == '__main__':
    main()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/15], Step [100/500], Loss: 6.9299
Epoch [1/15], Step [200/500], Loss: 6.6007
Epoch [1/15], Step [300/500], Loss: 6.2419
Epoch [1/15], Step [400/500], Loss: 5.8822
Epoch [1/15], Step [500/500], Loss: 5.5221
Epoch [2/15], Step [100/500], Loss: 5.1349
Epoch [2/15], Step [200/500], Loss: 4.6227
Epoch [2/15], Step [300/500], Loss: 4.2473
Epoch [2/15], Step [400/500], Loss: 4.2443
Epoch [2/15], Step [500/500], Loss: 3.8726
Epoch [3/15], Step [100/500], Loss: 3.6085
Epoch [3/15], Step [200/500], Loss: 3.3341
Epoch [3/15], Step [300/500], Loss: 3.1640
Epoch [3/15], Step [400/500], Loss: 2.9709
Epoch [3/15], Step [500/500], Loss: 2.7818
Epoch [4/15], Step [100/500], Loss: 2.8493
Epoch [4/15], Step [200/500], Loss: 2.4426
Epoch [4/15], Step [300/500], Loss: 2.5043
Epoch [4/15], Step [400/500], Loss: 2.4742
Epoch [4/15], Step [500/500], Loss: 2.2990
Epoch [5/15], Step [100/500], Loss: 2.2421
Epoch [5/15], Step [200/500], Loss: 2.3866
Epoch [5/15], St

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/15], Step [100/500], Loss: 6.9275
Epoch [1/15], Step [200/500], Loss: 6.5791
Epoch [1/15], Step [300/500], Loss: 6.2466
Epoch [1/15], Step [400/500], Loss: 5.8921
Epoch [1/15], Step [500/500], Loss: 5.5237
Epoch [2/15], Step [100/500], Loss: 5.1916
Epoch [2/15], Step [200/500], Loss: 4.6644
Epoch [2/15], Step [300/500], Loss: 4.2550
Epoch [2/15], Step [400/500], Loss: 4.2432
Epoch [2/15], Step [500/500], Loss: 3.9107
Epoch [3/15], Step [100/500], Loss: 3.5985
Epoch [3/15], Step [200/500], Loss: 3.3166
Epoch [3/15], Step [300/500], Loss: 3.1856
Epoch [3/15], Step [400/500], Loss: 2.9745
Epoch [3/15], Step [500/500], Loss: 2.8188
Epoch [4/15], Step [100/500], Loss: 2.8825
Epoch [4/15], Step [200/500], Loss: 2.4484
Epoch [4/15], Step [300/500], Loss: 2.5785
Epoch [4/15], Step [400/500], Loss: 2.5000
Epoch [4/15], Step [500/500], Loss: 2.3009
Epoch [5/15], Step [100/500], Loss: 2.1745
Epoch [5/15], Step [200/500], Loss: 2.3812
Epoch [5/15], St

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Epoch [1/15], Step [100/500], Loss: 7.0600
Epoch [1/15], Step [200/500], Loss: 6.8269
Epoch [1/15], Step [300/500], Loss: 6.5725
Epoch [1/15], Step [400/500], Loss: 6.4223
Epoch [1/15], Step [500/500], Loss: 6.1407
Epoch [2/15], Step [100/500], Loss: 5.9111
Epoch [2/15], Step [200/500], Loss: 5.5568
Epoch [2/15], Step [300/500], Loss: 5.2705
Epoch [2/15], Step [400/500], Loss: 5.1285
Epoch [2/15], Step [500/500], Loss: 5.0193
Epoch [3/15], Step [100/500], Loss: 4.6782
Epoch [3/15], Step [200/500], Loss: 4.4520
Epoch [3/15], Step [300/500], Loss: 4.3200
Epoch [3/15], Step [400/500], Loss: 4.1896
Epoch [3/15], Step [500/500], Loss: 4.0132
Epoch [4/15], Step [100/500], Loss: 3.9109
Epoch [4/15], Step [200/500], Loss: 3.5569
Epoch [4/15], Step [300/500], Loss: 3.5630
Epoch [4/15], Step [400/500], Loss: 3.4555
Epoch [4/15], Step [500/500], Loss: 3.3631
Epoch [5/15], Step [100/500], Loss: 3.3713
Epoch [5/15], Step [200/500], Loss: 3.2780
Epoch [5/15], St