In [21]:
# https://pytorch.org/docs/stable/notes/cuda.html

# 0. Make sure devices can be discovered

In [1]:
# https://discuss.pytorch.org/t/i-have-3-gpu-why-torch-cuda-device-count-only-return-1/7245/4

import torch
import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())

__Python VERSION: 3.7.7 (default, May  7 2020, 21:25:33) 
[GCC 7.3.0]
__pyTorch VERSION: 1.5.1
__CUDA VERSION
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
__CUDNN VERSION: 7603
__Number CUDA Devices: 2
__Devices
Active CUDA Device: GPU 0
Available devices  2
Current cuda device  0


In [2]:
!nvidia-smi

Mon Oct 26 02:14:14 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   42C    P8    27W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   34C    P8    33W / 149W |      3MiB / 11441MiB |      0%      Default |
|       

# 1. Multiple devices & How to use them

## 1.0 Simplest way to put tensors on different devices

In [3]:
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')
cuda2 = torch.device('cuda:2') # no such device

In [5]:
# assigned to device 0
x = torch.tensor([1., 2.], device=cuda0)
x

tensor([1., 2.], device='cuda:0')

In [6]:
# assigned to device 1
x = torch.tensor([1., 2.], device=cuda1)
x

tensor([1., 2.], device='cuda:1')

In [7]:
# will raise error
x = torch.tensor([1., 2.], device=cuda2)
x

RuntimeError: CUDA error: invalid device ordinal

## 1.1 Device Scope

In [8]:
# https://pytorch.org/docs/stable/notes/cuda.html

# use device 0 scope
with torch.cuda.device(0):
  # will assign to device 0 because we are passing cuda0, not because of scope
  x1 = torch.tensor([1., 2.], device=cuda0)

  # will assign to device 0 because we are passing cuda0, not because of scope
  x2 = torch.tensor([1., 2.]).to(device=cuda0)

  # will assign to device 0 because we are within device 0 scope
  x3 = torch.tensor([1., 2.]).cuda()

  # will assign to device 0 because we are passing cuda0, not because of scope
  x4 = torch.tensor([1., 2.]).cuda(cuda0)

print(x1.device)
print(x2.device)
print(x3.device)
print(x4.device)

cuda:0
cuda:0
cuda:0
cuda:0


In [9]:
# https://pytorch.org/docs/stable/notes/cuda.html

# use device 1 scope
with torch.cuda.device(1):
  # will assign to device 0 because we are passing cuda0, not because of scope
  x1 = torch.tensor([1., 2.], device=cuda0)

  # will assign to device 0 because we are passing cuda0, not because of scope
  x2 = torch.tensor([1., 2.]).to(device=cuda0)

  # will assign to device 1 because we are within device 1 scope
  x3 = torch.tensor([1., 2.]).cuda()

  # will assign to device 0 because we are passing cuda0, not because of scope
  x4 = torch.tensor([1., 2.]).cuda(cuda0)

print(x1.device)
print(x2.device)
print(x3.device)
print(x4.device)

cuda:0
cuda:0
cuda:1
cuda:0


## 1.2 Copy from device 0 to device 1

In [6]:
x

tensor([1., 2.], device='cuda:0')

In [7]:
x.to(device=cuda1)

tensor([1., 2.], device='cuda:1')

In [8]:
x

tensor([1., 2.], device='cuda:0')

# 2. Put tensor and delete tensors on devices, free up CPU memory

## 2.1 Restart Kernel, put one tensor on device 0

In [1]:
import torch

In [2]:
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')

In [3]:
x = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)

In [4]:
!nvidia-smi

Mon Oct 26 02:52:30 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   56C    P0    57W / 149W |    542MiB / 11441MiB |     18%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   35C    P8    34W / 149W |      3MiB / 11441MiB |      0%      Default |
|       

In [5]:
# https://discuss.pytorch.org/t/how-to-delete-a-tensor-in-gpu-to-free-up-memory/48879/12

def print_cuda_memory_allocation_for_device(device_to_print_stats):

    print(torch.cuda.memory_allocated(device_to_print_stats))
    # > 0
    print(torch.cuda.memory_reserved(device_to_print_stats))
    # > 0

In [6]:
print_cuda_memory_allocation_for_device(cuda0)

300000256
301989888


In [7]:
print_cuda_memory_allocation_for_device(cuda1)

0
0


In [8]:
del x

In [9]:
print_cuda_memory_allocation_for_device(cuda0)

0
301989888


In [10]:
print_cuda_memory_allocation_for_device(cuda1)

0
0


In [11]:
!nvidia-smi

Mon Oct 26 02:52:31 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   56C    P0    56W / 149W |    542MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   35C    P8    34W / 149W |      3MiB / 11441MiB |      0%      Default |
|       

In [12]:
torch.cuda.empty_cache()

In [13]:
print_cuda_memory_allocation_for_device(cuda0)

0
0


In [14]:
print_cuda_memory_allocation_for_device(cuda1)

0
0


In [15]:
!nvidia-smi

Mon Oct 26 02:52:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   56C    P0    57W / 149W |    254MiB / 11441MiB |     18%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   35C    P8    34W / 149W |      3MiB / 11441MiB |      0%      Default |
|       

## 2.1 Restart Kernel, put one tensor on each device

In [1]:
!nvidia-smi

Mon Oct 26 03:32:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   62C    P8    38W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   46C    P8    62W / 149W |      3MiB / 11441MiB |      0%      Default |
|       

In [1]:
import torch

In [2]:
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')

In [3]:
# https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_allocated

# device 0 - everything is zero
print(torch.cuda.memory_summary(cuda0))
# device 1 -- should give same result 
# print(torch.cuda.memory_summary(cuda1))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [4]:
x = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)
y = torch.zeros(300000000, dtype=torch.int8).cuda(cuda1)

In [5]:
!nvidia-smi

Mon Oct 26 03:34:23 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   59C    P0    58W / 149W |    542MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   45C    P0    71W / 149W |    542MiB / 11441MiB |      4%      Default |
|       

In [6]:
def print_cuda_memory_allocation_for_device(device_to_print_stats):

    print(torch.cuda.memory_allocated(device_to_print_stats))
    # > 0
    print(torch.cuda.memory_reserved(device_to_print_stats))
    # > 0

In [7]:
# both allocated and reserved are non-zero because of tensor x on device 0
print_cuda_memory_allocation_for_device(cuda0)

300000256
301989888


In [8]:
# both allocated and reserved are non-zero because of tensor y on device 1
print_cuda_memory_allocation_for_device(cuda1)

300000256
301989888


In [9]:
# check detailed memory stats
# https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats

# device 0 should give non-zero result because x is on device 0
torch.cuda.memory_stats(cuda0)
# device 1 -- should give same result because y is on device 0
# torch.cuda.memory_stats(cuda1)

OrderedDict([('active.all.allocated', 1),
             ('active.all.current', 1),
             ('active.all.freed', 0),
             ('active.all.peak', 1),
             ('active.large_pool.allocated', 1),
             ('active.large_pool.current', 1),
             ('active.large_pool.freed', 0),
             ('active.large_pool.peak', 1),
             ('active.small_pool.allocated', 0),
             ('active.small_pool.current', 0),
             ('active.small_pool.freed', 0),
             ('active.small_pool.peak', 0),
             ('active_bytes.all.allocated', 300000256),
             ('active_bytes.all.current', 300000256),
             ('active_bytes.all.freed', 0),
             ('active_bytes.all.peak', 300000256),
             ('active_bytes.large_pool.allocated', 300000256),
             ('active_bytes.large_pool.current', 300000256),
             ('active_bytes.large_pool.freed', 0),
             ('active_bytes.large_pool.peak', 300000256),
             ('active_bytes.small_p

In [10]:
# https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_allocated

# device 0 should give non-zero result because x is on device 0
print(torch.cuda.memory_summary(cuda0))
# device 1 -- should give same result because y is on device 1
# print(torch.cuda.memory_summary(cuda1))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from large pool |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from large pool |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------

## 2.2 Delete tensor on device 0

In [11]:
del x

In [12]:
# allocated is zero because x is deleted from device 0
# reserved is still the same number because we haven't cleared x from device 0 's cache
print_cuda_memory_allocation_for_device(cuda0)

0
301989888


In [13]:
# both allocated and reserved are non-zero because of tensor y on device 1
print_cuda_memory_allocation_for_device(cuda1)

300000256
301989888


In [14]:
# cache is not cleared
!nvidia-smi

Mon Oct 26 03:34:24 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   59C    P0    58W / 149W |    542MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   45C    P0    71W / 149W |    542MiB / 11441MiB |      0%      Default |
|       

In [15]:
# device 0 -- harder to read see table below
torch.cuda.memory_stats(cuda0)

OrderedDict([('active.all.allocated', 1),
             ('active.all.current', 0),
             ('active.all.freed', 1),
             ('active.all.peak', 1),
             ('active.large_pool.allocated', 1),
             ('active.large_pool.current', 0),
             ('active.large_pool.freed', 1),
             ('active.large_pool.peak', 1),
             ('active.small_pool.allocated', 0),
             ('active.small_pool.current', 0),
             ('active.small_pool.freed', 0),
             ('active.small_pool.peak', 0),
             ('active_bytes.all.allocated', 300000256),
             ('active_bytes.all.current', 0),
             ('active_bytes.all.freed', 300000256),
             ('active_bytes.all.peak', 300000256),
             ('active_bytes.large_pool.allocated', 300000256),
             ('active_bytes.large_pool.current', 0),
             ('active_bytes.large_pool.freed', 300000256),
             ('active_bytes.large_pool.peak', 300000256),
             ('active_bytes.small_p

In [16]:
# device 1 -- result same as before
torch.cuda.memory_stats(cuda1)

OrderedDict([('active.all.allocated', 1),
             ('active.all.current', 1),
             ('active.all.freed', 0),
             ('active.all.peak', 1),
             ('active.large_pool.allocated', 1),
             ('active.large_pool.current', 1),
             ('active.large_pool.freed', 0),
             ('active.large_pool.peak', 1),
             ('active.small_pool.allocated', 0),
             ('active.small_pool.current', 0),
             ('active.small_pool.freed', 0),
             ('active.small_pool.peak', 0),
             ('active_bytes.all.allocated', 300000256),
             ('active_bytes.all.current', 300000256),
             ('active_bytes.all.freed', 0),
             ('active_bytes.all.peak', 300000256),
             ('active_bytes.large_pool.allocated', 300000256),
             ('active_bytes.large_pool.current', 300000256),
             ('active_bytes.large_pool.freed', 0),
             ('active_bytes.large_pool.peak', 300000256),
             ('active_bytes.small_p

In [19]:
# noticed that Cur Usage, Active memory, Non-releasable memory all dropped to zero, moved to Tot Freed
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from large pool |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from small pool |       0 B  |       0 KB |       0 KB |       0 KB |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from large pool |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from small pool |       0 B  |       0 KB |       0 KB |       0 KB |
|---------------------------------------------------------------

In [18]:
# device 1 -- same as before
print(torch.cuda.memory_summary(cuda1))

|                  PyTorch CUDA memory summary, device ID 1                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from large pool |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from large pool |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------

## 2.3 Empty cache of device 0

In [20]:
torch.cuda.empty_cache()

In [21]:
# device 0
# allocated is zero because x is deleted from device 0
# reserved is zero because we cleared the cache
print_cuda_memory_allocation_for_device(cuda0)

0
0


In [22]:
# device 1 - same as before, tensor y is still on device 1
print_cuda_memory_allocation_for_device(cuda1)

300000256
301989888


In [23]:
# memory on device 0 is not completely zero but should be fine, those 254MiB can be reoccupied
!nvidia-smi

Mon Oct 26 03:35:41 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   61C    P0    58W / 149W |    254MiB / 11441MiB |     21%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   46C    P0    72W / 149W |    542MiB / 11441MiB |      0%      Default |
|       

In [24]:
# device 0 
torch.cuda.memory_stats(cuda0)

OrderedDict([('active.all.allocated', 1),
             ('active.all.current', 0),
             ('active.all.freed', 1),
             ('active.all.peak', 1),
             ('active.large_pool.allocated', 1),
             ('active.large_pool.current', 0),
             ('active.large_pool.freed', 1),
             ('active.large_pool.peak', 1),
             ('active.small_pool.allocated', 0),
             ('active.small_pool.current', 0),
             ('active.small_pool.freed', 0),
             ('active.small_pool.peak', 0),
             ('active_bytes.all.allocated', 300000256),
             ('active_bytes.all.current', 0),
             ('active_bytes.all.freed', 300000256),
             ('active_bytes.all.peak', 300000256),
             ('active_bytes.large_pool.allocated', 300000256),
             ('active_bytes.large_pool.current', 0),
             ('active_bytes.large_pool.freed', 300000256),
             ('active_bytes.large_pool.peak', 300000256),
             ('active_bytes.small_p

In [25]:
# Notice that GPU reserved memory is cleared after we empty the cache
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from large pool |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from small pool |       0 B  |       0 KB |       0 KB |       0 KB |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from large pool |       0 B  |  292969 KB |  292969 KB |  292969 KB |
|       from small pool |       0 B  |       0 KB |       0 KB |       0 KB |
|---------------------------------------------------------------

## 2.4 Put new tensor on device 0

In [26]:
z = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)

In [27]:
print_cuda_memory_allocation_for_device(cuda0)

300000256
301989888


In [28]:
print_cuda_memory_allocation_for_device(cuda1)

300000256
301989888


In [29]:
# memory on device 0 is 542MiB again. It looks like those 254MiB is reoccupied again
!nvidia-smi

Mon Oct 26 03:36:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   61C    P0    58W / 149W |    542MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   47C    P0    71W / 149W |    542MiB / 11441MiB |      0%      Default |
|       

In [30]:
# same stats as we first put tensor x on device 0
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  292969 KB |  585938 KB |  292969 KB |
|       from large pool |  292969 KB |  292969 KB |  585938 KB |  292969 KB |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 KB |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  292969 KB |  585938 KB |  292969 KB |
|       from large pool |  292969 KB |  292969 KB |  585938 KB |  292969 KB |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 KB |
|---------------------------------------------------------------

# 3. What if I don't empty cache? (Restart Kernel)

In [1]:
import torch

In [2]:
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')

In [3]:
x = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)
y = torch.zeros(300000000, dtype=torch.int8).cuda(cuda1)

In [4]:
# show up as 542MiB
!nvidia-smi

Mon Oct 26 03:46:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   60C    P0    58W / 149W |    542MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   45C    P0    71W / 149W |    542MiB / 11441MiB |     30%      Default |
|       

In [5]:
# same stats as we first put tensor x on device 0
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from large pool |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from large pool |  292969 KB |  292969 KB |  292969 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------

In [6]:
for i in range(10):
    x = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)
    y = torch.zeros(300000000, dtype=torch.int8).cuda(cuda1)

In [7]:
# increased to 830MiB
!nvidia-smi

Mon Oct 26 03:46:46 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   61C    P0    58W / 149W |    830MiB / 11441MiB |     29%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   46C    P0    71W / 149W |    830MiB / 11441MiB |     49%      Default |
|       

In [8]:
# same stats as we first put tensor x on device 0
# Total freed is 2861MB
# each tensor is about 300MB
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  585938 KB |    3147 MB |    2861 MB |
|       from large pool |  292969 KB |  585938 KB |    3147 MB |    2861 MB |
|       from small pool |       0 KB |       0 KB |       0 MB |       0 MB |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  585938 KB |    3147 MB |    2861 MB |
|       from large pool |  292969 KB |  585938 KB |    3147 MB |    2861 MB |
|       from small pool |       0 KB |       0 KB |       0 MB |       0 MB |
|---------------------------------------------------------------

In [9]:
# KB in MB
(292969 / 1000.0)

292.969

In [17]:
for i in range(30):
    x = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)
    y = torch.zeros(300000000, dtype=torch.int8).cuda(cuda1)

In [18]:
# remains at 830MiB
!nvidia-smi

Mon Oct 26 03:54:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   62C    P0    58W / 149W |    830MiB / 11441MiB |     29%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   47C    P0    71W / 149W |    830MiB / 11441MiB |     49%      Default |
|       

In [19]:
# same stats as we first put tensor x on device 0
# Total freed is 2861MB
# each tensor is about 300MB
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  585938 KB |   11730 MB |   11444 MB |
|       from large pool |  292969 KB |  585938 KB |   11730 MB |   11444 MB |
|       from small pool |       0 KB |       0 KB |       0 MB |       0 MB |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  585938 KB |   11730 MB |   11444 MB |
|       from large pool |  292969 KB |  585938 KB |   11730 MB |   11444 MB |
|       from small pool |       0 KB |       0 KB |       0 MB |       0 MB |
|---------------------------------------------------------------

In [21]:
for i in range(30):
    x = torch.zeros(300000000, dtype=torch.int8).cuda(cuda0)
    y = torch.zeros(300000000, dtype=torch.int8).cuda(cuda1)
    torch.cuda.empty_cache()

In [22]:
# remains at 830MiB
!nvidia-smi

Mon Oct 26 03:55:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   62C    P0    58W / 149W |    542MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000002:00:00.0 Off |                    0 |
| N/A   47C    P0    71W / 149W |    542MiB / 11441MiB |      0%      Default |
|       

In [23]:
# same stats as we first put tensor x on device 0
# Total freed is 2861MB
# each tensor is about 300MB
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  292969 KB |  585938 KB |   20599 MB |   20313 MB |
|       from large pool |  292969 KB |  585938 KB |   20599 MB |   20313 MB |
|       from small pool |       0 KB |       0 KB |       0 MB |       0 MB |
|---------------------------------------------------------------------------|
| Active memory         |  292969 KB |  585938 KB |   20599 MB |   20313 MB |
|       from large pool |  292969 KB |  585938 KB |   20599 MB |   20313 MB |
|       from small pool |       0 KB |       0 KB |       0 MB |       0 MB |
|---------------------------------------------------------------