# Demo: Creating Tensors on CUDA-enabled devices 

In [1]:
import torch

In [2]:
# Check whether the cuda is available
torch.cuda.is_available()

True

In [3]:
# In order to initialize the CUDA state for PyTorch, you can call torch.cuda.init(). 
# This is required when interacting with PyTorch's C API. 
# When you are working with Python, the CUDA state is initilized on demand, so it's not really needed here in this case.

torch.cuda.init()

In [4]:
# At any point in time, when you're working with PyTorch, torch.cuda keeps track of the currently selected GPU and 
# all CUDA tensors that you allocate will by default, be created on that device.

# the gpus with indexed positions, it will return index of the current device
torch.cuda.current_device()

0

In [5]:
# Number of CUDA emabled devices available for PyTorch to use by running torch.cuda.device_count().

torch.cuda.device_count()

1

In [6]:
# If you want to use PyTorch to monitor how much memory your tensors occupy, you can call torch.cuda.memory_allocated().

torch.cuda.memory_allocated()

0

In [7]:
# Behind the scenes PyTorch uses a caching memeroy allocator to speed up memory allocations to your tensors - 
# this allows fast memory deallocation without device synchronizations between your different CUDA devices.

# FutureWarning: torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved
torch.cuda.memory_reserved()

0

In [8]:
# "cuda" refers to the default CUDA device used by PyTorch (on which tensors will be created) - 
# this is something that can be changed using the device context manager.

cuda = torch.device("cuda")

cuda

device(type='cuda')

In [9]:
# If you want to access to a specific CUDA device using the device's context manager, you will reference it using an index.

# but we should have only one cuda device which is a dedicated GPU of your laptop.
# So, cuda1 and cuda2 are not really valid.
# The default cuda device is at index 0.
cuda0 = torch.device("cuda:0")
cuda1 = torch.device("cuda:1")
cuda2 = torch.device("cuda:2")

display(cuda0)
display(cuda1)
display(cuda2)

device(type='cuda', index=0)

device(type='cuda', index=1)

device(type='cuda', index=2)

In [10]:
# When you create a torch tensor and you haven't specified a CUDA device, this tensor, by default, is created on the CPU.

x = torch.tensor([10., 20.])

# x is a tensor created on CPU, because it has not CUDA device associated with it.
x

tensor([10., 20.])

In [11]:
# If you want to create a tensor on GPU, you need to explicitly specify the device parameter, device equal to cuda
# (meaning that it is going to use defauly cuda device)
x_default = torch.tensor([10., 20.], device=cuda) # device=cuda means device="cuda:0"

x_default

tensor([10., 20.], device='cuda:0')

In [12]:
x0 = torch.tensor([10.0, 20.0], device=cuda0)

x0

tensor([10., 20.], device='cuda:0')

In [13]:
torch.cuda.memory_allocated()

1024

In [14]:
torch.cuda.memory_reserved()

2097152

In [15]:
# Let's create another tensor, x1, explicitly on the device cuda1.

# will result in an error, since there is no 2nd GPU
x1 = torch.tensor([10.0, 20.0], device=cuda1)

x1

RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [16]:
# The same happens for the 3rd cuda device we obtained a reference to.

x2 = torch.tensor([10.0, 20.0], device=cuda2)

RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

# Demo: Working with the Device Context Manager

In [17]:
# If you want to create a copy of an object in CUDA memory, you can call the cuda() function
# If the tensor is already in CUDA memory and on the correct device, no copy is performed.

# Let's copy our CPU tensor, x, to CUDA memory (to the default CUDA device).
y = x.cuda()

y

tensor([10., 20.], device='cuda:0')

In [18]:
# now create a copy of x1 (which was actually created on cuda:1) on default device cuda:0
y0 = x1.cuda() # since we do not have second GPU, there is no tensor created on, so this operation will fail

y0

NameError: name 'x1' is not defined

In [20]:
# You can use a with context in order to change the default device for PyTorch, using the device context manager.

print("Outside with context: ", torch.cuda.current_device())

# Only within this "with" block, the device context is 1, outside it is 0.
with torch.cuda.device(1): # there is no device 1, so error will occur
    print("Inside with context: ", torch.cuda.current_device())
    
print("Outside with context: ", torch.cuda.current_device())

Outside with context:  0


RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [22]:
with torch.cuda.device(1):
    # when we explicitly specify a device within a with context the tensor is created on the specified device.
    # since we have not specified a cuda device here, tensor a will be created on CPU.
    a = torch.tensor([10.0, 20.0])
    # here, we explicitly create it on cuda:0, so, it will not be in cuda:1 as we specifed by context manager
    a0 = torch.tensor([10.0, 20.0], device=cuda0)
    # Here it will be created on default device, which is actually the device we set using context manager, cuda:1,
    # so, the default for the current with context is cuda:1
    a1 = torch.tensor([10.0, 20.0], device=cuda)
    


RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [23]:
# If you want to explicitly move a tensor from one GPU to another, you have to perform a copy using the cuda() function that
# we saw earlier or the to() function that we are going to see now.

# torch.Tensor.to() function is used to perform tensor data type or device conversion.

# a1 was originally on cuda0, b1 has been created on cuda1.
b1 = a0.to(device=cuda1)

b1

NameError: name 'a0' is not defined

In [24]:
# let's create an example for copying between CPU and GPU

# tensor on CPU
my_cpu_tensor = torch.tensor([10.0, 20.0])
display(my_cpu_tensor)

# create a copy on device cuda:0. cuda refers to cuda:0.
my_gpu0_tensor= my_cpu_tensor.to(device=cuda)
display(my_gpu0_tensor)

tensor([10., 20.])

tensor([10., 20.], device='cuda:0')

In [26]:
# What if we tried to perform an operation on tensors located on two different devices?
# a is on cpu, a0 is in cuda:0, this will fail
# sum_a = a + a0
# a1 is on cuda:1, a0 is on cuda:0, this will fail as well
# sum_a = a1 + a0
# this should work since a1 and x1 are in cuda:1
# the resulting tensor will be on the same device as the original tensors on which the operation is was performed
# sum_ax = a1 + x1

sum_cpu_gpu0 = my_cpu_tensor + my_gpu0_tensor

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [27]:
torch.cuda.memory_allocated()

2048

In [28]:
torch.cuda.memory_reserved()

2097152

In [29]:
# torch.cuda.empty_cache() will allow you to free up all of the unused cache memory within your caching allocator.

torch.cuda.empty_cache()

In [30]:
# if the cache memory shows the same value, this means we do not have any unused memory in our cache.
torch.cuda.memory_reserved()

2097152

In [33]:
# If you want to create a new tensor on the same device, you can use PyTorch operations prefixed with the new_ .
# Operations prefixed with "new_" create new tensors of the same type on the same device.

display(x)

# create a new 2x2 tensor based on x, x's data type and x's device, and fill the tensor with 1.1
# x was on CPU, the new tensor will be on cpu as well
preserve_context = x.new_full([2, 2], fill_value=1.1)
display(preserve_context)

tensor([10., 20.])

tensor([[1.1000, 1.1000],
        [1.1000, 1.1000]])

In [35]:
# Now let's do it for cuda:0 device
display(x0)

# create a new tensor using x0, x0's data type and x0's device.
# x0 was on cuda:0, the new tensor will on the cuda:0 as well
preserve_context0 = x.new_full([2, 2], fill_value=1.1)
display(preserve_context0)

tensor([10., 20.], device='cuda:0')

tensor([[1.1000, 1.1000],
        [1.1000, 1.1000]])