In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gc

## For using device you should know 2 things:
- How to transit tensors/models to device
- How to contol gpu memory (allocate/free)

In [2]:
#choose device: if there is "cuda:0", then env. you ready for teach with cuda 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
#this will show you size of allocated memory
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB


In [4]:
#next let's see how to compute calculations on device
a = torch.Tensor([1, 2, 3]).to(device=device)
b = torch.Tensor([2, 3, 4]).to(device=device)
c = a @ b #compute dot product

print(a, b, c)

tensor([1., 2., 3.], device='cuda:0') tensor([2., 3., 4.], device='cuda:0') tensor(20., device='cuda:0')


In [5]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.007936GB


### you just transit tensors on device, and automatically all computations take place on gpu

In [6]:
class FCN(nn.Module):
  def __init__(self, n_input, n_output, n_hidden=(32, 32), activation_fc=F.relu):
    super(FCN, self).__init__()
    self.input_layer = nn.Linear(n_input, n_hidden[0])
    self.hidden_layers = nn.ModuleList([nn.Linear(n_hidden[i], n_hidden[i + 1]) for i in range(len(n_hidden) - 1)])
    self.output_layer = nn.Linear(n_hidden[-1], n_output)
    self.activation_fc = activation_fc

  def forward(self, x):
    x = self.activation_fc(self.input_layer(x))
    for hidden_layer in self.hidden_layers:
      x = self.activation_fc(hidden_layer(x))
    x = self.output_layer(x)

    return x

In [7]:
train_features, train_lables = torch.Tensor([1, 2, 3]).to(device=device), torch.Tensor([2, 3]).to(device=device)

net = FCN(3, 2).to(device=device)
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr = 0.01)

#make one learning step

loss = criterion(train_lables, net(train_features))
loss.backward()
optimizer.step()

In [8]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.015884GB


#### here ^^^ you can see that you also need to transit learning data to device

# *PyTorch automatically clean memory, so usualy you need not to think about memory management*
 But there are guide, below this cell, how to contol memory

## Next i'll show how to free memory; it's important, because you learning process will break, if your gpu get limit of memory
#### there are two ways to clean gpu memory:
 - delete variables (rude way to relise memory(that use pytorch gc feature))
 - use empty_cache (naturall method to relise memory)

In [9]:
#lets look size of allocated memory
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.015884GB


In [10]:
#first way to free memory
del a
del b
del c
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.015883GB


In [11]:
#second empty_cache will free unused memory, if it's necessary
train_features, train_lables = None, None
net = None
loss = None
optimizer = None

torch.cuda.empty_cache()
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.015869GB


#### not all memory was relised, but if you(apparenly torch handle this memory for reuse it with same targets) 
#### because if we initalize model again, we won't see double increasing of memory

In [13]:
net = FCN(3, 2).to(device=device)
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.015875GB
