In [1]:
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim

from ModelParallel import ModelParallel, get_device_free_memory

import time

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
devices = [torch.device('cuda:' + str(i)) for i in range(4)]

In [4]:
batch_size = 32

In [5]:
X = [random.random() * 10 for i in range(1000)]

In [6]:
y = [math.exp(x) for x in X]

In [7]:
X = torch.tensor(X,dtype=torch.float).unsqueeze(-1)

In [8]:
y = torch.tensor(y,dtype=torch.float).unsqueeze(-1)

# SIGNLE-GPU

In [9]:
class ModelS(ModelParallel):
  def __init__(self):
    super(ModelS, self).__init__()
    self.layer1 = nn.Linear(1, 10200,bias=False)
    self.layer2 = nn.Linear(10200, 10200,bias=False)
    self.layer3 = nn.Linear(10200, 10200,bias=False)
    self.layer4 = nn.Linear(10200, 10200,bias=False)
    self.layer5 = nn.Linear(10200, 10200,bias=False)
    self.layer6 = nn.Linear(10200, 10200,bias=False)
    self.layerLast = nn.Linear(10200, 1,bias=False)

  def forward(self, x):
    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    x = self.layer5(x)
    x = self.layer6(x)
    x = self.layerLast(x)
    return x

In [10]:
model = ModelS()
model.to(device)

ModelS(
  (layer1): Linear(in_features=1, out_features=10200, bias=False)
  (layer2): Linear(in_features=10200, out_features=10200, bias=False)
  (layer3): Linear(in_features=10200, out_features=10200, bias=False)
  (layer4): Linear(in_features=10200, out_features=10200, bias=False)
  (layer5): Linear(in_features=10200, out_features=10200, bias=False)
  (layer6): Linear(in_features=10200, out_features=10200, bias=False)
  (layerLast): Linear(in_features=10200, out_features=1, bias=False)
)

In [11]:
for dev in devices:
  print(dev, get_device_free_memory(dev))

cuda:0 9640623616
cuda:1 11721506816
cuda:2 11721506816
cuda:3 11721506816


In [12]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
model.train()

t1 = time.time()

nb_batch = 0

for epoch in range(1):
  mae = 0
  nb_batch = 0
  for idx in range(0, len(X) - batch_size, batch_size):
    features = X[idx:idx+batch_size,:].to(device)
    #targets = torch.tensor(y[idx:idx+batch_size],dtype=torch.float).to(device)

    pred = model(features)

    targets = y[idx:idx+batch_size,:].to(pred.device)

    loss = criterion(pred, targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    mae += loss.item()
    nb_batch += 1

  print("epoch {} - mae {}".format(epoch, mae / nb_batch))

t2 = time.time()
(t2 - t1) / nb_batch

epoch 0 - mae 32256041710062.13


0.1815309909082228

# MULTI-GPU

In [14]:
class ModelP(ModelParallel):
  def __init__(self):
    super(ModelP, self).__init__()
    self.layer1 = self.mp_m(nn.Linear(1, 10200,bias=False), 0)
    self.layer2 = self.mp_m(nn.Linear(10200, 10200,bias=False), 1)
    self.layer3 = self.mp_m(nn.Linear(10200, 10200,bias=False), 0.5)
    self.layer4 = self.mp_m(nn.Linear(10200, 10200,bias=False), 0.5)
    self.layer5 = self.mp_m(nn.Linear(10200, 10200,bias=False), 1)
    self.layer6 = self.mp_m(nn.Linear(10200, 10200,bias=False), 1)
    self.layerLast = self.mp_m(nn.Linear(10200, 1,bias=False), 0)

  def forward(self, x):
    x = self.mp_f(self.layer1, x)
    x = self.mp_f(self.layer2, x)
    x = self.mp_f(self.layer3, x)
    x = self.mp_f(self.layer4, x)
    x = self.mp_f(self.layer5, x)
    x = self.mp_f(self.layer6, x)
    x = self.mp_f(self.layerLast, x)
    return x

In [15]:
model = ModelP()
model.to_devices(devices)

In [16]:
for dev in devices:
  print(dev, get_device_free_memory(dev))

cuda:0 3397971968
cuda:1 11305305600
cuda:2 10889186304
cuda:3 10889145344


In [17]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
model.train()

t1 = time.time()

nb_batch = 0

for epoch in range(1):
  mae = 0
  nb_batch = 0
  for idx in range(0, len(X) - batch_size, batch_size):
    features = X[idx:idx+batch_size,:]
    #targets = torch.tensor(y[idx:idx+batch_size],dtype=torch.float).to(device)

    pred = model(features)

    targets = y[idx:idx+batch_size,:].to(pred.device)

    loss = criterion(pred, targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    mae += loss.item()
    nb_batch += 1

  print("epoch {} - mae {}".format(epoch, mae / nb_batch))

t2 = time.time()
(t2 - t1) / nb_batch

epoch 0 - mae 9300210459076.355


0.12062265796046104

In [19]:
for dev in devices:
  print(dev, get_device_free_memory(dev))

cuda:0 7559740416
cuda:1 10056700416
cuda:2 8392224768
cuda:3 8392060928
