In [2]:
import transformers
import torch
import torch.nn as nn

In [None]:
layer = nn.Linear(10, 5)

In [None]:
layer.weight.shape

In [4]:
from torch import Tensor
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import math
from torch.nn import init, functional as F

class ParallelLinear(Module):
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: Tensor

    def __init__(self, in_features: int, out_features: int, bias: bool = True,
                 dtype=None) -> None:
        
        device = [i for i in range(torch.cuda.device_count())]

        factory_kwargs = {'device': torch.device("cpu"), 'dtype': dtype}
        factory_kwargs_right = {'device': torch.device("cuda:0"), 'dtype': dtype}
        factory_kwargs_left = {'device': torch.device("cuda:1"), 'dtype': dtype}

        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        self.left_weight = Parameter(torch.empty((out_features, in_features // 2), **factory_kwargs_left))
        self.right_weight = Parameter(torch.empty((out_features, in_features // 2 + in_features % 2), **factory_kwargs_right))
        
        if bias:
            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()


    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        init.kaiming_uniform_(self.left_weight, a=math.sqrt(5))
        init.kaiming_uniform_(self.right_weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.left_weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input: Tensor) -> Tensor:
        left_input = input[:,:(self.in_features // 2)].to(torch.device("cuda:1"))
        right_input = input[:,(self.in_features // 2):].to(torch.device("cuda:0"))

        left_product = F.linear(left_input, self.left_weight)
        right_product = F.linear(right_input, self.right_weight)
        
        left_product = left_product.to(torch.device("cpu"))
        right_product = right_product.to(torch.device("cpu"))
        return left_product + right_product + self.bias

    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )

In [None]:
plinear = ParallelLinear(10, 100)

In [None]:
plinear.left_weight.shape, plinear.right_weight.shape, data.shape

In [None]:
plinear.forward(data).shape

In [3]:
from torch.utils.data import Dataset, DataLoader
import time 

input_size = 768
output_size = 10

batch_size = 10000
data_size = 100000
learning_rate = 0.001
epoch = 10

class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)
        self.output = torch.randn(length, output_size)

    def __getitem__(self, index):
        return self.data[index], self.output[index]

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=False)

class ParallelNet(nn.Module):
  def __init__(self):
    super(ParallelNet, self).__init__()
    self.fc1 = ParallelLinear(768, 100)
    self.fc2 = ParallelLinear(100, 1000)
    self.fc3 = ParallelLinear(1000, 100000)
    self.fc4 = ParallelLinear(100000, 10000)
    self.fc5 = ParallelLinear(10000, 1000)
    self.fc6 = ParallelLinear(1000, 10)

  def forward(self, x):
    x = F.tanh(self.fc1(x))
    x = F.tanh(self.fc2(x))
    x = F.tanh(self.fc3(x))
    x = F.tanh(self.fc4(x))
    x = F.tanh(self.fc5(x))
    x = F.tanh(self.fc6(x))
    x = x.view(-1, 10)
    x = F.softmax(x, dim = 1)
    return x
  
model = ParallelNet()

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate) 

i = 1
start = time.time()
for epoch in range(epoch):
    for data, target in rand_loader:
        data = data
        target = target
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if i % 1 == 0:
            print("Train Step : {}\tLoss : {:3f}".format(i, loss.item()))
        i += 1
end = time.time()
print("Time taken : {}".format(end - start))
torch.cuda.empty_cache()

Train Step : 1	Loss : 1.006647
Train Step : 2	Loss : 1.013926
Train Step : 3	Loss : 1.009988
Train Step : 4	Loss : 1.013408
Train Step : 5	Loss : 1.012748
Train Step : 6	Loss : 1.009166


KeyboardInterrupt: 

: 

In [5]:
from torch.utils.data import Dataset, DataLoader
import time 

input_size = 1
output_size = 10

batch_size = 10000
data_size = 100000
learning_rate = 0.001
epoch = 10

class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)
        self.output = torch.randn(length, output_size)

    def __getitem__(self, index):
        return self.data[index], self.output[index]

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=False)

class MyNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = torch.nn.Linear(1, 100)
    self.fc2 = torch.nn.Linear(100, 1000)
    self.fc3 = torch.nn.Linear(1000, 100000)
    self.fc4 = torch.nn.Linear(100000, 10000)
    self.fc5 = torch.nn.Linear(10000, 1000)
    self.fc6 = torch.nn.Linear(1000, 10)

  def forward(self, x):
    x = F.tanh(self.fc1(x))
    x = F.tanh(self.fc2(x))
    x = F.tanh(self.fc3(x))
    x = F.tanh(self.fc4(x))
    x = F.tanh(self.fc5(x))
    x = F.tanh(self.fc6(x))
    x = x.view(-1, 10)
    x = F.softmax(x, dim = 1)
    return x

device = torch.device("cuda:0")

model = MyNet()

model.to(device)


criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate) 

i = 1
start = time.time()
for epoch in range(epoch):
    for data, target in rand_loader:
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if i % 1 == 0:
            print("Train Step : {}\tLoss : {:3f}".format(i, loss.item()))
        i += 1
end = time.time()
print("Time taken : {}".format(end - start))
torch.cuda.empty_cache()

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB (GPU 0; 23.69 GiB total capacity; 19.54 GiB already allocated; 138.06 MiB free; 23.25 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
parallelLinear = ParallelLinear(10, 5)

In [None]:
my_tensor = torch.Tensor([1.0, 1.0, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 2.1])

parallelLinear.forward(my_tensor)

In [None]:
layer.forward(my_tensor)

In [None]:
# Tensor의 bias를 언제 더해줘야 하는지?

In [None]:
sample_tensor = torch.Tensor([0.0, 0.1, 0.2]).unsqueeze(dim=1)

In [None]:
sample_tensor