In [1]:
import gc
import math
import torch
from torch import nn

In [2]:
class ResidualBiGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1, bidir=True):
        super(ResidualBiGRU, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            batch_first=True,
            bidirectional=bidir,
        )
        dir_factor = 2 if bidir else 1
        self.fc1 = nn.Linear(
            hidden_size * dir_factor, hidden_size * dir_factor * 2
        )
        self.ln1 = nn.LayerNorm(hidden_size * dir_factor * 2)
        self.fc2 = nn.Linear(hidden_size * dir_factor * 2, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)

    def forward(self, x, h=None):
        res, new_h = self.gru(x, h)

        res = self.fc1(res)
        res = self.ln1(res)
        res = nn.functional.relu(res)

        res = self.fc2(res)
        res = self.ln2(res)
        res = nn.functional.relu(res)

        res = res + x

        return res, new_h

class MultiResidualBiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, out_size, n_layers, bidir=True):
        super(MultiResidualBiGRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        self.res_bigrus = nn.ModuleList(
            [
                ResidualBiGRU(hidden_size, n_layers=1, bidir=bidir)
                for _ in range(n_layers)
            ]
        )
        self.fc_out = nn.Linear(hidden_size, out_size)

    def forward(self, x, h=None):
        if h is None:
            h = [None for _ in range(self.n_layers)]

        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)

        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)

        x = self.fc_out(x)
        
        return x, new_h

In [3]:
input_size = 10
hidden_size = 64
out_size = 2
n_layers = 5

allocated_memory_residual_bigru = torch.cuda.memory_allocated()
residual_bigru = MultiResidualBiGRU(input_size, hidden_size, out_size, n_layers).to('cuda')
allocated_memory_residual_bigru = torch.cuda.memory_allocated() - allocated_memory_residual_bigru
print(f"ResidualBiGRU 모델의 GPU 할당 메모리: {allocated_memory_residual_bigru} bytes")
del residual_bigru
gc.collect()

ResidualBiGRU 모델의 GPU 할당 메모리: 2009600 bytes


130

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.3, max_len=24*60):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class Transformer(nn.Module):
    def __init__(self, input_size, out_size, max_len, hidden_size, n_layers, nhead):
        super(Transformer, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        encoder_layers.self_attn.batch_first = True
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
        
        self.fc_out = nn.Linear(hidden_size, out_size)
        self.pos_encoder = PositionalEncoding(hidden_size, max_len=max_len)

    def forward(self, x):
        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.fc_out(x)
        return x

In [5]:
params = [
    [5000, 64, 5, 8],
    [24*60, 64, 5, 8],
    [24*60, 32, 5, 8],
    [24*60, 32, 2, 4],
    [24*60, 16, 1, 4]
]

In [6]:
for ml, hs, nl, nh in params:
    allocated_memory_transformer = torch.cuda.memory_allocated()
    transformer = Transformer(input_size, out_size, ml, hs, nl, nh).to('cuda')
    allocated_memory_transformer = torch.cuda.memory_allocated() - allocated_memory_transformer
    print()
    print(f"Transformer 모델의 GPU 할당 메모리: {allocated_memory_transformer} bytes")
    del transformer
    gc.collect()


Transformer 모델의 GPU 할당 메모리: 6917120 bytes

Transformer 모델의 GPU 할당 메모리: 6005760 bytes

Transformer 모델의 GPU 할당 메모리: 2950656 bytes

Transformer 모델의 GPU 할당 메모리: 1293312 bytes

Transformer 모델의 GPU 할당 메모리: 373760 bytes
