# PackedSequence & PaddedSequence

In [None]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence

## 예제 데이터

batch size가 5이고, sequence 중 가장 긴 길이는 13

In [None]:
# Random word from random word generator
data = ['hello world',
        'midnight',
        'calculation',
        'path',
        'short circuit']

# Make dictionary
char_set = ['<pad>'] + list(set(char for seq in data for char in seq)) # Get all characters and include pad token
char2idx = {char: idx for idx, char in enumerate(char_set)} # Constuct character to index dictionary
print('char_set:', char_set)
print('char_set length:', len(char_set))

char_set: ['<pad>', 'm', 'i', 'd', 'l', 'n', 'r', 't', 's', 'a', 'e', ' ', 'p', 'u', 'c', 'h', 'w', 'o', 'g']
char_set length: 19


In [None]:
# Convert character to index and make list of tensors
X = [torch.LongTensor([char2idx[char] for char in seq]) for seq in data]

# Check converted result
for sequence in X: print(sequence)

tensor([15, 10,  4,  4, 17, 11, 16, 17,  6,  4,  3])
tensor([ 1,  2,  3,  5,  2, 18, 15,  7])
tensor([14,  9,  4, 14, 13,  4,  9,  7,  2, 17,  5])
tensor([12,  9,  7, 15])
tensor([ 8, 15, 17,  6,  7, 11, 14,  2,  6, 14, 13,  2,  7])


sequence의 길이가 제각각



In [None]:
# Make length tensor (will be used later in 'pack_padded_sequence' function)
lengths = [len(seq) for seq in X]
print('lengths:', lengths)

lengths: [11, 8, 11, 4, 13]


# Sequence 데이터의 경우 어떻게 batch로 묶을까
Text나 audio처럼 sequence 형식인 데이터의 경우 길이가 각각 다르기 때문에 하나의 batch로 만들어주기 위해서 일반적으로 제일 긴 sequence 길이에 맞춰 뒷부분에 padding을 추가.
** 일반적으로 많이 쓰이는 Padding 방식
but! PyTorch에서는 'PackedSequence'을 쓰면 padding 없이도 정확히 필요한 부분까지만 병렬 계산 가능

# `pad_sequence` 함수 -> PaddedSequence (그냥 Tensor) 만들기

In [None]:
# Make a Tensor of shape (Batch x Maximum_Sequence_Length)
padded_sequence = pad_sequence(X, batch_first=True) # X is now padded sequence
print(padded_sequence)
print(padded_sequence.shape)

tensor([[15, 10,  4,  4, 17, 11, 16, 17,  6,  4,  3,  0,  0],
        [ 1,  2,  3,  5,  2, 18, 15,  7,  0,  0,  0,  0,  0],
        [14,  9,  4, 14, 13,  4,  9,  7,  2, 17,  5,  0,  0],
        [12,  9,  7, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 8, 15, 17,  6,  7, 11, 14,  2,  6, 14, 13,  2,  7]])
torch.Size([5, 13])


# `pack_sequence` 함수를 이용하여 PackedSequence 만들기

input을 길이에 따른 내림차순으로 정렬

In [None]:
# Sort by descending lengths
sorted_idx = sorted(range(len(lengths)), key=lengths.__getitem__, reverse=True)
sorted_X = [X[idx] for idx in sorted_idx]

# Check converted result
for sequence in sorted_X:
    print(sequence)

tensor([ 8, 15, 17,  6,  7, 11, 14,  2,  6, 14, 13,  2,  7])
tensor([15, 10,  4,  4, 17, 11, 16, 17,  6,  4,  3])
tensor([14,  9,  4, 14, 13,  4,  9,  7,  2, 17,  5])
tensor([ 1,  2,  3,  5,  2, 18, 15,  7])
tensor([12,  9,  7, 15])


'pack_sequence'를 이용하여 PackedSequence 만들기

In [None]:
packed_sequence = pack_sequence(sorted_X)
print(packed_sequence)

PackedSequence(data=tensor([ 8, 15, 14,  1, 12, 15, 10,  9,  2,  9, 17,  4,  4,  3,  7,  6,  4, 14,
         5, 15,  7, 17, 13,  2, 11, 11,  4, 18, 14, 16,  9, 15,  2, 17,  7,  7,
         6,  6,  2, 14,  4, 17, 13,  3,  5,  2,  7]), batch_sizes=tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1]))


# Embedding 적용
one-hot character embedding

In [None]:
# one-hot embedding using PaddedSequence
eye = torch.eye(len(char_set)) # Identity matrix of shape (len(char_set), len(char_set))
embedded_tensor = eye[padded_sequence]
print(embedded_tensor.shape) # shape: (Batch_size, max_sequence_length, number_of_input_tokens)

torch.Size([5, 13, 19])


In [None]:
# one-hot embedding using PackedSequence
embedded_packed_seq = pack_sequence([eye[X[idx]] for idx in sorted_idx])
print(embedded_packed_seq.data.shape)

torch.Size([47, 19])


# RNN 모델

In [None]:
rnn = torch.nn.RNN(input_size=len(char_set), hidden_size=30, batch_first=True)

PaddedSequence -> RNN

In [None]:
rnn_output, hidden = rnn(embedded_tensor)
print(rnn_output.shape) # shape: (batch_size, max_seq_length, hidden_size)
print(hidden.shape)     # shape: (num_layers * num_directions, batch_size, hidden_size)

torch.Size([5, 13, 30])
torch.Size([1, 5, 30])


PackedSequence -> RNN

In [None]:
rnn_output, hidden = rnn(embedded_packed_seq)
print(rnn_output.data.shape)
print(hidden.data.shape)

torch.Size([47, 30])
torch.Size([1, 5, 30])


# pad_packed_sequence


In [None]:
unpacked_sequence, seq_lengths = pad_packed_sequence(embedded_packed_seq, batch_first=True)
print(unpacked_sequence.shape)
print(seq_lengths)

torch.Size([5, 13, 19])
tensor([13, 11, 11,  8,  4])


# pack_padded_sequence
Padding이 된 Tensor인 'PaddedSequence'를 'PackedSequence'로 바꾸어주는 함수

In [None]:
embedded_padded_sequence = eye[pad_sequence(sorted_X, batch_first=True)]
print(embedded_padded_sequence.shape)

torch.Size([5, 13, 19])


padding이 된 Tensor를 PackedSequence로 변환

In [None]:
sorted_lengths = sorted(lengths, reverse=True)
new_packed_sequence = pack_padded_sequence(embedded_padded_sequence, sorted_lengths, batch_first=True)
print(new_packed_sequence.data.shape)
print(new_packed_sequence.batch_sizes)

torch.Size([47, 19])
tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1])
