In [3]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence

In [4]:
data = ['hello world',
        'midnight',
        'calculation',
        'path',
        'short circuit']
char_set = ["<pad>"] + list(set(char for seq in data for char in seq))
char2idx = {char : idx for idx, char in enumerate(char_set)}
print("char_set: ",char_set)
print("char_set length:",len(char_set))

char_set:  ['<pad>', 'd', 'g', 'o', 'l', 'r', 's', 'h', 'u', 'm', 'e', ' ', 'n', 'c', 'p', 'a', 'w', 'i', 't']
char_set length: 19


In [5]:
X = [torch.LongTensor([char2idx[char] for char in seq]) for seq in data]

for sequence in X:
    print(sequence)

tensor([ 7, 10,  4,  4,  3, 11, 16,  3,  5,  4,  1])
tensor([ 9, 17,  1, 12, 17,  2,  7, 18])
tensor([13, 15,  4, 13,  8,  4, 15, 18, 17,  3, 12])
tensor([14, 15, 18,  7])
tensor([ 6,  7,  3,  5, 18, 11, 13, 17,  5, 13,  8, 17, 18])


In [7]:
lengths = [len(seq) for seq in X]
print("lengths : ",lengths)

lengths :  [11, 8, 11, 4, 13]


## pad_sequence이용해서 PaddedSequence만들기

In [8]:
padded_sequence = pad_sequence(X, batch_first=True)
print(padded_sequence)
print(padded_sequence.shape)

tensor([[ 7, 10,  4,  4,  3, 11, 16,  3,  5,  4,  1,  0,  0],
        [ 9, 17,  1, 12, 17,  2,  7, 18,  0,  0,  0,  0,  0],
        [13, 15,  4, 13,  8,  4, 15, 18, 17,  3, 12,  0,  0],
        [14, 15, 18,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 6,  7,  3,  5, 18, 11, 13, 17,  5, 13,  8, 17, 18]])
torch.Size([5, 13])


In [10]:
## pack_seqeuence 이용해서 PackedSequence 만들기

In [9]:
sorted_idx = sorted(range(len(lengths)), key=lengths.__getitem__, reverse=True)
sorted_X = [X[idx] for idx in sorted_idx]

for sequence in sorted_X:
    print(sequence)

tensor([ 6,  7,  3,  5, 18, 11, 13, 17,  5, 13,  8, 17, 18])
tensor([ 7, 10,  4,  4,  3, 11, 16,  3,  5,  4,  1])
tensor([13, 15,  4, 13,  8,  4, 15, 18, 17,  3, 12])
tensor([ 9, 17,  1, 12, 17,  2,  7, 18])
tensor([14, 15, 18,  7])


In [12]:
packed_sequence = pack_sequence(sorted_X)
print(packed_sequence)

PackedSequence(data=tensor([ 6,  7, 13,  9, 14,  7, 10, 15, 17, 15,  3,  4,  4,  1, 18,  5,  4, 13,
        12,  7, 18,  3,  8, 17, 11, 11,  4,  2, 13, 16, 15,  7, 17,  3, 18, 18,
         5,  5, 17, 13,  4,  3,  8,  1, 12, 17, 18]), batch_sizes=tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1]), sorted_indices=None, unsorted_indices=None)


## Embedding 적용

In [13]:
eye = torch.eye(len(char_set))
embedded_tensor = eye[padded_sequence]
print(embedded_tensor.shape)

torch.Size([5, 13, 19])


In [14]:
embedded_packed_seq = pack_sequence([eye[X[idx]] for idx in sorted_idx])
print(embedded_packed_seq.data.shape)

torch.Size([47, 19])


## RNN 모델만들기

In [15]:
rnn = torch.nn.RNN(input_size=len(char_set), hidden_size=30,batch_first=True)

In [16]:
rnn_output, hidden = rnn(embedded_tensor)
print(rnn_output.shape)
print(hidden.shape)

torch.Size([5, 13, 30])
torch.Size([1, 5, 30])


In [18]:
rnn_output, hidden = rnn(embedded_packed_seq)
print(rnn_output.data.shape)
print(hidden.data.shape)

torch.Size([47, 30])
torch.Size([1, 5, 30])


## pad_packed_sequence

In [21]:
unpacked_sequence, seq_lengths = pad_packed_sequence(embedded_packed_seq, batch_first=True)
print(unpacked_sequence.shape)
print(seq_lengths)

torch.Size([5, 13, 19])
tensor([13, 11, 11,  8,  4])


## pack_padded_sequence

In [22]:
embedded_padded_sequence = eye[pad_sequence(sorted_X, batch_first=True)]
print(embedded_padded_sequence.shape)

torch.Size([5, 13, 19])


In [24]:
sorted_lengths = sorted(lengths, reverse=True)
new_packed_sequence = pack_padded_sequence(embedded_padded_sequence, sorted_lengths, batch_first=True)
print(new_packed_sequence.data.shape)
print(new_packed_sequence.batch_sizes)

torch.Size([47, 19])
tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1])
