In [15]:
# 데이터 전처리
import numpy as np

with open('../Dataset/pg2265.txt', 'r', encoding = 'utf-8') as f :
    text = f.read()

text = text[16247:] # 구텐베르그 페이지의 파일이 수정됨.
chars = set(text)
char2int = {ch:i for i, ch in enumerate(chars)}
int2char = dict(enumerate(chars))
text_ints = np.array([char2int[ch] for ch in text], dtype = np.int32)

print(len(text))
print(len(chars))

162850
65


In [16]:
def reshape_data(sequence, batch_size, num_steps) : 
    mini_batch_length = batch_size * num_steps
    num_batches = int(len(sequence) / mini_batch_length)
    if num_batches * mini_batch_length + 1 > len(sequence) : 
        num_batches = num_batches - 1
    # 전체 배치에 포함되지 않는 시퀀스 끝부분은 삭제
    x = sequence[0: num_batches * mini_batch_length]
    y = sequence[1: num_batches * mini_batch_length + 1]
    # x와 y를 시퀀스 배치의 리스트로 나눔
    x_batch_splits = np.split(x, batch_size)
    y_batch_splits = np.split(y, batch_size)
    # 합침 (size = batch_size * mini_batch_length)
    x = np.stack(x_batch_splits)
    y = np.stack(y_batch_splits)
    
    return x, y

In [17]:
# test

train_x, train_y = reshape_data(text_ints, 64, 10)
print(train_x.shape)
print(train_x[0, :10])
print(train_y[0, :10])
print(''.join(int2char[i] for i in train_x[0, :10]))
print(''.join(int2char[i] for i in train_y[0, :10]))

(64, 2540)
[15 31 44 38 15 47 30  2 44 53]
[31 44 38 15 47 30  2 44 53 29]
The Traged
he Tragedi


In [18]:
def create_batch_generator(data_x, data_y, num_steps) : 
    batch_size, tot_batch_length = data_x.shape[0:2]
    num_batches = int(tot_batch_length / num_steps)
    
    for b in range(num_batches) : 
        yield (data_x[:, b * num_steps : (b + 1) * num_steps],
              data_y[:, b * num_steps : (b + 1) * num_steps])

In [19]:
# test

bgen = create_batch_generator(train_x[:, :100], train_y[:, :100], 15)

for x, y in bgen : 
    print(x.shape, y.shape, end = ' ')
    print(''.join(int2char[i] for i in x[0, :]).replace('\n', '*'), ' ',
         ''.join(int2char[i] for i in y[0, :]).replace('\n', '*'))

(64, 15) (64, 15) The Tragedie of   he Tragedie of 
(64, 15) (64, 15)  Hamlet**Actus    Hamlet**Actus P
(64, 15) (64, 15) Primus. Scoena    rimus. Scoena P
(64, 15) (64, 15) Prima.**Enter B   rima.**Enter Ba
(64, 15) (64, 15) arnardo and Fra   rnardo and Fran
(64, 15) (64, 15) ncisco two Cent   cisco two Centi


In [20]:
batch_size = 64
num_steps = 100
train_x, train_y = reshape_data(text_ints, batch_size, num_steps)
print(train_x.shape, train_y.shape)

(64, 2500) (64, 2500)


In [21]:
from tensorflow.keras.utils import to_categorical

train_encoded_x = to_categorical(train_x) # 원-핫 인코딩된 벡터로 변환
train_encoded_y = to_categorical(train_y)
print(train_encoded_x.shape, train_encoded_y.shape)

(64, 2500, 65) (64, 2500, 65)


In [22]:
print(np.max(train_x), np.max(train_y))

64 64


In [24]:
# 모델 생성
from tensorflow.keras import models, layers

char_model = models.Sequential()

In [25]:
num_classes = len(chars) # 텍스트에 있는 모든 글자 수
char_model.add(layers.LSTM(128, input_shape = (None, num_classes), return_sequences = True))
# 가변 길이 시퀀스를 처리하기 위해 Input이 (None, 원-핫 인코딩 벡터 크기)
# 샘플링 : batch = 1, num_steps = 1 // 훈련 : batch = 64, num_steps = 100

In [26]:
char_model.add(layers.TimeDistributed(layers.Dense(num_classes, activation = 'softmax')))
# 모든 타임 스텝에 대한 손실을 계산해야 하므로 LSTM의 3차원 텐서를 다루어야 함.
# 따라서 Flatten을 넣지 않고 LSTM 층의 출력을 타임 스텝 순으로 Dense 층에 주입하고 결과를 받아
# 다시 타임 스텝 순으로 쌓도록 함

In [27]:
char_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 128)         99328     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 65)          8385      
Total params: 107,713
Trainable params: 107,713
Non-trainable params: 0
_________________________________________________________________


In [29]:
from tensorflow.keras.optimizers import Adam

adam = Adam(clipnorm = 5.0)

In [30]:
char_model.compile(loss = 'categorical_crossentropy', optimizer = adam)

In [31]:
for i in range(500) : 
    bgen = create_batch_generator(train_encoded_x, train_encoded_y, num_steps)
    char_model.fit_generator(bgen, steps_per_epoch = 25, epochs = 1,
                            verbose = 0)

In [32]:
# 텍스트 생성

np.random.seed(42)

def get_top_char(probas, char_size, top_n = 5) :
    p = np.squeeze(probas)
    p[np.argsort(p)[:-top_n]] = 0.0
    p = p / np.sum(p)
    ch_id = np.random.choice(char_size, 1, p = p)[0]
    return ch_id

In [33]:
seed_text = "The "

for ch in seed_text : 
    num = [char2int[ch]]
    onehot = to_categorical(num, num_classes = 65)
    onehot = np.expand_dims(onehot, axis = 0)
    probas = char_model.predict(onehot)

num = get_top_char(probas, len(chars))
seed_text += int2char[num]

In [34]:
# 500번 반복해보기

for i in range(500) : 
    onehot = to_categorical([num], num_classes = 65)
    onehot = np.expand_dims(onehot, axis = 0)
    probas = char_model.predict(onehot)
    num = get_top_char(probas, len(chars))
    seed_text += int2char[num]

print(seed_text)

The wene thes teroush wings anges, there t wingrono aterer  this we wat tenghend thies thoner t men as, t waro mauster  mar me th me, withe anode athin the the me m. m.  witonge man  thill me ather winder thes, mus me mat mes thour  me ast wingrer te    wie tod auero  menourere anende winge wau  windongh ande w wit,

 with me  m. wis thit m. asth angs arsthe,  as and   thithe,  t winers winde w wer me,
  we t ther m. where, to  wit au mursthe w   m. m.  thinghind  wie  te,
  tod me merther ath wauren
