# 텐서 플로우 RNN 텍스트 생성

In [1]:
import tensorflow as tf
import numpy as np
import os 
import time

In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 20

<IPython.core.display.Javascript object>

In [3]:
#셰익스피어 데이터셋 다운로드
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(text[:200])

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [4]:
print(repr(text[:200]))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'


In [7]:
#총문장의 길이
len(text)

1115394

In [8]:
#고유 캐릭터 수 출력
vocab = sorted(set(text))
print(vocab[:10])
print(len(vocab))

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']
65


# 텍스트 전처리

In [9]:
#step1. character 사전 만들기
# character를 index로 변환하는 사전을 만든다

char2idx = {u:i for i,u in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [10]:
idx2char =np.array(vocab)
idx2char[49]

'k'

In [11]:
#step2. 텍스트 전체를 int로 변환
print(char2idx['i'])
text_as_int = np.array([char2idx[c] for c in text])
print(len(text_as_int))

#변환된 부분을 처음5개만 확인
print('text before chaging int : ',text[:5])
print('text after chaging int  ; ',text_as_int[:5])

47
1115394
text before chaging int :  First
text after chaging int  ;  [18 47 56 57 58]


# 데이터셋 생성

In [12]:
window_size = 100
shuffle_buffer = 10000
batch_size = 64

In [14]:

def windowed_dataset(series, window_size, shuffle_buffer,batch_size):
  series = tf.expand_dims(series, -1)
  ds = tf.data.Dataset.from_tensor_slices(series)
  ds = ds.window(window_size +1, shift=1,drop_remainder = True)
  ds = ds.flat_map(lambda x: x.batch(window_size + 1))
  ds = ds.shuffle(shuffle_buffer)
  ds = ds.map(lambda x: (x[:-1],x[1:]))
  return ds.batch(batch_size).prefetch(1)

train_data = windowed_dataset(np.array(text_as_int), window_size,shuffle_buffer, batch_size)


In [15]:
#문자로 된 어휘 사전의 크기
vocab_size = len(vocab)
vocab_size

65

In [16]:
#임베딩 차원
embedding_dim = 256
#RNN 유닛(unit) 개수
rnn_units = 1024

#모델 구성
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                                       batch_input_shape = [batch_size, None]),
                             tf.keras.layers.LSTM(rnn_units,
                                                  return_sequences = True,
                                                  stateful = True,
                                                  recurrent_initializer = 'glorot_uniform'),
                             tf.keras.layers.Dense(vocab_size)
])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [19]:
# 체크포인트가 저장될 디렉토리
checkpoint_path = '/content/my_checkpt.ckpt'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_path,
    save_weights_only = True,
    save_best_only = True,
    monitor ='loss',
    verbose =1,
)

In [20]:
# Loss function을 정의

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits,from_logits=True)

In [21]:
model.compile(optimizer='adam', loss = loss, metrics=['acc'])

In [22]:
model.fit(train_data,
          epochs =10,
          steps_per_epoch = 1720,
          callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 1: loss improved from inf to 0.68153, saving model to /content/my_checkpt.ckpt
Epoch 2/10
Epoch 2: loss improved from 0.68153 to 0.31110, saving model to /content/my_checkpt.ckpt
Epoch 3/10
Epoch 3: loss improved from 0.31110 to 0.28116, saving model to /content/my_checkpt.ckpt
Epoch 4/10
Epoch 4: loss improved from 0.28116 to 0.28074, saving model to /content/my_checkpt.ckpt
Epoch 5/10
Epoch 5: loss did not improve from 0.28074
Epoch 6/10
Epoch 6: loss improved from 0.28074 to 0.28006, saving model to /content/my_checkpt.ckpt
Epoch 7/10
Epoch 7: loss did not improve from 0.28006
Epoch 8/10
Epoch 8: loss did not improve from 0.28006
Epoch 9/10
Epoch 9: loss did not improve from 0.28006
Epoch 10/10
Epoch 10: loss did not improve from 0.28006


<keras.callbacks.History at 0x7f3bd006a390>

# 예측을 위한 모델 재정의

In [23]:
#batch_size 1로 변경

model = tf.keras.Sequential([
          tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                    batch_input_shape = [1,None]),
          tf.keras.layers.LSTM(rnn_units,
                               return_sequences = True,
                               stateful = True,
                               recurrent_initializer = 'glorot_uniform'),
          tf.keras.layers.Dense(vocab_size)
])

In [24]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f3be67c18d0>

In [25]:
model.build(tf.TensorShape([1,None]))

In [26]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            16640     
                                                                 
 lstm_1 (LSTM)               (1, None, 1024)           5246976   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [27]:
# generate_text 함수를 활용하여, 문자를 연속적으로 예측

def generate_text(model, start_string):
  #평가단계(학습된 모델을 사용하여 텍스트 생성)

  #생성할 문자의 수 
  num_generate = 1000

  #시작 문자열을 숫자로 변환(벡터화)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  #결과를 저장할 빈 문자열
  text_generated = []

  # temperature 에 따른 예상 텍스트 
  # tempeorature가 낮으면 예측 가능한 텍스트, temperature가 높으면 의외의 텍스트
  # 최적의 세팅을 찾기 위한 실험
  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    #배치 차원 제거 
    predictions = tf.squeeze(predictions, 0)

    #범주형 분포를 사용하여 모델에서 리턴한 단어 예측
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples =1)[-1,0].numpy()
    
    #예측된 단어를 다음 입력으로 모델에 전달
    # 이전 은닉상태와 함께 
    input_eval = tf.expand_dims([predicted_id],0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [29]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: 'Tis England's words.

WARWICK:
Then 'twas my turn to fly, not prove inferior to yourself.
You that love me and Warwick, follow me.

GLOUCESTER:

KING EDWARD IV:
Clarence and son are you would not have bestow'd the heir
Of the Lord Bonville on your new wife'll tell you how am at our leasure of crowns:
What daye and taintanly, EDWARD IV:
What if both Lewis and Lady Bona,
And repart me, and your high adventure very e all's,
Thou stats:
But stay their holds this attempt,
Applaud the man that love me and Warwick, nd heart
From WARD IV:
Why, so! then am I sure of victers, and murd's coverture,
Thy brother being carelessly encamp
To lid him and Margaret:
But if your title to the crown be well, Edward block.
To rise and take his natural rest in our hunchoman,
Inters can his title, smooths the wrong. But what said Hath mad.

QUEEN MARGARET:
Those gracious in the people's eye?

WARWICK:
And mine;
So home fair queen and mistress
Smiles at her news, while Warwick frowns at his.

PRINCE EDW