**AI 노바투스과정**

Week 4: 딥러닝 기초 II (2021-08-27)

실습 #3 [**심화 실습**] : 순환 신경망 학습 실습하기 (텍스트 데이터 생성하기)

In [1]:
### 필요 패키지 불러오기

import numpy as np
import tensorflow.keras as keras
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

## 1. 데이터셋 준비

In [2]:
# 데이터셋 받아와서 charcter 의 모음인 string 으로 저장하기

shakespeare_url = "https://homl.info/shakespeare" 
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [3]:
# 데이터 살피기

print(shakespeare_text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
# 각 글자를 정수로 코딩하기
# 단어나 의미단위로 쪼개는 것
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # 1~39 -> 0 ~ 38

In [5]:
# tokenizer 사용해보기

tokenizer.texts_to_sequences(["Apple"])

[[5, 23, 23, 12, 2]]

In [6]:
tokenizer.sequences_to_texts([[5, 23, 23, 12, 2]])

['a p p l e']

In [7]:
# 서로 다른 글자의 개수

max_id=len(tokenizer.word_index)
max_id

39

In [8]:
# 전체 글자의 개수 

dataset_size=tokenizer.document_count
dataset_size

1115394

In [9]:
tokenizer.word_index

{'\n': 11,
 ' ': 1,
 '!': 31,
 '$': 39,
 '&': 38,
 "'": 28,
 ',': 18,
 '-': 32,
 '.': 27,
 '3': 37,
 ':': 24,
 ';': 29,
 '?': 30,
 'a': 5,
 'b': 22,
 'c': 19,
 'd': 13,
 'e': 2,
 'f': 20,
 'g': 21,
 'h': 7,
 'i': 6,
 'j': 33,
 'k': 25,
 'l': 12,
 'm': 15,
 'n': 10,
 'o': 4,
 'p': 23,
 'q': 34,
 'r': 9,
 's': 8,
 't': 3,
 'u': 14,
 'v': 26,
 'w': 17,
 'x': 35,
 'y': 16,
 'z': 36}

In [10]:
#학습용 데이터셋 분리하기

train_size = dataset_size * 40 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [11]:
# window() 함수 이용하여 하나의 긴 글을 길이가 101 인 여러 구간으로 분리
# 100개를 학습해서 1글자 예측
n_steps = 100
window_length = n_steps + 1 
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [12]:
# 구간들을 셔플 (shuffle) 한 후 batch 로 나누고 첫 100 개 글자와 뒤 100 개 글자를 분리

batch_size = 10000
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:])) # why?

In [13]:
# 각 정수 값을 one-hot 벡터로 바꾸기

dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

dataset = dataset.prefetch(1) # 데이터 prefetching
# 앞으로 연산에 필요한 data를 미리 가져오는 것.
# 학습중일때, 데이터 로드시간을 줄이기 위해 미리 메모리에 적재시킴. 이때, 괄호안의 숫자는 얼마만큼 적재시킬지에 대한 숫자

## 2. GRU 모형 만들고 학습하기

In [14]:
# gpu 설정
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [15]:
model = keras.models.Sequential()
model.add(keras.layers.GRU(64, return_sequences=True, input_shape=[None, max_id],
dropout=0.2, recurrent_dropout=0))
model.add(keras.layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")))

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
input_shape=[None, max_id]
input_shape

[None, 39]

In [17]:
model.summary()
# 3(n^2+m*n+2n)
# n = output
# m = input
# 64*39+39

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, None, 64)          20160     
                                                                 
 gru_1 (GRU)                 (None, None, 64)          24960     
                                                                 
 time_distributed (TimeDistr  (None, None, 39)         2535      
 ibuted)                                                         
                                                                 
Total params: 47,655
Trainable params: 47,655
Non-trainable params: 0
_________________________________________________________________


##3. 글자 생성하기

In [18]:
# 입력 데이터를 숫자 & one-hot vector 로 변환하는 함수 만들기

def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

In [19]:
import numpy as np
# "How are yo" 예측
X_new = preprocess(["How are yo"])
y_proba=model.predict(X_new)
Y_pred=y_proba.argmax(axis=2)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 첫번째 문장 마지막 알파벳

'u'

In [20]:
tokenizer.sequences_to_texts(Y_pred + 1)

['e u   t n d   t o u']

In [25]:
# "hel" 예측

X_new = preprocess(["hel"])
y_proba=model.predict(X_new)
Y_pred=y_proba.argmax(axis=2)
print(Y_pred)
tokenizer.sequences_to_texts(Y_pred+1)[0][-1]

[[ 1  0 11]]


'l'

In [None]:
tokenizer.sequences_to_texts(Y_pred+1)

['r   s']

In [None]:
tokenizer.sequences_to_texts(Y_pred+1)[0]

'r   s'

In [None]:
tokenizer.sequences_to_texts(Y_pred+1)[0][-1]

's'

## 4. 가짜 글 생성하기

In [30]:
# temperature가 높을수록 모든 단어 선택 확률이 같아진다.
# 낮을수록 예측 가능한 결과
def next_char(text, temperature=1):
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1 # 뒤에 글자 무작위 확률
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [31]:
def complete_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

In [32]:
print(complete_text("t", temperature=0.2))


t the sow the sore the the the so ding and the nous


In [33]:
print(complete_text("t", temperature=2))

t: aus sfou&n !zel?!p$dk;

opp. wovn'?id:eremt co.f


## 5. Stateful RNN 모형으로 해보기

상태유지 모드에서는 현재 샘플의 학습 상태가 다음 샘플의 초기 상태로 전달된다.

In [None]:
## 학습용 데이터 준비

# ***주의*** 각 입력 시퀀스는 이전 시퀀스와 이어져야 하므로,  
# shift=n_steps 으로 하며, shuffle() 을 사용하지 말것.


dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [None]:
# 모형의 골격 만들기 (Stateful=True 로 설정)

batch_size = 1
model = keras.models.Sequential()
model.add( keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0,
batch_input_shape=[batch_size, None, max_id]) )
model.add( keras.layers.GRU(128, return_sequences=True, stateful=True,
dropout=0.2, recurrent_dropout=0) )
model.add( keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")))


In [None]:
# callback 만들기 (다음 epoch 으로 넘어가기 전에 state 재설정하는 callback)

class ResetStatesCallback(keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.reset_states()

In [None]:
# 모형 컴파일 및 학습하기

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model.fit(dataset, epochs=20, callbacks=[ResetStatesCallback()])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe0d4ae8510>