In [59]:
from __future__ import print_function
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file

import numpy as np
import random, sys, io, warnings

warnings.filterwarnings('ignore')

random.seed(228)

In [54]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
    
print(f'텍스트 총 길이 : {len(text):,}개')

텍스트 총 길이 : 600,893개


In [51]:
chars = sorted(set(text))

In [53]:
char_indices = {c:idx for idx, c in enumerate(chars)}
indices_char = {idx:c for idx, c in enumerate(chars)}

print(f'고유 텍스트 수 : {len(chars)}개')

고유 텍스트 수 : 57개


In [47]:
# 텍스트를 maxlen 만큼의 집합으로 분리

maxlen = 40      # 추출할 텍스트 시퀀스의 최대값
step = 3         # 세 개의 텍스트씩 건너 새로운 시퀀스 생성

sentences = []   # 추출 시퀀스 컨테이너
next_chars = []  # 시퀀스 다음 글자(Target) 컨테이너

for idx in range(0, len(text) - maxlen, step): # idx = 0, 3, 6, ...
    sentences.append(text[idx: idx + maxlen])
    next_chars.append(text[idx + maxlen])
    
print(f'시퀀스의 총 개수 : {len(sentences):,}개')

시퀀스의 총 개수 : 200,285개


In [38]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)  # (200285, 40, 57)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)          # (200285, 57)

# 문자를 이진수 배열로 벡터화

for sentences_idx, seperate_sentence in enumerate(sentences):
    for text_idx, char in enumerate(seperate_sentence):
        # x : 고유 텍스트가 존재하는 부분에 1
        x[sentences_idx, text_idx, char_indices[char]] = 1
    # y : Target의 고유 텍스트 1
    y[sentences_idx, char_indices[next_chars[sentences_idx]]] = 1

In [45]:
# LSTM 모델 정의

lstm_model = Sequential()

lstm_model.add(LSTM(256, input_shape=(maxlen, len(chars))))
lstm_model.add(Dense(128, activation='relu'))
lstm_model.add(Dropout(0.3))

lstm_model.add(BatchNormalization())
lstm_model.add(Dense(len(chars), activation='softmax'))

In [46]:
optimizer = Adam() # 오차역전파 최적화 함수
lstm_model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [89]:
def sample(preds, temperature=1.0):
    '''
    모델의 예측값을 통해 새로운 글자를 샘플링하는 함수
    '''
    preds = np.asarray(preds).astype('float64') # 확률 분포 배열 추출
    preds = np.log(preds) / temperature         # 자연로그값 / 온도 (가중치 조절 분포 생성)
    exp_preds = np.exp(preds)                   # e ^ preds : 밑이 자연상수 e인 지수함수로 변환
    preds = exp_preds / np.sum(exp_preds)       # 무작위로 새로운 글자 샘플링
    probas = np.random.multinomial(n=1, pvals=preds, size=1)
    return np.argmax(probas)

In [114]:
start_index = random.randint(0, len(text) - maxlen - 1)
start_index

154067

In [115]:
for epoch in range(1, 30): # 30 Epoch 훈련
    print(f'Epoch : {epoch}')
    lstm_model.fit(x, y, batch_size=64, epochs=1)            # 데이터에서 한 번만 반복해서 모델 학습
    
    seed_text = text[start_index: start_index + maxlen] # 무작위로 시드 텍스트를 선택한다.
    print(f'Seed Text : "{seed_text}"')
    
    for temperature in [0.2, 0.5, 1.0, 1.2]:           # 여러 가지 샘플링 온도를 시도
        
        generated_text = seed_text
        sys.stdout.write(generated_text)
        
        for i in range(400): # 시드 텍스트에서 시작해서 400개의 글자 생성
            sampled = np.zeros((1, maxlen, len(chars)))  # One-Hot Encoding
            
            for text_idx, char in enumerate(generated_text): 
                sampled[0, text_idx, char_indices[char]] = 1.

            # 다음 글자를 샘플링
            preds = lstm_model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]
            
            generated_text += next_char
            generated_text = generated_text[1:]
            
        print(generated_text)

Epoch : 1
Seed Text : "o hostile and averse to the
new; and gen"
o hostile and averse to the
new; and gentanding and entire morality of a sort of
o hostile and averse to the
new; and gen cansafient man in short, perhaps it is 
o hostile and averse to the
new; and gene and the general that is, is a crude to
o hostile and averse to the
new; and genan philosophy,
and hers, also have feele
Epoch : 2
Seed Text : "o hostile and averse to the
new; and gen"
o hostile and averse to the
new; and genll these species and so far as the stand
o hostile and averse to the
new; and gend and such a state of
christianity than 
o hostile and averse to the
new; and gener of forms be than it beywndance theref
o hostile and averse to the
new; and gen
seems
honest dequiofstealing, setsit, s
Epoch : 3
Seed Text : "o hostile and averse to the
new; and gen"
o hostile and averse to the
new; and genbably makes itself these special these a
o hostile and averse to the
new; and geny, or enother turnified so denein mo

new; and gen order to live one is to been consists e
o hostile and averse to the
new; and gentain
thing our foll. it goated if they
r
Epoch : 19
Seed Text : "o hostile and averse to the
new; and gen"
o hostile and averse to the
new; and gencious and and in the strange than the st
o hostile and averse to the
new; and genes of man all the person in every and pa
o hostile and averse to the
new; and gen
dissimility it is deny
for knowledge, e
o hostile and averse to the
new; and genon of the complete: and to man in that s
Epoch : 20
Seed Text : "o hostile and averse to the
new; and gen"
o hostile and averse to the
new; and generion of the same and and the most self-
o hostile and averse to the
new; and geno the blood fight of man, with the sarni
o hostile and averse to the
new; and gen an exertions is untimilates, in its or 
o hostile and averse to the
new; and genwer, it)--full to the gigneriin ou der n
Epoch : 21
Seed Text : "o hostile and averse to the
new; and gen"
o hostile and averse

In [116]:
generated_text

'd of light, and scrult, it may nese able'

In [139]:
seed_text = ' you please take me so far away, you\'re ' # 임의의 영어
print(f'Seed Text : "{seed_text}"')

generated_text = seed_text

for i in range(600): # 시드 텍스트에서 시작해서 600개의 글자 생성
    sampled = np.zeros((1, maxlen, len(chars)))  # One-Hot Encoding

    for text_idx, char in enumerate(generated_text): 
        sampled[0, text_idx, char_indices[char]] = 1.

    # 다음 글자를 샘플링
    preds = lstm_model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, 0.7)
    next_char = chars[next_index]

    generated_text += next_char
    generated_text = generated_text[1:]
    
print(f'Generated Text : {generated_text}')

Seed Text : " you please take me so far away, you're "
Generated Text : itself the religious prise in indiness:-
