# 글자 단위 RNN 언어 모델(Char RNNLM)
- https://wikidocs.net/48649
- 입출력의 단위를 단어 레벨(word-level)에서 글자 레벨(character-level)로 변경하여 RNN을 구현
- 다 대 일(many-to-one) 구조의 RNN

## Import

In [22]:
import numpy as np

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Load dataset

In [2]:
text='''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [3]:
tokens = text.split()
text = " ".join(tokens)
text

"I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine."

In [4]:
char_vocab = sorted(list(set(text)))
print(char_vocab)

[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']


In [11]:
char_to_index = {c:i for i, c in enumerate(char_vocab)}
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [12]:
length = 11
sequences = []
for i in range(length, len(text)):
    seq = text[i-length:i]
    sequences.append(seq)
print(f"총 훈련 샘플 수: {len(sequences)}")

총 훈련 샘플 수: 426


In [14]:
sequences[:10]

['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [15]:
X = []
for line in sequences:
    temp_x = [char_to_index[c] for c in line]
    X.append(temp_x)
X[:10]

[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18],
 [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28],
 [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17],
 [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0],
 [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21],
 [0, 24, 23, 0, 31, 18, 28, 17, 0, 21, 18],
 [24, 23, 0, 31, 18, 28, 17, 0, 21, 18, 15],
 [23, 0, 31, 18, 28, 17, 0, 21, 18, 15, 14],
 [0, 31, 18, 28, 17, 0, 21, 18, 15, 14, 0],
 [31, 18, 28, 17, 0, 21, 18, 15, 14, 0, 10]]

In [18]:
sequences = np.array(X)
X = sequences[:,:-1]
y = sequences[:,-1]

In [20]:
vocab_size = len(char_vocab)
vocab_size

33

In [21]:
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)
X.shape, y.shape

((426, 10, 33), (426, 33))

## build model

In [23]:
model = Sequential()
model.add(LSTM(80, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 80)                36480     
_________________________________________________________________
dense (Dense)                (None, 33)                2673      
Total params: 39,153
Trainable params: 39,153
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
model.fit(X, y, epochs=100, verbose=2)

Train on 426 samples
Epoch 1/100
426/426 - 2s - loss: 3.4616 - accuracy: 0.1385
Epoch 2/100
426/426 - 0s - loss: 3.2681 - accuracy: 0.1972
Epoch 3/100
426/426 - 0s - loss: 3.0460 - accuracy: 0.1972
Epoch 4/100
426/426 - 0s - loss: 2.9939 - accuracy: 0.1972
Epoch 5/100
426/426 - 0s - loss: 2.9516 - accuracy: 0.1972
Epoch 6/100
426/426 - 0s - loss: 2.9370 - accuracy: 0.1972
Epoch 7/100
426/426 - 0s - loss: 2.9187 - accuracy: 0.1972
Epoch 8/100
426/426 - 0s - loss: 2.8917 - accuracy: 0.1972
Epoch 9/100
426/426 - 0s - loss: 2.8772 - accuracy: 0.1972
Epoch 10/100
426/426 - 0s - loss: 2.8475 - accuracy: 0.1972
Epoch 11/100
426/426 - 0s - loss: 2.8076 - accuracy: 0.1972
Epoch 12/100
426/426 - 0s - loss: 2.7723 - accuracy: 0.2066
Epoch 13/100
426/426 - 0s - loss: 2.7249 - accuracy: 0.2042
Epoch 14/100
426/426 - 0s - loss: 2.6679 - accuracy: 0.2089
Epoch 15/100
426/426 - 0s - loss: 2.6378 - accuracy: 0.2488
Epoch 16/100
426/426 - 0s - loss: 2.5794 - accuracy: 0.2277
Epoch 17/100
426/426 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x7fa1ca5e3a58>

## Predict

In [25]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
# 모델, 인덱스 정보, 문장 길이, 초기 시퀀스, 반복 횟수
    init_text = seed_text # 문장 생성에 사용할 초기 시퀀스
    sentence = ''

    for _ in range(n): # n번 반복
        encoded = [char_to_index[char] for char in seed_text] # 현재 시퀀스에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre') # 데이터에 대한 패딩
        encoded = to_categorical(encoded, num_classes=len(char_to_index))
        result = model.predict_classes(encoded, verbose=0)
        # 입력한 X(현재 시퀀스)에 대해서 y를 예측하고 y(예측한 글자)를 result에 저장.
        for char, index in char_to_index.items(): # 만약 예측한 글자와 인덱스와 동일한 글자가 있다면
            if index == result: # 해당 글자가 예측 글자이므로 break
                break
        seed_text=seed_text + char # 현재 시퀀스 + 예측 글자를 현재 시퀀스로 변경
        sentence=sentence + char # 예측 글자를 문장에 저장
        # for문이므로 이 작업을 다시 반복

    sentence = init_text + sentence
    return sentence

In [26]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))

I get on with life as a programmer, I like to contemplate beer. But when I stap ty taleeg 


In [28]:
print(sentence_generation(model, char_to_index, 10, 'I have a', 80))

I have aggs ano wto minpt tinn maayk lonr mmin I to ne tt alin g? I liet to cse tlaloggo


In [33]:
print(sentence_generation(model, char_to_index, 10, 'I hate', 80))

I hatepgg ciy minng I mhket o win th soet tcy to byee, I likn to cse tpalogbbe... But 
