# RNN with keras Review

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [9]:
text="""경마장에 있는 말이 뛰고 있다\n그의 말이 법이다\n가는 말이 고와야 오는 말이 곱다"""
print(text)

경마장에 있는 말이 뛰고 있다
그의 말이 법이다
가는 말이 고와야 오는 말이 곱다


**토크나이징**은 t.fit_on_texts([text]) 이렇게!

In [10]:
t = Tokenizer()
t.fit_on_texts([text])

총 11개의 vocab이 어절 단위 토크나이징으로 생성되었음

In [11]:
t.index_word

{1: '말이',
 2: '경마장에',
 3: '있는',
 4: '뛰고',
 5: '있다',
 6: '그의',
 7: '법이다',
 8: '가는',
 9: '고와야',
 10: '오는',
 11: '곱다'}

**학습시퀀스** 생성하기

In [12]:
sequeces = list()
for line in text.split('\n'):
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        print()

[2, 3, 1, 4, 5]
[6, 1, 7]
[8, 1, 9, 10, 1, 11]


In [17]:
sequeces = list()
for line in text.split('\n'):
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        print(encoded[:i+1])

[2, 3]
[2, 3, 1]
[2, 3, 1, 4]
[2, 3, 1, 4, 5]
[6, 1]
[6, 1, 7]
[8, 1]
[8, 1, 9]
[8, 1, 9, 10]
[8, 1, 9, 10, 1]
[8, 1, 9, 10, 1, 11]


In [26]:
sequences = list()
for line in text.split('\n'):
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)+1):
        sequence = encoded[:i]
        sequences.append(sequence)
sequences

[[2],
 [2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6],
 [6, 1],
 [6, 1, 7],
 [8],
 [8, 1],
 [8, 1, 9],
 [8, 1, 9, 10],
 [8, 1, 9, 10, 1],
 [8, 1, 9, 10, 1, 11]]

시퀀스를 생성했다면 이젠 **시퀀스패딩!** <br/>
시퀀스 패딩은 maxlen을 설정하고 pad_sequences 매소드..

In [27]:
max_len = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
sequences

array([[ 0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  2,  3,  1,  4],
       [ 0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  0,  6],
       [ 0,  0,  0,  0,  6,  1],
       [ 0,  0,  0,  6,  1,  7],
       [ 0,  0,  0,  0,  0,  8],
       [ 0,  0,  0,  0,  8,  1],
       [ 0,  0,  0,  8,  1,  9],
       [ 0,  0,  8,  1,  9, 10],
       [ 0,  8,  1,  9, 10,  1],
       [ 8,  1,  9, 10,  1, 11]])

시퀀스 패딩까지 완료 되었다면 이젠 **학습셋 분할** 해야지 ㅇㅇ

In [28]:
X = sequences[:, :-1]
y = sequences[:, -1]

print(X)
print(y)

[[ 0  0  0  0  0]
 [ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  0]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[ 2  3  1  4  5  6  1  7  8  1  9 10  1 11]


학습셋까지 분할했으면 벌써 **모델링**이네?

In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
import numpy as np

vocab_size = len(t.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, 2, input_length=max_len - 1)) # y레이블을 분리했기 때문
model.add(SimpleRNN(2))
model.add(Dense(vocab_size, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [57]:
vocab_size, max_len

(12, 6)

임시로 테스트 한번 넣어보고

In [58]:
model.predict(np.array([[0, 1, 4, 2, 1]]))
# max_len

array([[0.08305098, 0.08359986, 0.08534104, 0.08622256, 0.08723344,
        0.08693309, 0.08061866, 0.08229353, 0.08060069, 0.0822025 ,
        0.08106726, 0.08083653]], dtype=float32)

In [59]:
model.fit(X, y, epochs=200, verbose=2)

Train on 14 samples
Epoch 1/200
14/14 - 1s - loss: 2.4827 - accuracy: 0.0714
Epoch 2/200
14/14 - 0s - loss: 2.4814 - accuracy: 0.0714
Epoch 3/200
14/14 - 0s - loss: 2.4802 - accuracy: 0.0714
Epoch 4/200
14/14 - 0s - loss: 2.4789 - accuracy: 0.0714
Epoch 5/200
14/14 - 0s - loss: 2.4777 - accuracy: 0.0714
Epoch 6/200
14/14 - 0s - loss: 2.4764 - accuracy: 0.0714
Epoch 7/200
14/14 - 0s - loss: 2.4751 - accuracy: 0.0714
Epoch 8/200
14/14 - 0s - loss: 2.4738 - accuracy: 0.0714
Epoch 9/200
14/14 - 0s - loss: 2.4726 - accuracy: 0.0714
Epoch 10/200
14/14 - 0s - loss: 2.4713 - accuracy: 0.0714
Epoch 11/200
14/14 - 0s - loss: 2.4700 - accuracy: 0.1429
Epoch 12/200
14/14 - 0s - loss: 2.4687 - accuracy: 0.1429
Epoch 13/200
14/14 - 0s - loss: 2.4674 - accuracy: 0.1429
Epoch 14/200
14/14 - 0s - loss: 2.4661 - accuracy: 0.1429
Epoch 15/200
14/14 - 0s - loss: 2.4648 - accuracy: 0.1429
Epoch 16/200
14/14 - 0s - loss: 2.4635 - accuracy: 0.1429
Epoch 17/200
14/14 - 0s - loss: 2.4621 - accuracy: 0.1429
Epo

Epoch 142/200
14/14 - 0s - loss: 2.2396 - accuracy: 0.3571
Epoch 143/200
14/14 - 0s - loss: 2.2377 - accuracy: 0.3571
Epoch 144/200
14/14 - 0s - loss: 2.2358 - accuracy: 0.3571
Epoch 145/200
14/14 - 0s - loss: 2.2339 - accuracy: 0.3571
Epoch 146/200
14/14 - 0s - loss: 2.2320 - accuracy: 0.3571
Epoch 147/200
14/14 - 0s - loss: 2.2301 - accuracy: 0.3571
Epoch 148/200
14/14 - 0s - loss: 2.2282 - accuracy: 0.3571
Epoch 149/200
14/14 - 0s - loss: 2.2263 - accuracy: 0.3571
Epoch 150/200
14/14 - 0s - loss: 2.2244 - accuracy: 0.3571
Epoch 151/200
14/14 - 0s - loss: 2.2225 - accuracy: 0.3571
Epoch 152/200
14/14 - 0s - loss: 2.2206 - accuracy: 0.3571
Epoch 153/200
14/14 - 0s - loss: 2.2187 - accuracy: 0.3571
Epoch 154/200
14/14 - 0s - loss: 2.2168 - accuracy: 0.3571
Epoch 155/200
14/14 - 0s - loss: 2.2149 - accuracy: 0.3571
Epoch 156/200
14/14 - 0s - loss: 2.2130 - accuracy: 0.3571
Epoch 157/200
14/14 - 0s - loss: 2.2111 - accuracy: 0.3571
Epoch 158/200
14/14 - 0s - loss: 2.2092 - accuracy: 0.35

<tensorflow.python.keras.callbacks.History at 0x1c3a362e320>

학습이 완료됐으니 **테스트**를 해보자

In [60]:
sample = t.texts_to_sequences(['말이'])
sample = pad_sequences(sample, maxlen=max_len -1)
pred_id = np.argmax(model.predict(sample))
t.sequences_to_texts([[pred_id]])

['말이']