# RNN

# 1. import modules

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

# 2. define input sentence

In [5]:
text="""경마장에 있는 말이 뛰고 있다\n그의 말이 법이다\n가는 말이 고와야 오는 말이 곱다\n"""
print(text)

경마장에 있는 말이 뛰고 있다
그의 말이 법이다
가는 말이 고와야 오는 말이 곱다



# 3. word tokenizing

In [10]:
# declare instance
t = Tokenizer()
# word tokenizing
t.fit_on_texts([text])
# define vocab size
vocab_size = len(t.word_index) + 1
vocab_size

12

In [11]:
t.word_counts

OrderedDict([('경마장에', 1),
             ('있는', 1),
             ('말이', 4),
             ('뛰고', 1),
             ('있다', 1),
             ('그의', 1),
             ('법이다', 1),
             ('가는', 1),
             ('고와야', 1),
             ('오는', 1),
             ('곱다', 1)])

In [12]:
t.word_index

{'말이': 1,
 '경마장에': 2,
 '있는': 3,
 '뛰고': 4,
 '있다': 5,
 '그의': 6,
 '법이다': 7,
 '가는': 8,
 '고와야': 9,
 '오는': 10,
 '곱다': 11}

In [14]:
t.texts_to_sequences(['경마장에 뛰고 있는 말이 있다'])

[[2, 4, 3, 1, 5]]

In [18]:
# vocab에 없는 단어는 출력되지 않음
t.texts_to_sequences(['경마장에서 달리고 있는 말도 있는가'])

[[3]]

# 4. define training sequences

In [33]:
sequences = list()
for line in text.split('\n'):
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        print(i)
        print(sequence)
        sequences.append(np.array(sequence)) # only if done this way, the sequence padding is possible
        
print('total number of training samples: ', len(sequences))

1
[2, 3]
2
[2, 3, 1]
3
[2, 3, 1, 4]
4
[2, 3, 1, 4, 5]
1
[6, 1]
2
[6, 1, 7]
1
[8, 1]
2
[8, 1, 9]
3
[8, 1, 9, 10]
4
[8, 1, 9, 10, 1]
5
[8, 1, 9, 10, 1, 11]
total number of training samples:  11


In [34]:
sequences

[array([2, 3]),
 array([2, 3, 1]),
 array([2, 3, 1, 4]),
 array([2, 3, 1, 4, 5]),
 array([6, 1]),
 array([6, 1, 7]),
 array([8, 1]),
 array([8, 1, 9]),
 array([ 8,  1,  9, 10]),
 array([ 8,  1,  9, 10,  1]),
 array([ 8,  1,  9, 10,  1, 11])]

# 5. define max length and pad sequences

In [31]:
# if pad sequences with padding='post'
max_len = max([len(seq) for seq in sequences])
print('max length: ', max_len)

sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
sequences

max length:  6


array([[ 2,  3,  0,  0,  0,  0],
       [ 2,  3,  1,  0,  0,  0],
       [ 2,  3,  1,  4,  0,  0],
       [ 2,  3,  1,  4,  5,  0],
       [ 6,  1,  0,  0,  0,  0],
       [ 6,  1,  7,  0,  0,  0],
       [ 8,  1,  0,  0,  0,  0],
       [ 8,  1,  9,  0,  0,  0],
       [ 8,  1,  9, 10,  0,  0],
       [ 8,  1,  9, 10,  1,  0],
       [ 8,  1,  9, 10,  1, 11]])

In [35]:
# if pad sequences with padding='pre'
max_len = max([len(seq) for seq in sequences])
print('max length: ', max_len)

sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
sequences

max length:  6


array([[ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  2,  3,  1,  4],
       [ 0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  6,  1],
       [ 0,  0,  0,  6,  1,  7],
       [ 0,  0,  0,  0,  8,  1],
       [ 0,  0,  0,  8,  1,  9],
       [ 0,  0,  8,  1,  9, 10],
       [ 0,  8,  1,  9, 10,  1],
       [ 8,  1,  9, 10,  1, 11]])

# 6. split x and y_train set

In [45]:
X = sequences[:, :-1]
y = sequences[:, -1]

print(X)
print(y)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[ 3  1  4  5  1  7  1  9 10  1 11]


In [46]:
# y labels to onehot encoding
y = to_categorical(y, num_classes=vocab_size)
print(y)
print(vocab_size)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
12


# 7. Modeling

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

model = Sequential()
model.add(Embedding(vocab_size, 2, input_length=max_len-1)) # y 라벨을 분리하였기에 -1
model.add(SimpleRNN(3))
model.add(Dense(11 + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [60]:
model.predict(np.array([[0, 0, 0, 2, 1]]))

array([[0.08398467, 0.0832266 , 0.08282747, 0.08498497, 0.08260415,
        0.08329845, 0.08230419, 0.08137365, 0.08470046, 0.08386508,
        0.0835427 , 0.08328769]], dtype=float32)

In [61]:
model.predict(np.array([[0, 0, 0, 2, 1]])).sum()

1.0000001

In [101]:
model.fit(X, y, epochs=200, verbose=2)

Train on 11 samples
Epoch 1/200
11/11 - 0s - loss: 1.0512 - accuracy: 0.7273
Epoch 2/200
11/11 - 0s - loss: 1.0498 - accuracy: 0.7273
Epoch 3/200
11/11 - 0s - loss: 1.0484 - accuracy: 0.7273
Epoch 4/200
11/11 - 0s - loss: 1.0471 - accuracy: 0.7273
Epoch 5/200
11/11 - 0s - loss: 1.0457 - accuracy: 0.7273
Epoch 6/200
11/11 - 0s - loss: 1.0443 - accuracy: 0.7273
Epoch 7/200
11/11 - 0s - loss: 1.0430 - accuracy: 0.7273
Epoch 8/200
11/11 - 0s - loss: 1.0416 - accuracy: 0.7273
Epoch 9/200
11/11 - 0s - loss: 1.0403 - accuracy: 0.7273
Epoch 10/200
11/11 - 0s - loss: 1.0389 - accuracy: 0.7273
Epoch 11/200
11/11 - 0s - loss: 1.0376 - accuracy: 0.7273
Epoch 12/200
11/11 - 0s - loss: 1.0362 - accuracy: 0.7273
Epoch 13/200
11/11 - 0s - loss: 1.0349 - accuracy: 0.7273
Epoch 14/200
11/11 - 0s - loss: 1.0336 - accuracy: 0.7273
Epoch 15/200
11/11 - 0s - loss: 1.0322 - accuracy: 0.7273
Epoch 16/200
11/11 - 0s - loss: 1.0309 - accuracy: 0.7273
Epoch 17/200
11/11 - 0s - loss: 1.0296 - accuracy: 0.7273
Epo

Epoch 142/200
11/11 - 0s - loss: 0.8916 - accuracy: 0.8182
Epoch 143/200
11/11 - 0s - loss: 0.8907 - accuracy: 0.8182
Epoch 144/200
11/11 - 0s - loss: 0.8898 - accuracy: 0.8182
Epoch 145/200
11/11 - 0s - loss: 0.8889 - accuracy: 0.8182
Epoch 146/200
11/11 - 0s - loss: 0.8880 - accuracy: 0.8182
Epoch 147/200
11/11 - 0s - loss: 0.8871 - accuracy: 0.8182
Epoch 148/200
11/11 - 0s - loss: 0.8862 - accuracy: 0.8182
Epoch 149/200
11/11 - 0s - loss: 0.8854 - accuracy: 0.8182
Epoch 150/200
11/11 - 0s - loss: 0.8845 - accuracy: 0.8182
Epoch 151/200
11/11 - 0s - loss: 0.8836 - accuracy: 0.8182
Epoch 152/200
11/11 - 0s - loss: 0.8827 - accuracy: 0.8182
Epoch 153/200
11/11 - 0s - loss: 0.8819 - accuracy: 0.8182
Epoch 154/200
11/11 - 0s - loss: 0.8810 - accuracy: 0.8182
Epoch 155/200
11/11 - 0s - loss: 0.8801 - accuracy: 0.8182
Epoch 156/200
11/11 - 0s - loss: 0.8793 - accuracy: 0.8182
Epoch 157/200
11/11 - 0s - loss: 0.8784 - accuracy: 0.8182
Epoch 158/200
11/11 - 0s - loss: 0.8776 - accuracy: 0.81

<tensorflow.python.keras.callbacks.History at 0x2007eca5cf8>

In [102]:
sample = t.texts_to_sequences(['말이'])
sample = pad_sequences(sample, maxlen=6-1)
sample

array([[0, 0, 0, 0, 1]])

In [103]:
print(model.predict(sample))
print(model.predict(sample).sum())

[[0.00676516 0.49597344 0.00693486 0.11797989 0.08602361 0.01723134
  0.01364004 0.05981395 0.00651001 0.09689014 0.07537879 0.01685869]]
0.99999994


In [104]:
pred_id = np.argmax(model.predict(sample))
print(pred_id)
print('predicted word: ', t.sequences_to_texts([[pred_id]]))

1
predicted word:  ['말이']


In [105]:
for i in range(1, 12):
    print(t.sequences_to_texts([[i]]))

['말이']
['경마장에']
['있는']
['뛰고']
['있다']
['그의']
['법이다']
['가는']
['고와야']
['오는']
['곱다']


# 8. test with different input words

In [106]:
sample = t.texts_to_sequences(['경마장에'])
print('sample: ', sample)
sample = pad_sequences(sample, maxlen=6-1)
pred_id = np.argmax(model.predict(sample))
print('predicted word: ', t.sequences_to_texts([[pred_id]]))

sample:  [[2]]
predicted word:  ['말이']


In [107]:
# unknown token
sample = t.texts_to_sequences(['경륜장에는'])
print('sample: ', sample)
sample = pad_sequences(sample, maxlen=6-1)
pred_id = np.argmax(model.predict(sample))
print('predicted word: ', t.sequences_to_texts([[pred_id]]))

sample:  [[]]
predicted word:  ['말이']


In [108]:
sample = t.texts_to_sequences(['가는 말이 고와야'])
sample = pad_sequences(sample, maxlen=6-1)
print('sample: ', sample)
pred_id = np.argmax(model.predict(sample))
print('predicted word: ', t.sequences_to_texts([[pred_id]]))

sample:  [[0 0 8 1 9]]
predicted word:  ['오는']


In [109]:
sample = t.texts_to_sequences(['경마장에 있는 말이 뛰고'])
sample = pad_sequences(sample, maxlen=6-1)
print('sample: ', sample)
pred_id = np.argmax(model.predict(sample))
print('predicted word: ', t.sequences_to_texts([[pred_id]]))

sample:  [[0 2 3 1 4]]
predicted word:  ['있다']
