# LSTM 및 GRU를 활용한 언어 모형 생성 
---
RNN(순환신경망)의 대표 모형인 LSTM과 GRU를 활용하여 언어 모형 / 문장 생성 모델 실습

## 2.언어 모형 만들기

### 데이터 불러오기

In [1]:
import tensorflow.keras
import pandas as pd

In [2]:
df = pd.read_csv('amazon_cells_labelled.txt', sep="\t", header=None)
df.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


#### 토큰화

In [3]:
tok = tensorflow.keras.preprocessing.text.Tokenizer()

In [4]:
tok.fit_on_texts(df[0])
seq = tok.texts_to_sequences(df[0])

In [5]:
df.iloc[0, 0]

'So there is no way for me to plug it in here in the US unless I go by a converter.'

#### 텍스트의 시작과 끝을 나타내는 단어를 사전에 추가

In [6]:
tok.word_index["<START>"] = start = len(tok.word_index) + 1
tok.index_word[start] = "<START>"

tok.word_index["<END>"] = end = len(tok.word_index) + 1
tok.index_word[end] = "<END>"

In [7]:
# 시작과 끝을 모두 붙인 prev_seq
# 끝 표시만 붙은 next_seq 생성

prev_seq=[]
next_seq=[]

for s in seq:
    prev_seq.append([start] + s + [end])
    next_seq.append(s + [end])

#### 패딩

In [8]:
max_len = max(len(s) for s in prev_seq)

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
# padding: 'pre'/'post', 각 시퀀스의 처음/끝을 패딩
prev_pad = pad_sequences(prev_seq, max_len, padding='post')
next_pad = pad_sequences(next_seq, max_len, padding='post')

In [11]:
prev_pad[0]

array([1879,   33,  117,    5,   53,  214,   11,   47,    8,  155,    4,
         19,  337,   19,    1,  546,  416,    2,  241,  190,    6,  812,
       1880,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [12]:
next_pad[0]

array([  33,  117,    5,   53,  214,   11,   47,    8,  155,    4,   19,
        337,   19,    1,  546,  416,    2,  241,  190,    6,  812, 1880,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

### 모델 만들기

In [13]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, TimeDistributed

In [14]:
X_train, X_test, y_train, y_test = train_test_split(prev_pad, next_pad, test_size=0.2, random_state=1234)

In [15]:
Num_words = len(tok.index_word)+1

In [16]:
rnn = Sequential()

rnn.add(Embedding(input_dim=Num_words, output_dim=8, input_length=max_len, mask_zero=True))
rnn.add(LSTM(16, return_sequences=True))  # 모든 입력에 대해 출력 산출 (기존에는 n개의 input에 최종 출력 레이어를 하나만 주었음)
rnn.add(TimeDistributed(Dense(Num_words, activation='softmax'))) # 여러 출력이므로 softmax

rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 8)             15048     
                                                                 
 lstm (LSTM)                 (None, 32, 16)            1600      
                                                                 
 time_distributed (TimeDistr  (None, 32, 1881)         31977     
 ibuted)                                                         
                                                                 
Total params: 48,625
Trainable params: 48,625
Non-trainable params: 0
_________________________________________________________________


#### 출력 형태 맞추기
---
출력 형태가 (None, 32, 1881)인데 데이터는 (800, 32)형태이므로 차원이 맞지 않음.

뒤에 1차원을 덧붙여 형태를 맞춰줌

In [17]:
import numpy as np

In [18]:
y_train.shape

(800, 32)

In [19]:
y_train_dims = np.expand_dims(y_train, 2)
y_train_dims.shape

(800, 32, 1)

### 학습하기

In [20]:
from keras.optimizers import Adam

In [21]:
# TimeDistributed 쓰는 경우 sample_weight_mode 를 temporal로 줘야 함
# 즉, 시간 단계별로 샘플 가중치를 줘야 할 때 (2D 가중치) temporal 사용
rnn.compile(optimizer=Adam(learning_rate = 0.1), 
            loss='sparse_categorical_crossentropy',
            metrics=['acc'], sample_weight_mode="temporal")

In [23]:
rnn.fit(X_train, y_train_dims, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2100e949a00>

In [24]:
y_train.shape

(800, 32)

## 생성 모델을 바탕으로 다음 단어 예측하기

#### 예측값 사전 확인

In [25]:
[tok.index_word[i] for i in prev_seq[0][:10]]

['<START>', 'so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug']

In [26]:
i = prev_seq[0][10]
tok.index_word[i]

'it'

#### RNN 활용 예측

In [27]:
new_sentence = [prev_seq[0][:10]]
new_pad = pad_sequences(new_sentence, max_len, padding='post')

In [28]:
next_words = rnn.predict(new_pad)
next_words.shape



(1, 32, 1881)

In [30]:
next_words[0, 10].argmax()

4

In [31]:
tok.index_word[4]

'it'

In [32]:
new_sentence = [[start]]
new_pad = pad_sequences(new_sentence, max_len, padding='post')

for i in range(max_len - 1):
    
    next_words = rnn.predict(new_pad)  # 예측
    word = next_words[0, i].argmax()   # 가장 확률이 높은 단어 선정
    print(tok.index_word[word])        # 단어 출력
    new_pad[0, i + 1] = word           # 선정 단어를 추가
    
    if word == end:                   # 문장이 끝나면 중단
        break

i
have
had
to
use
<END>


In [33]:
import numpy.random

In [36]:
new_sentence = [[start]]
new_pad = pad_sequences(new_sentence, max_len, padding='post')

for i in range(max_len - 1):
    next_words = rnn.predict(new_pad)

    # 확률에 따라 단어를 무작위로 추출
    word = numpy.random.choice(Num_words, p=next_words[0, i])

    print(tok.index_word[word])
    new_pad[0, i + 1] = word
    if word == end:
        break

i
told
extra
years
when
no
do
but
any
while
it's
makes
unacceptable
<END>
