# RNN

## Keras로 RNN 구현하기

In [None]:
from keras.models import Sequential
from keras.layers import SimpleRNN

In [None]:
model = Sequential()
model.add(SimpleRNN(3, input_shape=(2,10)))

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_3 (SimpleRNN)     (None, 3)                 42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [None]:
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10)))

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_4 (SimpleRNN)     (8, 3)                    42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [None]:
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10), return_sequences=True))

In [None]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_5 (SimpleRNN)     (8, 2, 3)                 42        
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


## 파이썬으로 RNN 구현하기

$$ h_t = tanh(W_x X_t + W_h H_{t-1} + b)$$

```
의사코드
hidden_State_t = 0
for input_t in input_length: #각 시점마다 입력을 받는다.
    output_t = tanh(input_t, hidden_stat_t) #각 시점에 대해서 입력과 은닉 상태를 가지고 연산
    hidden_stat_t = output_t # 계산결과는 현재 시점의 은닉 상태가 된다.
```

In [None]:
import numpy as np
timesteps = 10  # 시점의 수 
input_dim = 4
hidden_size = 8  #은닉 상태의 크기 (메모리 셀의 용량?)

inputs = np.random.random((timesteps, input_dim))

hidden_state_t = np.zeros((hidden_size, ))
# 초기 은닉 상태는 0(벡터)로 초기화
# 은닉 상태의 크기 hidden size로 은닉 상태를 만듬

In [None]:
print(hidden_state_t)

[0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
Wx = np.random.random((hidden_size, input_dim))  # (8, 4) 입력에 대한 가중치
Wh = np.random.random((hidden_size, hidden_size))  # (8, 8) 은닉 상태에 대한 가중치
b = np.random.random((hidden_size,))  # (8,) 편향 

In [None]:
print(np.shape(Wx))  # (은닉 상태의 크기 * 입력의 차원)
print(np.shape(Wh))  # (은닉 상태의 크기 * 은닉 상태의 크기)
print(np.shape(b))  # (은닉 상태의 크기)

(8, 4)
(8, 8)
(8,)


In [None]:
total_hidden_state = []

# 메모리 셀 동작
for input_t in inputs:
  output_t = np.tanh(np.dot(Wx, input_t)+ np.dot(Wh, hidden_state_t)+b) 
  total_hidden_state.append(list(output_t))
  print(np.shape(total_hidden_state)) #각 시점 t별 메모리 셀의 출력의 크기는 (timestep, output_dim)

total_hidden_state = np.stack(total_hidden_state, axis=0)  # ( 1, 2, 3)

print(total_hidden_state) # (timesteps , outputdim) 크기

(1, 8)
(2, 8)
(3, 8)
(4, 8)
(5, 8)
(6, 8)
(7, 8)
(8, 8)
(9, 8)
(10, 8)
[[0.97548489 0.97022677 0.82556418 0.97270961 0.98506094 0.92232357
  0.93803503 0.94089292]
 [0.97943734 0.97456319 0.90187539 0.97976983 0.99124966 0.92560473
  0.96789186 0.95683747]
 [0.96224665 0.96295215 0.77117555 0.96069401 0.97495061 0.91238416
  0.90253291 0.92478352]
 [0.75812835 0.84831063 0.80821043 0.89469448 0.92666568 0.83199089
  0.87482068 0.88897119]
 [0.98896821 0.98701806 0.90346561 0.98251519 0.99326374 0.92558597
  0.96693605 0.96446478]
 [0.84725367 0.80255666 0.57034852 0.84592306 0.89255554 0.6745315
  0.72731566 0.87089095]
 [0.96893155 0.92656848 0.6838951  0.95826377 0.97510236 0.86496246
  0.89514638 0.92173411]
 [0.92257133 0.92461872 0.80385827 0.91717752 0.95826435 0.76956992
  0.88309675 0.9250734 ]
 [0.9643405  0.94957018 0.88529009 0.96533602 0.98555988 0.86363321
  0.9560291  0.94999034]
 [0.95712978 0.92591774 0.65982406 0.93993953 0.96322996 0.83061131
  0.85436312 0.91362053]]

## 양방향 순환 신경망 (Bidirectional Recurrent Neural Network)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Bidirectional

model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_size, return_sequences= True), input_shape=(timesteps, input_dim)))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 10, 16)            208       
Total params: 208
Trainable params: 208
Non-trainable params: 0
_________________________________________________________________


In [None]:
model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_size, return_sequences = True), input_shape=(timesteps, input_dim)))
model.add(Bidirectional(SimpleRNN(hidden_size, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_size, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_size, return_sequences=True)))

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 10, 16)            208       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 16)            400       
_________________________________________________________________
bidirectional_3 (Bidirection (None, 10, 16)            400       
_________________________________________________________________
bidirectional_4 (Bidirection (None, 10, 16)            400       
Total params: 1,408
Trainable params: 1,408
Non-trainable params: 0
_________________________________________________________________


``` 
Quiz
1. Embedding을 사용하여 단어 집합(Vocabulary)의 크기가 5,000이고, 임베딩 벡터의 차원은 100입니다.
2. 은닉층에서 Simple RNN을 사용하여 은닉상태의 크기는 128이다.
3. 훈련에 사용하는 모든 샘플의 길이는 30으로 가정
4. 이진 분류를 수행하는 모델로, 출력층의 뉴련은 1개로 시그모이드 함수를 사용
5. 은닉층은 1개
총 파라미터 갯수를 구하시오.
```

```
Embedding = 5,000(input) * 100(embedding) = 500,000
Wx = 100(embedding) * 128(hidden) = 12,800
Wh = 128 * 128 = 16,384
bias(hidden) = 128
Wy = 128
bias(output (y) ) = 1

total = 529,441
```

## 임의의 입력으로 SimpleRNN과 LSTM 이해하기

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Bidirectional

In [None]:
train_x = [[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]
print(np.shape(train_x))  # 단어 벡터 차원은 5, 문장의 길이가 4인 경우를 가정

(4, 5)


In [None]:
train_x = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_x = np.array(train_x, dtype=np.float32)

print(train_x.shape)  # (batch_size, timesteps, input_dim)

(1, 4, 5)


In [None]:
rnn = SimpleRNN(3)  # rnn = SimpleRNN(3, return_sequences=False, return_state=False)와 동일
hidden_state = rnn(train_x)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))

hidden state : [[-0.8143862  -0.08219036 -0.99989784]], shape: (1, 3)


In [None]:
rnn = SimpleRNN(3, return_sequences=True)
hidden_state = rnn(train_x)

print('hidden states : {}, shape : {}'.format(hidden_state, hidden_state.shape))

hidden states : [[[-0.41060805  0.9882579  -0.9967479 ]
  [-0.5616771   0.66002697 -0.9981759 ]
  [-0.9336512   0.32918617 -0.9656355 ]
  [ 0.18236862  0.6887267  -0.98288196]]], shape : (1, 4, 3)


In [None]:
rnn = SimpleRNN(3, return_sequences=True, return_state=True)
hidden_state, last_state = rnn(train_x)

print('hidden states: {}, shape:{}'.format(hidden_state, hidden_state.shape))
print('last hidden state: {}, shape : {}'.format(last_state, last_state.shape))

hidden states: [[[ 0.974603   -0.71189094  0.96084327]
  [ 0.9579967  -0.89243335  0.48994237]
  [ 0.3339227  -0.48368046  0.45302042]
  [ 0.47572416 -0.13298574  0.2446679 ]]], shape:(1, 4, 3)
last hidden state: [[ 0.47572416 -0.13298574  0.2446679 ]], shape : (1, 3)


In [None]:
rnn = SimpleRNN(3, return_sequences=False, return_state=True)
hidden_state, last_state = rnn(train_x)

print('hidden state ; {}, shape: {}'.format(hidden_state, hidden_state.shape))
print('last hidden state : {}, shape : {}'.format(last_state, last_state.shape))

hidden state ; [[-0.19887167  0.77704924 -0.9805013 ]], shape: (1, 3)
last hidden state : [[-0.19887167  0.77704924 -0.9805013 ]], shape : (1, 3)


## LSTM 이해하기

In [None]:
lstm = LSTM(3, return_sequences=False, return_state=True)
hidden_state, last_state, last_cell_state = lstm(train_x)

print('hidden state : {}, shape :{}'.format(hidden_state, hidden_state.shape))
print('last hidden state: {}, shape : {}'.format(last_state, last_state.shape))
print('last cell state : {}, shape : {}'.format(last_cell_state, last_cell_state.shape))

hidden state : [[-0.18315928 -0.15889569  0.00512844]], shape :(1, 3)
last hidden state: [[-0.18315928 -0.15889569  0.00512844]], shape : (1, 3)
last cell state : [[-0.26263916 -0.20663036  0.01880284]], shape : (1, 3)


In [9]:
lstm = LSTM(3, return_sequences=True, return_state=True)
hidden_state, last_hidden_state, last_cell_state = lstm(train_x)

print('hidden state : {}, shape : {}'.format(hidden_state, hidden_state.shape))
print('last hidden state: {}, shape: {}'.format(last_hidden_state, last_hidden_state.shape))
print('last cell state : {}, shape : {}'.format(last_cell_state, last_cell_state.shape))

hidden state : [[[ 0.05965044  0.00293815 -0.00715881]
  [ 0.22557531  0.0092026   0.08662733]
  [ 0.1512053   0.02101911 -0.00074633]
  [ 0.10630693 -0.0073412   0.05954108]]], shape : (1, 4, 3)
last hidden state: [[ 0.10630693 -0.0073412   0.05954108]], shape: (1, 3)
last cell state : [[ 0.56940705 -0.06561618  0.18111369]], shape : (1, 3)


## Bidirectional (LSTM) 이해하기

In [10]:
k_init = tf.keras.initializers.Constant(value= 0.1)
b_init = tf.keras.initializers.Constant(value=0)
r_init = tf.keras.initializers.Constant(value= 0.1)

In [11]:
bilstm = Bidirectional(LSTM(3, return_sequences=False, return_state=True, kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init))
hidden_state, forward_h, forward_c, backward_h, backward_c = bilstm(train_x)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))
print('forward state : {}, shape : {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape : {}'.format(backward_h, backward_h.shape))

hidden state : [[0.63031393 0.63031393 0.63031393 0.7038734  0.7038734  0.7038734 ]], shape: (1, 6)
forward state : [[0.63031393 0.63031393 0.63031393]], shape : (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape : (1, 3)


In [12]:
bilstm = Bidirectional(LSTM(3, return_sequences=True, return_state=True, kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init))
hidden_state, forward_h, forward_c, backward_h, backward_c= bilstm(train_x)

print('hidden state : {}, shape:{}'.format(hidden_state, hidden_state.shape))
print('forward state : {}, shape : {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape : {}'.format(backward_h, backward_h.shape))

hidden state : [[[0.35906473 0.35906473 0.35906473 0.7038734  0.7038734  0.7038734 ]
  [0.5511133  0.5511133  0.5511133  0.58863586 0.58863586 0.58863586]
  [0.59115744 0.59115744 0.59115744 0.3951699  0.3951699  0.3951699 ]
  [0.63031393 0.63031393 0.63031393 0.21942244 0.21942244 0.21942244]]], shape:(1, 4, 6)
forward state : [[0.63031393 0.63031393 0.63031393]], shape : (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape : (1, 3)


## RNN을 이용한 텍스트 생성(Text Generation)
- 다 대 일 (many to one)구조의 RNN을 사용하여 문맥을 반영해서 텍스트를 생성하는 모델을 만들어 보자
- '경마장에 있는 말이 뛰고 있다'
- '그의 말이 법이다'
- '가는 말이 고와야 오는 말이 곱다'

In [13]:
import pandas as pd
# 위 3문장을 재구성하면, 아래와 같이 총 11개의 샘플이 구성됨

content = [['경마장에', '있는'],['경마장에 있는','말이'],['경마장에 있는 말이', '뛰고'],['경마장에 있는 말이 뛰고','있다'],['그의','말이'],['그의 말이','법이다'],['가는', '말이'],['가는 말이','고와야'],['가는 말이 고와야','오는'],['가는 말이 고와야 오는','말이'],['가는 말이 고와야 오는 말이','곱다']]
df = pd.DataFrame(content)
df.index = ['1', '2', '3', '4', '5', '6', '7', '8', '9','10','11']
df.columns = ['X', 'y']

df

Unnamed: 0,X,y
1,경마장에,있는
2,경마장에 있는,말이
3,경마장에 있는 말이,뛰고
4,경마장에 있는 말이 뛰고,있다
5,그의,말이
6,그의 말이,법이다
7,가는,말이
8,가는 말이,고와야
9,가는 말이 고와야,오는
10,가는 말이 고와야 오는,말이


### 데이터에 대한 이해와 전처리

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical

In [15]:
text = """ 경마장에 있는 말이 뛰고 있다 \n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [16]:
# 단어집합을 생성, 크기를 확인
t = Tokenizer()
t.fit_on_texts([text])
vocab_size = len(t.word_index) +1

# 케라스 토크나이저 정수 인코딩은 인덱스가 1부터 시작,
# 케라스 원-핫 인코딩에서 배열의 인덱스가 0부터 시작하기 때문에
# 배열의 크기를 실제 단어 집합의 크기보다 +1로 생성해야함 그래서 미리 +1 선언
print('단어 집합의 크기 : %d'% vocab_size)

단어 집합의 크기 : 12


In [17]:
print(t.word_index)

{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [18]:
# 훈련 데이터 만들기
sequences = list()
for line in text.split('\n'):
  encoded = t.texts_to_sequences([line])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

print('학습에 사용할 샘플의 갯수 : %d' % len(sequences))

학습에 사용할 샘플의 갯수 : 11


In [19]:
print(sequences)

[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]


In [20]:
max_len = max(len(l) for l in sequences)  # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 6


In [21]:
sequences = pad_sequences(sequences, maxlen = max_len, padding='pre')

In [22]:
print(sequences)

[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [23]:
sequences = np.array(sequences)
x = sequences[:, :-1]
y = sequences[:, -1]

# 리스트의 마지막 값을 제외하고 저장한 것은 x
# 리스트의 마지막 값만 저장한 것은 y, --> 레이블에 해당

In [24]:
print(x)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]


In [25]:
print(y)

[ 3  1  4  5  1  7  1  9 10  1 11]


In [26]:
y = to_categorical(y, num_classes=vocab_size)

In [27]:
print(y)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


### 모델 설계하기

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [29]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1))
model.add(SimpleRNN(32))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 1s - loss: 2.4679 - accuracy: 0.2727
Epoch 2/200
1/1 - 0s - loss: 2.4520 - accuracy: 0.3636
Epoch 3/200
1/1 - 0s - loss: 2.4361 - accuracy: 0.4545
Epoch 4/200
1/1 - 0s - loss: 2.4200 - accuracy: 0.4545
Epoch 5/200
1/1 - 0s - loss: 2.4038 - accuracy: 0.4545
Epoch 6/200
1/1 - 0s - loss: 2.3872 - accuracy: 0.4545
Epoch 7/200
1/1 - 0s - loss: 2.3704 - accuracy: 0.3636
Epoch 8/200
1/1 - 0s - loss: 2.3531 - accuracy: 0.3636
Epoch 9/200
1/1 - 0s - loss: 2.3354 - accuracy: 0.3636
Epoch 10/200
1/1 - 0s - loss: 2.3172 - accuracy: 0.3636
Epoch 11/200
1/1 - 0s - loss: 2.2986 - accuracy: 0.3636
Epoch 12/200
1/1 - 0s - loss: 2.2793 - accuracy: 0.3636
Epoch 13/200
1/1 - 0s - loss: 2.2596 - accuracy: 0.3636
Epoch 14/200
1/1 - 0s - loss: 2.2393 - accuracy: 0.3636
Epoch 15/200
1/1 - 0s - loss: 2.2186 - accuracy: 0.3636
Epoch 16/200
1/1 - 0s - loss: 2.1975 - accuracy: 0.3636
Epoch 17/200
1/1 - 0s - loss: 2.1761 - accuracy: 0.3636
Epoch 18/200
1/1 - 0s - loss: 2.1544 - accuracy: 0.3636
E

<tensorflow.python.keras.callbacks.History at 0x7ff77d229550>

In [30]:
# 문장을 생성하는 함수 만들어서 출력하기
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
    sentence = ''
    for _ in range(n): # n번 반복
        encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=5, padding='pre') # 데이터에 대한 패딩
        result = model.predict_classes(encoded, verbose=0)
    # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        for word, index in t.word_index.items(): 
            if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
                break # 해당 단어가 예측 단어이므로 break
        current_word = current_word + ' '  + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        sentence = sentence + ' ' + word # 예측 단어를 문장에 저장
    # for문이므로 이 행동을 다시 반복
    sentence = init_word + sentence
    return sentence

In [31]:
print(sentence_generation(model, t, '경마장에', 4))



경마장에 있는 말이 뛰고 있다


In [32]:
print(sentence_generation(model, t, '그의', 2))

그의 말이 법이다




In [33]:
print(sentence_generation(model, t, '가는', 5))

가는 말이 고와야 오는 말이 곱다




## LSTM을 이용한 텍스트 생성

In [34]:
import pandas as pd
from string import punctuation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical

In [36]:
!pwd

/content


In [37]:
ls

ArticlesApril2018.csv  [0m[01;34msample_data[0m/


In [38]:
df = pd.read_csv('./ArticlesApril2018.csv')

df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [39]:
print('열의 갯수 : ', len(df.columns))

print(df.columns)

열의 갯수 :  15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


### 전처리

In [40]:
# Null이 있는지 확인
df['headline'].isnull().values.any()

False

In [41]:
# headline 열에서 모든 신문 기사의 제목을 뽑아서 하나의 리스트로 저장하자
headline = []
headline.extend(list(df.headline.values)) #헤드라인의 값들을 리스트로 저장

In [42]:
# 상위 5개만 출력
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [43]:
# 총 샘플의 갯수 
print('총 샘플의 갯수 : {}'.format(len(headline)))

총 샘플의 갯수 : 1324


In [44]:
# unknown값을 가진 샘플 제거
# 제거 후 샘플의 갯수 출력
headline = [n for n in headline if n != "Unknown"]
print('노이즈 값 제거 후 샘플의 갯수 : {}'.format(len(headline)))

노이즈 값 제거 후 샘플의 갯수 : 1214


In [45]:
# 기존에 출력했던 5개의 샘플을 출력
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

In [46]:
def repreprocessing(s):
  s = s.encode("utf8").decode("ascii", 'ignore')
  return ''.join(c for c in s if c not in punctuation).lower() # 구두점 제거와 동시에 소문자화

text = [repreprocessing(x) for x in headline]
text[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [47]:
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1

print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 3494


In [48]:
sequences = list()

In [49]:
for line in text:
  encoded = t.texts_to_sequences([line])[0] #각 샘플에 대한 정수 인코딩
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [50]:
index_to_word = {}

for key, value in t.word_index.items(): 
  index_to_word[value] = key

print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))

빈도수 상위 582번 단어 : offer


In [51]:
max_len = max(len(l) for l in sequences)

print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 24


In [52]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

print(sequences[:3])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


In [53]:
sequences = np.array(sequences)

x = sequences[:, :-1]
y = sequences[:, -1]

In [54]:
print(x[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  99]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  99 269]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0  99 269 371]]


In [55]:
print(y[:3])

[ 269  371 1115]


In [56]:
y = to_categorical(y, num_classes=vocab_size)

In [57]:
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### 모델 설계

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [59]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x, y, epochs=200, verbose=2)

Epoch 1/200
244/244 - 8s - loss: 7.6610 - accuracy: 0.0254
Epoch 2/200
244/244 - 6s - loss: 7.1173 - accuracy: 0.0310
Epoch 3/200
244/244 - 6s - loss: 6.9782 - accuracy: 0.0381
Epoch 4/200
244/244 - 6s - loss: 6.8576 - accuracy: 0.0402
Epoch 5/200
244/244 - 6s - loss: 6.7134 - accuracy: 0.0437
Epoch 6/200
244/244 - 6s - loss: 6.5479 - accuracy: 0.0463
Epoch 7/200
244/244 - 6s - loss: 6.3630 - accuracy: 0.0499
Epoch 8/200
244/244 - 6s - loss: 6.1715 - accuracy: 0.0563
Epoch 9/200
244/244 - 6s - loss: 5.9856 - accuracy: 0.0598
Epoch 10/200
244/244 - 6s - loss: 5.8071 - accuracy: 0.0659
Epoch 11/200
244/244 - 6s - loss: 5.6370 - accuracy: 0.0698
Epoch 12/200
244/244 - 6s - loss: 5.4747 - accuracy: 0.0733
Epoch 13/200
244/244 - 6s - loss: 5.3217 - accuracy: 0.0792
Epoch 14/200
244/244 - 6s - loss: 5.1745 - accuracy: 0.0827
Epoch 15/200
244/244 - 6s - loss: 5.0313 - accuracy: 0.0951
Epoch 16/200
244/244 - 6s - loss: 4.8960 - accuracy: 0.0980
Epoch 17/200
244/244 - 6s - loss: 4.7669 - accura

<tensorflow.python.keras.callbacks.History at 0x7ff77543cb90>

In [60]:
# 문장을 생성하는 함수 만들어서 출력하기
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
    sentence = ''
    for _ in range(n): # n번 반복
        encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=23, padding='pre') # 데이터에 대한 패딩
        result = model.predict_classes(encoded, verbose=0)
    # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        for word, index in t.word_index.items(): 
            if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
                break # 해당 단어가 예측 단어이므로 break
        current_word = current_word + ' '  + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        sentence = sentence + ' ' + word # 예측 단어를 문장에 저장
    # for문이므로 이 행동을 다시 반복
    sentence = init_word + sentence
    return sentence

In [61]:
print(sentence_generation(model, t, 'i', 10))



i cant jump ship from facebook yet the camera island with


In [62]:
print(sentence_generation(model, t, 'how', 10))



how to make facebook more accountable into presidency not they think


In [64]:
print(sentence_generation(model, t, 'former', 10))



former playboy model is free to discuss alleged affair it more


In [63]:
print(sentence_generation(model, t, 'former', 5))



former playboy model is free to
