In [116]:
import numpy as np
import pandas as pd
from string import punctuation
import tensorflow as tf
import urllib.request
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Bidirectional, Embedding, Dense, TimeDistributed


### RNN

In [24]:
model = Sequential()

#출력값: (batch_size, timesteps, output_dim)
model.add(SimpleRNN(3, batch_input_shape=(8, 2, 10), return_sequences=True)) #은닉 노드 3, 은닉층은 1
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_6 (SimpleRNN)    (8, 2, 3)                 42        
                                                                 
Total params: 42 (168.00 Byte)
Trainable params: 42 (168.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
timesteps = 10 #시점의 수 - 문장의 길이
input_dim = 4 #입력 차원
hidden_units = 8 #은닉 상태 크기

inputs = np.random.random((timesteps, input_dim))

# 초기은닉상태는 0(벡터)로 초기화
hidden_state_t = np.zeros((hidden_units,))
print('초기은닉상태 :',hidden_state_t)

초기은닉상태 : [0. 0. 0. 0. 0. 0. 0. 0.]


In [27]:
# (8, 4)크기의 2D 텐서 생성. 입력에 대한 가중치.
Wx = np.random.random((hidden_units, input_dim))

# (8, 8)크기의 2D 텐서 생성.은닉 상태에 대한 가중치.
Wh = np.random.random((hidden_units, hidden_units))

# (8,)크기의 1D 텐서 생성. 편향(bias).
b = np.random.random((hidden_units,))

print('가 중 치 Wx의 크 기(shape) :',np.shape(Wx))
print('가 중 치 Wh의 크 기(shape) :',np.shape(Wh))
print('편 향 의 크 기(shape) :',np.shape(b))

가 중 치 Wx의 크 기(shape) : (8, 4)
가 중 치 Wh의 크 기(shape) : (8, 8)
편 향 의 크 기(shape) : (8,)


In [28]:
# Wx * Xt + Wh * Ht-1 + b(bias)
# 각 시점 t별 메모리 셀의 출력의 크기는 (timestep t, output_dim)
total_hidden_states = []

for input_t in inputs:
  output_t = np.tanh(np.dot(Wx, input_t) + np.dot(Wh, hidden_state_t) + b)
  total_hidden_states.append(list(output_t)) # 리스트 추가
  hidden_state_t = output_t

# 출력 깔끔하게 저일
total_hidden_states = np.stack(total_hidden_states, axis = 0)

# (timesteps, output_dim)
print('모든 시점의 hidden state:')
print(total_hidden_states)

모든 시점의 hidden state:
[[0.88597634 0.76583135 0.87536722 0.75211718 0.77892872 0.79620327
  0.60624149 0.52245371]
 [0.9999276  0.9996539  0.99984447 0.9994971  0.99949186 0.99994218
  0.99806458 0.99434387]
 [0.99999397 0.99996193 0.99997756 0.99991895 0.99995311 0.99999635
  0.99946006 0.99753134]
 [0.9999953  0.99999033 0.99995879 0.99992522 0.99998822 0.99999142
  0.99952895 0.99394091]
 [0.99999606 0.99997953 0.99998326 0.99993531 0.99998353 0.99999498
  0.99952441 0.99661567]
 [0.99999943 0.99999185 0.99999704 0.99998568 0.99999264 0.99999898
  0.99993974 0.99951815]
 [0.99997865 0.99994828 0.99993447 0.99977739 0.99990454 0.99998863
  0.99914535 0.99297844]
 [0.99999605 0.9999871  0.9999729  0.99993729 0.99998316 0.99999469
  0.99965427 0.99657983]
 [0.99999759 0.99998554 0.99998734 0.99995621 0.99998763 0.99999652
  0.99969356 0.99773986]
 [0.99999868 0.99998634 0.99999335 0.9999746  0.99998065 0.99999868
  0.99990378 0.99934269]]


### 층 늘려보기

In [29]:
model = Sequential()
model.add(SimpleRNN(hidden_units, input_length=10, input_dim=5, return_sequences=True))
model.add(SimpleRNN(hidden_units, return_sequences=True))

In [39]:
timesteps = 10
input_dim = 5
hidden_units = 8 # hidden unit의 개수에 따라 output shape의 크기가 달라짐

# 양방향 RNN
model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True),
input_shape=(timesteps, input_dim)))
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_27 (Bidirect  (None, 10, 16)            224       
 ional)                                                          
                                                                 
Total params: 224 (896.00 Byte)
Trainable params: 224 (896.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [40]:
# 은닉층이 4개인 Bi-RNN
model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True),
input_shape=(timesteps, input_dim)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_28 (Bidirect  (None, 10, 16)            224       
 ional)                                                          
                                                                 
 bidirectional_29 (Bidirect  (None, 10, 16)            400       
 ional)                                                          
                                                                 
 bidirectional_30 (Bidirect  (None, 10, 16)            400       
 ional)                                                          
                                                                 
 bidirectional_31 (Bidirect  (None, 10, 16)            400       
 ional)                                                          
                                                                 
Total params: 1424 (5.56 KB)
Trainable params: 1424 (

## Data

In [None]:
# 학습 데이터의 형상 확인하기
# batch size = 1
train_X = [[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]
print(np.shape(train_X))

(4, 5)


In [None]:
# 2차원 tensor를 3차원 tensor로 만들기
train_X = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_X = np.array(train_X, dtype=np.float32)
print(train_X.shape)

(1, 4, 5)


## Simple RNN
- return_sequences\
모든 시점의 은닉 상태를 출력할지를 결정하는 인자\
True 이면 모든 시점의 은닉 상태를 출력\
False 이면 마지막 시점의 은닉 상태만을 출력

- return_state\
마지막 시점의 은닉 상태를 출력할지를 결정하는 인자

SimpleRNN = \
tf.keras.layers.SimpleRNN(
    units,
    activation='tanh',
    use_bias=True,
    kernel_constraint=None,
    recurrent_constraint=None,
    dropout=0.0,
    recurrent_dropout=0.0,
    return_sequences=False,
    return_state=False,
    stateful=False)

- units = 은닉 노드 수
- 기본 활성화 함수 = tanh




In [None]:
# rnn = SimpleRNN(3, return_sequences=False, return_state=False)와 동일.
rnn = SimpleRNN(3)

hidden_state = rnn(train_X)
print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))

hidden state : [[ 0.08891981 -0.13903677 -0.5548742 ]], shape: (1, 3)


In [None]:
# 모든 hidden state 반환
rnn = SimpleRNN(3, return_sequences=True)

hidden_states = rnn(train_X)
print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))

hidden states : [[[-0.97428906  0.17683886 -0.8382877 ]
  [-0.93320745  0.8497672  -0.74947494]
  [-0.9706838   0.7800281  -0.6492494 ]
  [-0.2591262   0.77266616  0.88132215]]], shape: (1, 4, 3)


In [None]:
# 마지막 시점 hidden state 출력하기
rnn = SimpleRNN(3, return_sequences=False, return_state=True)
hidden_state, last_state = rnn(train_X)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))

hidden state : [[-0.96035916  0.92250663 -0.7715755 ]], shape: (1, 3)
last hidden state : [[-0.96035916  0.92250663 -0.7715755 ]], shape: (1, 3)


### LSTM

In [None]:
# return_sequences=False 마지막 시점의 hidden state 출력
# last_cell_state: 마지막 셀의 상태 return
lstm = LSTM(3, return_sequences=False, return_state=True)
hidden_state, last_state, last_cell_state = lstm(train_X)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))
print('last cell state : {}, shape: {}'.format(last_cell_state, last_cell_state.shape))

hidden state : [[-0.18912607  0.31478125 -0.84932125]], shape: (1, 3)
last hidden state : [[-0.18912607  0.31478125 -0.84932125]], shape: (1, 3)
last cell state : [[-0.30425426  0.7598704  -2.1993682 ]], shape: (1, 3)


In [None]:
# return_sequences=True 모든 hidden state 반환
# return_state=True 마지막 hidden state 및 cell state 반환
lstm = LSTM(3, return_sequences=True, return_state=True)
hidden_states, last_hidden_state, last_cell_state = lstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('last hidden state : {}, shape: {}'.format(last_hidden_state,
last_hidden_state.shape))
print('last cell state : {}, shape: {}'.format(last_cell_state, last_cell_state.shape))

hidden states : [[[-3.6141244e-01  6.1760384e-01  3.8209141e-03]
  [-2.0709430e-01  5.7021034e-01 -5.2918703e-04]
  [-1.8941352e-01  3.8377562e-01 -7.4299395e-02]
  [-2.7542940e-01  3.2115528e-01 -2.0646188e-02]]], shape: (1, 4, 3)
last hidden state : [[-0.2754294   0.32115528 -0.02064619]], shape: (1, 3)
last cell state : [[-0.6587205   0.6345121  -0.08454297]], shape: (1, 3)


### Bi-LSTM

In [None]:
# hidden state 고정
# tf.keras.initializers.Constant: 상수 값으로 tensor를 생성하는 initializer
#  value = 가중치를 초기화할 때 사용할 상수값

k_init = tf.keras.initializers.Constant(value=0.1) # 가중치(kernal) 초기화 값
b_init = tf.keras.initializers.Constant(value=0) # bias 초기화 값
r_init = tf.keras.initializers.Constant(value=0.1) # 순환 시 선형 변환에 사용되는 가중치 행렬 초기화 값

In [None]:
# object로 출력
print(k_init)

<keras.src.initializers.initializers.Constant object at 0x7965ec9b23b0>


In [None]:
# Bi-LSTM 쌓기
# keras.layers.LSTM 문서 참고
# 마지막 hidden state 확인하기
bilstm = Bidirectional(LSTM(3, return_sequences=False, return_state=True, kernel_initializer=k_init, bias_initializer=b_init,recurrent_initializer=r_init))
hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))

hidden states : [[0.6303139  0.6303139  0.6303139  0.70387346 0.70387346 0.70387346]], shape: (1, 6)
forward state : [[0.6303139 0.6303139 0.6303139]], shape: (1, 3)
backward state : [[0.70387346 0.70387346 0.70387346]], shape: (1, 3)


In [None]:
# 모든 hidden state 확인하기
# return_sequences=True
bilstm = Bidirectional(LSTM(3, return_sequences=True, return_state=True,kernel_initializer=k_init, bias_initializer=b_init,
recurrent_initializer=r_init))
hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))

hidden states : [[[0.35906476 0.35906476 0.35906476 0.70387346 0.70387346 0.70387346]
  [0.5511133  0.5511133  0.5511133  0.5886358  0.5886358  0.5886358 ]
  [0.5911575  0.5911575  0.5911575  0.39516988 0.39516988 0.39516988]
  [0.6303139  0.6303139  0.6303139  0.2194224  0.2194224  0.2194224 ]]], shape: (1, 4, 6)
forward state : [[0.6303139 0.6303139 0.6303139]], shape: (1, 3)
backward state : [[0.70387346 0.70387346 0.70387346]], shape: (1, 3)


### RNN을 이용한 text 생성

In [41]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical #one-hot encoding

In [47]:
text = """경마장에 있는 말이 뛰고 있다.\n그의 말이 법이다.\n가는 말이 고와야 오는 말이 곱다."""

In [45]:
tokenizer = Tokenizer() #tokenizer 선언
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1 #0부터 시작이니까
print('단어집합의 크기: %d' % vocab_size)

단어집합의 크기: 12


In [46]:
# 정수 인덱스 출력
print(tokenizer.word_index)

{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [48]:
# train data set
sequences = list()

for line in text.split('\n'): # \n기준 tokenize
  encoded = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 11


In [50]:
# 데이터의 길이가 맞지 않는다. -> padding 조정
print(sequences)

[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]


In [51]:
# 길이가 가장 긴 샘플 출력
max_len = max(len(l) for l in sequences)
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 6


In [52]:
# 패딩
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [54]:
# label 만들기-> 마지막 문자를 label로 사용
sequences = np.array(sequences)
X = sequences[:,:-1] #[:4]까지, 5번째 까지
y = sequences[:,-1] # 마지막 6번째 문자 하나

In [55]:
# 원‑핫 인코딩
y = to_categorical(y, num_classes=vocab_size)

In [63]:
print(X)
print(y)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [65]:
# 모델
embedding_dim = 10 # 입력 차원 10
hidden_units = 32 # 은닉노드 32개

model = Sequential() # 순서대로 쌓기
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
#손실함수: 크로스 엔트로피, optimizer: 아담, 평가지표: 정확도
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 2s - loss: 2.4998 - accuracy: 0.1818 - 2s/epoch - 2s/step
Epoch 2/200
1/1 - 0s - loss: 2.4868 - accuracy: 0.1818 - 14ms/epoch - 14ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4742 - accuracy: 0.1818 - 15ms/epoch - 15ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4619 - accuracy: 0.1818 - 14ms/epoch - 14ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4497 - accuracy: 0.2727 - 15ms/epoch - 15ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4376 - accuracy: 0.2727 - 13ms/epoch - 13ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4254 - accuracy: 0.3636 - 12ms/epoch - 12ms/step
Epoch 8/200
1/1 - 0s - loss: 2.4130 - accuracy: 0.3636 - 17ms/epoch - 17ms/step
Epoch 9/200
1/1 - 0s - loss: 2.4003 - accuracy: 0.4545 - 14ms/epoch - 14ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3873 - accuracy: 0.4545 - 16ms/epoch - 16ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3739 - accuracy: 0.3636 - 13ms/epoch - 13ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3599 - accuracy: 0.3636 - 13ms/epoch - 13ms/step
Epoch 13/200
1/1 - 0s - loss: 2.3453 - ac

<keras.src.callbacks.History at 0x7965660c8bb0>

In [66]:
def sentence_generation(model, tokenizer, current_word, n): # 모델 , tokenizer , 현재  단어 , 반복할 횟수
  init_word = current_word
  sentence = ''

  # n번 반복
  for _ in range(n):
    #현재 word에 대한 정수 인코딩과 패딩
    encoded = tokenizer.texts_to_sequences([current_word])[0]
    encoded = pad_sequences([encoded], maxlen=5, padding='pre') # 앞쪽 패딩

    # 입력한 현재단어에 대해 Y를 예측, Y를 result에 저장
    result = model.predict(encoded, verbose=0) #예측
    result = np.argmax(result, axis=1) #가장 높은 값의 index 가져오기 -> 가장 높은 확률 가져오기

    for word, index in tokenizer.word_index.items():
      # 예측한 단어와 인덱스가 동일한 단어가 있다면 break
      if index == result:
        break

    # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
    current_word = current_word + ' ' + word

    # 예측 단어를 문장에 저장
    sentence = sentence + ' ' + word

  sentence = init_word + sentence
  return sentence

In [67]:
print(sentence_generation(model, tokenizer, '가 는', 5))
# 아직 멍청한데?

가 는 말이 말이 있는 말이 곱다


In [82]:
df = pd.read_csv('/content/ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [83]:
print('열 의 개 수: ',len(df.columns))
print(df.columns)

열 의 개 수:  15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [84]:
# Null 값 검사
print(df['headline'].isnull().values.any())

False


In [85]:
# headline의 값을 list로 저장
headline = []

headline.extend(list(df.headline.values))
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [86]:
print('samples : {}'.format(len(headline)))

samples : 1324


In [87]:
# noise 제거하기 - unknown 불필요
headline = [word for word in headline if word != "Unknown"]
print('samples without noise : {}'.format(len(headline)))

samples without noise : 1214


In [88]:
# 제거 후 check
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

In [89]:
# text 전처리 - 구두점 제거& 단어의 소문자화
def repreprocessing(raw_sentence):
  preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii",'ignore')
  return ''.join(word for word in preproceseed_sentence if word not in punctuation).lower()

# headline list에 있는 모든 요소 전처리
preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [90]:
tokenizer = Tokenizer()

#문자 데이터를 입력받아서 리스트의 형태로 변환한다.
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print('vocab size : %d' % vocab_size)

vocab size : 3494


In [93]:
# 최종적으로 원하는 훈련 데이터의 형태
# 하나의 단어를 예측하기 위해 이전에 등장한 단어들을 모두 참고하는 것.
sequences = list()
for sentence in preprocessed_headline:
  # 정수 인코딩
  encoded = tokenizer.texts_to_sequences([sentence])[0] #접근하기 위해 뒤에 [0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [94]:
# 인덱스를 단어로 바꾸귀 휘해 index_to_word 생성
index_to_word = {}
for key, value in tokenizer.word_index.items():
  index_to_word[value] = key
print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))

#582번 인덱스 = offer

빈도수 상위 582번 단어 : offer


In [95]:
# padding
max_len = max(len(l) for l in sequences)
print('sample max len : {}'.format(max_len))

sample max len : 24


In [96]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences[:3])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


In [97]:
# 맨 우측 단어 = label
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [98]:
y = to_categorical(y, num_classes=vocab_size)

In [99]:
# model
embedding_dim = 10
hidden_units = 128 #은닉 노드 128개


model = Sequential() # 순차적으로 쌓기
model.add(Embedding(vocab_size, embedding_dim)) #입력 크기
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
244/244 - 16s - loss: 7.6430 - accuracy: 0.0304 - 16s/epoch - 64ms/step
Epoch 2/200
244/244 - 4s - loss: 7.1204 - accuracy: 0.0310 - 4s/epoch - 17ms/step
Epoch 3/200
244/244 - 3s - loss: 6.9780 - accuracy: 0.0310 - 3s/epoch - 14ms/step
Epoch 4/200
244/244 - 3s - loss: 6.8603 - accuracy: 0.0404 - 3s/epoch - 11ms/step
Epoch 5/200
244/244 - 2s - loss: 6.7430 - accuracy: 0.0415 - 2s/epoch - 8ms/step
Epoch 6/200
244/244 - 2s - loss: 6.5644 - accuracy: 0.0466 - 2s/epoch - 9ms/step
Epoch 7/200
244/244 - 2s - loss: 6.3854 - accuracy: 0.0509 - 2s/epoch - 6ms/step
Epoch 8/200
244/244 - 2s - loss: 6.1877 - accuracy: 0.0573 - 2s/epoch - 7ms/step
Epoch 9/200
244/244 - 2s - loss: 5.9837 - accuracy: 0.0601 - 2s/epoch - 9ms/step
Epoch 10/200
244/244 - 2s - loss: 5.7911 - accuracy: 0.0633 - 2s/epoch - 8ms/step
Epoch 11/200
244/244 - 2s - loss: 5.6098 - accuracy: 0.0656 - 2s/epoch - 10ms/step
Epoch 12/200
244/244 - 2s - loss: 5.4346 - accuracy: 0.0736 - 2s/epoch - 8ms/step
Epoch 13/200
244/2

<keras.src.callbacks.History at 0x79655961ff70>

In [102]:
# 이전과 동일
def sentence_generation(model, tokenizer, current_word, n):
  init_word = current_word
  sentence = ''

  for _ in range(n):
    encoded = tokenizer.texts_to_sequences([current_word])[0]
    encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')

    result = model.predict(encoded, verbose=0)
    result = np.argmax(result, axis=1)

    for word, index in tokenizer.word_index.items():
      if index == result:
        break

    current_word = current_word + ' ' + word
    sentence = sentence + ' ' + word

  sentence = init_word + sentence

  return sentence

In [103]:
print(sentence_generation(model, tokenizer, 'i', 10))

i want to be rich and im not sorry seen say


### 문자 단위 RNN 언어 모델 (Char RNNLM)

In [105]:
# 데이터 로드
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

f = open('11-0.txt', 'rb') # 읽기위해 파일 열고
sentences = []
for sentence in f: # 데이터로부터 한 줄씩 읽는다.
    sentence = sentence.strip() # strip()을 통해 \r, \n을 제거한다.
    sentence = sentence.lower() # 소문자화.
    sentence = sentence.decode('ascii', 'ignore') # \xe2\x80\x99 등과 같은 바이트 열 제거
    if len(sentence) > 0:
        sentences.append(sentence)
f.close() # 파일 닫기

In [106]:
# 문장 token화 상태는 아니기 때문에 일단 합치고 token화 하기
total_data = ' '.join(sentences)
print('문자열의 길이 또는 총 문자의 개수: %d' % len(total_data))

문자열의 길이 또는 총 문자의 개수: 159484


In [107]:
# 문자집합 만들기 - 알파벳, 특수 기호
# set: 중복된 항목 제거
# list: 제거된 항목으로 다시 리스트 생성
# sorted: 정렬
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)
print ('문자 집합의 크기 : {}'.format(vocab_size))

문자 집합의 크기 : 56


In [108]:
# 문자에 고유한 정수 부여
# 문자로 정수 리턴
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print('문자 집합 :',char_to_index)

문자 집합 : {' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [109]:
# 정수로 문자 리턴
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [110]:
# data 예시
# appl (입력 시퀀스) -> pple (예측해야하는 시퀀스)
train_X = 'appl'
train_y = 'pple'

In [111]:
seq_length = 60

# 문자열의 길이를 seq_length로 나누면 전처리 후 생겨날 샘플 수
n_samples = int(np.floor((len(total_data) - 1) / seq_length))
print ('샘플의 수 : {}'.format(n_samples))

샘플의 수 : 2658


In [112]:
train_X = []
train_y = []

for i in range(n_samples):
    # 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 pick. seq_length = 60
    X_sample = total_data[i * seq_length: (i + 1) * seq_length]

    # 정수 인코딩 -> 정수 리스트 반환
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)

    # 오른쪽으로 1칸 쉬프트
    y_sample = total_data[i * seq_length + 1: (i + 1) * seq_length + 1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [114]:
print('X 데이터의 첫번째 문장 샘플 :',train_X[0])
print('y 데이터의 첫번째 문장 샘플 :',train_y[0])
print('-'*50)
print('X 데이터의 첫번째 문장 샘플 디코딩 :',[index_to_char[i] for i in train_X[0]])
print('y 데이터의 첫번째 문장 샘플 디코딩 :',[index_to_char[i] for i in train_y[0]])

X 데이터의 첫번째 문장 샘플 : [49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30]
y 데이터의 첫번째 문장 샘플 : [37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]
--------------------------------------------------
X 데이터의 첫번째 문장 샘플 디코딩 : ['t', 'h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e', 'b', 'o', 'o', 'k', ' ', 'o', 'f', ' ', 'a', 'l', 'i', 'c', 'e', 's', ' ', 'a', 'd', 'v', 'e', 'n', 't', 'u', 'r', 'e', 's', ' ', 'i', 'n', ' ', 'w', 'o', 'n', 'd', 'e', 'r', 'l', 'a']
y 데이터의 첫번째 문장 샘플 디코딩 : ['h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g

In [115]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩

# data size = 2658
# input_dim = 56
# input_length = 60

train_X의 크기(shape) : (2658, 60, 56)
train_y의 크기(shape) : (2658, 60, 56)


In [117]:
hidden_units = 256 # 은닉 노드 수

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

Epoch 1/80
84/84 - 7s - loss: 3.0750 - accuracy: 0.1820 - 7s/epoch - 85ms/step
Epoch 2/80
84/84 - 1s - loss: 2.7172 - accuracy: 0.2477 - 775ms/epoch - 9ms/step
Epoch 3/80
84/84 - 1s - loss: 2.3851 - accuracy: 0.3315 - 785ms/epoch - 9ms/step
Epoch 4/80
84/84 - 1s - loss: 2.2364 - accuracy: 0.3648 - 778ms/epoch - 9ms/step
Epoch 5/80
84/84 - 1s - loss: 2.1290 - accuracy: 0.3916 - 1s/epoch - 14ms/step
Epoch 6/80
84/84 - 2s - loss: 2.0527 - accuracy: 0.4098 - 2s/epoch - 20ms/step
Epoch 7/80
84/84 - 1s - loss: 1.9788 - accuracy: 0.4307 - 782ms/epoch - 9ms/step
Epoch 8/80
84/84 - 1s - loss: 1.9150 - accuracy: 0.4467 - 855ms/epoch - 10ms/step
Epoch 9/80
84/84 - 1s - loss: 1.8602 - accuracy: 0.4617 - 1s/epoch - 17ms/step
Epoch 10/80
84/84 - 1s - loss: 1.8071 - accuracy: 0.4768 - 1s/epoch - 18ms/step
Epoch 11/80
84/84 - 1s - loss: 1.7598 - accuracy: 0.4884 - 1s/epoch - 16ms/step
Epoch 12/80
84/84 - 1s - loss: 1.7159 - accuracy: 0.4996 - 773ms/epoch - 9ms/step
Epoch 13/80
84/84 - 1s - loss: 1.677

<keras.src.callbacks.History at 0x79657435af50>

In [120]:
def sentence_generation(model, length):
    # 문자에 대한 랜덤한 정수 생성
    ix = [np.random.randint(vocab_size)]

    # 랜덤한 정수로부터 맵핑되는 문자 생성
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 문자',y_char[-1],'로 예측을 시작!')

    # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성
    X = np.zeros((1, length, vocab_size))

    for i in range(length):
        # X[0][i][예측한 문자의 인덱스] = 1, 즉, 예측 문자를 다음 입력 시퀀스에 추가
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)

In [121]:
result = sentence_generation(model, 100)
print(result)

31 번 문자 b 로 예측을 시작!
berg literary archive foundation, the manager of the project gutenberg ebook alices adventures in won
