### **순환 신경망**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN

model = Sequential()
model.add(SimpleRNN(3, input_shape=(2,10))) #출력층이 3
# model.add(SimpleRNN(3, input_length=2, input_dim=10))와 동일함.
model.summary()
#hidden unit도 3

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 3)                 42        
                                                                 
Total params: 42 (168.00 Byte)
Trainable params: 42 (168.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10))) #batch_size 미리 정의
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (8, 3)                    42        
                                                                 
Total params: 42 (168.00 Byte)
Trainable params: 42 (168.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10), return_sequences=True)) #return_sequences 값을 true -> 은닉상태 출력
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_2 (SimpleRNN)    (8, 2, 3)                 42        
                                                                 
Total params: 42 (168.00 Byte)
Trainable params: 42 (168.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
import numpy as np

timesteps = 10
input_dim = 4
hidden_units = 8

# 입력에 해당되는 2D 텐서
inputs = np.random.random((timesteps, input_dim))

# 초기 은닉 상태는 0(벡터)로 초기화
hidden_state_t = np.zeros((hidden_units,))

print('초기 은닉 상태 :',hidden_state_t)


초기 은닉 상태 : [0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
Wx = np.random.random((hidden_units, input_dim))  # (8, 4)크기의 2D 텐서 생성. 입력에 대한 가중치.
Wh = np.random.random((hidden_units, hidden_units)) # (8, 8)크기의 2D 텐서 생성. 은닉 상태에 대한 가중치.
b = np.random.random((hidden_units,)) # (8,)크기의 1D 텐서 생성. 이 값은 편향(bias).

print('가중치 Wx의 크기(shape) :',np.shape(Wx))
print('가중치 Wh의 크기(shape) :',np.shape(Wh))
print('편향의 크기(shape) :',np.shape(b))


가중치 Wx의 크기(shape) : (8, 4)
가중치 Wh의 크기(shape) : (8, 8)
편향의 크기(shape) : (8,)


In [None]:
total_hidden_states = []

# 각 시점 별 입력값.
for input_t in inputs:

  # Wx * Xt + Wh * Ht-1 + b(bias)
  output_t = np.tanh(np.dot(Wx,input_t) + np.dot(Wh,hidden_state_t) + b)

  # 각 시점 t별 메모리 셀의 출력의 크기는 (timestep t, output_dim)
  # 각 시점의 은닉 상태의 값을 계속해서 누적
  total_hidden_states.append(list(output_t))
  hidden_state_t = output_t

# 출력 시 값을 깔끔하게 해주는 용도.
total_hidden_states = np.stack(total_hidden_states, axis = 0)

# (timesteps, output_dim)
print('모든 시점의 은닉 상태 :')
print(total_hidden_states)


모든 시점의 은닉 상태 :
[[0.69747514 0.9520125  0.82406018 0.87947816 0.86533404 0.95485367
  0.72802468 0.79849338]
 [0.99976415 0.99998336 0.99999095 0.99999547 0.99916752 0.9999936
  0.99909236 0.99999604]
 [0.99993536 0.99999528 0.99999812 0.99999788 0.99968765 0.99999715
  0.99966391 0.99999896]
 [0.99994989 0.99999572 0.99999851 0.99999645 0.99974042 0.99999744
  0.99950693 0.99999856]
 [0.99991803 0.99999288 0.99999732 0.99999711 0.99946438 0.99999534
  0.99957504 0.99999852]
 [0.99992828 0.9999931  0.99999736 0.99999598 0.99942784 0.99999514
  0.99949505 0.99999815]
 [0.99986867 0.99998707 0.99999577 0.99999744 0.99873986 0.99999154
  0.99952585 0.99999826]
 [0.99994802 0.99999671 0.99999902 0.99999909 0.99970984 0.99999852
  0.9997079  0.99999941]
 [0.99978608 0.99997766 0.9999906  0.99999364 0.99924683 0.99998205
  0.99946686 0.99999693]
 [0.9999018  0.99999115 0.99999706 0.99999617 0.99963199 0.99999463
  0.99948314 0.99999821]]


In [None]:
#은닉층 두개
model = Sequential()
model.add(SimpleRNN(hidden_units, input_length=10, input_dim=5, return_sequences=True))
model.add(SimpleRNN(hidden_units, return_sequences=True))
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_10 (SimpleRNN)   (None, 10, 8)             112       
                                                                 
 simple_rnn_11 (SimpleRNN)   (None, 10, 8)             136       
                                                                 
Total params: 248 (992.00 Byte)
Trainable params: 248 (992.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
#양방향
from tensorflow.keras.layers import Bidirectional

timesteps = 10
input_dim = 5

model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True), input_shape=(timesteps, input_dim)))


In [None]:
model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True), input_shape=(timesteps, input_dim)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_6 (Bidirecti  (None, 10, 16)            224       
 onal)                                                           
                                                                 
 bidirectional_7 (Bidirecti  (None, 10, 16)            400       
 onal)                                                           
                                                                 
 bidirectional_8 (Bidirecti  (None, 10, 16)            400       
 onal)                                                           
                                                                 
 bidirectional_9 (Bidirecti  (None, 10, 16)            400       
 onal)                                                           
                                                                 
Total params: 1424 (5.56 KB)
Trainable params: 1424 (5

### **케라스의 SimpleRNN과 LSTM 이해하기**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Bidirectional

train_X = [[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]
print(np.shape(train_X))
#단어 벡터의 차원은 5

(4, 5)


In [None]:
train_X = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_X = np.array(train_X, dtype=np.float32)
print(train_X.shape)
#배치크기 1을 추가하여 3D텐서로 변경

(1, 4, 5)


In [None]:
rnn = SimpleRNN(3)
# rnn = SimpleRNN(3, return_sequences=False, return_state=False)와 동일.
hidden_state = rnn(train_X)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))


hidden state : [[-0.9948007   0.80878836 -0.7302251 ]], shape: (1, 3)


In [None]:
rnn = SimpleRNN(3, return_sequences=True) #return_sequence=true로 해서 은닉상태 출력
hidden_states = rnn(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))


hidden states : [[[-0.99957275 -0.97253996  0.6890963 ]
  [-0.9982143  -0.850565    0.04091832]
  [-0.98033166  0.4973345   0.7508743 ]
  [-0.7624582  -0.8807801   0.98852897]]], shape: (1, 4, 3)


In [None]:
rnn = SimpleRNN(3, return_sequences=True, return_state=True) #은닉상태 출력, 마지막 시점의 은닉상태 출력
hidden_states, last_state = rnn(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))


hidden states : [[[ 0.5517705   0.9243687   0.9988912 ]
  [-0.70834166  0.25067243  0.70666975]
  [-0.8267746   0.99512553  0.9526299 ]
  [-0.8751349   0.97075963  0.90179384]]], shape: (1, 4, 3)
last hidden state : [[-0.8751349   0.97075963  0.90179384]], shape: (1, 3)


In [None]:
rnn = SimpleRNN(3, return_sequences=False, return_state=True) #모두 마지막 시점의 은닉 상태 출력
hidden_state, last_state = rnn(train_X)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))


hidden state : [[ 0.9189658  -0.64069396  0.9764519 ]], shape: (1, 3)
last hidden state : [[ 0.9189658  -0.64069396  0.9764519 ]], shape: (1, 3)


In [None]:
lstm = LSTM(3, return_sequences=False, return_state=True)
hidden_state, last_state, last_cell_state = lstm(train_X)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape)) #마지막 은닉상태와 shape 출력
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))
print('last cell state : {}, shape: {}'.format(last_cell_state, last_cell_state.shape)) #셀 상태까지


hidden state : [[-0.29802382  0.26670095  0.12661272]], shape: (1, 3)
last hidden state : [[-0.29802382  0.26670095  0.12661272]], shape: (1, 3)
last cell state : [[-0.62705266  0.35841253  0.47645295]], shape: (1, 3)


In [None]:
lstm = LSTM(3, return_sequences=True, return_state=True)
hidden_states, last_hidden_state, last_cell_state = lstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))#모든 시점의 은닉상태 출력
print('last hidden state : {}, shape: {}'.format(last_hidden_state, last_hidden_state.shape))
print('last cell state : {}, shape: {}'.format(last_cell_state, last_cell_state.shape))


hidden states : [[[-0.22552623 -0.04550451  0.39649454]
  [-0.2122874  -0.03874421  0.43619826]
  [-0.3538742   0.08344807  0.5200256 ]
  [-0.4719701  -0.00708192  0.30684656]]], shape: (1, 4, 3)
last hidden state : [[-0.4719701  -0.00708192  0.30684656]], shape: (1, 3)
last cell state : [[-0.82930833 -0.01085801  0.8804437 ]], shape: (1, 3)


In [None]:
k_init = tf.keras.initializers.Constant(value=0.1)
b_init = tf.keras.initializers.Constant(value=0)
r_init = tf.keras.initializers.Constant(value=0.1)


In [None]:
bilstm = Bidirectional(LSTM(3, return_sequences=False, return_state=True, \
                            kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init)) #마지막 시점 은닉상태 출력
hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape)) #정방향 은닉상태
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))#역방향 은닉상태


hidden states : [[0.6303138 0.6303138 0.6303138 0.7038734 0.7038734 0.7038734]], shape: (1, 6)
forward state : [[0.6303138 0.6303138 0.6303138]], shape: (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape: (1, 3)


In [None]:
bilstm = Bidirectional(LSTM(3, return_sequences=True, return_state=True, \
                            kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init)) #모든 상태 은닉상태 출력
hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)


In [None]:
print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))


hidden states : [[[0.35906473 0.35906473 0.35906473 0.7038734  0.7038734  0.7038734 ]
  [0.55111325 0.55111325 0.55111325 0.58863586 0.58863586 0.58863586]
  [0.59115744 0.59115744 0.59115744 0.3951699  0.3951699  0.3951699 ]
  [0.6303138  0.6303138  0.6303138  0.21942244 0.21942244 0.21942244]]], shape: (1, 4, 6)
forward state : [[0.6303138 0.6303138 0.6303138]], shape: (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape: (1, 3)


### **RNN을 이용한 텍스트 생성**

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

tokenizer = Tokenizer() #토크나이저
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1 #패딩을 위한 0을 고려하여 +1
print('단어 집합의 크기 : %d' % vocab_size)
print(tokenizer.word_index)


단어 집합의 크기 : 12
{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [2]:
print(vocab_size)

12


In [3]:
sequences = list()
for line in text.split('\n'): # 줄바꿈 문자를 기준으로 문장 토큰화
  #줄바꿈을 기준으로 문장을 가져와서 숫자로 바꿔서 저장함 -> 결국 encoded의 길이는 단어의 개수
    encoded = tokenizer.texts_to_sequences([line])[0] #경마장에 있는 말이 뛰고 있다
    print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        #경마장에 있는
        #경마장에 있는 말이
        #경마장에 있는 말이 뛰고
        #경마장에 있는 말이 뛰고 있다

print('학습에 사용할 샘플의 개수: %d' % len(sequences))


[2, 3, 1, 4, 5]
[]
[6, 1, 7]
[]
[8, 1, 9, 10, 1, 11]
[]
학습에 사용할 샘플의 개수: 11


In [None]:
max_len = max(len(l) for l in sequences) # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))
#전체 샘플의 길이를 일치시켜 주어야하기 때문에 가장 긴 샘플의 길이 확인

샘플의 최대 길이 : 6


In [None]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre') #패딩 - 앞부터 0으로 채워주기
print(sequences)

[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [None]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
#X와 y로 분리하기
print(X)
print(y)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[ 3  1  4  5  1  7  1  9 10  1 11]


In [None]:
y = to_categorical(y, num_classes=vocab_size)
#레이블에 대한 원핫인코딩 수행
print(y)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))#첫번째 인자 : 단어사전의 크기, 두번째 인자 : 임베딩 차원 / 투사층
#12개의 단어를 포함한 Embedding 레이어를 선언할 것이고 각 단어는 10차원으로 분산 표현할 것이다
model.add(SimpleRNN(hidden_units))#은닉층
model.add(Dense(vocab_size, activation='softmax'))#활성화 함수 = 소프트맥스 / 출력층
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])#손실함수 : 크로스 엔트로피 함수
model.fit(X, y, epochs=200, verbose=2)#epoch : 200



Epoch 1/200
1/1 - 1s - loss: 2.5041 - accuracy: 0.0000e+00 - 1s/epoch - 1s/step
Epoch 2/200
1/1 - 0s - loss: 2.4906 - accuracy: 0.0909 - 7ms/epoch - 7ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4777 - accuracy: 0.0909 - 9ms/epoch - 9ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4652 - accuracy: 0.3636 - 8ms/epoch - 8ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4530 - accuracy: 0.3636 - 8ms/epoch - 8ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4409 - accuracy: 0.3636 - 7ms/epoch - 7ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4289 - accuracy: 0.4545 - 7ms/epoch - 7ms/step
Epoch 8/200
1/1 - 0s - loss: 2.4167 - accuracy: 0.5455 - 7ms/epoch - 7ms/step
Epoch 9/200
1/1 - 0s - loss: 2.4044 - accuracy: 0.4545 - 7ms/epoch - 7ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3917 - accuracy: 0.4545 - 9ms/epoch - 9ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3786 - accuracy: 0.4545 - 9ms/epoch - 9ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3651 - accuracy: 0.4545 - 6ms/epoch - 6ms/step
Epoch 13/200
1/1 - 0s - loss: 2.3509 - accuracy: 0.4545 - 8

<keras.src.callbacks.History at 0x784f0373e6e0>

In [None]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items():
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence


In [None]:
print(sentence_generation(model, tokenizer, '경마장에', 4))


경마장에 있는 말이 뛰고 있다


In [None]:
print(sentence_generation(model, tokenizer, '그의', 2))


그의 말이 법이다


In [None]:
print(sentence_generation(model, tokenizer, '가는', 5))


가는 말이 고와야 오는 말이 곱다


In [None]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

df = pd.read_csv('/content/drive/MyDrive/ArticlesApril2018.csv')
df.head()


Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [None]:
print('열의 개수: ',len(df.columns)) #열 개수 확인
print(df.columns) #열 종류 확인
print(df['headline'].isnull().values.any())#null값 확인


열의 개수:  15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')
False


In [None]:
headline = []
# 헤드라인의 값들을 리스트로 저장
headline.extend(list(df.headline.values))
headline[:5]


['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [None]:
print('총 샘플의 개수 : {}'.format(len(headline)))
headline = [word for word in headline if word != "Unknown"]
print('노이즈값 제거 후 샘플의 개수 : {}'.format(len(headline)))


총 샘플의 개수 : 1324
노이즈값 제거 후 샘플의 개수 : 1214


In [None]:
#전처리 함수
def repreprocessing(raw_sentence):
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii",'ignore')
    # 구두점 제거, 소문자화
    return ''.join(word for word in preproceseed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]


['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline) #정수 인코딩
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)


단어 집합의 크기 : 3494


In [None]:
sequences = list()

for sentence in preprocessed_headline:

    # 각 샘플에 대한 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        #위에서 한 경마장에~와 같은 흐름

sequences[:11]


[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [None]:
index_to_word = {}
for key, value in tokenizer.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
    index_to_word[value] = key

print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))


빈도수 상위 582번 단어 : offer


In [None]:
max_len = max(len(l) for l in sequences)
print('샘플의 최대 길이 : {}'.format(max_len)) #24
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre') #24로 패딩(앞에0)
print(sequences[:3])


샘플의 최대 길이 : 24
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


In [None]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
print(X[:3])
print(y[:3])


[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  99]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  99 269]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0  99 269 371]]
[ 269  371 1115]


In [None]:
y = to_categorical(y, num_classes=vocab_size)
#원핫인코딩 수행

In [None]:
print(vocab_size)

3494


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim)) #3494 단어를 가지고 임베딩층 생성, 차원은 10
model.add(LSTM(hidden_units)) #은닉층
model.add(Dense(vocab_size, activation='softmax')) #소프트맥스 함수
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])#크로스엔트로피
model.fit(X, y, epochs=200, verbose=2)


Epoch 1/200
244/244 - 12s - loss: 7.6453 - accuracy: 0.0282 - 12s/epoch - 48ms/step
Epoch 2/200
244/244 - 7s - loss: 7.1149 - accuracy: 0.0300 - 7s/epoch - 31ms/step
Epoch 3/200
244/244 - 10s - loss: 6.9774 - accuracy: 0.0305 - 10s/epoch - 39ms/step
Epoch 4/200
244/244 - 9s - loss: 6.8544 - accuracy: 0.0401 - 9s/epoch - 36ms/step
Epoch 5/200
244/244 - 8s - loss: 6.7112 - accuracy: 0.0434 - 8s/epoch - 34ms/step
Epoch 6/200
244/244 - 9s - loss: 6.5440 - accuracy: 0.0491 - 9s/epoch - 39ms/step
Epoch 7/200
244/244 - 7s - loss: 6.3585 - accuracy: 0.0528 - 7s/epoch - 31ms/step
Epoch 8/200
244/244 - 11s - loss: 6.1644 - accuracy: 0.0577 - 11s/epoch - 47ms/step
Epoch 9/200
244/244 - 8s - loss: 5.9762 - accuracy: 0.0604 - 8s/epoch - 34ms/step
Epoch 10/200
244/244 - 8s - loss: 5.7949 - accuracy: 0.0647 - 8s/epoch - 35ms/step
Epoch 11/200
244/244 - 9s - loss: 5.6299 - accuracy: 0.0697 - 9s/epoch - 39ms/step
Epoch 12/200
244/244 - 7s - loss: 5.4722 - accuracy: 0.0747 - 7s/epoch - 30ms/step
Epoch 1

<keras.src.callbacks.History at 0x784f11007c10>

In [None]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items():
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence


In [None]:
print(sentence_generation(model, tokenizer, 'i', 10))



i cant jump ship from facebook yet them up about that


In [None]:
print(sentence_generation(model, tokenizer, 'how', 10))


how to make facebook more accountable on team trump ends in


In [None]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

# 데이터 로드
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

f = open('11-0.txt', 'rb')
sentences = []
for sentence in f: # 데이터로부터 한 줄씩 읽는다.
    sentence = sentence.strip() # strip()을 통해 \r, \n을 제거한다.
    sentence = sentence.lower() # 소문자화.
    sentence = sentence.decode('ascii', 'ignore') # \xe2\x80\x99 등과 같은 바이트 열 제거
    if len(sentence) > 0: #리스트에 추가
        sentences.append(sentence)
f.close()


In [None]:
sentences[:5]


['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. you may copy it, give it away or re-use it under the terms',
 'of the project gutenberg license included with this ebook or online at']

In [None]:
total_data = ' '.join(sentences) #모든 문장을 하나의 문자열로 합치기
print('문자열의 길이 또는 총 문자의 개수: %d' % len(total_data))


문자열의 길이 또는 총 문자의 개수: 159484


In [None]:
print(total_data[:200])


the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with


In [None]:
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)
print ('문자 집합의 크기 : {}'.format(vocab_size))


문자 집합의 크기 : 56


In [None]:
# 문자에 고유한 정수 부여
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print('문자 집합 :',char_to_index)
#구두점, 특수문자 존재/ 26개의 알파벳 소문자 집합

문자 집합 : {' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [None]:
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key


In [None]:
seq_length = 60

# 문자열의 길이를 seq_length로 나누면 전처리 후 생겨날 샘플 수
n_samples = int(np.floor((len(total_data) - 1) / seq_length))
print ('샘플의 수 : {}'.format(n_samples))


샘플의 수 : 2658


In [None]:
train_X = []
train_y = []

for i in range(n_samples):
    # 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 pick.
    X_sample = total_data[i * seq_length: (i + 1) * seq_length]

    # 정수 인코딩
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)

    # 오른쪽으로 1칸 쉬프트
    y_sample = total_data[i * seq_length + 1: (i + 1) * seq_length + 1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)


In [None]:
print('X 데이터의 첫번째 샘플 :',train_X[0])
print('y 데이터의 첫번째 샘플 :',train_y[0])
print('-'*50)
print('X 데이터의 첫번째 샘플 디코딩 :',[index_to_char[i] for i in train_X[0]])
print('y 데이터의 첫번째 샘플 디코딩 :',[index_to_char[i] for i in train_y[0]])



X 데이터의 첫번째 샘플 : [49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30]
y 데이터의 첫번째 샘플 : [37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]
--------------------------------------------------
X 데이터의 첫번째 샘플 디코딩 : ['t', 'h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e', 'b', 'o', 'o', 'k', ' ', 'o', 'f', ' ', 'a', 'l', 'i', 'c', 'e', 's', ' ', 'a', 'd', 'v', 'e', 'n', 't', 'u', 'r', 'e', 's', ' ', 'i', 'n', ' ', 'w', 'o', 'n', 'd', 'e', 'r', 'l', 'a']
y 데이터의 첫번째 샘플 디코딩 : ['h', 'e', ' ', 'p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 'e',

In [None]:
print(train_X[1])


[43, 33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54]


In [None]:
print(train_y[1])


[33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54, 52]


In [None]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩


train_X의 크기(shape) : (2658, 60, 56)
train_y의 크기(shape) : (2658, 60, 56)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

hidden_units = 256

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(None, train_X.shape[2]), return_sequences=True)) #은닉상태 = 256
model.add(LSTM(hidden_units, return_sequences=True)) #LSTM 은닉츠 2개 사용
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))#소프트맥스함수
#문자 집합 크기 만큼의 뉴런을 배치하여 모델 설계 -> 전결합층

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #크로스 엔트로피 - 손실함수
model.fit(train_X, train_y, epochs=80, verbose=2)


Epoch 1/80
84/84 - 45s - loss: 3.0673 - accuracy: 0.1835 - 45s/epoch - 541ms/step
Epoch 2/80
84/84 - 41s - loss: 2.7953 - accuracy: 0.2287 - 41s/epoch - 493ms/step
Epoch 3/80
84/84 - 42s - loss: 2.4475 - accuracy: 0.3150 - 42s/epoch - 501ms/step
Epoch 4/80
84/84 - 41s - loss: 2.3089 - accuracy: 0.3469 - 41s/epoch - 486ms/step
Epoch 5/80
84/84 - 40s - loss: 2.2129 - accuracy: 0.3689 - 40s/epoch - 477ms/step
Epoch 6/80
84/84 - 40s - loss: 2.1341 - accuracy: 0.3885 - 40s/epoch - 477ms/step
Epoch 7/80
84/84 - 41s - loss: 2.0809 - accuracy: 0.4011 - 41s/epoch - 483ms/step
Epoch 8/80
84/84 - 41s - loss: 2.0284 - accuracy: 0.4141 - 41s/epoch - 490ms/step
Epoch 9/80
84/84 - 40s - loss: 1.9825 - accuracy: 0.4280 - 40s/epoch - 473ms/step
Epoch 10/80
84/84 - 40s - loss: 1.9412 - accuracy: 0.4388 - 40s/epoch - 477ms/step
Epoch 11/80
84/84 - 41s - loss: 1.9045 - accuracy: 0.4490 - 41s/epoch - 491ms/step
Epoch 12/80
84/84 - 42s - loss: 1.8709 - accuracy: 0.4580 - 42s/epoch - 495ms/step
Epoch 13/80
8

<keras.src.callbacks.History at 0x7e65495a7e20>

In [None]:
def sentence_generation(model, length):
    # 문자에 대한 랜덤한 정수 생성
    ix = [np.random.randint(vocab_size)]

    # 랜덤한 정수로부터 맵핑되는 문자 생성
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 문자',y_char[-1],'로 예측을 시작!')

    # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성
    X = np.zeros((1, length, vocab_size))

    for i in range(length):
        # X[0][i][예측한 문자의 인덱스] = 1, 즉, 예측 문자를 다음 입력 시퀀스에 추가
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)


In [None]:
result = sentence_generation(model, 100)
print(result)


0 번 문자   로 예측을 시작!
 and the mouse went should like it, you know im all her confusing this talk not timidly im a poor man


In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical


raw_text = '''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''


In [None]:
tokens = raw_text.split()
raw_text = ' '.join(tokens)
print(raw_text)
#단락 구분 없애고 하나의 문자열로 재저장

I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [None]:
# 중복을 제거한 문자 집합 생성
char_vocab = sorted(list(set(raw_text)))
vocab_size = len(char_vocab)
print('문자 집합 :',char_vocab)
print ('문자 집합의 크기 : {}'.format(vocab_size))


문자 집합 : [' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
문자 집합의 크기 : 33


In [None]:
char_to_index = dict((char, index) for index, char in enumerate(char_vocab)) # 문자에 고유한 정수 인덱스 부여
print(char_to_index)


{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [None]:
length = 11
sequences = []
for i in range(length, len(raw_text)):
    seq = raw_text[i-length:i] # 길이 11의 문자열을 지속적으로 만든다. (입력 시퀀스 길이는 10이지만 다음 문자도 필요함)
    sequences.append(seq) #리스트에 추
print('총 훈련 샘플의 수: %d' % len(sequences))


총 훈련 샘플의 수: 426


In [None]:
sequences[:10]


['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [None]:
encoded_sequences = []
for sequence in sequences: # 전체 데이터에서 문장 샘플을 1개씩 꺼낸다.
    encoded_sequence = [char_to_index[char] for char in sequence] # 문장 샘플에서 각 문자에 대해서 정수 인코딩을 수행.
    encoded_sequences.append(encoded_sequence)


In [None]:
encoded_sequences[:5]


[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18],
 [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28],
 [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17],
 [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0],
 [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]]

In [None]:
encoded_sequences = np.array(encoded_sequences)

# 맨 마지막 위치의 문자를 분리
X_data = encoded_sequences[:,:-1]
# 맨 마지막 위치의 문자를 저장
y_data = encoded_sequences[:,-1]


In [None]:
print(X_data[:5])
print(y_data[:5])


[[ 8  0 16 14 28  0 24 23  0 31]
 [ 0 16 14 28  0 24 23  0 31 18]
 [16 14 28  0 24 23  0 31 18 28]
 [14 28  0 24 23  0 31 18 28 17]
 [28  0 24 23  0 31 18 28 17  0]]
[18 28 17  0 21]


In [None]:
# 원-핫 인코딩
X_data_one_hot = [to_categorical(encoded, num_classes=vocab_size) for encoded in X_data]
X_data_one_hot = np.array(X_data_one_hot)
y_data_one_hot = to_categorical(y_data, num_classes=vocab_size)


In [None]:
print(X_data_one_hot.shape)


(426, 10, 33)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

hidden_units = 64

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(X_data_one_hot.shape[1], X_data_one_hot.shape[2]))) #은닉상태 64, input shape = (10,33)
model.add(Dense(vocab_size, activation='softmax')) #소프트맥스 함수 사용, 출력층으로 문자 집합 크기만큼 뉴런 배치

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #크로스 엔트로피 함수 사용
model.fit(X_data_one_hot, y_data_one_hot, epochs=100, verbose=2)


Epoch 1/100
14/14 - 3s - loss: 3.4791 - accuracy: 0.0892 - 3s/epoch - 225ms/step
Epoch 2/100
14/14 - 0s - loss: 3.4068 - accuracy: 0.1972 - 111ms/epoch - 8ms/step
Epoch 3/100
14/14 - 0s - loss: 3.1959 - accuracy: 0.1972 - 111ms/epoch - 8ms/step
Epoch 4/100
14/14 - 0s - loss: 3.0076 - accuracy: 0.1972 - 122ms/epoch - 9ms/step
Epoch 5/100
14/14 - 0s - loss: 2.9615 - accuracy: 0.1972 - 112ms/epoch - 8ms/step
Epoch 6/100
14/14 - 0s - loss: 2.9341 - accuracy: 0.1972 - 104ms/epoch - 7ms/step
Epoch 7/100
14/14 - 0s - loss: 2.9124 - accuracy: 0.1972 - 104ms/epoch - 7ms/step
Epoch 8/100
14/14 - 0s - loss: 2.8979 - accuracy: 0.1972 - 106ms/epoch - 8ms/step
Epoch 9/100
14/14 - 0s - loss: 2.8751 - accuracy: 0.1972 - 115ms/epoch - 8ms/step
Epoch 10/100
14/14 - 0s - loss: 2.8436 - accuracy: 0.1972 - 109ms/epoch - 8ms/step
Epoch 11/100
14/14 - 0s - loss: 2.8154 - accuracy: 0.1995 - 109ms/epoch - 8ms/step
Epoch 12/100
14/14 - 0s - loss: 2.7779 - accuracy: 0.2066 - 114ms/epoch - 8ms/step
Epoch 13/100
1

<keras.src.callbacks.History at 0x7e6543e0af20>

In [None]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):

    # 초기 시퀀스
    init_text = seed_text
    sentence = ''

    # 다음 문자 예측은 총 n번만 반복.
    for _ in range(n):
        encoded = [char_to_index[char] for char in seed_text] # 현재 시퀀스(아래 예시에서는 I get on w)에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre') # 데이터에 대한 패딩
        encoded = to_categorical(encoded, num_classes=len(char_to_index))#원핫인코딩

        # 입력한 X(현재 시퀀스)에 대해서 y를 예측하고 y(예측한 문자)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for char, index in char_to_index.items():
            if index == result:
                break

        # 현재 시퀀스 + 예측 문자를 현재 시퀀스로 변경
        seed_text = seed_text + char

        # 예측 문자를 문장에 저장
        sentence = sentence + char

    # n번의 다음 문자 예측이 끝나면 최종 완성된 문장을 리턴.
    sentence = init_text + sentence
    return sentence


In [None]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))


I get on with life as a programmer, I like to cange mlnd moeer. aBu hhe I laat bo ee.. But
