# Generate sentence using reuters dataset

## Import

In [68]:
import numpy as np
from tqdm import tqdm

from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Load reuters dataset

In [54]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(
    path='reuters.npz', num_words=None, test_split=0,
)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((11228,), (11228,), (0,), (0,))

In [55]:
word2index = reuters.get_word_index(path="reuters_word_index.json")
index2word = {i:w for w, i in word2index.items()}
len(word2index), len(index2word)

(30979, 30979)

In [65]:
texts = []
char_vocab = []
for text in x_train:
    text = [index2word[w] for w in text if w in index2word]
    text = " ".join(text)
    char_vocab = list(set(char_vocab + list(set(text))))
    texts.append(text)
char_vocab = sorted(char_vocab)
len(texts), len(char_vocab)

(11228, 39)

In [66]:
index2char = {i:c for i, c in enumerate(char_vocab)}
char2index = {c:i for i, c in index2char.items()}
len(index2char), len(char2index)

(39, 39)

In [85]:
length = 11
sequences = []
for text in tqdm(texts):
    len_text = len(text)
    for j in range(len_text-length+1):
        seq = [char2index[c] for c in text[j:j+length]]
        sequences.append(seq)
print(f"총 훈련 샘플 수: {len(sequences)}")

100%|██████████| 11228/11228 [00:15<00:00, 742.44it/s]

총 훈련 샘플 수: 8968495





In [86]:
sequences[:10]

[[32, 20, 17, 0, 35, 13, 32, 32, 21, 17, 0],
 [20, 17, 0, 35, 13, 32, 32, 21, 17, 0, 26],
 [17, 0, 35, 13, 32, 32, 21, 17, 0, 26, 27],
 [0, 35, 13, 32, 32, 21, 17, 0, 26, 27, 26],
 [35, 13, 32, 32, 21, 17, 0, 26, 27, 26, 16],
 [13, 32, 32, 21, 17, 0, 26, 27, 26, 16, 21],
 [32, 32, 21, 17, 0, 26, 27, 26, 16, 21, 31],
 [32, 21, 17, 0, 26, 27, 26, 16, 21, 31, 15],
 [21, 17, 0, 26, 27, 26, 16, 21, 31, 15, 30],
 [17, 0, 26, 27, 26, 16, 21, 31, 15, 30, 21]]

In [87]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
X.shape, y.shape

((8968495, 10), (8968495,))

In [88]:
vocab_size = len(char_vocab)
vocab_size

39

In [90]:
sequences = []
for x in tqdm(X):
    sequences.append(to_categorical(x, num_classes=vocab_size))
len(sequences)

100%|██████████| 8968495/8968495 [08:47<00:00, 17005.37it/s] 


8968495

In [91]:
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)
X.shape, y.shape

((8968495, 10, 39), (8968495, 39))

## build model

In [92]:
model = Sequential()
model.add(LSTM(80, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 80)                38400     
_________________________________________________________________
dense (Dense)                (None, 39)                3159      
Total params: 41,559
Trainable params: 41,559
Non-trainable params: 0
_________________________________________________________________


In [93]:
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
model.fit(X, y, epochs=10, verbose=2)

Train on 8968495 samples
Epoch 1/10
8968495/8968495 - 1556s - loss: 1.4617 - accuracy: 0.5747
Epoch 2/10
8968495/8968495 - 1556s - loss: 1.3318 - accuracy: 0.6087
Epoch 3/10
8968495/8968495 - 8167s - loss: 1.3093 - accuracy: 0.6144
Epoch 4/10
8968495/8968495 - 1581s - loss: 1.2985 - accuracy: 0.6171
Epoch 5/10
8968495/8968495 - 1534s - loss: 1.2918 - accuracy: 0.6189
Epoch 6/10
8968495/8968495 - 1534s - loss: 1.2873 - accuracy: 0.6201
Epoch 7/10
8968495/8968495 - 1531s - loss: 1.2842 - accuracy: 0.6208
Epoch 8/10
8968495/8968495 - 1542s - loss: 1.2818 - accuracy: 0.6214
Epoch 9/10
8968495/8968495 - 1555s - loss: 1.2800 - accuracy: 0.6219
Epoch 10/10
8968495/8968495 - 1614s - loss: 1.2783 - accuracy: 0.6224


<tensorflow.python.keras.callbacks.History at 0x7fba3ba25780>

## Predict

In [94]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
# 모델, 인덱스 정보, 문장 길이, 초기 시퀀스, 반복 횟수
    init_text = seed_text # 문장 생성에 사용할 초기 시퀀스
    sentence = ''

    for _ in range(n): # n번 반복
        encoded = [char_to_index[char] for char in seed_text] # 현재 시퀀스에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre') # 데이터에 대한 패딩
        encoded = to_categorical(encoded, num_classes=len(char_to_index))
        result = model.predict_classes(encoded, verbose=0)
        # 입력한 X(현재 시퀀스)에 대해서 y를 예측하고 y(예측한 글자)를 result에 저장.
        for char, index in char_to_index.items(): # 만약 예측한 글자와 인덱스와 동일한 글자가 있다면
            if index == result: # 해당 글자가 예측 글자이므로 break
                break
        seed_text=seed_text + char # 현재 시퀀스 + 예측 글자를 현재 시퀀스로 변경
        sentence=sentence + char # 예측 글자를 문장에 저장
        # for문이므로 이 작업을 다시 반복

    sentence = init_text + sentence
    return sentence

In [96]:
print(sentence_generation(model, char2index, 10, "Citi gets its first female CEO".lower(), 1000))


citi gets its first female ceott and in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in this mln in 