# 4.11 LSTM 기사분류
로이터 뉴스 데이터는 총 11,258개의 뉴스 기사가 46개의 카테고리로 나누어진 텍스트 데이터

To Learn:
* 전체 텍스트들에서 단어들의 출현빈도에 따른 순위가 정수화 ==> 이를 수행하는 함수는 Tokenizer()
* pad_sequences
* to_categorical
* Embedding

![](https://cl.ly/ed0710345d0f/Image%2525202019-10-27%252520at%25252010.07.20%252520AM.png)
![](https://cl.ly/ba3956aa4f47/image182.png)



In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
seed = 0
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
# test_split 0.2 ==> 20%
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.reuters.load_data(
    num_words=1000, test_split=0.2) 

In [5]:
category = np.max(y_train) + 1
category

46

In [6]:
x_train[0] #전체 텍스트들에서 단어들의 출현빈도에 따른 순위가 인덱스 생성(정수화) ==> 이를 수행하는 함수는 Tokenizer()

[1,
 2,
 2,
 8,
 43,
 10,
 447,
 5,
 25,
 207,
 270,
 5,
 2,
 111,
 16,
 369,
 186,
 90,
 67,
 7,
 89,
 5,
 19,
 102,
 6,
 19,
 124,
 15,
 90,
 67,
 84,
 22,
 482,
 26,
 7,
 48,
 4,
 49,
 8,
 864,
 39,
 209,
 154,
 6,
 151,
 6,
 83,
 11,
 15,
 22,
 155,
 11,
 15,
 7,
 48,
 9,
 2,
 2,
 504,
 6,
 258,
 6,
 272,
 11,
 15,
 22,
 134,
 44,
 11,
 15,
 16,
 8,
 197,
 2,
 90,
 67,
 52,
 29,
 209,
 30,
 32,
 132,
 6,
 109,
 15,
 17,
 12]

In [7]:
len(x_train)

8982

In [8]:
y_train

array([ 3,  4,  3, ..., 25,  3, 25])

In [9]:
x_train[0]

[1,
 2,
 2,
 8,
 43,
 10,
 447,
 5,
 25,
 207,
 270,
 5,
 2,
 111,
 16,
 369,
 186,
 90,
 67,
 7,
 89,
 5,
 19,
 102,
 6,
 19,
 124,
 15,
 90,
 67,
 84,
 22,
 482,
 26,
 7,
 48,
 4,
 49,
 8,
 864,
 39,
 209,
 154,
 6,
 151,
 6,
 83,
 11,
 15,
 22,
 155,
 11,
 15,
 7,
 48,
 9,
 2,
 2,
 504,
 6,
 258,
 6,
 272,
 11,
 15,
 22,
 134,
 44,
 11,
 15,
 16,
 8,
 197,
 2,
 90,
 67,
 52,
 29,
 209,
 30,
 32,
 132,
 6,
 109,
 15,
 17,
 12]

In [10]:
len(x_train[0])

87

In [11]:
len(x_train[1])

56

In [12]:
# 전처리
# 각 기사(x)는 각기 다른 단어갯수로 구성되었다. 이를 100으로 맞춘다.
# y 데이터를 원-핫 인코딩한다.
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=100)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=100)
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [13]:
x_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         1,   2,   2,   8,  43,  10, 447,   5,  25, 207, 270,   5,   2,
       111,  16, 369, 186,  90,  67,   7,  89,   5,  19, 102,   6,  19,
       124,  15,  90,  67,  84,  22, 482,  26,   7,  48,   4,  49,   8,
       864,  39, 209, 154,   6, 151,   6,  83,  11,  15,  22, 155,  11,
        15,   7,  48,   9,   2,   2, 504,   6, 258,   6, 272,  11,  15,
        22, 134,  44,  11,  15,  16,   8, 197,   2,  90,  67,  52,  29,
       209,  30,  32, 132,   6, 109,  15,  17,  12], dtype=int32)

In [14]:
len(x_train[0])

100

In [None]:
y_train

In [None]:
y_train[0]

In [None]:
# How many Features ?
# How many time steps ?
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(1000,100),
  tf.keras.layers.LSTM(100, activation='tanh'),
  tf.keras.layers.Dense(46, activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, epochs=100, batch_size=20, validation_data=(x_test, y_test))

In [None]:
model.evaluate(x_test,y_test)

In [None]:
x_vloss=history.history['val_loss']
y_loss = history.history['loss']

In [None]:
x_len = np.arange(len(y_loss))
plt.plot(x_len, y_vloss, marker='.', c='red', label='Testset_loss')
plt.plot(x_len, y_loss, marker='.', c='blue', label='Trainset_loss')

plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylable('loss')
plt.show()