# 訓練計畫
- ## num_words 改變
    - 10000
    - 5000
- ## max_len 改變
    - 100
    - 125
    - 150
- ## output_dim 改變
    - 128
    - 256
- ## dropout 如果出現overfitting嚴重才考慮
- ## optimizer 因為adam是很棒的優化器, 暫時不考慮
- ## SimpleRNN, GRU, LSTM
    - 

In [0]:

# import must-use modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. 讀入深度學習套件

In [0]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN
from tensorflow.keras.layers import Embedding

## 2. 讀入data

In [0]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)    # 最多只有10000個index, 多的都會變成unknown, 避免字典過於龐大

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [0]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape, sep='\n')

(25000,)
(25000,)
(25000,)
(25000,)


In [0]:
# 查看單一樣本評價字數

len(x_train[18])

212

In [0]:
# 可以知道, 每個input shape是不一樣的

len(x_train[0])

218

In [0]:
print(y_train[18], y_train[0], sep='\n')

0
1


## 3. 資料處理

In [0]:
# 資料長度把他固定在100, 多扣少補
# 有點疑惑這樣不會造成重要訊息的失去嗎?

x_train = sequence.pad_sequences(x_train, maxlen=100)
x_test = sequence.pad_sequences(x_test, maxlen=100)

In [0]:
x_train[0].shape

(100,)

In [0]:
x_train[18].shape

(100,)

## 4. 模型建立

In [0]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 1,674,497
Trainable params: 1,674,497
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam',
             metrics=['accuracy'])

## 5. 模型訓練

In [0]:
model.fit(x_train, y_train, batch_size=32, epochs=5,
         validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fb25b5c1ac8>

In [0]:
# model參數化, 以及儲存為h5格式。

model_json = model.to_json()
open('imdb_model_architecture.json', 'w').write(model_json)
model.save_weights('imdb_model_weights.h5')

## num_words change
- ### 改變原因: 想知道不同的字典大小, 對於模型的結果。

In [0]:
## num_words 5000

(x_train_5t, y_train_5t), (x_test_5t, y_test_5t) = imdb.load_data(num_words=5000)

x_train_5t = sequence.pad_sequences(x_train_5t, maxlen=100)
x_test_5t = sequence.pad_sequences(x_test_5t, maxlen=100)

model_5t = Sequential()
model_5t.add(Embedding(input_dim=5000, output_dim=128))
model_5t.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model_5t.add(Dense(1, activation='sigmoid'))

model_5t.summary()

model_5t.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5t.fit(x_train_5t, y_train_5t, batch_size=32, epochs=5,
          validation_data=(x_test_5t, y_test_5t))


# Model: "sequential"
# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# embedding (Embedding)        (None, None, 128)         640000    
# _________________________________________________________________
# lstm (LSTM)                  (None, 256)               394240    
# _________________________________________________________________
# dense (Dense)                (None, 1)                 257       
# =================================================================
# Total params: 1,034,497
# Trainable params: 1,034,497
# Non-trainable params: 0
# _________________________________________________________________
# Train on 25000 samples, validate on 25000 samples
# Epoch 1/5
# 25000/25000 [==============================] - 397s 16ms/sample - loss: 0.4854 - accuracy: 0.7626 - val_loss: 0.3937 - val_accuracy: 0.8258
# Epoch 2/5
# 25000/25000 [==============================] - 380s 15ms/sample - loss: 0.3547 - accuracy: 0.8488 - val_loss: 0.4008 - val_accuracy: 0.8214
# Epoch 3/5
# 25000/25000 [==============================] - 482s 19ms/sample - loss: 0.2934 - accuracy: 0.8776 - val_loss: 0.3656 - val_accuracy: 0.8395
# Epoch 4/5
# 25000/25000 [==============================] - 434s 17ms/sample - loss: 0.2501 - accuracy: 0.8993 - val_loss: 0.3651 - val_accuracy: 0.8491
# Epoch 5/5
# 25000/25000 [==============================] - 427s 17ms/sample - loss: 0.2128 - accuracy: 0.9170 - val_loss: 0.4292 - val_accuracy: 0.8342

## max_len 改變
- ### 改變原因：想知道, max_len對於模型的影響, 推測至少要大於最長的會比較好, 因為不會有資料損失的情況, 但可能會多一些雜訊。

In [0]:
## max_len: 150

(x_train_5t, y_train_5t), (x_test_5t, y_test_5t) = imdb.load_data(num_words=10000)

x_train_5t = sequence.pad_sequences(x_train_5t, maxlen=150)
x_test_5t = sequence.pad_sequences(x_test_5t, maxlen=150)

model_5t = Sequential()
model_5t.add(Embedding(input_dim=10000, output_dim=128))
model_5t.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model_5t.add(Dense(1, activation='sigmoid'))

model_5t.summary()

model_5t.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5t.fit(x_train_5t, y_train_5t, batch_size=32, epochs=5,
          validation_data=(x_test_5t, y_test_5t))


# Model: "sequential_1"
# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# embedding_1 (Embedding)      (None, None, 128)         1280000   
# _________________________________________________________________
# lstm_1 (LSTM)                (None, 256)               394240    
# _________________________________________________________________
# dense_1 (Dense)              (None, 1)                 257       
# =================================================================
# Total params: 1,674,497
# Trainable params: 1,674,497
# Non-trainable params: 0
# _________________________________________________________________
# Train on 25000 samples, validate on 25000 samples
# Epoch 1/5
# 25000/25000 [==============================] - 625s 25ms/sample - loss: 0.4941 - accuracy: 0.7614 - val_loss: 0.3913 - val_accuracy: 0.8317
# Epoch 2/5
# 25000/25000 [==============================] - 637s 25ms/sample - loss: 0.3813 - accuracy: 0.8341 - val_loss: 0.3901 - val_accuracy: 0.8348
# Epoch 3/5
# 25000/25000 [==============================] - 621s 25ms/sample - loss: 0.3511 - accuracy: 0.8501 - val_loss: 0.4491 - val_accuracy: 0.7885
# Epoch 4/5
# 25000/25000 [==============================] - 608s 24ms/sample - loss: 0.2547 - accuracy: 0.8972 - val_loss: 0.4213 - val_accuracy: 0.8324
# Epoch 5/5
# 25000/25000 [==============================] - 737s 29ms/sample - loss: 0.1960 - accuracy: 0.9251 - val_loss: 0.3681 - val_accuracy: 0.8528

In [28]:
## max_len: 125

(x_train_5t, y_train_5t), (x_test_5t, y_test_5t) = imdb.load_data(num_words=10000)

x_train_5t = sequence.pad_sequences(x_train_5t, maxlen=125)
x_test_5t = sequence.pad_sequences(x_test_5t, maxlen=125)

model_5t = Sequential()
model_5t.add(Embedding(input_dim=10000, output_dim=128))
model_5t.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model_5t.add(Dense(1, activation='sigmoid'))

model_5t.summary()

model_5t.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5t.fit(x_train_5t, y_train_5t, batch_size=32, epochs=5,
          validation_data=(x_test_5t, y_test_5t))

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 128)         1280000   
_________________________________________________________________
lstm_11 (LSTM)               (None, 256)               394240    
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 257       
Total params: 1,674,497
Trainable params: 1,674,497
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdebb537198>

# outdim改變
- ## 改變原因：壓縮到多少維度才會比較適合?改變此有沒有甚麼規則可循?

In [27]:
## outdim: 256

(x_train_5t, y_train_5t), (x_test_5t, y_test_5t) = imdb.load_data(num_words=10000)

x_train_5t = sequence.pad_sequences(x_train_5t, maxlen=100)
x_test_5t = sequence.pad_sequences(x_test_5t, maxlen=100)

model_5t = Sequential()
model_5t.add(Embedding(input_dim=10000, output_dim=256))
model_5t.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model_5t.add(Dense(1, activation='sigmoid'))

model_5t.summary()

model_5t.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5t.fit(x_train_5t, y_train_5t, batch_size=32, epochs=5,
          validation_data=(x_test_5t, y_test_5t))

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, None, 256)         2560000   
_________________________________________________________________
lstm_10 (LSTM)               (None, 256)               525312    
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 257       
Total params: 3,085,569
Trainable params: 3,085,569
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdea7b447b8>

# RNN 基礎結構改變
- ## 改變原因：想知道是否在這個問題上，如同上課所述，simpleRnn < LSTM <= GRU

In [26]:
## SimpleRNN

(x_train_5t, y_train_5t), (x_test_5t, y_test_5t) = imdb.load_data(num_words=10000)

x_train_5t = sequence.pad_sequences(x_train_5t, maxlen=100)
x_test_5t = sequence.pad_sequences(x_test_5t, maxlen=100)

model_5t = Sequential()
model_5t.add(Embedding(input_dim=10000, output_dim=128))
model_5t.add(SimpleRNN(256, dropout=0.2, recurrent_dropout=0.2))
model_5t.add(Dense(1, activation='sigmoid'))

model_5t.summary()

model_5t.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5t.fit(x_train_5t, y_train_5t, batch_size=32, epochs=5,
          validation_data=(x_test_5t, y_test_5t))

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 128)         1280000   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 256)               98560     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 257       
Total params: 1,378,817
Trainable params: 1,378,817
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdf5006a6a0>

In [25]:
## GRU

(x_train_5t, y_train_5t), (x_test_5t, y_test_5t) = imdb.load_data(num_words=10000)

x_train_5t = sequence.pad_sequences(x_train_5t, maxlen=100)
x_test_5t = sequence.pad_sequences(x_test_5t, maxlen=100)

model_5t = Sequential()
model_5t.add(Embedding(input_dim=10000, output_dim=128))
model_5t.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
model_5t.add(Dense(1, activation='sigmoid'))

model_5t.summary()

model_5t.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5t.fit(x_train_5t, y_train_5t, batch_size=32, epochs=5,
          validation_data=(x_test_5t, y_test_5t))

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 128)         1280000   
_________________________________________________________________
gru (GRU)                    (None, 256)               296448    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 257       
Total params: 1,576,705
Trainable params: 1,576,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdeb9fa5f28>