# 初始準備

In [1]:
%env KERAS_BACKEND = tensorflow

env: KERAS_BACKEND=tensorflow


In [2]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

# 讀入IMDB電影數據庫

In [3]:
from keras.datasets import imdb

In [5]:
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=10000)

In [6]:
len(x_train)

25000

In [7]:
len(x_test)

25000

# 輸入資料部分

In [9]:
x_train[99]

[1,
 1230,
 3765,
 566,
 97,
 189,
 102,
 86,
 7,
 32,
 4,
 973,
 16,
 55,
 355,
 18,
 14,
 20,
 4,
 64,
 542,
 173,
 16,
 4,
 893,
 2115,
 5376,
 250,
 39,
 8013,
 4,
 1362,
 2,
 14,
 102,
 47,
 57,
 599,
 633,
 6,
 1317,
 2,
 8,
 6,
 189,
 20,
 57,
 206,
 57,
 116,
 5,
 57,
 836,
 82,
 6,
 1317,
 2,
 3728,
 2,
 9,
 6,
 52,
 284,
 21,
 29,
 9,
 38,
 2245,
 5,
 1044,
 11,
 14,
 15,
 45,
 619,
 50,
 71,
 6,
 171,
 531,
 15,
 71,
 424,
 8,
 30,
 163,
 6211,
 4,
 1629,
 189,
 212,
 102,
 5,
 57,
 31,
 1498,
 11,
 4,
 311,
 13,
 197,
 15,
 14,
 20,
 16,
 1150,
 1479,
 5,
 13,
 161,
 990,
 692,
 5,
 1706,
 12,
 69,
 77,
 1194,
 8,
 3245,
 2001,
 553,
 67,
 14,
 20,
 48,
 25,
 423,
 13,
 131,
 124,
 51,
 25,
 122,
 236,
 1506,
 198,
 4,
 64,
 552,
 7,
 415,
 37,
 62,
 169,
 14,
 20,
 60,
 2602,
 629,
 5,
 615,
 14,
 9,
 8,
 25,
 1230,
 3765,
 570,
 231,
 189,
 102,
 14,
 20,
 166,
 2039,
 168,
 40,
 2450,
 5486,
 3298]

* 每一個數字都代表一個字 (頻率最常出現的字為1,以此類推)

In [10]:
for i in range(10):
    print(len(x_train[i]),end=', ')

218, 189, 141, 550, 147, 43, 123, 562, 233, 130, 

* 可以發現每個影評長度不同

# 輸出資料部分

In [12]:
y_train[:10]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0], dtype=int64)

* 1為正評，0為負評

# 送入神經網路的輸入處理

雖然RNN可以處理不同長度的輸入，但為了方便性還是會有以下設定

* 設定輸入文字長度上限
* 把每段文字弄成一樣長，太短補0

In [13]:
from keras.preprocessing import sequence

In [14]:
x_train = sequence.pad_sequences(x_train, maxlen=100)
x_test = sequence.pad_sequences(x_test, maxlen=100)

In [15]:
x_train.shape

(25000, 100)

# 打造RNN神經網路

這裡使用LSTM,基本上使用哪種RNN寫法都差不多

* 先將10000維的文字壓到128維
* 然後用150個LSTM(不一定要跟前面一樣)
* 最後一個output,直接用sigmoid輸出

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [17]:
model = Sequential()

In [18]:
model.add(Embedding(10000,128))

In [19]:
model.add(LSTM(150))

In [20]:
model.add(Dense(1, activation='sigmoid'))

# 組裝

* 這次用binary_crossentropy做loss function,另外用Adam學習法。

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 150)               167400    
_________________________________________________________________
dense (Dense)                (None, 1)                 151       
Total params: 1,447,551
Trainable params: 1,447,551
Non-trainable params: 0
_________________________________________________________________


3(LSTM層) * (128(上層輸入)+150(鄰居傳來的資料)+1(bias))* 150 = 125550

In [22]:
(128+150+1)*150

41850

In [23]:
125550+41850

167400

In [24]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

# 訓練

In [25]:
model.fit(x_train, y_train,
         batch_size=32,
         epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1e5d571b220>

# 分數

In [26]:
score = model.evaluate(x_test,y_test)



In [27]:
print('測試資料的 loss', score[0])
print('測試資料的正確率',score[1])

測試資料的 loss 0.7985097765922546
測試資料的正確率 0.8215600252151489


# 儲存結果

In [28]:
model_json = model.to_json()
open('imdb_model_architecture.json','w').write(model_json)

1965

In [29]:
model.save_weights('imdb_model_weights.h5')

### 另一種存的方式

In [30]:
model.save('myrnn.h5')