## 資料集準備及說明

In [None]:
import tensorflow
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten

import numpy as np

In [None]:
# 載入 IMDb 資料集, 如果是第一次載入會自行下載資料集
top_words = 1000 #最大單字量設定為1000
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=top_words)

In [None]:
# 形狀
print("X_train.shape: ", X_train.shape)
print("Y_train.shape: ", Y_train.shape)
print("X_test.shape: ", X_test.shape)
print("Y_test.shape: ", Y_test.shape)

In [None]:
# 顯示 Numpy 陣列內容
print(X_train[0])
print(Y_train[0])   # 標籤資料

In [None]:
# 最大的單字索引值
max_index = max(max(sequence) for sequence in X_train)
print("Max Index: ", max_index)

In [None]:
# 建立評論文字的解碼字典
word_index = imdb.get_word_index()
we_index = word_index["we"]
print("'we' index:", we_index)

In [None]:
# 使用index反查字典後，顯示該單字為何
decode_word_map = dict([(value, key) for (key, value) in word_index.items()])
print(decode_word_map[we_index])

In [None]:
# 解碼顯示評論文字內容
decoded_indices = [decode_word_map.get(i-3, "?") for i in X_train[0]]
print(decoded_indices)
decoded_review = " ".join(decoded_indices)
print(decoded_review)

In [None]:
# 文字資料預處理
max_words = 100 #設定每篇文章讀取前100個單字（不足數量的填補、超過數量的截斷）

X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)

In [None]:
print(X_train[0])

## 搭建MLP模型

In [None]:
seed = 10
np.random.seed(seed)  # 指定亂數種子

In [None]:
# 定義模型
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words)) #Embedding層將文字向量化，必須為第一層
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()   # 顯示模型摘要資訊

In [None]:
# 編譯模型
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])
# 訓練模型
history = model.fit(X_train, Y_train, validation_split=0.2, epochs=10, batch_size=128, verbose=2)

In [None]:
# 評估模型
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print("測試資料集的準確度 = {:.2f}".format(accuracy))

## max_words改用500，每篇文章讀取更多單字

In [None]:
# 資料預處理 設定每篇文章讀取前500個單字
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [None]:
# 定義模型
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words)) #Embedding層將文字向量化，必須為第一層
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()   # 顯示模型摘要資訊

In [None]:
# 編譯模型
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])
# 訓練模型
history = model.fit(X_train, Y_train, validation_split=0.2, epochs=10, batch_size=128, verbose=2)

In [None]:
# 評估模型
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print("測試資料集的準確度 = {:.2f}".format(accuracy))

## top_words設定成10000，表示字典的單字量為10000個
## max_words設定500，每篇文章讀取500個單字
## MLP模型設定兩層隱藏層，各128個神經元

In [None]:
# 載入 IMDb 資料集
top_words = 10000
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=top_words)
# 資料預處理
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [None]:
# 定義模型
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()   # 顯示模型摘要資訊

In [None]:
# 編譯模型
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])
# 訓練模型
history = model.fit(X_train, Y_train, validation_split=0.2, epochs=10, batch_size=128, verbose=2)

In [None]:
# 評估模型
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print("測試資料集的準確度 = {:.2f}".format(accuracy))