<a href="https://colab.research.google.com/github/Elwing-Chou/ml1206/blob/main/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [None]:
import glob
dn = os.path.dirname(dataset)
glob.glob("/root/.keras/datasets/aclImdb/train/pos/*")

In [2]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df

In [3]:
train_df = getdata("train")
test_df = getdata("test")

In [None]:
test_df

In [17]:
# 1. tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [20]:
# tok.index_word
# tok.word_index

In [None]:
# 2. sequenize: 文字化成數字(數字最大值:3000)
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
# 3. padding: 變成一樣長
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

layers = [
    # 3001(種) * 128(情緒) -> 384128
    Embedding(3001, 128, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

In [28]:
# 不用做one-hot
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer="adam",
              metrics=["accuracy"])


In [29]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [30]:
# 1 round: 54000 / 100 -> 540次梯度下降
# verbose: 0(quiet) 1(full) 2(no progress bar)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    ModelCheckpoint("imdb.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
          y_train,
          batch_size=100,
          epochs=100,
          verbose=2,
          validation_split=0.1,
          callbacks=callbacks)

Epoch 1/100
225/225 - 9s - loss: 0.5676 - accuracy: 0.7234 - val_loss: 0.5088 - val_accuracy: 0.7620 - 9s/epoch - 38ms/step
Epoch 2/100
225/225 - 8s - loss: 0.3583 - accuracy: 0.8656 - val_loss: 0.3659 - val_accuracy: 0.8404 - 8s/epoch - 34ms/step
Epoch 3/100
225/225 - 7s - loss: 0.2904 - accuracy: 0.8894 - val_loss: 0.3370 - val_accuracy: 0.8528 - 7s/epoch - 33ms/step
Epoch 4/100
225/225 - 7s - loss: 0.2596 - accuracy: 0.8995 - val_loss: 0.3767 - val_accuracy: 0.8360 - 7s/epoch - 33ms/step
Epoch 5/100
225/225 - 7s - loss: 0.2414 - accuracy: 0.9063 - val_loss: 0.3254 - val_accuracy: 0.8632 - 7s/epoch - 33ms/step
Epoch 6/100
225/225 - 8s - loss: 0.2287 - accuracy: 0.9116 - val_loss: 0.3021 - val_accuracy: 0.8780 - 8s/epoch - 34ms/step
Epoch 7/100
225/225 - 7s - loss: 0.2199 - accuracy: 0.9160 - val_loss: 0.3479 - val_accuracy: 0.8576 - 7s/epoch - 33ms/step
Epoch 8/100
225/225 - 7s - loss: 0.2131 - accuracy: 0.9190 - val_loss: 0.3485 - val_accuracy: 0.8604 - 7s/epoch - 33ms/step
Epoch 9/

<keras.callbacks.History at 0x7f2a27a9bb90>

In [32]:
model.evaluate(x_test_pad, y_test)



[0.2892310321331024, 0.8817200064659119]