<a href="https://colab.research.google.com/github/Elwing-Chou/ml1216/blob/main/embedding_gap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [None]:
import os
import glob
import pandas as pd

base = os.path.dirname(dataset)
def get_data(category):
    contents, targets = [], []
    dir = os.path.join(base, "aclImdb", category, "pos")
    lfn = glob.glob(os.path.join(dir, "*.txt"))
    ufn = glob.glob(os.path.join(dir, "*.TXT"))
    for fn in lfn + ufn:
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            targets.append(1)
    dir = os.path.join(base, "aclImdb", category, "neg")
    lfn = glob.glob(os.path.join(dir, "*.txt"))
    ufn = glob.glob(os.path.join(dir, "*.TXT"))
    for fn in lfn + ufn:
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            targets.append(0)
    df = pd.DataFrame({
        "content":contents,
        "target":targets
    })
    return df
train_df = get_data("train")
test_df = get_data("test")

In [None]:
# tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
# fit
tok.fit_on_texts(train_df["content"])

In [None]:
# transform
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])

In [None]:
pd.DataFrame(x_train_seq)
tok.index_word[3005]
tok.word_index

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)

In [None]:
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling1D
layers = [
    # 3001 * 128
    Embedding(3001, 128, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")

]
model = Sequential(layers)
model.summary()

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [None]:
y_train = train_df["target"]
y_test = test_df["target"]

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint("model.h5", save_best_only=True)
]
# validation_split: 切出一部分資料驗證
# batch_size: 看多少筆才做一次調整(梯度下降)
# epochs: 訓練次數(60000-6000筆/epoch)
# 1 epoch 多少次梯度下降: 54000 / 200 -> 270
# verbose: 印出多少log(1:default 0:quiet 2:)
model.fit(x_train_pad, 
     y_train,
     validation_split=0.1,
     batch_size=200,
     epochs=50,
     verbose=2,
     callbacks=callbacks)

In [None]:
model.evaluate(x_test_pad, y_test)