<a href="https://colab.research.google.com/github/Elwing-Chou/ml0602/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [None]:
import os
import glob
dn = os.path.split(dataset)[0]
fn = os.path.join(dn, "aclImdb", "train", "pos", "*")
print(fn)
fns = glob.glob(fn)
with open(fns[0], "r", encoding="utf-8") as f:
    print(f.read())

In [None]:
import pandas as pd
def get_data(base="train"):
    contents, target = [], []
    dn = os.path.split(dataset)[0]
    fn = os.path.join(dn, "aclImdb", base, "pos", "*.txt")
    fns = glob.glob(fn)
    for p in fns:
        with open(p, "r", encoding="utf-8") as f:
            contents.append(f.read())
            target.append(1)
    fn = os.path.join(dn, "aclImdb", base, "neg", "*.txt")
    fns = glob.glob(fn)
    for p in fns:
        with open(p, "r", encoding="utf-8") as f:
            contents.append(f.read())
            target.append(0)
    df = pd.DataFrame({
        "contents":contents,
        "ans":target
    })
    return df
train_df = get_data(base="train")
test_df = get_data(base="test")

In [None]:
# Step1. Tokenize
# 我們絕對不會使用0, 因為0是拿來做padding
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["contents"])

In [None]:
# Step2. Sequenize
# tok.word_index
# tok.index_word[3000]
# 文章(token) -> sequence
x_train_seq = tok.texts_to_sequences(train_df["contents"])
x_test_seq = tok.texts_to_sequences(test_df["contents"])
pd.DataFrame(x_train_seq)

In [None]:
SEQ_LEN = 512

In [None]:
# Step3. Padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=SEQ_LEN)
x_test_pad = pad_sequences(x_test_seq, maxlen=SEQ_LEN)
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
layers = [
    # 每個詞彙都有128個係數, 3001 * 128 = 384128
    # !!!tokenize有多少個可調(上面要記得調): 3000 -> 4000
    # !!!詞向量維度可調(簡單小一點, 困難大一點): 128 -> 256/64
    # !!!input_length(上面要記得調): 512 -> 256
    Embedding(3001, 128, mask_zero=True, input_length=SEQ_LEN),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer=Adam(),
              metrics=["accuracy"])

In [None]:
import numpy as np
y_train = np.array(train_df["ans"])
y_test = np.array(test_df["ans"])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# validation_split: 切出一些驗證資料來做驗證(0.1: 54000筆資料訓練)
# epochs: 看幾輪(10輪: 54000 * 10)
# batch_size: 看了多少筆再做一次梯度調整(看圖片大小, 圖片小(20*20):200, 圖片大(200*200):20)
# 200batch, 10epoch: 54000 * 10 / 200 -> 2700(梯度調整)
# verbose: 0(無聲) 1(最詳細) 2(沒有進度條)
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint("model.h5", save_best_only=True)
]
# h5: https://www.hdfgroup.org/downloads/hdfview/
model.fit(x_train_pad,
          y_train,
          validation_split=0.1,
          epochs=100,
          batch_size=200,
          verbose=2,
          callbacks=callbacks)

In [1]:
model.evaluate(x_test_pad, y_test)

NameError: ignored