<a href="https://colab.research.google.com/github/Elwing-Chou/ml0223/blob/main/sentiment_gap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [None]:
import glob
import os
dn = os.path.dirname(dataset)
fn = glob.glob(dn + "/aclImdb/train/pos/*")
with open(fn[0], "r", encoding="utf-8") as f:
    print("pos:", f.read())
fn = glob.glob(dn + "/aclImdb/train/neg/*")
with open(fn[0], "r", encoding="utf-8") as f:
    print("neg:", f.read())

In [None]:
import pandas as pd
def getdata(base):
    datas = {"article":[], "ans":[]}
    targets = os.path.join(base, "pos", "*")
    for fn in glob.glob(targets):
        with open(fn, "r", encoding="utf-8") as f:
            datas["article"].append(f.read())
            datas["ans"].append(1)
    targets = os.path.join(base, "neg", "*")
    for fn in glob.glob(targets):
        with open(fn, "r", encoding="utf-8") as f:
            datas["article"].append(f.read())
            datas["ans"].append(0)
    return pd.DataFrame(datas)
train = os.path.join(dn, "aclImdb", "train")
train_df = getdata(train)
test = os.path.join(dn, "aclImdb", "test")
test_df = getdata(test)

In [None]:
train_df["article"][0]

In [None]:
# Tokenize: I -> 1 love -> 2 you -> 3
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["article"])
x_train_seq = tok.texts_to_sequences(train_df["article"])
x_test_seq = tok.texts_to_sequences(test_df["article"])

In [None]:
# tok.word_index, 0不會用到, 0 for padding
tok.index_word[19]
# 有東西消失是因為超過三千最常出現單字
tok.sequences_to_texts(x_train_seq)[0]

In [None]:
pd.DataFrame(x_train_seq)

In [None]:
# 個人覺得512 1024差不多
MAXLEN = 512

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=MAXLEN)
x_test_pad = pad_sequences(x_test_seq, maxlen=MAXLEN)

In [None]:
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D
layers = [
    Embedding(3001, 128, mask_zero=True, input_length=MAXLEN),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

In [None]:

# 不需要自己做one-hot
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [None]:
import numpy as np
y_train = np.array(train_df["ans"])
y_test = np.array(test_df["ans"])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
   EarlyStopping(patience=5, restore_best_weights=True),
   ModelCheckpoint("sentiment.h5", save_best_only=True)
]
model.fit(x_train_pad,
     y_train,
     batch_size=200,
     epochs=50,
     validation_split=0.1,
     verbose=2,
     callbacks=callbacks)

In [None]:
model.evaluate(x_test_pad, y_test)