<a href="https://colab.research.google.com/github/Elwing-Chou/ml0804/blob/master/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)
print("extract on:", dataset)

In [None]:
import glob
import pandas as pd
import os
def getdata(dataset, t):
    dn = os.path.dirname(dataset)
    dn = os.path.join(dn, "aclImdb", t)
    pos = glob.glob(os.path.join(dn, "pos", "*.txt")) + glob.glob(os.path.join(dn, "pos", "*.TXT"))
    neg = glob.glob(os.path.join(dn, "neg", "*.txt")) + glob.glob(os.path.join(dn, "neg", "*.TXT"))
    contents = []
    for fn in pos + neg:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(pos) + [0] * len(neg)
    })
    return df
train_df = getdata(dataset, "train")
test_df = getdata(dataset, "test")
train_df

In [None]:
test_df

In [None]:
# (中文): 分詞, 空白鍵.join()
# 預處理第一步: tokenize(把詞和代表數字關係建立起來)
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
# token不會包括0(padding)
# tok.word_index
# len(tok.word_index)

In [None]:
# 第二步: to sequence(真的把詞轉成數字)
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
# 第三部: 截長補短
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Dropout, Flatten, Lambda, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
layers = [
    # Embedding(進來有幾種, 出來情緒有幾種)
    # 3001 * 128
    Embedding(3001, 256, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [None]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [None]:
# batch:看幾筆在調整一次(20-200)
# epochs:整份訓練資料看幾遍 (10->60000*10)
# verbose: 0(quiet) 1(default) 2(我喜歡的)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
callbacks = [
    ModelCheckpoint("sentiment.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
     y_train,
     validation_split=0.1,
     batch_size=200,
     epochs=100,
     verbose=2,
     callbacks=callbacks)

In [None]:
model.evaluate(x_test_pad, y_test)

In [None]:
newl = [
  Embedding(3001, 256, mask_zero=True, input_length=1)  
]
newmodel = Sequential(newl)
w = model.layers[0].get_weights()
newmodel.layers[0].set_weights(w)
n = tok.word_index["the"]
print("the長這樣:", newmodel.predict([[n]]))