<a href="https://colab.research.google.com/github/Elwing-Chou/tibaml0315/blob/main/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:

import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [None]:
test_df

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
# 3000種常用詞彙+1padding(0): 美一篇文章進入的時候只取512在常用詞彙列表的詞, 每一個詞化做100維度的語意像量
layers = [
    # 沒有激活, 3001(種詞彙) * 100 -> 300100
    Embedding(input_dim=3001, output_dim=100, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

In [12]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       metrics=["accuracy"],
       optimizer="adam")

In [13]:
# Tokenize: 詞彙換成數字, 建立一個3000常用詞彙辭典
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
# tok.word_index
# tok.index_word
# 檢查: 這個case, 標點和換行是可以去掉的
# tok.word_index["?"]
# 停用詞(忽略一些無意義的): 不用, 根據答案就會把無意義的東西調整出來

In [None]:
import pandas as pd
# Sequenize: 把我的字轉換成數字(利用剛剛列表)
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

In [32]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
c = [
    ModelCheckpoint("imdb.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
     y_train,
     batch_size=100,
     epochs=40,
     validation_split=0.1,
     callbacks=c)

In [35]:
model.evaluate(x_test_pad, y_test)



[0.28666284680366516, 0.8818399906158447]

In [45]:
review = "It's barely even a movie, given it functions mostly as a big, long ad for Nintendo products (Mario games, in this case). It has that lovely Illumination\u2122 style of humor throughout it all, the stunning shallow writing we've all grown to love from them, and the immaculate character development we certainly expect to be blown away by. It's awful, really. There are so many plot points that make zero sense and function merely as a way of getting the film moving without really adding anything to the story, it all feels infinitely pointless and hollow.  At least the world they built is pretty, that's the one positive."#@param {type:"string"}
seq = tok.texts_to_sequences([review])
pad = pad_sequences(seq, maxlen=512)
pre = model.predict(pad)
prob = pre[0]
trans = ["neg", "pos"]
for p, label in zip(prob, trans):
    print(label, "的機率是:", p)

neg 的機率是: 0.9205835
pos 的機率是: 0.07941649
