<a href="https://colab.research.google.com/github/Elwing-Chou/tibaml1027/blob/main/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
dataset

'/root/.keras/datasets/aclImdb.tar.gz'

In [2]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df

In [5]:
train_df = getdata("train")
test_df = getdata("test")

In [7]:
test_df

Unnamed: 0,content,sentiment
0,"Valentine ""Dogkiller"" Dussaut and Joe ""The Jud...",1
1,Ruthless mercenary Bruno Rivera (Paul Naschy i...,1
2,Should we take the opening shot as a strange f...,1
3,"Being a retired medical/health field ""toiler i...",1
4,I wasn't expecting a great deal from this film...,1
...,...,...
24995,"I give it a 2, because of the beautiful Medite...",0
24996,How bad can you make a film. A good question w...,0
24997,"Oh, my. Oh, this is a *really* bad movie. The ...",0
24998,Apparently re-cut episodes from the Gangbuster...,0


In [13]:
# Tokenize: 把你的詞變成數字
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
# tok.word_index
# tok.index_word

In [14]:
# Sequence: 化成數字的序列
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])

In [39]:
INPUT_LENGTH = 512
INPUT_DIM = 3000
OUTPUT_DIM = 128

In [None]:
# pd.DataFrame(x_train_seq)
# Padding: 截長補短變成一樣長
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=INPUT_LENGTH)
x_test_pad = pad_sequences(x_test_seq, maxlen=INPUT_LENGTH)
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
layers = [
    # 3001(種詞) * 128(個情緒)
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True, input_length=INPUT_LENGTH),
    Flatten(),
    Dense(256, activation="relu"),
    Dropout(0.25),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
layers = [
    # 3001(種詞) * 128(個情緒)
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True, input_length=INPUT_LENGTH),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 512, 128)          384128    
                                                                 
 global_average_pooling1d_3   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_9 (Dense)             (None, 2)                 258       
                                                                 
Total params: 384,386
Trainable params: 384,386
Non-trainable params: 0
_________________________________________________________________


In [41]:
# 一個輸出(二元分類): BinaryCrossEntropy p log 1/q + (1 - p) log 1/1-q
# 多個輸出(多元分類): CategoricalCrossEntropy pi log1/qi
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
              # "adam"也可以
              optimizer="adam",
              metrics=["accuracy"])

In [42]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [None]:
# batch_size: 看多少筆, 做一次梯度下降(幾10~幾100)
# epochs: 所有資料看幾輪(負責結束訓練)
# batch_size=200
# 一epochs: 54000 / 200 = 270(次梯度下降)
# verbose=0(quiet) 1(default) 2(no bar)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint("imdb.h5", save_best_only=True)
]
model.fit(x_train_pad,
          y_train,
          batch_size=200,
          epochs=100,
          validation_split=0.1,
          verbose=2,
          callbacks=callbacks)


In [44]:
model.evaluate(x_test_pad, y_test)



[0.29357779026031494, 0.8814399838447571]

In [None]:
l = [
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True)
]
remain = model.layers[1:]
model_use = Sequential(l+remain)
model_use.layers[0].set_weights(model.layers[0].get_weights())
model_use.summary()

In [61]:
review = input("影評:")
review_seq = tok.texts_to_sequences([review])
proba = model_use.predict(review_seq)[0]
trans = ["neg", "pos"]
for p, sentiment in zip(proba, trans):
    print(sentiment, ":", p)

影評:My toes have just about uncurled enough to free me to diss this over-sentimental piece of tosh. There's a fine art, I believe, in concocting a classic Christmas movie, but the best of them - "It's A Wonderful Life", "The Bishop's Wife", "Miracle on 34th Street" etc, all manage it by being natural and unforced, showing real people in real situations, but their lives overtaken in one way or another by that unclassifiable Christmas spirit.  The so-called fare on show here is however so calculated and lacking in good humour (in every sense of the phrase) that what's left is an overwrought, overplayed, overdone piece of contrived hokum.  Quite how an ordinary member of the public is meant to relate to high-flying movie-trailer producer Cameron Diaz in her all mod cons dream house, replete with swimming pool and electric curtains, or publishing executive Jude Law as a grieving widower, Jack Black as a cuckolded movie soundtrack composer or even the most downbeat character, Kate Winslet as