<a href="https://colab.research.google.com/github/Elwing-Chou/tibaml0826/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [16]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [17]:
test_df

Unnamed: 0,content,sentiment
0,After I've seen this movie I find it hard to u...,1
1,I had never heard about this movie when it was...,1
2,I really liked the first part of this film in ...,1
3,"Most of the criticism of ""Attack of Show"" is f...",1
4,"To my eternal shame, I've never seen a silent ...",1
...,...,...
24995,The first film was quite hip and had amusing m...,0
24996,Divorced single mom in picturesque seaside tow...,0
24997,"It's schmaltzy, but then what else did you exp...",0
24998,What a shame. This could have been good. The m...,0


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
# 3001(3000種常用詞+1種填補)
layers = [
    # 300100 = 3001(種) * 100(weight)
    Embedding(3001, 100, input_length=512, mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [19]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer=Adam(),
       metrics=["accuracy"])

In [22]:
# 1. tokenize: 把詞化做index
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [33]:
# 可以check一下
# tok.word_index
# tok.index_word

In [None]:
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
# 2. 截長補短
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

In [38]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    ModelCheckpoint("imdb.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
# 不想要進度條, verbose=2
model.fit(x_train_pad,
     y_train,
     batch_size=200,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks,
     verbose=2)

In [40]:
model.evaluate(x_test_pad, y_test)



[0.29148074984550476, 0.8804799914360046]

In [46]:
comment = input("comment:")
comment_tok = tok.texts_to_sequences([comment])
comment_pad = pad_sequences(comment_tok, maxlen=512)
pre = model.predict(comment_pad)[0]
labels = ["neg", "pos"]
for l, p in zip(labels, pre):
    print(l, "的機率:", p)

comment:Whoever's thinking about watching this movie, don't. Absolute garbage.  They've ruined everything they had built since the first Iron man came out. Burned it to the ground. I am utterly disappointed and shocked.  This made me never wanna watch another Marvel movie again. Ever.  I was so excited about this movie, I couldn't wait for it to come out. Now, I can't find words to describe how terrible this movie was. This has ruined my day.
neg 的機率: 0.99622214
pos 的機率: 0.0037778444
