<a href="https://colab.research.google.com/github/Elwing-Chou/tibaml1017/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [3]:
test_df

Unnamed: 0,content,sentiment
0,Four stories about the drug trade in Europe be...,1
1,Lloyd Hamilton was one of the most imaginative...,1
2,SPOILER WARNING<br /><br />We've all heard the...,1
3,One of the most pleasurable aspects of movie v...,1
4,I would hope so and how can I get involved?<br...,1
...,...,...
24995,"La Ragazza del Vagone Letto, or Terror Express...",0
24996,"When I saw this in the cinema, I remember winc...",0
24997,"Why Lori Petty was cast as tank girl, I'll nev...",0
24998,No emotion. Bad music (and I am a reformed eig...,0


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

layers = [
    # 3001: 情緒化對象(3000種最常用詞匯+padding(0))
    # 100: 情緒個數(100-500)
    # input_length: 一篇文章你要看幾個詞彙(128-512)
    # 一篇文章我會蒐集512詞彙, 這512詞彙是在3000最常用單字裡, 每一個詞彙會被我化作100維度的向量
    # 係數個數: 3001(種詞彙) * 100(個情緒) = 300100
    Embedding(3001, 100, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [7]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [10]:
# tokenize: 
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])
# 你可以看一下fit的結果
# tok.word_index
# tok.index_word

In [13]:
# 文章token化: sequence
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,8,1,19,523,31,104,115,365,2665.0,2888.0,...,,,,,,,,,,
1,312,1807,755,53,3,369,36,1,308.0,30.0,...,,,,,,,,,,
2,6,3,348,20,459,4,48,386,99.0,382.0,...,,,,,,,,,,
3,1230,215,96,562,3,678,15,1,11.0,19.0,...,,,,,,,,,,
4,10,432,2,216,2,171,635,42,1.0,330.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,440,1947,13,239,1,88,4,29,55.0,35.0,...,,,,,,,,,,
24996,11,6,1085,784,120,41,28,974,1.0,974.0,...,,,,,,,,,,
24997,11,17,66,3,897,672,4,706,297.0,35.0,...,,,,,,,,,,
24998,171,36,1,2592,48,123,572,5,1.0,84.0,...,,,,,,,,,,


In [19]:
# padding and truncating
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,...,1,1664,10,59,404,199,11,19,104,53
1,0,0,0,0,0,0,0,0,0,0,...,440,2095,10,437,11,1519,256,1992,43,1
2,1984,33,89,356,98,170,4,1,540,1,...,2374,634,1226,5,1,245,1085,43,4,155
3,0,0,0,0,0,0,0,0,0,0,...,215,11,19,20,285,9,1011,5,409,1433
4,0,0,0,0,0,0,0,0,0,0,...,871,553,41,36,1,42,443,2,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,1056,146,45,22,178,5,291,43,339,155
24996,0,0,0,0,0,0,0,0,0,0,...,10,188,819,43,135,10,89,456,41,95
24997,0,0,0,0,0,0,0,0,0,0,...,17,77,127,53,8,1,348,17,1049,512
24998,0,0,0,0,0,0,0,0,0,0,...,1006,21,3,75,194,18,54,437,15,1019


In [16]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])
y_train

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
# batch_size: 我看多少筆, 把所有筆的斜率做一個平均, 再調整
# 圖片小(不吃那麼多ram):100-200 圖片大(吃很多ram):10-20
# 60000筆, 200比一次調整  -> 300次調整
# epochs: 整份資料看幾遍(你覺得模型訓練好就停下來)
# validation_split: 切出一部份資料來做模型確認
# epochs + validation: 什麼時候停下來(val_loss平滑的時候就停下來)
# epoch: 60000筆(90%) -> 54000 / 200 -> 270調整
# verbose: 決定log印多少 1(default):進度條 2:沒有進度條, 0:完全不印
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
callbacks = [
    ModelCheckpoint("sentiment.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
     y_train,
     batch_size=200,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks,
     verbose=2)

In [18]:
model.evaluate(x_test_pad, y_test)



[0.2917490005493164, 0.8812400102615356]

In [None]:
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
infer = Sequential(layers)
infer.layers[0].set_weights(model.layers[0].get_weights())
infer.layers[2].set_weights(model.layers[2].get_weights())
infer.summary()

In [25]:
comment = "This film just proves how important was Stan Lee and his ingenious story telling was critical to Marvel.  Marvel just spent millions of dollers to show a dodgeball match between Wanda and the rest of the cast with a Halloween backdrop.  It is no wonder the makers had to rely on over the top VFX as there is absolutely no coherent story to weave.  Money down the drain...dont even watch if someone shows it to you for free...." #@param {type:"string"}
seq = tok.texts_to_sequences([comment])
prob = infer.predict(seq)[0]
trans = ["neg", "pos"]
for t, p in zip(trans, prob):
    print(t, "的機率:", p)

neg 的機率: 0.9771408
pos 的機率: 0.022859199


In [26]:
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D(),
]
infer = Sequential(layers)
infer.layers[0].set_weights(model.layers[0].get_weights())
infer.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 100)         300100    
                                                                 
 global_average_pooling1d_3   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
Total params: 300,100
Trainable params: 300,100
Non-trainable params: 0
_________________________________________________________________


In [31]:
from scipy.spatial.distance import cosine
comment1 = "horror" #@param {type:"string"}
comment2 = "scary" #@param {type:"string"}
seq1 = tok.texts_to_sequences([comment1])
v1 = infer.predict(seq1)[0]
seq2 = tok.texts_to_sequences([comment2])
v2 = infer.predict(seq2)[0]
# 越趨近於1越相似
print("相似度:", 1 - cosine(v1, v2))

相似度: 0.8046953082084656
