<a href="https://colab.research.google.com/github/Bao3333/MachineLearning/blob/main/Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
import glob
print(dataset)

/root/.keras/datasets/aclImdb.tar.gz


In [None]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

layers = [
    # input_dim 3001 = 3000種詞+1種填補詞
    # param 300100 = 3001(種) * 100(weight)
    # 實際上有調的 512 * 100 = 51200
    #                     ↓每篇文章抓取的字數 ↓用0來填補缺少的字數,且不列入計分
    Embedding(3001, 100, input_length=512, mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.python import metrics
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(),
              loss=SparseCategoricalCrossentropy(),
              metrics=["accuracy"])

In [None]:
# 1 tokenize:把詞化作index
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
# 再把文章中的字替換成token
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,8,159,779,1794,2847,6,32,4,3.0,144.0,...,,,,,,,,,,
1,2006,15,2471,2553,2225,159,179,1264,542.0,14.0,...,,,,,,,,,,
2,10,66,112,107,3,1289,17,363,10.0,66.0,...,,,,,,,,,,
3,11,19,2640,22,8,36,1,76,137.0,85.0,...,,,,,,,,,,
4,931,37,1,88,2,1241,22,97,123.0,166.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,22,123,76,12,5,40,513,32,531.0,39.0,...,,,,,,,,,,
24996,63,9,20,11,28,172,1258,6,1669.0,3.0,...,,,,,,,,,,
24997,4,1088,7,7,11,215,3,17,118.0,114.0,...,,,,,,,,,,
24998,11,17,8,242,13,1161,1092,42,753.0,16.0,...,,,,,,,,,,


In [None]:
# 2 截長補短(可決定截前or後,補截前or後)
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,...,7,7,422,1600,156,1,2104,691,332,348
1,0,0,0,0,0,0,0,0,0,0,...,14,424,2951,825,15,1693,1275,8,17,448
2,0,0,0,0,0,0,0,0,0,0,...,48,3,17,55,5,377,58,1079,17,1588
3,0,0,0,0,0,0,0,0,0,0,...,28,40,1383,131,151,1802,705,43,4,155
4,0,0,0,0,0,0,0,0,0,0,...,353,18,6,478,718,15,360,4,29,2078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,22,77,27,484,440,1290,22,1,372,248
24996,0,0,0,0,0,0,0,0,0,0,...,95,25,218,534,5,373,3,52,1330,19
24997,0,0,0,0,0,0,0,0,0,0,...,18,538,2,2624,8,3,1073,93,9,6
24998,0,0,0,0,0,0,0,0,0,0,...,22,178,5,780,7,7,297,238,339,1465


In [None]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint("imdb.h5", save_best_only=True)
]

model.fit(x_train_pad, 
          y_train,
          batch_size=200,
          epochs=50,
          callbacks=callbacks,
          validation_split=0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


<keras.callbacks.History at 0x7fbfaaf501d0>

In [None]:
model.evaluate(x_test_pad, y_test)



[0.2959665358066559, 0.8807600140571594]

In [None]:
comment = input("comment:")
comment_seq = tok.texts_to_sequences([comment])
comment_pad = pad_sequences(comment_seq, maxlen=512)
pre = model.predict(comment_pad)[0]
labels = ["neg", "pos" ]
for l, p in zip(labels, pre):
  print(l, "的機率:", p)

comment:"neg"
neg 的機率: nan
pos 的機率: nan


In [None]:
# 透過向量化來比較相似度
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D()
]
partial = Sequential(layers)
partial.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 100)         300100    
                                                                 
 global_average_pooling1d_2   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
Total params: 300,100
Trainable params: 300,100
Non-trainable params: 0
_________________________________________________________________


In [None]:
w = model.layers[0].get_weights()
partial.layers[0].set_weights(w)

In [None]:
# 用cosine來看相似度 
from scipy.spatial.distance import cosine

sentence1 = "great movie" #@param {type:"string"}
sentence2 = "recommend" #@param {type:"string"}

s1, s2 = tok.texts_to_sequences([sentence1, sentence2])
v1 = partial.predict([s1])[0]
v2 = partial.predict([s2])[0]
print("相似度:", 1 - cosine(v1, v2))

相似度: 0.9878920912742615
