In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#文字嵌入法(word embeddings) 又稱 密集文字向量(dense word vector)
#one-hot encoding 向量: 稀疏、高維度、強制編碼
#word embeddings 向量: 密集、低維度、從資料中學習
#建立文字嵌入向量: 1.訓練模型時(ex: 文檔分類、情感預測)，使用Embedding layer同時學習文字嵌入向量。此方法與學習神經網路權重的方式相同。
#               2.用其他機器學習模型以計算(訓練好)的文字嵌入向量。此方式稱為"預先訓練的文字嵌入法(pretrained word embeddings)"

In [3]:
#用Keras的Embedding Layer時做文字嵌入法
from tensorflow import keras
from tensorflow.keras import layers, models

In [4]:
#建立嵌入層(Embedding Layer)
embedding_layer = layers.Embedding(1000, 64) #tokens數(1+最大單字數)， 輸出的嵌入向量維數

In [5]:
#載入IMDB，並整理成適合供Embedding層使用的資料
from keras.datasets import imdb
from tensorflow.keras import preprocessing

In [6]:
max_features = 10000 #設定作為特徵的最常用文字數量
maxlen = 20 #只用每篇文章最後面的20個字(需在max_feature最常用文字中)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) #將文字評論資料以整數(鍵值)list載入
print(x_train.shape) #shape=(25000,)即25000筆評論，每筆評論都是一個樣本
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen) #將list轉換為2D整數張量，shape為(samples 樣本數, maxlen 最大長度)
print(x_train.shape) #shape=(25000, 20)代表只看每筆資料的後20個字

print(x_train[0])

x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
(25000,)
(25000, 20)
[  65   16   38 1334   88   12   16  283    5   16 4472  113  103   32
   15   16 5345   19  178   32]


In [7]:
#訓練模型，用Embedding layer學習文字嵌入向量
model = models.Sequential()
model.add(layers.Embedding(10000, 8, input_length=maxlen)) #指定嵌入向量層的最大輸入長度，以便之後展平嵌入向量。
                                                           #輸出的嵌入向量shape=(smaples,20,8)，8為嵌入向量之維度。
model.add(layers.Flatten()) #將嵌入向量的3D向量展平為2D張量，輸出向量shape=(samples,20*8)
model.add(layers.Dense(1, activation='sigmoid')) #加上二元分類氣，輸出向量shape=(samples, 1)

2022-08-23 03:20:09.465168: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [8]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [10]:
histiry = model.fit(x_train, y_train,
                   epochs=10,
                   batch_size=32,
                   validation_split=0.2)

Epoch 1/10


2022-08-23 03:20:09.686295: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
#僅展平嵌入向量的序列資料，並在頂部訓練單個密集層會導致模型分別處理輸入序列資料(評論)中的每個文字，而未考慮文字關係與句子結構。
#因此需在嵌入向量序列資料頂部加上循環層或1D卷積層，以學習將每個序列資料做為一個整體考慮在內的特徵。

In [12]:
#使用預先訓練的文字嵌入向量(Pretrained word embeddings)
#當資料不足時，可使用預先訓練的文字嵌入向量，但其需具備涵蓋語言結構的普遍特性
#無足夠資料用於學習真正的特徵時，拿通用的特徵來使用
#ex: Word2vee 演算法, GloVe 全域向量文字表示法