## 專案流程
### - 資料讀取(keras built-in datasets)
### - 資料處理(pad)
### - 模型建立
### - 超參數調整

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
# 資料讀取
vocab_size = 20000          # 只考慮20000個words
max_len = 200               # 長度最多200

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=20000, maxlen=max_len)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


((14244,), (14244,), (14669,), (14669,))

In [7]:
# 
x_train[0]

[1,
 194,
 1153,
 194,
 8255,
 78,
 228,
 5,
 6,
 1463,
 4369,
 5012,
 134,
 26,
 4,
 715,
 8,
 118,
 1634,
 14,
 394,
 20,
 13,
 119,
 954,
 189,
 102,
 5,
 207,
 110,
 3103,
 21,
 14,
 69,
 188,
 8,
 30,
 23,
 7,
 4,
 249,
 126,
 93,
 4,
 114,
 9,
 2300,
 1523,
 5,
 647,
 4,
 116,
 9,
 35,
 8163,
 4,
 229,
 9,
 340,
 1322,
 4,
 118,
 9,
 4,
 130,
 4901,
 19,
 4,
 1002,
 5,
 89,
 29,
 952,
 46,
 37,
 4,
 455,
 9,
 45,
 43,
 38,
 1543,
 1905,
 398,
 4,
 1649,
 26,
 6853,
 5,
 163,
 11,
 3215,
 10156,
 4,
 1153,
 9,
 194,
 775,
 7,
 8255,
 11596,
 349,
 2637,
 148,
 605,
 15358,
 8003,
 15,
 123,
 125,
 68,
 2,
 6853,
 15,
 349,
 165,
 4362,
 98,
 5,
 4,
 228,
 9,
 43,
 2,
 1157,
 15,
 299,
 120,
 5,
 120,
 174,
 11,
 220,
 175,
 136,
 50,
 9,
 4373,
 228,
 8255,
 5,
 2,
 656,
 245,
 2350,
 5,
 4,
 9837,
 131,
 152,
 491,
 18,
 2,
 32,
 7464,
 1212,
 14,
 9,
 6,
 371,
 78,
 22,
 625,
 64,
 1382,
 9,
 8,
 168,
 145,
 23,
 4,
 1690,
 15,
 16,
 4,
 1355,
 5,
 28,
 6,
 52,
 154,
 462,
 33,


In [8]:
y_train[0]

0

In [9]:
# 將其pad_sequence, 才可以送入model

x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_len, padding='post')
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_len, padding='post')

In [11]:
x_train.shape, x_test.shape

((14244, 200), (14669, 200))

In [26]:
# Encoder, PositionalEmbedding
# 細節可以看Attention is all you need論文

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim=32, num_heads=8, ff_dim=32, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(units=ff_dim, activation='relu'),
            layers.Dense(units=embed_dim, activation='relu')
        ])
        self.drop1 = layers.Dropout(rate)
        self.drop2 = layers.Dropout(rate)
        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs):
        att_output = self.att(inputs, inputs, inputs)
        att_output = self.drop1(att_output)
        out1 = self.layer_norm1(att_output + inputs)    # short-cut
        ffn_output = self.ffn(out1)
        ffn_output = self.drop2(ffn_output)
        out2 = self.layer_norm2(ffn_output + out1)      # short-cut
        return out2


class PositionalWordEmbedding(layers.Layer):
    def __init__(self, vocab_size, max_len, embed_dim):
        super(PositionalWordEmbedding, self).__init__()
        self.pos_embedding = layers.Embedding(input_dim=max_len, output_dim=embed_dim, input_length=max_len)
        self.word_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len)
    
    def call(self, inputs):
        # 位置向量的init
        max_len = tf.shape(inputs)[-1]      # 長度
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_embedding(positions)
        word_embedding = self.word_embedding(inputs)

        return positions + word_embedding

[tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model?version=nightly)

In [38]:
class Transformer(tf.keras.Model):
    def __init__(self, params):
        super(Transformer, self).__init__()
        self.embedding = PositionalWordEmbedding(params['vocab_size'], params['max_len'], params['embed_dim'])
        self.transformer_block = keras.Sequential([
            TransformerBlock(params['embed_dim'], params['num_heads'], params['ff_dim'], params['rate']) for _ in range(params['layers'])
        ])
        self.flatten = layers.GlobalAveragePooling1D()
        self.drop1 = layers.Dropout(params['rate'])
        self.dense1 = layers.Dense(units=params['units'], activation='relu')
        self.drop2 = layers.Dropout(params['rate'])
        self.dense_out = layers.Dense(units=params['units_out'], activation=params['activation'])
    
    def call(self, inputs):
        inputs = self.embedding(inputs)
        inputs = self.transformer_block(inputs)
        inputs = self.flatten(inputs)
        inputs = self.drop1(inputs)
        inputs = self.dense1(inputs)
        inputs = self.drop2(inputs)
        out = self.dense_out(inputs)
        return out

In [39]:
## 超參數設定與訓練

params = {
    'max_len': 200,
    'vocab_size': 20000,
    'embed_dim': 32,
    'num_heads': 2,
    'ff_dim': 32,
    'rate': 0.1,
    'units': 16,
    'units_out': 2,
    'activation': 'softmax',
    'layers': 1
}


model = Transformer(params)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit(x_train, y_train, 128, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f601cf1f450>

In [40]:
model.summary()

Model: "transformer_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
positional_word_embedding_2  multiple                  646400    
_________________________________________________________________
sequential_17 (Sequential)   (None, 200, 32)           10656     
_________________________________________________________________
global_average_pooling1d_8 ( multiple                  0         
_________________________________________________________________
dropout_34 (Dropout)         multiple                  0         
_________________________________________________________________
dense_34 (Dense)             multiple                  528       
_________________________________________________________________
dropout_35 (Dropout)         multiple                  0         
_________________________________________________________________
dense_35 (Dense)             multiple               

In [None]:
# embed_dim = 32  # Embedding size for each token
# num_heads = 2  # Number of attention heads
# ff_dim = 32  # Hidden layer size in feed forward network inside transformer

# inputs = layers.Input(shape=(max_len,))
# embedding_layer = PositionalWordEmbedding(vocab_size, max_len, embed_dim)
# x = embedding_layer(inputs)
# transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
# x = transformer_block(x)
# x = layers.GlobalAveragePooling1D()(x)
# x = layers.Dropout(0.1)(x)
# x = layers.Dense(20, activation="relu")(x)
# x = layers.Dropout(0.1)(x)
# outputs = layers.Dense(2, activation="softmax")(x)

# model = keras.Model(inputs=inputs, outputs=outputs)