In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from dataclasses import dataclass
import pandas as pd
import numpy as np
import glob
import re
from pprint import pprint

In [2]:
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 128
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 12


config = Config()
config

Config()

In [3]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz        # 下載
!tar -xf aclImdb_v1.tar.gz                                                      # 解壓縮

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  54.5M      0  0:00:01  0:00:01 --:--:-- 54.5M


In [4]:
def get_text_list_from_files(files):
    text_list = []
    for name in files:
        with open(name) as f:
            for line in f:
                text_list.append(line)
    return text_list


def get_data_from_text_files(folder_name):
    pos_files = glob.glob('aclImdb/' + folder_name + "/pos/*.txt")
    pos_texts = get_text_list_from_files(pos_files)
    neg_files = glob.glob('aclImdb/' + folder_name + '/neg/*.txt')
    neg_texts = get_text_list_from_files(neg_files)
    df = pd.DataFrame(
        {
            'review': pos_texts + neg_texts,
            'sentiment': [0] * len(pos_texts) + [1] * len(neg_texts)
        }
    )
    df = df.sample(len(df)).reset_index(drop=True)      # shuffle + reset_index
    return df

train_df = get_data_from_text_files('train')
test_df = get_data_from_text_files('test')

all_data = train_df.append(other=test_df)               # df.append(other): 直接append到後面 
all_data.shape

(50000, 2)

In [5]:
def custom_standarization(input_data):
    """
        客製化的處理text資料, 用於text layer。
        處理:
            小寫、去除html tag、去除標點符號
    """
    lowercase = tf.strings.lower(input=input_data)
    stripped_html = tf.strings.regex_replace(
        input=lowercase,
        pattern="<br />",
        rewrite=' '
    )
    return tf.strings.regex_replace(
        input=stripped_html,
        pattern=re.escape(pattern="!#$%&'()*+,-./:;<=>?@\^_`{|}~"),
        rewrite=''
    )


def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["MASK"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence lenght.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorizer_layer = TextVectorization(
        max_tokens=vocab_size,
        standardize=custom_standarization,
        output_mode='int',
        output_sequence_length=max_seq
    )
    vectorizer_layer.adapt(texts)           # fit

    # 插入mask token
    vocab = vectorizer_layer.get_vocabulary()   # 30000
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]    # 0:2 ---> '', '[UNK]', 不要這兩個why?
    vectorizer_layer.set_vocabulary(vocab)      # 29998
    return vectorizer_layer


# 文字處理layer init
vectorize_layer = get_vectorize_layer(
    all_data.review.values.tolist(),
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

In [6]:
# MASK TOEKN看看

mask_token_id = vectorize_layer([['[mask]']]).numpy()[0][0]   # return (1, max_len)
mask_token_id

29999

In [7]:
def encode(texts):
    """
        將文字輸入做處理、轉換成模型可處理的vector，也可將其變成end-to-end model，
        此方法是先在資料集處理。
    """
    encoded_texts = vectorize_layer(texts)
    return encoded_texts.numpy()

In [8]:
encode(train_df.review.values)      # 輸入轉換成向量

array([[   9,  599,   10, ...,    0,    0,    0],
       [   9,   25,    6, ...,    0,    0,    0],
       [   3,  185,  291, ...,    0,    0,    0],
       ...,
       [ 856,   34,    5, ...,    0,    0,    0],
       [3150,    9,   80, ...,    0,    0,    0],
       [   9,  197,   10, ...,    0,    0,    0]])

In [9]:
vectorize_layer.get_vocabulary()[:3]

['', '[UNK]', 'the']

In [10]:
# masked 總數估計

25000 * 256 * 0.15

960000.0

### masked inputs and labels 釋例

In [11]:
### masked inputs and labels 釋例

encoded_texts = encode(train_df.review.values)  # (25000, 256) ---> 25000筆, 句子長度為256, 也就是256個word。
# print(encoded_texts.shape)

input_mask = np.random.rand(*encoded_texts.shape) < 0.15        # (25000, 256) ---> boolean matrix, 根據論文mask 15%
input_mask[encoded_texts <= 2] = False                            # ['', '[UNK]', 'the'] 是不去mask的。

labels = (-1) * np.ones(encoded_texts.shape, dtype=int)         # ---> 真實mask的labels, -1代表忽略
labels[input_mask] = encoded_texts[input_mask]                  # 給予真實的word token, 真正masked的資料之答案

# print(labels[input_mask].shape)                                 # 586446, 可能每次會不一樣, 根據seed, 代表的是: 在訓練語言模型時, 會預測的Word總數。
#### 上面完成了mask資料的y部分(預測的地方)

#### 輸入部分
encoded_texts_masked = np.copy(encoded_texts)       # 複製, 要修改
input_mask_2mask = input_mask & (np.random.rand(*encoded_texts.shape) < 0.90)       # (25000, 256) 的boolean matrix, 根據論文15%裡面有90%使用[mask], 
encoded_texts_masked[input_mask_2mask] = mask_token_id          # 將其word token改成[mask]的token, 也就是29999, 數量會是上面596446的大概9成: 528488

# 剩下10%去隨機token
input_mask_2random = input_mask * (np.random.rand(*encoded_texts.shape) < 0.10)
encoded_texts_masked[input_mask_2random] = np.random.randint(
    low=3,
    high=mask_token_id,
    size=input_mask_2random.sum()
)

## 最後一步: 樣本權重設定, 一般是數值weight, 0代表忽略
sample_weights = np.ones(labels.shape)              # (25000, 256)
sample_weights[labels == -1] = 0

y_labels = np.copy(encoded_texts)                   # (25000, 256), 原始句子, 也就是真實答案

#### Masked之後的輸入token


In [12]:
encoded_texts_masked, encoded_texts_masked.shape

(array([[    9,   599,    10, ...,     0,     0,     0],
        [    9,    25, 29999, ...,     0,     0,     0],
        [    3,   185,   291, ...,     0,     0,     0],
        ...,
        [  856,    34,     5, ...,     0,     0,     0],
        [ 3150,     9,    80, ...,     0,     0,     0],
        [29999,   197, 29999, ...,     0,     0,     0]]), (25000, 256))

#### 真實labels, 原始句子

In [13]:
y_labels, y_labels.shape

(array([[   9,  599,   10, ...,    0,    0,    0],
        [   9,   25,    6, ...,    0,    0,    0],
        [   3,  185,  291, ...,    0,    0,    0],
        ...,
        [ 856,   34,    5, ...,    0,    0,    0],
        [3150,    9,   80, ...,    0,    0,    0],
        [   9,  197,   10, ...,    0,    0,    0]]), (25000, 256))

#### 樣本權重

In [14]:
sample_weights, sample_weights.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 1., ..., 0., 0., 0.]]), (25000, 256))

In [15]:
def get_masked_input_and_labels(encoded_texts):
    """

    """
    # 準備label(target)
    # 根據BERT論文, 將15%的資料做masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15      # boolean matrix
    inp_mask[encoded_texts <= 2] = False                        # <=2的token是特殊token('', '[UNK]'),不mask, 但這邊好像把'the'也不mask了
    # 將目標預設為-1,  代表忽略
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]                  # 這就是被mask的真實label, 其餘為-1是忽略的

    # 準備輸入
    encoded_texts_masked = np.copy(encoded_texts)
    # 根據BERT論文, 將15%將要mask的資料裡面, 90%去真正使用[mask]
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[inp_mask_2mask] = mask_token_id        # 真正mask的資料, x_data

    # 剩下10%去random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 0.10)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        low=3, high=mask_token_id, size=inp_mask_2random.sum()
    )       # token(3~mask_token_id(不包含)), 都可能

    # 準備樣本權重去pass to fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0            # 忽略的權重會直接變成0, 就不會更新該樣本權重

    y_labels = np.copy(encoded_texts)           # y_label, 真實答案

    return encoded_texts_masked, y_labels, sample_weights

In [16]:
# 訓練資料準備

# 這是語義分析的資料
x_train = encode(train_df.review.values)
y_train = train_df.sentiment.values
train_classifier_ds = (
    tf.data.Dataset.from_tensor_slices(
        tensors=(x_train, y_train)
    ).shuffle(1000).batch(config.BATCH_SIZE)
)


x_test = encode(test_df.review.values)
y_test = test_df.sentiment.values
test_classifier_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(
    config.BATCH_SIZE
)

# Build dataset for end to end model input (will be used at the end)
test_raw_classifier_ds = tf.data.Dataset.from_tensor_slices(
    (test_df.review.values, y_test)
).batch(config.BATCH_SIZE)


# MLM 訓練資料
x_all_review = encode(all_data.review.values)
x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(
    x_all_review
)

mlm_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_train, y_masked_labels, sample_weights)
).shuffle(1000).batch(config.BATCH_SIZE)

### BERT model建立

In [17]:
def bert_module(query, key, value, i):
    """
        Encoder block
    """
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name=f'encoder_{i}/multiheadattention'
    )(query, key, value)
    attention_output = layers.Dropout(rate=0.1, name=f'encoder_{i}/att_dropout')(attention_output)
    attention_output = layers.LayerNormalization(
        epsilon=1e-1, name=f'encoder_{i}/att_layernormalization'
    )(query + attention_output)                 # short-cut

    # ffn
    ffn = keras.Sequential([
        layers.Dense(config.FF_DIM, activation='relu'),
        layers.Dense(config.EMBED_DIM),          
    ], name=f"encoder_{i}/ffn")
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name=f"encoder_{i}/ffn_dropout")(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name=f"encoder_{i}/ffn_layernormalization"
    )(attention_output + ffn_output)            # short-cut
    return sequence_output


def get_pos_encoding_matrix(max_len, d_emb):
    """
        初始化的參數, 放入postional embedding讓其繼續學習。
    """
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


loss_fn = keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")       # : Computes the (weighted) mean of the given values.

#### [sample_weight](https://keras.io/zh/models/model/)
- 就是在說訓練中, 樣本權重會使用、注意哪一些，常用在序列相關的資料中!mask data也是!
#### [Model.fit](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit?hl=zh-cn#%E6%94%AF%E6%8C%81_sample_weight_%E5%92%8C_class_weight)

In [18]:
# MLM 模型

class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        # override 訓練步驟, mode.fit()的客製化
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None
        
        # tf梯度紀錄, forward
        with tf.GradientTape() as tape:
            predictions =  self(features, training=True)        # 前面建好的model layers
            loss = loss_fn(
                y_true=labels,
                y_pred=predictions,
                sample_weight=sample_weight                     # 重點之一, model.compile也可以設定, 但不知道效果一樣嗎?
            )
        
        # 梯度計算, backward
        trainable_vars = self.trainable_variables               # Model的模型訓練參數
        gradients = tape.gradient(loss, trainable_vars)

        # 梯度更新
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # 計算model metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)    # 重點之一, metrics設定那些才列入計算

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}          # 平常使用keras看到的高階API, return dict(metrics: value)
    

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]




In [19]:
mask_token_id

29999

In [20]:
def create_masked_language_bert_model():
    """
        建造MLM with BERT
    """
    inputs = keras.Input(shape=(config.MAX_LEN,), dtype='int64')
    word_embedding = layers.Embedding(
        input_dim=config.VOCAB_SIZE,
        output_dim=config.EMBED_DIM,
        name='word_embedding'
    )(inputs)
    position_embeddings = layers.Embedding(
        input_dim=config.MAX_LEN,
        output_dim=config.EMBED_DIM,
        weights=[get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)],    # 完全讓其自己學也可以
        name="position_embedding",
    )(tf.range(start=0, limit=config.MAX_LEN, delta=1))
    # 其實還需要一個segment embedding, 但沒有NSP任務, 先忽略
    embedding =  word_embedding + position_embeddings
    encoder_output = embedding
    
    # Encoder block的堆疊
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name='mlm_cls', activation='softmax')(encoder_output)
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name='masked_bert_model')

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)      # 不需要loss 是因為前面MLM已經設定了
    return mlm_model


# token <-> index轉換的dict
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}

class MaskedTextGenerator(keras.callbacks.Callback):
    """
        Callback使用, 每一個epoch之後的操作
    """
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    # 先要override的method, 在epoch結束會做的事情! 觀察用
    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)     # (batch_size=1, 256, 30000)
        print(prediction.shape)

        masked_index = np.where(self.sample_tokens == mask_token_id) 
        print(masked_index)                             # (array([0]), array([4])), 為什有0是因為np.where
        masked_index = masked_index[1]                  # 故此處真正要的是index 1, 也就是[4]
        print(masked_index)                             # [4]: mask token的位置
        mask_prediction = prediction[0][masked_index]   # mask的預測值, 因為是分類, 有vocab_size這麼多個機率, 找top_k個就好!
        print(mask_prediction)

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]     # argsort()會將機率的index由小到大排序, 故先找到最後面的k個, 再從後往前數
        print(top_indices)                                              # 例: [19 24 1371 103 122]

        values = mask_prediction[0][top_indices]                        # 例: [0.5538519  0.130241   0.07007366 0.04236298 0.01845231]
        print(values)

        for i in range(len(top_indices)):
            p = top_indices[i]                  # word token, 也就是index
            v = values[i]                       # 機率
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)
        
            # {'input_text': 'i have watched this [mask] and it was awesome',
            # 'predicted mask token': 'worst',
            # 'prediction': 'i have watched this worst and it was awesome',
            # 'probability': 0.022767955}


sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 256, 128)     3840000     input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 256, 128)     0           word_embedding[0][0]             
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 256, 128)     66048       tf.__operators__.add[0][0]       
                                                                 tf.__operators__.

In [54]:
"""
    **np.where解說: https://numpy.org/doc/stable/reference/generated/numpy.where.html
    2種用法:
        1. np.where(condition): return True的位置(tuple)。
            假受condtion為一個(1, 256)的ndarray/tensor, 且裡面有一個True在[0, 4]的位置, 則會return (0, 4)
            若為(1, 1, 256), ... 且裡面有一個True在[0, 0, 4] --- return (0, 0, 4)
            故代表, 每一個會return 一個tuple, 而tuple的順序值, 為各個維度的index。
        2. np.where(condition, x, y): return conditon資料, True用x替代, False用y替代
"""

np.where(sample_tokens == mask_token_id)

(array([0]), array([4]))

In [None]:
for id_ in sample_tokens[0]:
    print(id_.numpy(), id2token[id_.numpy()])

In [None]:
# MLM訓練

bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
bert_masked_model.save("bert_mlm_imdb.h5")

In [33]:
# 更了解輸入、輸出

one_batch = mlm_ds.take(1)

for one_ds in one_batch:
    x, y, weights = one_ds
    print(x)
    print(y)
    print(weights)

    print(bert_masked_model.predict(one_batch))     # (batch_size, max_len, vocab_size): 預測出所有的機率。
    

tf.Tensor(
[[    9  2075   547 ...     0     0     0]
 [    1     4   303 ...     0     0     0]
 [    9 29999 29999 ...     0     0     0]
 ...
 [ 2849   549     3 ...     0     0     0]
 [   40 29999    66 ...     0     0     0]
 [  193  2684 10974 ... 11642     7   299]], shape=(128, 256), dtype=int64)
tf.Tensor(
[[    9  2075   547 ...     0     0     0]
 [    1     4   303 ...     0     0     0]
 [    9   471    48 ...     0     0     0]
 ...
 [ 2849   549     3 ...     0     0     0]
 [   40    23    66 ...     0     0     0]
 [  193  2684 10974 ... 11642     7   299]], shape=(128, 256), dtype=int64)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(128, 256), dtype=float64)
(128, 256, 30000)
[[[5.9711203e-07 7.2227135e-07 6.1290308e-07 ... 2.3189561e-06
   5.3074524e-07 5.8734702e-07]
  [5.9711124e-07 7.2226896e-07 6.1289751e-07 ... 2.3189484e-06
   5.3074

### 結果
- 根據觀察, cloze task並沒有訓練好, 故合理推測其實應沒有學習到應有的語意關係
- 可能原因
    - Encoder 太深
    - 訓練資料不夠多

### BERT的強大 - Finetune


In [None]:
# 使用預訓練模型
mlm_model = keras.models.load_model(
    "bert_mlm_imdb.h5", custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
)

pretrained_bert_model = tf.keras.Model(
    mlm_model.input, mlm_model.get_layer("encoder_0/ffn_layernormalization").output # 到這邊剛好是MLM分類任務前, 也就是學到的語意關係
)

# 凍結之前訓練的權重, 可選擇
pretrained_bert_model.trainable = False

def create_classifier_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int64)
    sequence_output = pretrained_bert_model(inputs)
    pooled_output = layers.GlobalMaxPooling1D()(sequence_output)        # 類似flattn, 原先3維
    hidden_layer = layers.Dense(64, activation="relu")(pooled_output)   
    outputs = layers.Dense(1, activation="sigmoid")(hidden_layer)       # 二分類問題, 故Dense(1)
    classifer_model = keras.Model(inputs, outputs, name="classification")
    optimizer = keras.optimizers.Adam()
    classifer_model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
    return classifer_model

classifer_model = create_classifier_bert_model()
classifer_model.summary()

# Train the classifier with frozen BERT stage(凍結下的訓練)
classifer_model.fit(
    train_classifier_ds,
    epochs=5,
    validation_data=test_classifier_ds,
)

# Unfreeze the BERT model for fine-tuning(不凍結下訓練, 代表MLM的參數也會跟著變動)
pretrained_bert_model.trainable = True
optimizer = keras.optimizers.Adam()
classifer_model.compile(
    optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
)
classifer_model.fit(
    train_classifier_ds,
    epochs=5,
    validation_data=test_classifier_ds,
)

Model: "classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 256)]             0         
_________________________________________________________________
model (Functional)           (None, 256, 128)          3939584   
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_120 (Dense)            (None, 64)                8256      
_________________________________________________________________
dense_121 (Dense)            (None, 1)                 65        
Total params: 3,947,905
Trainable params: 8,321
Non-trainable params: 3,939,584
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f66983cae50>

### end-to-end
- 之前的textvector之在資料集方面處理的, 也可以透過把layer embed 到model內, 得到end-to-end的模型

In [None]:
def get_end_to_end(model):
    """
        基於原先的模型, 將輸入處多放入一個"文本處理"layer也就是: TextVectorization得到的layer,
        就可以得到end-to-end model
    """
    inputs_string = keras.Input(shape=(1,), dtype="string")     # 文本(inputs)
    indices = vectorize_layer(inputs_string)                    # 轉成向量
    outputs = model(indices)                                    # outputs
    end_to_end_model = keras.Model(inputs_string, outputs, name="end_to_end_model")
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    end_to_end_model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
    return end_to_end_model


end_to_end_classification_model = get_end_to_end(classifer_model)
end_to_end_classification_model.evaluate(test_raw_classifier_ds)



[0.7545887231826782, 0.5]

### 修正
- Encoder
    12 -> 1

In [None]:
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 128
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1


config = Config()
config

Config()

In [None]:
sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

bert_masked_model_1layer = create_masked_language_bert_model()
bert_masked_model_1layer.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 256, 128)     3840000     input_8[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add_125 (TFOpL (None, 256, 128)     0           word_embedding[0][0]             
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 256, 128)     66048       tf.__operators__.add_125[0][0]   
                                                                 tf.__operators__.

In [None]:
bert_masked_model_1layer.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
bert_masked_model_1layer.save("bert_mlm_imdb_1layer.h5")

Epoch 1/5
(1, 256, 30000)
(array([0]), array([4]))
[4]
[[6.066066e-08 7.209404e-08 6.055639e-08 ... 2.872794e-06 5.329966e-07
  7.219587e-08]]
[10  9  3 19  5]
[0.04216674 0.03485606 0.03185284 0.02736021 0.01999529]
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'this',
 'prediction': 'i have watched this this and it was awesome',
 'probability': 0.042166743}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'i',
 'prediction': 'i have watched this i and it was awesome',
 'probability': 0.03485606}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'a',
 'prediction': 'i have watched this a and it was awesome',
 'probability': 0.03185284}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'movie',
 'prediction': 'i have watched this movie and it was awesome',
 'probability': 0.027360214}
{'input_text': 'i have watched this [mask] 

In [None]:
def user_bert_for_pretrained(model_file='bert_mlm_imdb_1layer.h5'):# 使用預訓練模型
    mlm_model = keras.models.load_model(
        model_file, custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
    )

    pretrained_bert_model = tf.keras.Model(
        mlm_model.input, mlm_model.get_layer("encoder_0/ffn_layernormalization").output # 到這邊剛好是MLM分類任務前, 也就是學到的語意關係
    )

    # 凍結之前訓練的權重, 可選擇
    pretrained_bert_model.trainable = False

    def create_classifier_bert_model():
        inputs = layers.Input((config.MAX_LEN,), dtype=tf.int64)
        sequence_output = pretrained_bert_model(inputs)
        pooled_output = layers.GlobalMaxPooling1D()(sequence_output)        # 類似flattn, 原先3維
        hidden_layer = layers.Dense(64, activation="relu")(pooled_output)   
        outputs = layers.Dense(1, activation="sigmoid")(hidden_layer)       # 二分類問題, 故Dense(1)
        classifer_model = keras.Model(inputs, outputs, name="classification")
        optimizer = keras.optimizers.Adam()
        classifer_model.compile(
            optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
        )
        return classifer_model

    classifer_model = create_classifier_bert_model()
    classifer_model.summary()

    # Train the classifier with frozen BERT stage(凍結下的訓練)
    classifer_model.fit(
        train_classifier_ds,
        epochs=5,
        validation_data=test_classifier_ds,
    )

    # Unfreeze the BERT model for fine-tuning(不凍結下訓練, 代表MLM的參數也會跟著變動)
    pretrained_bert_model.trainable = True
    optimizer = keras.optimizers.Adam()
    classifer_model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
    classifer_model.fit(
        train_classifier_ds,
        epochs=5,
        validation_data=test_classifier_ds,
    )
    return classifer_model

classifier_bert_1layer = user_bert_for_pretrained()

Model: "classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 256)]             0         
_________________________________________________________________
model_2 (Functional)         (None, 256, 128)          3939584   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_126 (Dense)            (None, 64)                8256      
_________________________________________________________________
dense_127 (Dense)            (None, 1)                 65        
Total params: 3,947,905
Trainable params: 8,321
Non-trainable params: 3,939,584
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
classifier_bert_1layer.evaluate(test_classifier_ds)



[0.8669344782829285, 0.8154000043869019]

### Encoder layers改變結果
- 可以發現第一個epoch loss就降得很快, 測試語句也很快的找出正確答案, 可以得知有部分抓取語意。
- 另外也發現, 有沒有凍結參數效果差很多, 如果沒有凍結, 效果fine tune得很快, 但結論有待商榷。

> 下次加超參數fine-tune、NSP任務、wordpiece等等