In [None]:
!pip uninstall -y keras
# 2.12
!pip install keras==2.12.0
!pip install keras-bert==0.81.1

In [2]:
# 安裝keras-bert函式庫

# 下載對應需要使用的bert預訓練模型
# wget參數: -q(安靜模式)
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# unzip參數: -o(overrite 直接覆蓋不詢問)
!unzip -o uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [3]:
# 設置參數
# SEQ_LEN: 取原文的多少個字, 建議至少256以上, 512是最大值, 但是會導致模型太大而RAM不夠用
SEQ_LEN = 256
# BATCH_SIZE: 每次多少個BATCH再做調整, 如果RAM不夠用就調小
BATCH_SIZE = 16

In [4]:
# 讀取預訓練模型
import os
pretrained_path = 'uncased_L-12_H-768_A-12'
# 參數設定
config_path = os.path.join(pretrained_path, 'bert_config.json')
# 模型本身
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
# 所有支持的字彙
vocab_path = os.path.join(pretrained_path, 'vocab.txt')


In [5]:
# BERT 透過 Subword 的方式讓辭典長度最大就是 30522
token_dict = {}
with open(vocab_path, 'r', encoding='utf8') as f:
    for line in f.readlines():
        token = line.strip()
        token_dict[token] = len(token_dict)
print("辭典長度:", len(token_dict))

辭典長度: 30522


In [8]:
from keras_bert import load_trained_model_from_checkpoint

# 使用 Adapter 來獲得比較高的準確率
layer_num = 12
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    use_adapter=True,
    # trainable=['Encoder-{}-MultiHeadSelfAttention-Adapter'.format(i + 1) for i in range(layer_num)] +
    # ['Encoder-{}-FeedForward-Adapter'.format(i + 1) for i in range(layer_num)] +
    # ['Encoder-{}-MultiHeadSelfAttention-Norm'.format(i + 1) for i in range(layer_num)] +
    # ['Encoder-{}-FeedForward-Norm'.format(i + 1) for i in range(layer_num)],
    trainable=False,
    seq_len=SEQ_LEN
)



In [12]:
allow = [
    "MultiHeadSelfAttention-Adapter",
    "FeedForward-Adapter",
    "MultiHeadSelfAttention-Norm",
    "FeedForward-Norm"
]
for l in model.layers:
  n = l.name
  for a in allow:
    if a in n:
      l.trainable = True

In [13]:
# 預訓練模型的 summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input-Token (InputLayer)       [(None, 256)]        0           []                               
                                                                                                  
 Input-Segment (InputLayer)     [(None, 256)]        0           []                               
                                                                                                  
 Embedding-Token (TokenEmbeddin  [(None, 256, 768),  23440896    ['Input-Token[0][0]']            
 g)                              (30522, 768)]                                                    
                                                                                                  
 Embedding-Segment (Embedding)  (None, 256, 768)     1536        ['Input-Segment[0][0]']      

In [14]:
# 下載IMDB資料集
from keras.utils import get_file
dataset = get_file(
    fname="aclImdb.tar.gz",
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    extract=True,
)
print("下載位址:", dataset)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
下載位址: /root/.keras/datasets/aclImdb.tar.gz


In [15]:
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)

In [16]:
# 有些詞彙會被拆成 subword 會用 ## 在前面做表示
# 不認識的詞會被轉換成 [UNK] token
# [101] -> [CLS] (文章代表token)
# [102] -> [SEP] (兩段的分段token)
# [100] -> [UNK] (未知詞彙token)
en, seg = tokenizer.encode("I am Coldplay 粉絲")
print("[Encoding]:", en)
print("[Segement]:", seg)
de = tokenizer.decode(en)
print("[Decode]:", de)

[Encoding]: [101, 1045, 2572, 3147, 13068, 100, 100, 102]
[Segement]: [0, 0, 0, 0, 0, 0, 0, 0]
[Decode]: ['i', 'am', 'cold', '##play', '[UNK]', '[UNK]']


In [17]:
# 兩句式的時候 segment 會幫你填入 0 和 1
en, seg = tokenizer.encode(first="I am hungry", second="I can barely eat a Gorilla.")
print("[Encoding]:", en)
print("[Segement]:", seg)
de = tokenizer.decode(en)
print("[Decode]:", de)

[Encoding]: [101, 1045, 2572, 7501, 102, 1045, 2064, 4510, 4521, 1037, 23526, 1012, 102]
[Segement]: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
[Decode]: (['i', 'am', 'hungry'], ['i', 'can', 'barely', 'eat', 'a', 'gorilla', '.'])


In [18]:
# 準備資料流
import glob
from tqdm import tqdm
import numpy as np
def load_data(path):
    global tokenizer
    indices, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        pat = glob.glob("{}/{}/*".format(path, folder))
        for fn in tqdm(iter(pat), total=len(pat)):
            with open(fn, 'r', encoding="utf-8") as f:
                  text = f.read()
            ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
            indices.append(ids)
            sentiments.append(sentiment)
    items = list(zip(indices, sentiments))
    np.random.shuffle(items)
    indices = np.array([i for i, s in items])
    sentiments = np.array([s for i, s in items])

    return [indices, np.zeros_like(indices)], np.array(sentiments)

In [19]:
# 讀取資料集
train_path = os.path.join(os.path.dirname(dataset), 'aclImdb', 'train')
test_path = os.path.join(os.path.dirname(dataset), 'aclImdb', 'test')

train_x, train_y = load_data(train_path)
test_x, test_y = load_data(test_path)

100%|██████████| 12500/12500 [00:37<00:00, 333.66it/s]
100%|██████████| 12500/12500 [00:37<00:00, 333.98it/s]
100%|██████████| 12500/12500 [00:34<00:00, 363.94it/s]
100%|██████████| 12500/12500 [00:36<00:00, 344.16it/s]


In [20]:
# 通常我們不會需要第三個輸入, 第三個輸入是你要做再訓練的時候可以用
model.input

[<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'Input-Token')>,
 <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'Input-Segment')>,
 <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'Input-Masked')>]

In [23]:
# 不用拿整個詞彙的encoding, 因為這樣訓練時間一定會拉很長
# 我們直接拿整段文章的代表encoding, 也就是[CLS]的encoding
# 那層我們叫做 NSP-Dense
import keras

inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
# dense = keras.layers.Dense(units=128, activation='relu')(dense)
outputs = keras.layers.Dense(units=2, activation='softmax')(dense)
model = keras.models.Model(inputs, outputs)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer="adam",
    metrics=['sparse_categorical_accuracy']
)

In [24]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input-Token (InputLayer)       [(None, 256)]        0           []                               
                                                                                                  
 Input-Segment (InputLayer)     [(None, 256)]        0           []                               
                                                                                                  
 Embedding-Token (TokenEmbeddin  [(None, 256, 768),  23440896    ['Input-Token[0][0]']            
 g)                              (30522, 768)]                                                    
                                                                                                  
 Embedding-Segment (Embedding)  (None, 256, 768)     1536        ['Input-Segment[0][0]']    

In [25]:
EPOCHS = 2
model.fit(
    train_x,
    train_y,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7aab2f1ec610>

In [26]:
# 大概就能得到 91%~92% 的命中率了
model.evaluate(test_x, test_y)



[0.2132416069507599, 0.9156399965286255]