In [1]:
!pip install -q keras-bert
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip

  Building wheel for keras-bert (setup.py) ... [?25l[?25hdone
  Building wheel for keras-transformer (setup.py) ... [?25l[?25hdone
  Building wheel for keras-pos-embd (setup.py) ... [?25l[?25hdone
  Building wheel for keras-multi-head (setup.py) ... [?25l[?25hdone
  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone
  Building wheel for keras-position-wise-feed-forward (setup.py) ... [?25l[?25hdone
  Building wheel for keras-embed-sim (setup.py) ... [?25l[?25hdone
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [0]:
SEQ_LEN = 512
BATCH_SIZE = 32
EPOCHS = 2

In [0]:
import os
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')


In [0]:
from keras_bert import load_trained_model_from_checkpoint

# BERT 透過 Subword 的方式讓辭典長度最大就是 30522
token_dict = {}
with open(vocab_path, 'r', encoding='utf8') as f:
    for line in f.readlines():
        token = line.strip()
        token_dict[token] = len(token_dict)
print("辭典長度:", len(token_dict))


model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=False,
    trainable=False,
    seq_len=SEQ_LEN,
)

In [0]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [0]:
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)

In [8]:
# 有些詞彙會被拆成 subword 會用 ## 在前面做表示
# 不認識的詞會被轉換成 [UNK] token
# [101] -> [CLS] (文章代表token)
# [102] -> [SEP] (兩段的分段token)
# [100] -> [UNK] (未知詞彙token)
en, seg = tokenizer.encode("I am Coldplay 粉絲")
print("[Encoding]:", en)
print("[Segement]:", seg)
de = tokenizer.decode(en)
print("[Decode]:", de)

[Encoding]: [101, 1045, 2572, 3147, 13068, 100, 100, 102]
[Segement]: [0, 0, 0, 0, 0, 0, 0, 0]
[Decode]: ['i', 'am', 'cold', '##play', '[UNK]', '[UNK]']


In [9]:
# 兩句式的時候 segment 會幫你填入 0 和 1
en, seg = tokenizer.encode(first="I am hungry", second="I can barely eat a Gorilla.")
print("[Encoding]:", en)
print("[Segement]:", seg)
de = tokenizer.decode(en)
print("[Decode]:", de)

[Encoding]: [101, 1045, 2572, 7501, 102, 1045, 2064, 4510, 4521, 1037, 23526, 1012, 102]
[Segement]: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
[Decode]: (['i', 'am', 'hungry'], ['i', 'can', 'barely', 'eat', 'a', 'gorilla', '.'])


In [0]:
# 準備資料流
import glob
from tqdm import tqdm
import numpy as np

def load_data(path):
    global tokenizer
    indices, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        pat = glob.glob("{}/{}/*".format(path, folder))
        for fn in tqdm(iter(pat), total=len(pat)):
            with open(fn, 'r', encoding="utf-8") as f:
                  text = f.read()
            ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
            indices.append(ids)
            sentiments.append(sentiment)
    items = list(zip(indices, sentiments))
    np.random.shuffle(items)
    indices = np.array([i for i, s in items])
    sentiments = np.array([s for i, s in items])

    return [indices, np.zeros_like(indices)], np.array(sentiments)

# 讀取資料集
train_path = os.path.join(os.path.dirname(dataset), 'aclImdb', 'train')
test_path = os.path.join(os.path.dirname(dataset), 'aclImdb', 'test')

train_x, train_y = load_data(train_path)
test_x, test_y = load_data(test_path)

100%|██████████| 12500/12500 [00:35<00:00, 349.71it/s]
100%|██████████| 12500/12500 [00:36<00:00, 338.37it/s]
100%|██████████| 12500/12500 [00:35<00:00, 355.02it/s]
100%|██████████| 12500/12500 [00:35<00:00, 349.58it/s]


In [0]:
import keras
from keras.layers import SimpleRNN
inputs = model.inputs[:2]
x = SimpleRNN(64)(model.output)
outputs = keras.layers.Dense(units=2, activation='softmax')(x)
model = keras.models.Model(inputs, outputs)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer="adam",
    metrics=['sparse_categorical_accuracy']
)

In [0]:
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 512)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
___________________________________________________________________________________________

In [0]:
model.fit(
    train_x,
    train_y,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1
)

Train on 22492 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f5eddbbedd8>

In [0]:
model.evaluate(test_x, test_y)



[0.32487024319156643, 0.8572343149807938]