In [1]:
from bert_serving.client import BertClient
bc = BertClient()
bc.encode(['First do it', 'then do it right', 'then do it better'])

array([[ 0.22590299,  0.30924726, -0.08256867, ..., -0.14456667,
        -0.47119054, -0.07071219],
       [ 0.37245134,  0.35599762,  0.09506325, ...,  0.09425778,
        -0.4482554 , -0.17623922],
       [ 0.26777843,  0.22399886, -0.0217796 , ...,  0.02863549,
        -0.37525114,  0.02919415]], dtype=float32)

https://github.com/hanxiao/bert-as-service#building-a-qa-semantic-search-engine-in-3-minutes

# Building a QA semantic search engine

In [2]:
import numpy as np

In [3]:
prefix_q = '##### **Q:** '
with open('README.md', encoding='utf-8') as fp:
    questions = [v.replace(prefix_q, '').strip() for v in fp if v.strip() and v.startswith(prefix_q)]
    print('%d questions loaded, avg. len of %d' % (len(questions), np.mean([len(d.split()) for d in questions])))

35 questions loaded, avg. len of 9


In [4]:
doc_vecs = bc.encode(questions)

In [10]:
doc_vecs.shape

(35, 768)

In [6]:
topk=5

query = input('your question: ')
query_vec = bc.encode([query])[0]
# compute normalized dot product as score
score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1)
topk_idx = np.argsort(score)[::-1][:topk]
for idx in topk_idx:
    print('> %s\t%s' % (score[idx], questions[idx]))

your question: 你会说汉语吗
> 14.697959	Can I use my own tokenizer?
> 14.687681	Do I need to do segmentation for Chinese?
> 14.655649	How can I choose `num_worker`?
> 14.632836	Why my (English) word is tokenized to `##something`?
> 14.62494	What is backend based on?


# Serving a fine-tuned BERT model

# Getting ELMo-like contextual word embedding

In [12]:
# max_seq_len = 25
# pooling_strategy = NONE

bc = BertClient()
vec = bc.encode(['hey you', 'whats up?'])

In [14]:
vec.shape

(2, 25, 768)

In [16]:
vec[0].shape
# sentence embeddings for `hey you`

(25, 768)

In [18]:
vec[0][0].shape
# word embedding for `[CLS]`

(768,)

In [21]:
vec[0][1].shape
# word embedding for `hey`

(768,)

In [22]:
vec[0][2].shape
# word embedding for `you`

(768,)

In [23]:
vec[0][3].shape
# word embedding for `[SEP]`

(768,)

In [24]:
vec[0][4].shape
# word embedding for padding symbol

(768,)

In [25]:
vec[0][25].shape

IndexError: index 25 is out of bounds for axis 0 with size 25

# Using your own tokenizer

In [44]:
texts = ['你好 世界!', '美好 一天']

# a naive whitespace tokenizer
texts2 = [s.split() for s in texts]

vecs = bc.encode(texts2, is_tokenized=True)

In [45]:
vecs.shape

(2, 25, 768)

In [46]:
bc.encode(['你好 世界!', '这是 它'], show_tokens=True)

array([[[-0.01175493, -0.1617977 ,  0.29648292, ...,  0.6362557 ,
         -0.04364629, -0.27905396],
        [ 0.47149712, -0.7719901 , -0.73191035, ...,  0.12688503,
         -0.2823674 , -0.19017912],
        [ 0.50740826, -0.44305798, -0.15977177, ...,  0.29794553,
          0.02039958, -0.23108397],
        ...,
        [-0.        , -0.        , -0.        , ..., -0.        ,
         -0.        , -0.        ],
        [-0.        , -0.        ,  0.        , ..., -0.        ,
         -0.        , -0.        ],
        [ 0.        ,  0.        , -0.        , ..., -0.        ,
         -0.        , -0.        ]],

       [[-0.14682214,  0.4779205 ,  0.6809331 , ...,  0.5676709 ,
         -0.0326115 , -0.93247014],
        [ 0.5639583 ,  0.34819496,  1.2245796 , ...,  0.39399296,
         -0.68489134, -0.94545054],
        [-0.05322995, -0.2393229 ,  1.0470473 , ...,  0.7995612 ,
         -0.11019681, -1.1336169 ],
        ...,
        [-0.        ,  0.        ,  0.        , ...,  

In [48]:
bc.encode([['hello', 'world!'], ['thisis', 'it']], show_tokens=True, is_tokenized=True)

array([[[ 0.27740484,  0.7500669 ,  0.4284198 , ...,  0.6141869 ,
         -0.20193532, -0.26091242],
        [ 0.4758171 ,  0.0606777 ,  0.6419624 , ..., -0.52978677,
          0.04691685, -0.18648772],
        [ 0.7059512 ,  0.07687826, -0.04954352, ..., -0.12742816,
         -0.35104132, -0.04731564],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., -0.        ,
         -0.        , -0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.        ,
         -0.        , -0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.        ,
         -0.        , -0.        ]],

       [[-0.8403784 ,  1.0385634 , -0.34069625, ...,  0.66673994,
         -0.3021288 , -0.5121763 ],
        [-0.89954877,  1.0046595 , -0.46250865, ...,  0.18021399,
         -0.2635792 , -0.8112389 ],
        [-0.1540899 ,  0.5245218 , -0.5092169 , ...,  0.13944007,
          0.28740644, -0.11593796],
        ...,
        [-0.        ,  0.        ,  0.        , ...,  

the pretrained BERT Chinese from Google is character-based

# Using BertClient with tf.data API

In [50]:
import tensorflow as tf
batch_size = 256
num_parallel_calls = 4
num_clients = num_parallel_calls * 2  # should be at least greater than `num_parallel_calls`

# start a pool of clients
bc_clients = [BertClient(show_server_config=False) for _ in range(num_clients)]


def get_encodes(x):
    # x is `batch_size` of lines, each of which is a json object
    samples = [json.loads(l) for l in x]
    text = [s['raw_text'] for s in samples]  # List[List[str]]
    labels = [s['label'] for s in samples]  # List[str]
    # get a client from available clients
    bc_client = bc_clients.pop()
    features = bc_client.encode(text)
    # after use, put it back
    bc_clients.append(bc_client)
    return features, labels


ds = (tf.data.TextLineDataset(train_fp).batch(batch_size)
        .map(lambda x: tf.py_func(get_encodes, [x], [tf.float32, tf.string]),  num_parallel_calls=num_parallel_calls)
        .map(lambda x, y: {'feature': x, 'label': y})
        .make_one_shot_iterator().get_next())

NameError: name 'train_fp' is not defined

# Training a text classifier using BERT features and tf.estimator API

In [None]:
estimator = DNNClassifier(
    hidden_units=[512],
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=len(laws),
    config=run_config,
    label_vocabulary=laws_str,
    dropout=0.1)

input_fn = lambda fp: (tf.data.TextLineDataset(fp)
                       .apply(tf.contrib.data.shuffle_and_repeat(buffer_size=10000))
                       .batch(batch_size)
                       .map(lambda x: tf.py_func(get_encodes, [x], [tf.float32, tf.string]), num_parallel_calls=num_parallel_calls)
                       .map(lambda x, y: ({'feature': x}, y))
                       .prefetch(20))

train_spec = TrainSpec(input_fn=lambda: input_fn(train_fp))
eval_spec = EvalSpec(input_fn=lambda: input_fn(eval_fp), throttle_secs=0)
train_and_evaluate(estimator, train_spec, eval_spec)