# BioMedBERT: BREATHE -> (BERT+BioBERT)

In [1]:
# !sudo pip install tensorflow==1.15

In [2]:
import os
import sys
import json
import tensorflow as tf

In [3]:
tf.__version__

'1.15.0'

Save model assets and checkpoints to GCS

In [4]:
BUCKET_NAME = "ekaba-assets"
MODEL_DIR = "biomedbert_base"
tf.io.gfile.mkdir(MODEL_DIR)

Hyparameter configuration for BERT BASE

In [5]:
# VOC_SIZE = 32000
# VOC_FNAME = "biomedbert-8M.txt"

In [6]:
# # use this for BERT-base

# bert_base_config = {
#     "attention_probs_dropout_prob": 0.1,
#     "directionality": "bidi",
#     "hidden_act": "gelu",
#     "hidden_dropout_prob": 0.1,
#     "hidden_size": 768,
#     "initializer_range": 0.02,
#     "intermediate_size": 3072,
#     "max_position_embeddings": 512,
#     "num_attention_heads": 12,
#     "num_hidden_layers": 12,
#     "pooler_fc_size": 768,
#     "pooler_num_attention_heads": 12,
#     "pooler_num_fc_layers": 3,
#     "pooler_size_per_head": 128,
#     "pooler_type": "first_token_transform",
#     "type_vocab_size": 2,
#     "vocab_size": VOC_SIZE
# }

# with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
#     json.dump(bert_base_config, fo, indent=2)

In [7]:
# # update vocab
# !cp ../vocabulary/full_text/biomedbert-8M.txt biomedbert_base/biomedbert-8M.txt

In [8]:
# # move to GCS
# !gsutil -m cp -r $MODEL_DIR gs://ekaba-assets/

In [9]:
#import bert modules
sys.path.append("bert")
from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

In [10]:
import logging
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

In [11]:
PRETRAINING_DIR = "pre_trained_data_full_biomed"
VOC_FNAME = "biomedbert-8M.txt"

# Input data pipeline config
TRAIN_BATCH_SIZE = 128  # 128 -> 12.8K -> 1.2K
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 128  # 64, 128 - 12.8K -> 1.2K
LEARNING_RATE = 1e-5  # 2e-5
TRAIN_STEPS = 100000000  # 1M -> 100M
SAVE_CHECKPOINTS_STEPS = 25000  # 2500 -> 25K
NUM_TPU_CORES = 128

if BUCKET_NAME:
    BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
    BUCKET_PATH = "."

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)
# 'gs://ekaba-assets/biomedbert_base/model.ckpt-20577500

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.io.gfile.glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

INFO:tensorflow:Using checkpoint: gs://ekaba-assets/biomedbert_base/model.ckpt-20577500
INFO:tensorflow:Using 424 data shards


**Train on TPUs**

In [12]:
%%bash
export TPU_NAME='biomedbert-preempt'
echo $TPU_NAME

biomedbert-preempt


In [13]:
USE_TPU = True

model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10, #10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True
)

tpu_cluster_resolver =  tf.distribute.cluster_resolver.TPUClusterResolver(
    zone='europe-west4-a', project='ai-vs-covid19', job_name='biomedbert', tpu='biomedbert-preempt')

run_config = tf.compat.v1.estimator.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True,
        num_cpu_threads=64
)

RuntimeError: TPU "biomedbert-preempt" is not yet ready; state: "PREEMPTED"

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)