# BioMedBERT BigQuery Data Analysis/ Pre-training

In [None]:
# imports
import os
import json
import numpy as np
import pandas as pd
import textwrap
import tensorflow as tf
from google.cloud import bigquery

In [None]:
project_id = 'ai-vs-covid19'
client = bigquery.Client(project=project_id)

## Query Analysis

In [None]:
# Get number of rows
row_count = client.query('''
  SELECT 
    COUNT(*) as total
  FROM `ai-vs-covid19.BigBioMedBERT2.ncbi_comm_use`''').to_dataframe().total
row_count

In [None]:
# get column names
col_names = client.query('''
  SELECT column_name
  FROM `ai-vs-covid19.BigBioMedBERT2`.INFORMATION_SCHEMA.COLUMNS
  WHERE table_name = 'ncbi_comm_use'
''').to_dataframe()
col_names

In [None]:
# get general table information schema
table_schema = client.query('''
  SELECT *
  FROM `ai-vs-covid19.BigBioMedBERT2`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS
  WHERE table_name = 'ncbi_comm_use'
''').to_dataframe()
table_schema

In [None]:
# select first 10 rows
first_10_rows = client.query('''
  SELECT *
  FROM `ai-vs-covid19.BigBioMedBERT2.ncbi_comm_use`
  LIMIT 10
''').to_dataframe()
first_10_rows

In [None]:
# print body text
def print_body(body_series: pd.Series) -> str:
  print(textwrap.fill(body_series))

In [None]:
print_body(first_10_rows.Body[3])

## Data Preprocessing

In [None]:
!pip install sentencepiece
# !git clone https://github.com/google-research/bert

In [None]:
# base imports
import os
import sys
import nltk
import sentencepiece as spm

In [None]:
#import bert modules
sys.path.append("bert")
from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

### Preprocess text
Remove punсtuation, uppercase letters and non-utf symbols.

In [None]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punctuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

In [None]:
# example to normalize text from ``Body`` in BioMedBERT dataset
print_body(normalize_text(first_10_rows.Body[1]))

## Create Expanded csv dataset

In [None]:
import os
import glob
import pandas as pd
from google.cloud import storage

In [None]:
storage_client = storage.Client(project=project_id)

In [None]:
bucket=storage_client.get_bucket('big_bio_med_bert_dump_csv')
# List all objects that satisfy the filter.
blobs=bucket.list_blobs(prefix='ncbi_comm_use')

In [None]:
blob = [blob for blob in blobs]

In [None]:
print(len(blob))
print(len(blob) // 5)
print((len(blob) // 5)*5)

In [None]:
# Create a function called "chunks" with two arguments, l and n:
def split_list(data, chunk):
    # For item i in a range that is a length of data (l),
    for i in range(0, len(data), chunk):
        # Create an index range for data of chunk (e.g. 5) items:
        yield data[i:i+chunk]

In [None]:
# list of length in which we have to split 
blob_split_1, blob_split_2, blob_split_3, blob_split_4, blob_split_5 = list(split_list(blob, 50))

In [None]:
print(len(blob))
print(len(blob_split_1))
print(len(blob_split_2))
print(len(blob_split_3))
print(len(blob_split_4))
print(len(blob_split_5))

In [None]:
blob_split_1[0:5]

In [None]:
blob_split_2[0:5]

In [None]:
def download_to_local(folder, blob_lst):
    print('File download Started…. Wait for the job to complete.')
    # Create this folder locally if not exists
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Iterating through for loop one by one using API call
    for blob in blob_lst:
#         print('Blobs: {}'.format(blob.name))
        destination_uri = '{}/{}'.format(folder, (blob.name).split('/')[-1])
        blob.download_to_filename(destination_uri)
        print('Exported {} to {}'.format(blob.name, destination_uri))

In [None]:
# !rm -rf data #data_1 data_2 data_3 data_4 #data_5
# !rm ncbi_comm_use_csv_A.csv

In [None]:
# download first part of csv's
# download_to_local('data', blob)
# download_to_local('data_1', blob_split_1)
# download_to_local('data_2', blob_split_2)
# download_to_local('data_3', blob_split_3)
# download_to_local('data_4', blob_split_4)
# download_to_local('data_5', blob_split_5)

In [None]:
# make combined csv
def combined_csv(data_folder):
    extension = 'csv'
    all_filenames = [i for i in glob.glob('{}/*.{}'.format(data_folder, extension))]
    #combine all files in the list
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
    return combined_csv

In [None]:
# blob_csv_A = combined_csv('data')
# blob_csv = combined_csv('data')

In [None]:
# len(blob_csv)

In [None]:
# blob_csv.to_csv('gs://ekaba-assets/ncbi_comm_use.csv')

In [None]:
# blob_csv_A.to_csv( "ncbi_comm_use_csv_A.csv", index=False, encoding='utf-8-sig')
# blob_csv.to_csv( "ncbi_comm_use.csv", index=False, encoding='utf-8-sig')

In [None]:
# copy files from gcs bucket
# !gsutil -m cp gs://ekaba-assets/ncbi_comm_use_BODY.csv .

In [None]:
# body = pd.read_csv('ncbi_comm_use.csv')

In [None]:
# body_sel = body[['Body']]

In [None]:
# body_sel.to_csv('gs://ekaba-assets/ncbi_comm_use_BODY.csv')

In [None]:
# body_sel.to_csv( "ncbi_comm_use_BODY.csv", index=False, encoding='utf-8-sig')

In [None]:
# remove FULL ncbi_comm_use.csv
# !rm ncbi_comm_use.csv

In [None]:
# convert csv to txt
import csv
import sys
maxInt = sys.maxsize
csv.field_size_limit(maxInt)

csv_file = 'ncbi_comm_use_BODY.csv'
txt_file = 'ncbi_comm_use_BODY.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [None]:
# move text file to GCS
# !gsutil -m cp ncbi_comm_use_BODY.txt gs://ekaba-assets/

In [None]:
# remove csv file
# !rm ncbi_comm_use_BODY.csv

In [None]:
from tensorflow.keras.utils import Progbar
def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

In [None]:
# Apply normalization to entire dataset
RAW_DATA_FPATH = "ncbi_comm_use_BODY.txt"
PRC_DATA_FPATH = "processed_ncbi_comm_use_BODY.txt"

# apply normalization to the dataset

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)

In [None]:
# move processed text file to GCS
# !gsutil -m cp processed_ncbi_comm_use_BODY.txt gs://ekaba-assets/

In [None]:
# remove intermediate files
# !rm ncbi_comm_use_BODY.csv ncbi_comm_use_BODY.txt #processed_ncbi_comm_use_BODY.txt

## Building the vocabulary

In [None]:
PRC_DATA_FPATH = "processed_ncbi_comm_use_BODY.txt"
MODEL_PREFIX = "biomedbert" #@param {type: "string"}
VOC_SIZE = 32000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

In [None]:
spm.SentencePieceTrainer.Train(SPM_COMMAND)

In [None]:
!ls

In [None]:
!head -n 30 tokenizer.vocab

In [None]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

In [None]:
import random

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

In [None]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token

In [None]:
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

In [None]:
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

In [None]:
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

In [None]:
# write vocabulary to file
VOC_FNAME = "vocab.txt"

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [None]:
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME)
bert_tokenizer.tokenize(first_10_rows.Body[0])[0:20]

## Generating pre-trained data

In [None]:
# sharding the dataset
!mkdir ./shards
!split -a 4 -l 5560 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

In [None]:
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}

For each shard we need to call `create_pretraining_data.py` script

In [None]:
XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 bert/create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

In [None]:
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

In [None]:
# !gsutil -m cp -r gs://ekaba-assets/pre_trained_data .

# Train the BioMedBERT model

In [None]:
# !gcloud auth application-default login --no-launch-browser

In [None]:
# !pip install --user tensorflow==1.15.0

In [None]:
# !pip install --upgrade "cloud-tpu-profiler"

In [None]:
# !export PATH="$PATH:`python -m site --user-base`/bin"

In [1]:
import os
import sys
import json
import tensorflow as tf
# import tensorflow.compat.v2 as tf2

In [2]:
tf.__version__

'1.15.0'

Save model assets and checkpoints to GCS

In [3]:
BUCKET_NAME = "ekaba-assets"
MODEL_DIR = "bert_model_10M"
tf.io.gfile.mkdir(MODEL_DIR)

Hyparameter configuration for BERT BASE

In [4]:
VOC_SIZE = 32000
VOC_FNAME = "biomedbert-8M.txt"

In [5]:
# use this for BERT-base

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
    json.dump(bert_base_config, fo, indent=2)
  
# with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
#   for token in bert_vocab:
#     fo.write(token+"\n")

In [6]:
# !gsutil -m cp -r $MODEL_DIR gs://$BUCKET_NAME

In [7]:
# if BUCKET_NAME:
#   !gsutil -m cp -r $MODEL_DIR $PRETRAINING_DIR gs://$BUCKET_NAME

In [8]:
#import bert modules
sys.path.append("bert")
from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

In [9]:
import logging
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

In [10]:
# BUCKET_NAME = "ekaba-assets" # "ekaba-assets" #@param {type:"string"}
# MODEL_DIR = "bert_model_sat_18th_april" #@param {type:"string"}
PRETRAINING_DIR = "pre_trained_data" #@param {type:"string"}
VOC_FNAME = "biomedbert-8M.txt" #@param {type:"string"}

# Input data pipeline config
TRAIN_BATCH_SIZE = 128 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 128 #64
LEARNING_RATE = 1e-5  #2e-5
TRAIN_STEPS = 10000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 2500 #@param {type:"integer"}
NUM_TPU_CORES = 128

if BUCKET_NAME:
    BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
    BUCKET_PATH = "."

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.io.gfile.glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

INFO:tensorflow:Using checkpoint: gs://ekaba-assets/bert_model_10M/model.ckpt-397500
INFO:tensorflow:Using 10000 data shards


In [11]:
print(VOCAB_FILE)
print(CONFIG_FILE)
print(BERT_GCS_DIR)
print(INIT_CHECKPOINT)

gs://ekaba-assets/bert_model_10M/biomedbert-8M.txt
gs://ekaba-assets/bert_model_10M/bert_config.json
gs://ekaba-assets/bert_model_10M
gs://ekaba-assets/bert_model_10M/model.ckpt-397500


In [12]:
# export VOCAB_FILE=gs://ekaba-assets/bert_model_sat_18th_april/biomedbert-8M.txt
# export CONFIG_FILE=gs://ekaba-assets/bert_model_sat_18th_april/bert_config.json
# export INIT_CHECKPOINT=gs://ekaba-assets/bert_model_sat_18th_april/model.ckpt-1000000

In [13]:
# CONFIG_FILE='BioMedBERT/notebooks/bert_model_sat_18th_april/bert_config.json'

**Train on TPUs**

In [14]:
# if 'TPU_NAME' in os.environ:
#     log.info("Using TPU runtime")
#     USE_TPU = True
#     TPU_ADDRESS = 'grpc://' + '10.250.1.2:8470'

In [15]:
# !export BERT_GCS_DIR=BERT_GCS_DIR

In [16]:
# !echo $BERT_GCS_DIR

In [17]:
# !export TPU_NAME=for-ekaba-tpu

In [18]:
# !export STORAGE_BUCKET=gs://ekaba-assets/
# !export MODEL_DIR=gs://ekaba-assets/bert_model

In [19]:
# !capture_tpu_profile --tpu=${TPU_NAME} --logdir=${BERT_GCS_DIR}

In [20]:
USE_TPU = True

model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True,
      log_dir=BERT_GCS_DIR
)

# tpu_cluster_resolver =  tf.distribute.cluster_resolver.TPUClusterResolver(
#     tpu=TPU_ADDRESS, zone='us-central1-a', project='ai-vs-covid19', job_name='biomedbert')

tpu_cluster_resolver =  tf.distribute.cluster_resolver.TPUClusterResolver(
    zone='europe-west4-a', project='ai-vs-covid19', job_name='biomedbert')

run_config = tf.compat.v1.estimator.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_model_dir': 'gs://ekaba-assets/bert_model_10M', '_tf_random_seed': None, '_protocol': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fadd5673978>, '_cluster': <tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver object at 0x7fadd559a3c8>, '_device_fn': None, '_keep_checkpoint_max': 5, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_task_id': 0, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "biomedbert"
    tasks {
      key: 0
      value: "10.250.1.8:8470"
    }
    tasks {
      key: 1
      value: "10.250.1.5:8470"
    }
    tasks {
      key: 2
      value: "10.250.1.14:8470"
    }
    tasks {
      key: 3
      value: "10.250.1.11:8470"
    }
    tasks {
      key: 4
      value: "10.250.1.13:8470"
    }
    tasks {
      key: 5
      value: "10.250.1.10:8470"
    }
    

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

INFO:tensorflow:Querying Tensorflow master (grpc://10.250.1.8:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 128
INFO:tensorflow:*** Num TPU Workers: 16
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:biomedbert/replica:0/task:0/device:CPU:0, CPU, -1, 6300658717287230532)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:biomedbert/replica:0/task:0/device:TPU:0, TPU, 17179869184, 10342637076824321847)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:biomedbert/replica:0/task:0/device:TPU:1, TPU, 17179869184, 16631036280802566097)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:biomedbert/replica:0/task:0/device:TPU:2, TPU, 17179869184, 2278662464832429552)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:biomedbert/replica:0/task:0/device:TPU:3, TPU, 17179869184, 1597258585061134100)
INFO:tensorflow:*** Available Device: _Devi

## Extract pre-trained contextual embeddings

In [None]:
# !echo 'Who was Jim Henson ? ||| Jim Henson was a puppeteer' > ./input.txt #/tmp/input.txt

In [None]:
INPUT_TXT = 'input_fra.txt'
OUTPUT_FILE = 'output_fra.jsonl'
processes = 2

In [None]:
XARGS_CMD = ("python3 bert/extract_features.py "
             "--input_file={} "
             "--output_file={} "
             "--vocab_file={} "
             "--bert_config_file={} "
             "--init_checkpoint={} "
             "--layers=-1,-2,-3,-4 "
             "--max_seq_length=128 "
             "--batch_size=8 ")

XARGS_CMD = XARGS_CMD.format(INPUT_TXT, OUTPUT_FILE, VOCAB_FILE,
                             CONFIG_FILE, INIT_CHECKPOINT)

In [None]:
!$XARGS_CMD

In [None]:
# !python3 bert/extract_features.py --input_file='input.txt' --output_file='output.jsonl' --vocab_file=VOCAB_FILE --bert_config_file=bert_config --init_checkpoint=INIT_CHECKPOINT --layers=-1,-2,-3,-4 --max_seq_length=128 --batch_size=8

In [None]:
biomedbert code extract embeddings "input.txt" "vocabulary/ncbi/biomedbert-8M.txt"


