In [None]:
# @markdown Copyright 2020 The ALBERT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# ALBERT End to End (Fine-tuning + Predicting) with Cloud TPU

## Set up environment

In [None]:
%tensorflow_version 1.x

In [None]:
# Clone the source code from GitHub.
import sys

!test -d albert || git clone https://github.com/google-research/albert albert
if not 'albert' in sys.path:
  sys.path += ['albert']
  
!pip install -q sentencepiece

# Download glue data.
! test -d download_glue_repo || git clone https://gist.github.com/60c2bdb54d156a41194446737ce03e2e.git download_glue_repo

In [None]:
import os
import pprint
import json
import logging
import numpy as np
import itertools
import collections
import subprocess

from google.colab import auth

import tensorflow as tf
from tensorflow.contrib import (
  cluster_resolver as contrib_cluster_resolver,
  tpu as contrib_tpu
)

from albert import (
    tokenization,
    modeling,
    classifier_utils,
    fine_tuning_utils,
    tags_utils
)

In [None]:
#@markdown # Configure LOGGING
LOGGING_LEVEL = "DEBUG" #@param ["DEBUG", "INFO", "WARNING", "ERROR"]

LOGGING_LEVEL_TO_INT = {
    "DEBUG": 0,
    "INFO": 1,
    "WARNING": 2,
    "ERROR": 3
}
LOGGING_VERBOSITY = LOGGING_LEVEL_TO_INT.get(LOGGING_LEVEL, 0)

log = logging.getLogger('tensorflow')
# log.setLevel(logging.INFO)
if LOGGING_LEVEL == "DEBUG":
  log.setLevel(logging.DEBUG)
elif LOGGING_LEVEL == "INFO":
  log.setLevel(logging.INFO)
elif LOGGING_LEVEL == "WARNING":
 log.setLevel(logging.WARNING)
elif LOGGING_LEVEL == "ERROR":
  log.setLevel(logging.ERROR)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

# tf.logging.set_verbosity(tf.logging.INFO)
if LOGGING_LEVEL == "DEBUG":
  tf.logging.set_verbosity(tf.logging.DEBUG)
elif LOGGING_LEVEL == "INFO":
  tf.logging.set_verbosity(tf.logging.INFO)
elif LOGGING_LEVEL == "WARNING":
 tf.logging.set_verbosity(tf.logging.WARNING)
elif LOGGING_LEVEL == "ERROR":
  tf.logging.set_verbosity(tf.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(LOGGING_LEVEL)

# configure logging
tf.autograph.set_verbosity(LOGGING_VERBOSITY)
#   Level | Level for Humans | Level Description
#  -------|------------------|------------------------------------
#   0     | DEBUG            | [Default] Print all messages
#   1     | INFO             | Filter out INFO messages
#   2     | WARNING          | Filter out INFO & WARNING messages
#   3     | ERROR            | Filter out all messages

In [None]:
# Set up Colab TPU running environment and verify connection to TPU device

# Upload credentials to TPU to access GCS bucket.
auth.authenticate_user()

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    print('TPU devices:')
    pprint.pprint(session.list_devices())
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

In [None]:
#@markdown # GCS and TF hub configuration

FROM_TF_HUB = False #@param {type:"boolean"}

#@markdown ---
#@markdown ### GCS model (fill if not FROM_TF_HUB)

PROJECT_ID = '' #@param {type:"string"}
os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT_ID

!gcloud config set project $PROJECT_ID
PRETRAINING_DIR = "" #@param {type:"string"}
BUCKET_NAME = "" #@param {type:"string"}
MODEL_NAME = "" #@param {type:"string"}
PHASE = "1" #@param ["1", "2"]

BASE_DIR = "gs://" + BUCKET_NAME
if not BASE_DIR or BASE_DIR == "gs://":
  log.warning("WARNING: BUCKET_NAME is not set. "
              "You will not be able to train the model.")
else:
  BUCKET_PATH = "gs://{}".format(BUCKET_NAME)

TASK_DATA_DIR = 'glue_data'

#@markdown ---
#@markdown ### TF hub model (fill if FROM_TF_HUB)

if FROM_TF_HUB:
  ALBERT_MODEL_HUB = 'https://tfhub.dev/google/albert_' + ALBERT_MODEL + '/' + VERSION
  ALBERT_MODEL = 'base' #@param ["base", "large", "xlarge", "xxlarge"]
  VERSION = "3" #@param ["1", "2", "3"]
else:
  ALBERT_MODEL_HUB = None
  ALBERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_NAME)
  DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)
  ALBERT_CONFIG_FILE = os.path.join(ALBERT_GCS_DIR, "albert_config.json")

if not FROM_TF_HUB and (not BASE_DIR or BASE_DIR == "gs://"):
  raise ValueError("You must configure at least one of"
                   "`TF_HUB` and `BUCKET_NAME`")

In [None]:
#@markdown # Vocabulary definition
VOC_SIZE = 30000 #@param {type: "integer"}
VOCAB_FILE = "30k-clean-v2.vocab" #@param {type:"string"}
SPM_MODEL_FILE = "30k-clean-v2.model" #@param {type:"string"}
DO_LOWER_CASE = True #@param {type:"boolean"}

VOCAB_FILE = os.path.join(ALBERT_GCS_DIR, VOCAB_FILE)
SPM_MODEL_FILE = os.path.join(ALBERT_GCS_DIR, SPM_MODEL_FILE)
if not tf.io.gfile.exists(VOCAB_FILE):
   log.warning("WARNING: vocab file: {} does not exists. You will not be able to evaluate the model.".format(VOCAB_FILE))
if not tf.io.gfile.exists(SPM_MODEL_FILE):
   log.warning("WARNING: spm model file: {} does not exists. You will not be able to evaluate the model.".format(SPM_MODEL_FILE))

In [None]:
#@markdown # Evaluation and optimization parameters

# DO_TRAIN = True #@param {type:"boolean"}
# DO_EVAL = True #@param {type:"boolean"}
# DO_EVAL_ON_PTB = True #@param {type:"boolean"}
# DO_PREDICT = True #@param {type:"boolean"}
# TASK = "MRPC" #@param ["MRPC", "MNLI", "QNLI", "QQP", "RTE", "SST-2", "CoLA", "STS-B", "WNLI"]
GLUE_DIR = os.path.join(ALBERT_GCS_DIR, 'GLUE')
USE_BOS = True #@param {type:"boolean"}

if not tf.io.gfile.exists(GLUE_DIR):
  tf.gfile.MakeDirs(GLUE_DIR)
  
TAU = 5e-4 #@param {type:"number"}
NUM_TPU_CORES = 8 #@param {type:"integer"}
OPTIMIZER = "lamb" #@param ["adamw", "lamb"]
EVAL_BATCH_SIZE = 32 #@param {type:"integer"}
PREDICT_BATCH_SIZE = 32 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
NUM_ACCUMULATION_STEPS =  1#@param {type:"integer"}
MAX_EPOCHS = 4 #@param {type:"integer"}
WARMUP_STEPS = 200 #@param {type:"integer"}
# SAVE_CHECKPOINTS_STEPS = 200 #@param {type:"integer"}
# ITERATIONS_PER_LOOP = 200 #@param {type:"integer"}
# ITERATIONS_PER_LOOP = int(min(ITERATIONS_PER_LOOP,
#                               SAVE_CHECKPOINTS_STEPS))

In [None]:
#@markdown # Hyper parameters search range

TRAIN_BATCH_SIZES = 16, 32 #@param
TRAIN_BATCH_SIZES = [int(lr) for lr in TRAIN_BATCH_SIZES]
LEARNING_RATES = 5e-5, 3e-5, 2e-5 #@param
LEARNING_RATES = [float(lr) for lr in LEARNING_RATES]
N_SEEDS = 5 #@param {type:"integer"}
MULTI_SEEDS_TASKS = "CoLA, STS-B, RTE, MRPC, SST-2, WNLI" #@param {type:"string"}
MULTI_SEED_TASKS = [str(lr).lower().strip() for lr in 
                      MULTI_SEEDS_TASKS.split(',')]

hyper_parameters_gridsearch = {
    'learning_rate': LEARNING_RATES,
    'batch_size': TRAIN_BATCH_SIZES
}

hyper_parameters_values = [p for p in itertools.product(*[*hyper_parameters_gridsearch.values()])]

In [None]:
INIT_CHECKPOINT = tf.train.latest_checkpoint(os.path.join(ALBERT_GCS_DIR, "phase_{}".format(PHASE)))
log.info('***** Model checkpoint: {} *****')
log.info(INIT_CHECKPOINT)

In [None]:
# if TASK == 'STS-B':
#   TASK = 'STS'
# elif TASK == 'SST-2':
#   TASK = 'SST'

!python download_glue_repo/download_glue_data.py --data_dir=$TASK_DATA_DIR 
# --tasks=$TASK
print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))

# if TASK == 'STS':
#   TASK = 'STS-B'
# elif TASK == 'SST':
 #  TASK = 'SST-2'

In [None]:
# Config definition

processors = {
    "cola": classifier_utils.ColaProcessor,
    "mnli": classifier_utils.MnliProcessor,
    "mismnli": classifier_utils.MisMnliProcessor,
    "mrpc": classifier_utils.MrpcProcessor,
    "rte": classifier_utils.RteProcessor,
    "sst-2": classifier_utils.Sst2Processor,
    "sts-b": classifier_utils.StsbProcessor,
    "qqp": classifier_utils.QqpProcessor,
    "qnli": classifier_utils.QnliProcessor,
    "wnli": classifier_utils.WnliProcessor,
    "diagnostic": classifier_utils.AXProcessor,
}

if not FROM_TF_HUB:
  albert_config = modeling.AlbertConfig.from_json_file(ALBERT_CONFIG_FILE)
  if MAX_SEQ_LENGTH > albert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the ALBERT model "
        "was only trained up to sequence length %d" %
        (MAX_SEQ_LENGTH, albert_config.max_position_embeddings))
else:
  albert_config = None  # Get the config from TF-Hub.

tpu_cluster_resolver = None
if USE_TPU and TPU_ADDRESS:
  tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
    TPU_ADDRESS, zone=None, project=PROJECT_ID)
      
is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2

tokenizer = fine_tuning_utils.create_vocab(
    vocab_file=VOCAB_FILE,
    do_lower_case=DO_LOWER_CASE,
    spm_model_file=SPM_MODEL_FILE,
    hub_module=ALBERT_MODEL_HUB)

# tokenizer = tokenization.FullTokenizer(
#       vocab_file=VOCAB_FILE, 
#       do_lower_case=True,
#       spm_model_file=SPM_FILE)

tokenizer.tokenize("This here's an example of using the ALBERT tokenizer")

In [None]:
def train_(estimator, task_name, label_list, max_seq_length, tokenizer, 
           task_data_dir, output_dir, use_tpu, train_batch_size, train_steps):
  
  train_examples = processor.get_train_examples(task_data_dir)
  train_file = os.path.join(output_dir, task_name + "_train.tf_record")
  if not tf.gfile.Exists(train_file):
    classifier_utils.file_based_convert_examples_to_features(
      train_examples, label_list, max_seq_length, tokenizer,
      train_file, task_name)
  tf.logging.info("***** Running training *****")
  tf.logging.info("  Num examples = %d", len(train_examples))
  tf.logging.info("  Batch size = %d", train_batch_size)
  tf.logging.info("  Num steps = %d", train_steps)

  # tags_utils.file_based_input_fn_builder(
  train_input_fn = classifier_utils.file_based_input_fn_builder(
    input_file=train_file,
    seq_length=max_seq_length,
    is_training=True,
    drop_remainder=True,
    task_name=task_name,
    use_tpu=use_tpu,
    bsz=train_batch_size)

  estimator.train(input_fn=train_input_fn, max_steps=train_steps)

## Train

In [None]:
for TASK in ["MRPC", "MNLI", "QNLI", "QQP", "RTE", "SST-2", "CoLA", "STS-B"]:

  print(TASK)

  if USE_BOS:
    OUTPUT_DIR = os.path.join(GLUE_DIR, TASK)
  else:
    OUTPUT_DIR = os.path.join(GLUE_DIR, TASK, 'SUM')

  if not tf.io.gfile.exists(OUTPUT_DIR):
    tf.gfile.MakeDirs(OUTPUT_DIR)

  log.info('***** Model output directory: {} *****'.format(OUTPUT_DIR))

  task_name = TASK.lower()
  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name](
      use_spm=True if SPM_MODEL_FILE else False,
      do_lower_case=DO_LOWER_CASE)

  if task_name in MULTI_SEED_TASKS:
      n_seeds_ = N_SEEDS
  else:
      n_seeds_ = 1

  label_list = processor.get_labels()
  train_examples = processor.get_train_examples(TASK_DATA_DIR)
  n_train_examples = len(train_examples)

  for hyperparameters in hyper_parameters_values:
    for seed in range(n_seeds_):

      learning_rate, train_batch_size = hyperparameters
      if task_name in ['mismnli', "diagnostic"]:
        run_output_dir = os.path.join(os.path.join(GLUE_DIR, 'MNLI'), 
            str(learning_rate), str(train_batch_size), str(seed))
      else:
        run_output_dir = os.path.join(
            OUTPUT_DIR, str(learning_rate), str(train_batch_size), str(seed))
      print(run_output_dir)

      n_iterations_per_epoch = int(n_train_examples / train_batch_size)
      num_train_steps = int(MAX_EPOCHS * n_iterations_per_epoch)
      SAVE_CHECKPOINTS_STEPS = int(n_iterations_per_epoch)
      ITERATIONS_PER_LOOP = int(n_iterations_per_epoch)

      if not tf.io.gfile.exists(run_output_dir):
        # Creates a directory and all parent/intermediate directories.
        tf.gfile.MakeDirs(run_output_dir)

      run_config = contrib_tpu.RunConfig(
        # cluster=tpu_cluster_resolver,
        master=TPU_ADDRESS,
        model_dir=run_output_dir,
        save_checkpoints_steps=int(SAVE_CHECKPOINTS_STEPS),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=ITERATIONS_PER_LOOP,
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=is_per_host))

      # model_fn = tags_utils.model_fn_builder(
      model_fn = classifier_utils.model_fn_builder(
        albert_config=albert_config,
        num_labels=len(label_list),
        init_checkpoint=INIT_CHECKPOINT,
        learning_rate=learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=WARMUP_STEPS,
        use_tpu=USE_TPU,
        use_one_hot_embeddings=USE_TPU,
        task_name=task_name,
        hub_module=ALBERT_MODEL_HUB,
        optimizer=OPTIMIZER,)
        # tau=TAU,
        # use_bos=USE_BOS)

      # If TPU is not available, this will fall back to Estimator on CPU or GPU.
      estimator = contrib_tpu.TPUEstimator(
        use_tpu=USE_TPU,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=train_batch_size,
        eval_batch_size=EVAL_BATCH_SIZE,
        export_to_tpu=False)
      
      if not task_name in ['mismnli', "diagnostic"]:
        train_(
            estimator, 
            task_name,
            label_list, 
            MAX_SEQ_LENGTH, 
            tokenizer, 
            TASK_DATA_DIR, 
            run_output_dir, 
            USE_TPU, 
            train_batch_size, 
            num_train_steps)
        

## Eval

In [None]:
Runs = collections.namedtuple('Runs', 'task, learning_rate batch_size seed epoch ckpt_path')

def _find_valid_cands(filenames):
  candidates = []
  for filename in filenames:
    if filename.endswith(".index"):
      ckpt_name = filename[:-6]
      idx = int(ckpt_name.split("-")[-1])
      candidates.append((idx, filename))
  candidates.sort(key=lambda t: t[0]) 
  return candidates

def _find_runs(output_dir, task_name, stop_ckpt=None):
  runs = []
  for dir, subdirs, files in tf.io.gfile.walk(output_dir):
    if len(subdirs) == 0:
      candidates = _find_valid_cands(files)
      for epoch, (step, cand) in enumerate(candidates):
        if epoch >= 2:
          if stop_ckpt is not None:
            if not os.path.join(dir, cand) in stop_ckpt:
              lr, bsz, seed = dir.split('/')[-4:-1]
              runs.append(Runs(task_name, float(lr), int(bsz), int(seed), int(epoch), os.path.join(dir, cand)))
          else:
            lr, bsz, seed = dir.split('/')[-4:-1]
            runs.append(Runs(task_name, float(lr), int(bsz), int(seed), int(epoch), os.path.join(dir, cand)))
  return runs

In [None]:
def get_eval_results(output_eval_file, task_name):
  bashCommand = "gsutil cp {} eval_results_{}.txt".format(output_eval_file, task_name)
  process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
  output, error = process.communicate()
  if not tf.io.gfile.exists("eval_results_{}.txt".format(task_name)):
    return []
  eval_results = []
  with open("eval_results_{}.txt".format(task_name), 'r') as f:
    eval_results = f.readlines()
  eval_results = [e.strip() for e in eval_results]
  return eval_results

def get_already_eval_ckpt(eval_results):
  already_eval_ckpt = []
  for run in eval_results:
    task, lr, bsz, epoch, seed, ckpt_path, dev_results = run.split('\t')
    already_eval_ckpt.append(ckpt_path)
  return already_eval_ckpt

In [None]:
def eval_(estimator, task_name, label_list, max_seq_length, tokenizer, 
           task_data_dir, output_dir, use_tpu, eval_batch_size, checkpoint_path):

  eval_examples = processor.get_dev_examples(task_data_dir)
  num_actual_eval_examples = len(eval_examples)
  if use_tpu:
    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on. These do NOT count towards the metric (all tf.metrics
    # support a per-instance weight, and these get a weight of 0.0).
    while len(eval_examples) % eval_batch_size != 0:
      eval_examples.append(classifier_utils.PaddingInputExample())

  eval_file = os.path.join(output_dir, task_name + "_eval.tf_record")
  if not tf.gfile.Exists(eval_file):
    classifier_utils.file_based_convert_examples_to_features(
        eval_examples, label_list, max_seq_length, tokenizer,
        eval_file, task_name)

  tf.logging.info("***** Running evaluation *****")
  tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                  len(eval_examples), num_actual_eval_examples,
                  len(eval_examples) - num_actual_eval_examples)
  tf.logging.info("  Batch size = %d", eval_batch_size)

  # This tells the estimator to run through the entire set.
  eval_steps = None
  # However, if running eval on the TPU, you will need to specify the
  # number of steps.
  if use_tpu:
    assert len(eval_examples) % eval_batch_size == 0
    eval_steps = int(len(eval_examples) // eval_batch_size)

  eval_drop_remainder = True if use_tpu else False
  # tags_utils
  eval_input_fn = classifier_utils.file_based_input_fn_builder(
      input_file=eval_file,
      seq_length=max_seq_length,
      is_training=False,
      drop_remainder=eval_drop_remainder,
      task_name=task_name,
      use_tpu=use_tpu,
      bsz=eval_batch_size)
  
  result = estimator.evaluate(
          input_fn=eval_input_fn,
          steps=eval_steps,
          checkpoint_path=checkpoint_path)
  global_step = result["global_step"]
  tf.logging.info("***** Eval results *****")
  for key in sorted(result.keys()):
    tf.logging.info("  %s = %s", key, str(result[key]))
  return result

In [None]:
for TASK in ["MRPC", "MNLI", "MISMNLI", "QNLI", "QQP", "RTE", "SST-2", "CoLA", "STS-B"]:

  print(TASK)

  try:
    writer.close()
  except:
    pass

  if USE_BOS:
    OUTPUT_DIR = os.path.join(GLUE_DIR, TASK)
  else:
    OUTPUT_DIR = os.path.join(GLUE_DIR, TASK, 'SUM')

  if not tf.io.gfile.exists(OUTPUT_DIR):
    tf.gfile.MakeDirs(OUTPUT_DIR)

  output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
  writer = tf.gfile.GFile(output_eval_file, "w")

  task_name = TASK.lower()
  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name](
      use_spm=True if SPM_MODEL_FILE else False,
      do_lower_case=DO_LOWER_CASE)

  label_list = processor.get_labels()
  if task_name in ['mismnli', "diagnostic"]:
    run_output_dir = os.path.join(os.path.join(GLUE_DIR, 'MNLI'))
  else:
    run_output_dir = os.path.join(os.path.join(GLUE_DIR, TASK))
  print(run_output_dir)

  runs = _find_runs(run_output_dir, task_name)
  tf.logging.info(" found %i runs.", len(runs))

  eval_results = get_eval_results(output_eval_file, task_name)
  print(eval_results)

  already_eval_ckpt = get_already_eval_ckpt(eval_results)
  print(already_eval_ckpt)

  runs = [run for run in runs if not run.ckpt_path in already_eval_ckpt]

  for run in runs:
    print(run)
    tf.logging.info("Eval {}.".format(run))

    run_config = contrib_tpu.RunConfig(
      master=TPU_ADDRESS,
      # model_dir=run_output_dir,
      tpu_config=contrib_tpu.TPUConfig(
        iterations_per_loop=1000,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=is_per_host))

    # tags_utils
    model_fn = classifier_utils.model_fn_builder(
      albert_config=albert_config,
      num_labels=len(label_list),
      init_checkpoint=run.ckpt_path[:-6],
      learning_rate=run.learning_rate,
      num_train_steps=1000,
      num_warmup_steps=1000,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=USE_TPU,
      task_name=task_name,
      hub_module=ALBERT_MODEL_HUB,
      optimizer=OPTIMIZER,)
      # tau=TAU,
      # use_bos=USE_BOS)

    estimator = contrib_tpu.TPUEstimator(
      use_tpu=USE_TPU,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=run.batch_size,
      eval_batch_size=EVAL_BATCH_SIZE,
      export_to_tpu=False)

    dev_results = eval_(
        estimator, 
        task_name, 
        label_list, 
        MAX_SEQ_LENGTH, 
        tokenizer, 
        TASK_DATA_DIR, 
        OUTPUT_DIR, 
        USE_TPU, 
        EVAL_BATCH_SIZE,
        run.ckpt_path[:-6])
    
    if task_name == "sts-b":
      key_name = "pearson"
    elif task_name == "cola":
      key_name = "matthew_corr"
    else:
      key_name = "eval_accuracy"

    for key in sorted(dev_results.keys()):
      tf.logging.info("  %s = %s", key, str(dev_results[key]))
    tf.logging.info("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
        task_name, run.learning_rate, run.batch_size, run.epoch, run.seed, run.ckpt_path, dev_results[key_name]))
    writer.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
        task_name, run.learning_rate, run.batch_size, run.epoch, run.seed, run.ckpt_path, dev_results[key_name]))
    writer.flush()

  writer.close()    

In [None]:
try:
  writer.close()
except:
  pass

  
output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
writer = tf.gfile.GFile(output_eval_file, "w")

for hyperparameters in hyper_parameters_values:
  for seed in range(n_seeds_):

    learning_rate, train_batch_size = hyperparameters
    if not task_name in ['rte', 'mismnli']:
      run_output_dir = os.path.join(os.path.join(GLUE_DIR, 'mnli'), 
          str(learning_rate), str(train_batch_size), str(seed))
    else:
      run_output_dir = os.path.join(
          OUTPUT_DIR, str(learning_rate), str(train_batch_size), str(seed))
    print(run_output_dir)

    n_iterations_per_epoch = int(n_train_examples / train_batch_size)
    num_train_steps = int(MAX_EPOCHS * n_iterations_per_epoch)
    SAVE_CHECKPOINTS_STEPS = int(n_iterations_per_epoch)
    ITERATIONS_PER_LOOP = int(n_iterations_per_epoch)

    if not tf.io.gfile.exists(run_output_dir):
      # Creates a directory and all parent/intermediate directories.
      tf.gfile.MakeDirs(run_output_dir)

    run_config = contrib_tpu.RunConfig(
      # cluster=tpu_cluster_resolver,
      master=TPU_ADDRESS,
      model_dir=run_output_dir,
      save_checkpoints_steps=int(SAVE_CHECKPOINTS_STEPS),
      keep_checkpoint_max=0,
      tpu_config=contrib_tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=is_per_host))

    model_fn = tags_utils.model_fn_builder(
      albert_config=albert_config,
      num_labels=len(label_list),
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=WARMUP_STEPS,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=USE_TPU,
      task_name=task_name,
      hub_module=ALBERT_MODEL_HUB,
      optimizer=OPTIMIZER,
      tau=TAU,
      use_bos=USE_BOS)

    # If TPU is not available, this will fall back to Estimator on CPU or GPU.
    estimator = contrib_tpu.TPUEstimator(
      use_tpu=USE_TPU,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=train_batch_size,
      eval_batch_size=EVAL_BATCH_SIZE,
      export_to_tpu=False)
    
    if not task_name in ['rte', 'mismnli']:
      train_(
          estimator, 
          task_name,
          label_list, 
          MAX_SEQ_LENGTH, 
          tokenizer, 
          TASK_DATA_DIR, 
          run_output_dir, 
          USE_TPU, 
          train_batch_size, 
          num_train_steps)
    
    if not task_name in ['rte']:
      runs = _find_runs(run_output_dir, task_name)
      tf.logging.info(" found %i runs.", len(runs))
      for run in runs:
        print(run)
        tf.logging.info("Eval {}.".format(run))

        dev_results = eval_(
            estimator, 
            task_name, 
            label_list, 
            MAX_SEQ_LENGTH, 
            tokenizer, 
            TASK_DATA_DIR, 
            OUTPUT_DIR, 
            USE_TPU, 
            EVAL_BATCH_SIZE,
            run.ckpt_path[:-6])
    
        if task_name == "sts-b":
          key_name = "pearson"
        elif task_name == "cola":
          key_name = "matthew_corr"
        else:
          key_name = "eval_accuracy"

        for key in sorted(dev_results.keys()):
          tf.logging.info("  %s = %s", key, str(dev_results[key]))
          # writer.write("%s = %s\n" % (key, str(dev_results[key])))
        tf.logging.info("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            task_name, run.learning_rate, run.batch_size, run.epoch, run.seed, run.ckpt_path, dev_results[key_name]))
        writer.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            task_name, run.learning_rate, run.batch_size, run.epoch, run.seed, run.ckpt_path, dev_results[key_name]))
        writer.flush()

writer.close()    

## Predict

In [None]:
def eval_results_to_dict(eval_results, task_name):
  eval_results_dict = {}
  for run in eval_results:
    task, lr, bsz, epoch, seed, ckpt_path, dev_results = run.split('\t')
    if task_name == task:
      key = "{}_{}_{}".format(lr, bsz, epoch)
      if key in eval_results_dict:
        eval_results_dict[key] = eval_results_dict[key] + [float(dev_results)]
      else:
        eval_results_dict[key] = [float(dev_results)]
  return eval_results_dict

def get_best_hp(eval_results, task_name):
  eval_results_dict = eval_results_to_dict(eval_results, task_name)
  best_dev_result = -1
  best_dev_key = ''
  for (k, v) in eval_results_dict.items():
    eval_results_dict[k] = np.mean(v)
    if eval_results_dict[k] > best_dev_result:
      best_dev_result = eval_results_dict[k]
      best_dev_key = k
  return best_dev_result, best_dev_key

def get_best_ckpt(eval_results, best_dev_key, task_name):
  best_dev_result_all = -1
  best_dev_result_all_ckpt = ""
  for run in eval_results:
    task, lr, bsz, epoch, seed, ckpt_path, dev_results = run.split('\t')
    if task_name == task:
      key = "{}_{}_{}".format(lr, bsz, epoch)
      if best_dev_key == key:
        if float(dev_results) > best_dev_result_all:
          best_dev_result_all_ckpt = ckpt_path
          best_dev_result_all = float(dev_results)
  return best_dev_result_all, best_dev_result_all_ckpt

In [None]:
def predict_(task_name, label_list, max_seq_length, tokenizer, 
             task_data_dir, output_dir, use_tpu, predict_batch_size,
             checkpoint_path):

  predict_examples = processor.get_test_examples(TASK_DATA_DIR)
  
  num_actual_predict_examples = len(predict_examples)
  if USE_TPU:
    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on.
    while len(predict_examples) % predict_batch_size != 0:
      predict_examples.append(classifier_utils.PaddingInputExample())

  predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record")
  classifier_utils.file_based_convert_examples_to_features(
      predict_examples, label_list,
      max_seq_length, tokenizer,
      predict_file, task_name)

  tf.logging.info("***** Running prediction*****")
  tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                  len(predict_examples), num_actual_predict_examples,
                  len(predict_examples) - num_actual_predict_examples)
  tf.logging.info("  Batch size = %d", predict_batch_size)

  predict_drop_remainder = True if use_tpu else False
  # tags_utils
  predict_input_fn = classifier_utils.file_based_input_fn_builder(
    input_file=predict_file,
    seq_length=max_seq_length,
    is_training=False,
    drop_remainder=predict_drop_remainder,
    task_name=task_name,
    use_tpu=use_tpu,
    bsz=predict_batch_size)
  
  run_config = contrib_tpu.RunConfig(
    master=TPU_ADDRESS,
    # model_dir=run_output_dir,
    tpu_config=contrib_tpu.TPUConfig(
      iterations_per_loop=1000,
      num_shards=NUM_TPU_CORES,
      per_host_input_for_training=is_per_host))

  tags_utils
  model_fn = classifier_utils.model_fn_builder(
    albert_config=albert_config,
    num_labels=len(label_list),
    init_checkpoint=checkpoint_path[:-6],
    learning_rate=float(best_dev_result_all_ckpt.split('/')[-4]),
    num_train_steps=1000,
    num_warmup_steps=1000,
    use_tpu=use_tpu,
    use_one_hot_embeddings=use_tpu,
    task_name=task_name,
    hub_module=ALBERT_MODEL_HUB,
    optimizer=OPTIMIZER,)
    # tau=TAU,
    # use_bos=USE_BOS)

  estimator = contrib_tpu.TPUEstimator(
    use_tpu=use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=int(best_dev_result_all_ckpt.split('/')[-3]),
    predict_batch_size=int(best_dev_result_all_ckpt.split('/')[-3]),
    export_to_tpu=False)

  result = estimator.predict(
    input_fn=predict_input_fn,
    checkpoint_path=checkpoint_path[:-6])
  
  return list(result)


def write_prediction_file(output_predict_file, output_submit_file, result):
  predict_examples = processor.get_test_examples(TASK_DATA_DIR)
  num_actual_predict_examples = len(predict_examples)
  with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\
      tf.gfile.GFile(output_submit_file, "w") as sub_writer:
    sub_writer.write("index" + "\t" + "prediction\n")
    num_written_lines = 0
    tf.logging.info("***** Predict results *****")
    for (i, (example, prediction)) in\
      enumerate(zip(predict_examples, result)):
      probabilities = prediction["probabilities"]
      if i >= num_actual_predict_examples:
        break
    
      output_line = "\t".join(
          str(class_probability)
          for class_probability in probabilities) + "\n"
      pred_writer.write(output_line)

      if task_name != "sts-b":
        actual_label = label_list[int(prediction["predictions"])]
      else:
        actual_label = str(max(min(prediction["predictions"], 5.0), 0.0))
      sub_writer.write(example.guid + "\t" + actual_label + "\n")
      num_written_lines += 1
  assert num_written_lines == num_actual_predict_examples

In [None]:
for TASK in ["MRPC", "MNLI", "MISMNLI", "QNLI", "QQP", "RTE", "SST-2", "CoLA", "STS-B", "diagnostic", "WNLI"]:  # "MRPC", "MNLI", "MISMNLI", "QNLI", "QQP", "RTE", "SST-2", "CoLA", "STS-B", "diagnostic", "WNLI"

  print(TASK)

  if USE_BOS:
    OUTPUT_DIR = os.path.join(GLUE_DIR, TASK)
  else:
    OUTPUT_DIR = os.path.join(GLUE_DIR, TASK, 'SUM')

  if not tf.io.gfile.exists(OUTPUT_DIR):
    tf.gfile.MakeDirs(OUTPUT_DIR)

  if task_name in ["diagnostic"]:
    output_eval_file = os.path.join(GLUE_DIR, 'MNLI', "eval_results.txt")
  else:
    output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
  print(output_eval_file)  

  task_name = TASK.lower()
  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name](
      use_spm=True if SPM_MODEL_FILE else False,
      do_lower_case=DO_LOWER_CASE)

  label_list = processor.get_labels()

  TASK_SAVE_NAME = ""
  if TASK == "MNLI":
    TASK_SAVE_NAME = "MNLI-m"
  elif TASK == "MISMNLI":
    TASK_SAVE_NAME = "MNLI-mm"
  elif TASK == "diagnostic":
    TASK_SAVE_NAME = "AX"

  if TASK == "WNLI":
    # Always predict majority class
    results = [{"probabilities": [1., 0.], "predictions": 0} for _ in range(146)]
    output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv")
    if not tf.io.gfile.exists(os.path.join(GLUE_DIR, 'submission')):
      tf.gfile.MakeDirs(os.path.join(GLUE_DIR, 'submission'))
    output_submit_file = os.path.join(GLUE_DIR, 'submission', "{}.tsv".format(TASK if not TASK_SAVE_NAME else TASK_SAVE_NAME))
    print(output_submit_file)
    write_prediction_file(output_predict_file, output_submit_file, results)
    continue

  eval_results = get_eval_results(output_eval_file, task_name if task_name not in ["diagnostic"] else "mnli")
  print(eval_results)

  best_dev_result, best_dev_key = get_best_hp(eval_results, task_name if task_name not in ["diagnostic"] else "mnli")
  print(best_dev_result, best_dev_key)

  best_dev_result_all, best_dev_result_all_ckpt = get_best_ckpt(eval_results, best_dev_key, task_name if task_name not in ["diagnostic"] else "mnli")
  print(best_dev_result_all, best_dev_result_all_ckpt)

  results = predict_(task_name, 
                     label_list, 
                     MAX_SEQ_LENGTH, 
                     tokenizer, 
                     TASK_DATA_DIR, 
                     OUTPUT_DIR, 
                     USE_TPU, 
                     PREDICT_BATCH_SIZE,
                     best_dev_result_all_ckpt)
   
  output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv")
  if not tf.io.gfile.exists(os.path.join(GLUE_DIR, 'submission')):
    tf.gfile.MakeDirs(os.path.join(GLUE_DIR, 'submission'))

  output_submit_file = os.path.join(GLUE_DIR, 'submission', "{}.tsv".format(TASK if not TASK_SAVE_NAME else TASK_SAVE_NAME))
  write_prediction_file(output_predict_file, output_submit_file, results)

In [None]:
!mkdir submission

In [None]:
!gsutil -m cp \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/AX.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/CoLA.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/MNLI-m.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/MNLI-mm.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/MRPC.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/QNLI.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/QQP.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/RTE.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/SST-2.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/STS-B.tsv" \
  "gs://$BUCKET_NAME/$MODEL_NAME/GLUE/submission/WNLI.tsv" \
  submission/

In [None]:
!wc -l submission/*.tsv

In [None]:
#    1105 AX.tsv
#    1064 CoLA.tsv
#    9848 MNLI-mm.tsv
#    9797 MNLI-m.tsv
#    1726 MRPC.tsv
#    5464 QNLI.tsv
#  390966 QQP.tsv
#    3001 RTE.tsv
#    1822 SST-2.tsv
#    1380 STS-B.tsv
#     147 WNLI.tsv
#  426597 total 

In [None]:
!zip -r submission.zip submission