In [10]:
import tensorflow as tf

In [5]:
def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
    """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
    Args:
        all_reduce_alg: a string specifying which cross device op to pick, or None.
        num_packs: an integer specifying number of packs for the cross device op.
    Returns:
        tf.distribute.CrossDeviceOps object or None.
    Raises:
        ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
    """
    if all_reduce_alg is None:
        return None
    mirrored_all_reduce_options = {
                  "nccl": tf.distribute.NcclAllReduce,
                  "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
                }
    if all_reduce_alg not in mirrored_all_reduce_options:
        raise ValueError(
                "When used with `mirrored`, valid values for all_reduce_alg are "
                "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
                    all_reduce_alg))
    cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
    return cross_device_ops_class(num_packs=num_packs)

In [6]:
def tpu_initialize(tpu_address):
    """Initializes TPU for TF 2.x training.
    Args:
        tpu_address: string, bns address of master TPU worker.
    Returns:
        A TPUClusterResolver.
    """
    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                                                              tpu=tpu_address)
    if tpu_address not in ("", "local"):
        tf.config.experimental_connect_to_cluster(cluster_resolver)
    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
    return cluster_resolver

In [7]:
def get_distribution_strategy(distribution_strategy="mirrored",
                              num_gpus=0,
                              all_reduce_alg=None,
                              num_packs=1,
                              tpu_address=None,
                              **kwargs):
    """Return a DistributionStrategy for running the model.
    Args:
    distribution_strategy: a string specifying which distribution strategy to
      use. Accepted values are "off", "one_device", "mirrored",
      "parameter_server", "multi_worker_mirrored", and "tpu" -- case
      insensitive. "off" means not to use Distribution Strategy; "tpu" means to
      use TPUStrategy using `tpu_address`.
    num_gpus: Number of GPUs to run this model.
    all_reduce_alg: Optional. Specifies which algorithm to use when performing
      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
      "ring" and "nccl".  If None, DistributionStrategy will choose based on
      device topology.
    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
    tpu_address: Optional. String that represents TPU to connect to. Must not be
      None if `distribution_strategy` is set to `tpu`.
    **kwargs: Additional kwargs for internal usages.
    Returns:
    tf.distribute.DistibutionStrategy object.
    Raises:
    ValueError: if `distribution_strategy` is "off" or "one_device" and
      `num_gpus` is larger than 1; or `num_gpus` is negative or if
      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
    """
    del kwargs
    
    if num_gpus < 0:
        raise ValueError("`num_gpus` can not be negative.")

    if not isinstance(distribution_strategy, str):
        msg = ("distribution_strategy must be a string but got: %s." %
               (distribution_strategy,))
        
        if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
            msg += (" If you meant to pass the string 'off', make sure you add "
                    "quotes around 'off' so that yaml interprets it as a string "
                    "instead of a bool.")
        raise ValueError(msg)

    distribution_strategy = distribution_strategy.lower()
    
#     if distribution_strategy == "off":
#         if num_gpus > 1:
#             raise ValueError("When {} GPUs are specified, distribution_strategy "
#                            "flag cannot be set to `off`.".format(num_gpus))
#         return None

    if distribution_strategy == "tpu":
        # When tpu_address is an empty string, we communicate with local TPUs.
        cluster_resolver = tpu_initialize(tpu_address)
        return tf.distribute.TPUStrategy(cluster_resolver)

#     if distribution_strategy == "multi_worker_mirrored":
#         return tf.distribute.experimental.MultiWorkerMirroredStrategy(
#             communication=_collective_communication(all_reduce_alg))

#     if distribution_strategy == "one_device":
#         if num_gpus == 0:
#             return tf.distribute.OneDeviceStrategy("device:CPU:0")
#         if num_gpus > 1:
#             raise ValueError("`OneDeviceStrategy` can not be used for more than "
#                            "one device.")
#         return tf.distribute.OneDeviceStrategy("device:GPU:0")

    if distribution_strategy == "mirrored":
        if num_gpus == 0:
            devices = ["device:CPU:0"]
        else:
            devices = ["device:GPU:%d" % i for i in range(num_gpus)]
        return tf.distribute.MirroredStrategy(devices=devices,
                        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))

#     if distribution_strategy == "parameter_server":
#         cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
#         return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)

    raise ValueError("Unrecognized Distribution Strategy: %r" %
                   distribution_strategy)

In [6]:
class AdamWeightDecay(tf.keras.optimizers.Adam):

    def __init__(self,
               learning_rate=0.001,
               beta_1=0.9,
               beta_2=0.999,
               epsilon=1e-7,
               amsgrad=False,
               weight_decay_rate=0.0,
               include_in_weight_decay=None,
               exclude_from_weight_decay=None,
               gradient_clip_norm=1.0,
               name='AdamWeightDecay',
               **kwargs):
        super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
                                          epsilon, amsgrad, name, **kwargs)
        self.weight_decay_rate = weight_decay_rate
        self.gradient_clip_norm = gradient_clip_norm
        self._include_in_weight_decay = include_in_weight_decay
        self._exclude_from_weight_decay = exclude_from_weight_decay

    @classmethod
    def from_config(cls, config):
        """Creates an optimizer from its config with WarmUp custom object."""
        custom_objects = {'WarmUp': WarmUp}
        return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)

    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
                                            apply_state)
        apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
            self.weight_decay_rate, name='adam_weight_decay_rate')

    def _decay_weights_op(self, var, learning_rate, apply_state):
        do_decay = self._do_use_weight_decay(var.name)
        if do_decay:
            return var.assign_sub(
              learning_rate * var *
              apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
              use_locking=self._use_locking)
        return tf.no_op()

    def apply_gradients(self,
                  grads_and_vars,
                  name=None,
                  experimental_aggregate_gradients=True):
        grads, tvars = list(zip(*grads_and_vars))
        if experimental_aggregate_gradients and self.gradient_clip_norm > 0.0:
            # when experimental_aggregate_gradients = False, apply_gradients() no
            # longer implicitly allreduce gradients, users manually allreduce gradient
            # and passed the allreduced grads_and_vars. For now, the
            # clip_by_global_norm will be moved to before the explicit allreduce to
            # keep the math the same as TF 1 and pre TF 2.2 implementation.
            (grads, _) = tf.clip_by_global_norm(grads, clip_norm=self.gradient_clip_norm)
            return super(AdamWeightDecay, self).apply_gradients(
                zip(grads, tvars),
                name=name,
                experimental_aggregate_gradients=experimental_aggregate_gradients)

    def _get_lr(self, var_device, var_dtype, apply_state):
        """Retrieves the learning rate with the given state."""
        if apply_state is None:
            return self._decayed_lr_t[var_dtype], {}

        apply_state = apply_state or {}
        coefficients = apply_state.get((var_device, var_dtype))
        if coefficients is None:
            coefficients = self._fallback_apply_state(var_device, var_dtype)
            apply_state[(var_device, var_dtype)] = coefficients

        return coefficients['lr_t'], dict(apply_state=apply_state)

    def _resource_apply_dense(self, grad, var, apply_state=None):
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
        decay = self._decay_weights_op(var, lr_t, apply_state)
        with tf.control_dependencies([decay]):
            return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)

    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
        decay = self._decay_weights_op(var, lr_t, apply_state)
        with tf.control_dependencies([decay]):
            return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)

    def get_config(self):
        config = super(AdamWeightDecay, self).get_config()
        config.update({'weight_decay_rate': self.weight_decay_rate,   })
        return config

    def _do_use_weight_decay(self, param_name):
        """Whether to use L2 weight decay for `param_name`."""
        if self.weight_decay_rate == 0:
            return False

        if self._include_in_weight_decay:
            for r in self._include_in_weight_decay:
                if re.search(r, param_name) is not None:
                    return True

        if self._exclude_from_weight_decay:
            for r in self._exclude_from_weight_decay:
                if re.search(r, param_name) is not None:
                      return False
        return True

NameError: name 'tf' is not defined

In [11]:
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Applies a warmup schedule on a given learning rate decay schedule."""

    def __init__(self,
                 initial_learning_rate,
                 decay_schedule_fn,
                 warmup_steps,
                 power=1.0,
                 name=None):
        super(WarmUp, self).__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.power = power
        self.decay_schedule_fn = decay_schedule_fn
        self.name = name

    def __call__(self, step):
        with tf.name_scope(self.name or 'WarmUp') as name:
          # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
          # learning rate will be `global_step/num_warmup_steps * init_lr`.
            global_step_float = tf.cast(step, tf.float32)
            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
            warmup_percent_done = global_step_float / warmup_steps_float
            warmup_learning_rate = (
                self.initial_learning_rate *
                tf.math.pow(warmup_percent_done, self.power))
        return tf.cond(
              global_step_float < warmup_steps_float,
              lambda: warmup_learning_rate,
              lambda: self.decay_schedule_fn(step),
              name=name)

    def get_config(self):
        return {
            'initial_learning_rate': self.initial_learning_rate,
            'decay_schedule_fn': self.decay_schedule_fn,
            'warmup_steps': self.warmup_steps,
            'power': self.power,
            'name': self.name
        }