In [1]:
# chose the current file directory as the working directory
import os

os.chdir("/teamspace/studios/this_studio/rubikscubesolver")

In [2]:
from tqdm import tqdm

import wandb  # for logging
import time
from dataclasses import dataclass

import jax
import jax.numpy as jnp
import flax.nnx as nnx

import optax

from rubiktransformer.model_diffusion_dt import RubikDTTransformer
import rubiktransformer.dataset as dataset
from rubiktransformer.trainer import reshape_sample

cuda_plugin_extension is not found.


In [3]:
@dataclass
class Config:
    """Configuration class"""

    jax_key: jnp.ndarray = jax.random.PRNGKey(46)
    rngs = nnx.Rngs(45)
    batch_size: int = 128
    lr_1: float = 4e-3
    lr_2: float = 4e-3
    nb_games: int = 128 * 100
    len_seq: int = 20
    nb_step: int = 1000000
    log_every_step: int = 10
    log_eval_every_step: int = 10
    log_policy_reward_every_step: int = 10
    add_data_every_step: int = 500


config = Config()

# init wandb config
user = "forbu14"
project = "RubikTransformer"
display_name = "experiment_" + time.strftime("%Y%m%d-%H%M%S")

wandb.init(entity=user, project=project, name=display_name)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mforbu14[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
transformer = RubikDTTransformer(rngs=config.rngs, causal=True)

scheduler = optax.linear_schedule(init_value=0.0, end_value=1.0, transition_steps=4000)

# init optimizer
optimizer_dd = optax.chain(
    optax.clip_by_global_norm(1.0),
    optax.lion(config.lr_1 / 100.0),
    # optax.adamw(config.lr_1/10.),
    optax.scale_by_schedule(scheduler),
)

optimizer_diffuser = nnx.Optimizer(transformer, optimizer_dd)

# metrics
metrics_train = nnx.MultiMetric(
    loss=nnx.metrics.Average("loss"),
    loss_cross_entropy=nnx.metrics.Average("loss_cross_entropy"),
)

metrics_eval = nnx.MultiMetric(
    loss_eval=nnx.metrics.Average("loss_eval"),

    loss_cross_entropy_eval=nnx.metrics.Average("loss_cross_entropy_eval"),
)

In [5]:
# gather data from the environment
# init models and optimizers
env, buffer = dataset.init_env_buffer(sample_batch_size=config.batch_size)
env, buffer_eval = dataset.init_env_buffer(sample_batch_size=config.batch_size)


nb_games = config.nb_games
len_seq = config.len_seq

state_first = jnp.zeros((6, 3, 3))
state_next = jnp.zeros((len_seq, 6, 3, 3))
action = jnp.zeros((len_seq, 3))
action_proba = jnp.zeros((len_seq, 9))

# transform state to int8 type
state_first = state_first.astype(jnp.int8)
state_next = state_next.astype(jnp.int8)

# action to int32 type
action = action.astype(jnp.int32)

reward = jnp.zeros((1))

jit_step = jax.jit(env.step)

buffer_list = buffer.init(
    {
        "action": action,
        "reward": reward,
        "state_histo": state_next,
    }
)

buffer_list_eval = buffer_eval.init(
    {
        "action": action,
        "reward": reward,
        "state_histo": state_next,
    }
)

In [6]:
def step_fn(state, key):
    """
    Simple step function
    We choose a random action
    """

    action = jax.random.randint(
        key=key,
        minval=env.action_spec.minimum,
        maxval=env.action_spec.maximum,
        shape=(3,),
    )

    new_state, timestep = jit_step(state, action)
    timestep.extras["action"] = action

    return new_state, timestep


def run_n_steps(state, key, n):
    random_keys = jax.random.split(key, n)
    state, rollout = jax.lax.scan(step_fn, state, random_keys)

    return rollout


vmap_reset = jax.vmap(jax.jit(env.reset))
vmap_step = jax.vmap(run_n_steps, in_axes=(0, 0, None))

In [7]:
nnx.display(transformer)

In [8]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer, buffer_list = dataset.fast_gathering_data_diffusion(
    env,
    vmap_reset,
    vmap_step,
    int(config.nb_games / 10),
    config.len_seq,
    buffer,
    buffer_list,
    subkey,
)

In [9]:
sample = buffer.sample(buffer_list, subkey)

def reshape_diffusion_setup(sample, key=jax.random.PRNGKey(0)):
    sample.experience["state_histo"] = sample.experience["state_histo"].reshape(
        (sample.experience["state_histo"].shape[0], sample.experience["state_histo"].shape[1], 54)
    )

    # one hot encoding for state_histo
    sample.experience["state_histo"] = jax.nn.one_hot(
        sample.experience["state_histo"],
        num_classes=6,
        axis=-1,
    )

    # batch creation
    batch  = sample.experience
    len_seq = batch["state_histo"].shape[1]

    time_step = jax.random.uniform(
        key, (batch["state_histo"].shape[0], 1, 1, 1)
    )  # random value between 0 and 1

    batch['time_step'] = time_step

    # now contact the value to have the context for the rectified flow setup
    batch["context"] = jnp.concatenate([batch["reward"], time_step[:, :, 0, 0]], axis=1)

    batch["state_past"] = batch["state_histo"][:, :len_seq//2, :, :]
    batch["state_future"] = batch["state_histo"][:, len_seq//2:, :, :]

    # now we generate the random noise for the rectified flow setup
    random_noise = jax.random.normal(key, batch["state_future"].shape)
    simplex_noise = jax.nn.softmax(random_noise, axis=-1)

    batch["state_future_noise"] = (
        (1 - time_step) * simplex_noise + time_step * batch["state_future"]
    )
    
    return batch


sample = reshape_diffusion_setup(sample)


In [10]:
def loss_fn_transformer_rf(model: RubikDTTransformer, batch):
    # rectified flow setup
    state_past, state_future = model(
        batch["state_past"], batch["state_future_noise"], batch["context"]
    )

    loss_crossentropy = optax.softmax_cross_entropy(
        logits=state_future, labels=batch["state_future"]
    ).mean(axis=[1, 2])

    weight = jnp.clip(1. / (1. - batch["time_step"][:, 0, 0, 0]), a_min=0.005, a_max=1.5)

    loss_cross_entropy_weight = loss_crossentropy * weight

    return loss_cross_entropy_weight.mean(), (loss_crossentropy.mean())


@nnx.jit
def train_step_transformer_rf(
    model: RubikDTTransformer,
    optimizer: nnx.Optimizer,
    metrics: nnx.MultiMetric,
    batch,
):
    """Train for a single step."""

    grad_fn = nnx.value_and_grad(loss_fn_transformer_rf, has_aux=True)
    (loss, (loss_crossentropy)), grads = grad_fn(model, batch)
    metrics.update(
        loss=loss, loss_cross_entropy=loss_crossentropy
    )
    optimizer.update(grads)

In [11]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer, buffer_list = dataset.fast_gathering_data_diffusion(
    env,
    vmap_reset,
    vmap_step,
    config.nb_games * 10, # old is int(config.nb_games * 10.0),
    config.len_seq,
    buffer,
    buffer_list,
    subkey,
)

In [None]:
# transformer model calibration
for idx_step in tqdm(range(config.nb_step)):
    # training for world model
    key, subkey = jax.random.split(config.jax_key)
    config.jax_key = key

    if idx_step % config.add_data_every_step == 0:
        buffer, buffer_list = dataset.fast_gathering_data_diffusion(
            env,
            vmap_reset,
            vmap_step,
            int(config.nb_games // 10),
            config.len_seq,
            buffer,
            buffer_list,
            config.jax_key,
        )

    sample = buffer.sample(buffer_list, subkey)
    sample = reshape_diffusion_setup(sample, subkey)

    # we update the policy
    train_step_transformer_rf(
        transformer, optimizer_diffuser, metrics_train, sample
    )

    if idx_step % config.log_every_step == 0:
        metrics_train_result = metrics_train.compute()
        print(metrics_train_result)

        wandb.log(metrics_train_result, step=idx_step)
        metrics_train.reset()

    if idx_step % config.log_eval_every_step == 0:
        key, subkey = jax.random.split(config.jax_key)
        config.jax_key = key

        buffer_eval, buffer_list_eval = dataset.fast_gathering_data_diffusion(
            env,
            vmap_reset,
            vmap_step,
            int(128),
            config.len_seq,
            buffer_eval,
            buffer_list_eval,
            subkey,
        )

        sample = buffer_eval.sample(buffer_list_eval, subkey)
        sample = reshape_diffusion_setup(sample, subkey)

        loss, (loss_crossentropy) = loss_fn_transformer_rf(
            transformer, sample
        )

        metrics_eval.update(
            loss_eval=loss,
            loss_cross_entropy_eval=loss_crossentropy,
        )
        wandb.log(metrics_eval.compute(), step=idx_step)

        metrics_eval.reset()

  0%|          | 0/1000000 [00:00<?, ?it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)
{'loss': Array(2.135381, dtype=float32), 'loss_cross_entropy': Array(2.135381, dtype=float32)}


  0%|          | 3/1000000 [00:16<1156:47:54,  4.16s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 8/1000000 [00:16<293:10:30,  1.06s/it] 

{'loss': Array(2.1401873, dtype=float32), 'loss_cross_entropy': Array(2.1401873, dtype=float32)}


  0%|          | 13/1000000 [00:17<146:48:14,  1.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 19/1000000 [00:17<60:05:42,  4.62it/s] 

{'loss': Array(2.1336591, dtype=float32), 'loss_cross_entropy': Array(2.1336591, dtype=float32)}


  0%|          | 21/1000000 [00:18<78:52:32,  3.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 29/1000000 [00:18<32:16:50,  8.60it/s]

{'loss': Array(2.1236262, dtype=float32), 'loss_cross_entropy': Array(2.1236262, dtype=float32)}


  0%|          | 33/1000000 [00:19<45:18:57,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 40/1000000 [00:20<25:19:04, 10.97it/s]

{'loss': Array(2.107849, dtype=float32), 'loss_cross_entropy': Array(2.107849, dtype=float32)}


  0%|          | 42/1000000 [00:21<50:59:23,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 48/1000000 [00:21<29:16:37,  9.49it/s]

{'loss': Array(2.0918367, dtype=float32), 'loss_cross_entropy': Array(2.0918367, dtype=float32)}


  0%|          | 53/1000000 [00:22<39:26:29,  7.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 59/1000000 [00:23<37:39:39,  7.38it/s]

{'loss': Array(2.0689101, dtype=float32), 'loss_cross_entropy': Array(2.0689101, dtype=float32)}


  0%|          | 61/1000000 [00:24<56:55:17,  4.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 70/1000000 [00:24<26:56:07, 10.31it/s]

{'loss': Array(2.0414886, dtype=float32), 'loss_cross_entropy': Array(2.0414886, dtype=float32)}


  0%|          | 72/1000000 [00:25<51:47:40,  5.36it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 80/1000000 [00:26<25:22:43, 10.94it/s]

{'loss': Array(2.016842, dtype=float32), 'loss_cross_entropy': Array(2.016842, dtype=float32)}


  0%|          | 82/1000000 [00:26<50:22:28,  5.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 90/1000000 [00:27<24:01:20, 11.56it/s]

{'loss': Array(1.9870316, dtype=float32), 'loss_cross_entropy': Array(1.9870316, dtype=float32)}


  0%|          | 92/1000000 [00:28<47:04:12,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 100/1000000 [00:28<24:12:14, 11.48it/s]

{'loss': Array(1.9571841, dtype=float32), 'loss_cross_entropy': Array(1.9571841, dtype=float32)}


  0%|          | 102/1000000 [00:29<47:46:02,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 110/1000000 [00:29<24:22:27, 11.40it/s]

{'loss': Array(1.9232781, dtype=float32), 'loss_cross_entropy': Array(1.9232781, dtype=float32)}


  0%|          | 112/1000000 [00:30<50:42:54,  5.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 119/1000000 [00:31<26:33:02, 10.46it/s]

{'loss': Array(1.8997074, dtype=float32), 'loss_cross_entropy': Array(1.8997074, dtype=float32)}


  0%|          | 121/1000000 [00:32<70:49:43,  3.92it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 129/1000000 [00:32<32:10:11,  8.63it/s]

{'loss': Array(1.869777, dtype=float32), 'loss_cross_entropy': Array(1.869777, dtype=float32)}


  0%|          | 131/1000000 [00:33<55:16:27,  5.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 139/1000000 [00:34<26:44:56, 10.38it/s]

{'loss': Array(1.8438791, dtype=float32), 'loss_cross_entropy': Array(1.8438791, dtype=float32)}


  0%|          | 141/1000000 [00:35<50:51:43,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 149/1000000 [00:35<25:35:05, 10.86it/s]

{'loss': Array(1.8214076, dtype=float32), 'loss_cross_entropy': Array(1.8214076, dtype=float32)}


  0%|          | 153/1000000 [00:36<40:02:39,  6.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 159/1000000 [00:36<24:51:30, 11.17it/s]

{'loss': Array(1.8007246, dtype=float32), 'loss_cross_entropy': Array(1.8007246, dtype=float32)}


  0%|          | 161/1000000 [00:37<48:23:20,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 169/1000000 [00:38<25:37:01, 10.84it/s]

{'loss': Array(1.7797664, dtype=float32), 'loss_cross_entropy': Array(1.7797664, dtype=float32)}


  0%|          | 173/1000000 [00:39<40:30:35,  6.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 179/1000000 [00:39<24:05:45, 11.53it/s]

{'loss': Array(1.756506, dtype=float32), 'loss_cross_entropy': Array(1.756506, dtype=float32)}


  0%|          | 183/1000000 [00:40<42:21:01,  6.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 190/1000000 [00:41<33:33:40,  8.28it/s]

{'loss': Array(1.7421988, dtype=float32), 'loss_cross_entropy': Array(1.7421988, dtype=float32)}


  0%|          | 192/1000000 [00:42<55:54:05,  4.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 199/1000000 [00:42<27:41:45, 10.03it/s]

{'loss': Array(1.7246903, dtype=float32), 'loss_cross_entropy': Array(1.7246903, dtype=float32)}


  0%|          | 201/1000000 [00:43<52:18:06,  5.31it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 210/1000000 [00:43<24:30:31, 11.33it/s]

{'loss': Array(1.706753, dtype=float32), 'loss_cross_entropy': Array(1.706753, dtype=float32)}


  0%|          | 212/1000000 [00:44<46:42:50,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 220/1000000 [00:45<26:36:42, 10.44it/s]

{'loss': Array(1.6947157, dtype=float32), 'loss_cross_entropy': Array(1.6947157, dtype=float32)}


  0%|          | 222/1000000 [00:46<51:05:02,  5.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 229/1000000 [00:46<26:24:47, 10.51it/s]

{'loss': Array(1.6821057, dtype=float32), 'loss_cross_entropy': Array(1.6821057, dtype=float32)}


  0%|          | 231/1000000 [00:47<50:43:42,  5.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 238/1000000 [00:47<26:54:40, 10.32it/s]

{'loss': Array(1.6702147, dtype=float32), 'loss_cross_entropy': Array(1.6702147, dtype=float32)}


  0%|          | 243/1000000 [00:48<39:02:29,  7.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 250/1000000 [00:49<44:21:43,  6.26it/s]

{'loss': Array(1.6605242, dtype=float32), 'loss_cross_entropy': Array(1.6605242, dtype=float32)}


  0%|          | 252/1000000 [00:50<67:43:18,  4.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 260/1000000 [00:50<30:03:28,  9.24it/s]

{'loss': Array(1.6514502, dtype=float32), 'loss_cross_entropy': Array(1.6514502, dtype=float32)}


  0%|          | 262/1000000 [00:51<53:20:10,  5.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 270/1000000 [00:52<26:07:56, 10.63it/s]

{'loss': Array(1.6430658, dtype=float32), 'loss_cross_entropy': Array(1.6430658, dtype=float32)}


  0%|          | 272/1000000 [00:53<50:10:40,  5.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 280/1000000 [00:53<25:06:18, 11.06it/s]

{'loss': Array(1.6346047, dtype=float32), 'loss_cross_entropy': Array(1.6346047, dtype=float32)}


  0%|          | 282/1000000 [00:54<49:29:43,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 289/1000000 [00:54<26:39:06, 10.42it/s]

{'loss': Array(1.6293144, dtype=float32), 'loss_cross_entropy': Array(1.6293144, dtype=float32)}


  0%|          | 291/1000000 [00:55<52:59:23,  5.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 300/1000000 [00:56<24:05:45, 11.52it/s]

{'loss': Array(1.6229838, dtype=float32), 'loss_cross_entropy': Array(1.6229838, dtype=float32)}


  0%|          | 302/1000000 [00:56<54:11:22,  5.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 309/1000000 [00:57<27:00:39, 10.28it/s]

{'loss': Array(1.6172917, dtype=float32), 'loss_cross_entropy': Array(1.6172917, dtype=float32)}


  0%|          | 311/1000000 [00:58<52:44:12,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 319/1000000 [00:59<34:09:25,  8.13it/s]

{'loss': Array(1.6129887, dtype=float32), 'loss_cross_entropy': Array(1.6129887, dtype=float32)}


  0%|          | 321/1000000 [01:00<55:44:02,  4.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 329/1000000 [01:00<26:17:40, 10.56it/s]

{'loss': Array(1.6087477, dtype=float32), 'loss_cross_entropy': Array(1.6087477, dtype=float32)}


  0%|          | 331/1000000 [01:01<53:30:52,  5.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 340/1000000 [01:01<24:14:55, 11.45it/s]

{'loss': Array(1.6061596, dtype=float32), 'loss_cross_entropy': Array(1.6061596, dtype=float32)}


  0%|          | 342/1000000 [01:02<48:14:45,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 349/1000000 [01:03<26:31:47, 10.47it/s]

{'loss': Array(1.6035296, dtype=float32), 'loss_cross_entropy': Array(1.6035296, dtype=float32)}


  0%|          | 353/1000000 [01:04<41:20:26,  6.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 359/1000000 [01:04<24:40:16, 11.26it/s]

{'loss': Array(1.6020815, dtype=float32), 'loss_cross_entropy': Array(1.6020815, dtype=float32)}


  0%|          | 363/1000000 [01:05<42:38:05,  6.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 369/1000000 [01:05<25:08:00, 11.05it/s]

{'loss': Array(1.5996504, dtype=float32), 'loss_cross_entropy': Array(1.5996504, dtype=float32)}


  0%|          | 373/1000000 [01:06<40:59:09,  6.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 379/1000000 [01:07<37:55:32,  7.32it/s]

{'loss': Array(1.5982045, dtype=float32), 'loss_cross_entropy': Array(1.5982045, dtype=float32)}


  0%|          | 383/1000000 [01:08<47:45:09,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 389/1000000 [01:08<26:43:35, 10.39it/s]

{'loss': Array(1.5973551, dtype=float32), 'loss_cross_entropy': Array(1.5973551, dtype=float32)}


  0%|          | 393/1000000 [01:09<43:29:31,  6.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 399/1000000 [01:10<26:12:58, 10.59it/s]

{'loss': Array(1.5965773, dtype=float32), 'loss_cross_entropy': Array(1.5965773, dtype=float32)}


  0%|          | 403/1000000 [01:11<43:12:16,  6.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 409/1000000 [01:11<25:07:19, 11.05it/s]

{'loss': Array(1.5960716, dtype=float32), 'loss_cross_entropy': Array(1.5960716, dtype=float32)}


  0%|          | 413/1000000 [01:12<41:01:25,  6.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 419/1000000 [01:12<24:33:06, 11.31it/s]

{'loss': Array(1.5953164, dtype=float32), 'loss_cross_entropy': Array(1.5953164, dtype=float32)}


  0%|          | 421/1000000 [01:13<53:44:28,  5.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 428/1000000 [01:14<26:48:59, 10.35it/s]

{'loss': Array(1.5947459, dtype=float32), 'loss_cross_entropy': Array(1.5947459, dtype=float32)}


  0%|          | 431/1000000 [01:14<47:08:25,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 440/1000000 [01:16<44:25:22,  6.25it/s]

{'loss': Array(1.5940342, dtype=float32), 'loss_cross_entropy': Array(1.5940342, dtype=float32)}


  0%|          | 442/1000000 [01:16<64:54:57,  4.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 449/1000000 [01:17<30:07:13,  9.22it/s]

{'loss': Array(1.5941054, dtype=float32), 'loss_cross_entropy': Array(1.5941054, dtype=float32)}


  0%|          | 453/1000000 [01:18<44:51:15,  6.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 460/1000000 [01:18<23:47:40, 11.67it/s]

{'loss': Array(1.593597, dtype=float32), 'loss_cross_entropy': Array(1.593597, dtype=float32)}


  0%|          | 462/1000000 [01:19<50:22:52,  5.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 470/1000000 [01:20<25:05:04, 11.07it/s]

{'loss': Array(1.5922475, dtype=float32), 'loss_cross_entropy': Array(1.5922475, dtype=float32)}


  0%|          | 472/1000000 [01:20<54:57:42,  5.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 480/1000000 [01:21<25:09:08, 11.04it/s]

{'loss': Array(1.5917381, dtype=float32), 'loss_cross_entropy': Array(1.5917381, dtype=float32)}


  0%|          | 482/1000000 [01:22<48:27:14,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 490/1000000 [01:22<23:31:14, 11.80it/s]

{'loss': Array(1.5916208, dtype=float32), 'loss_cross_entropy': Array(1.5916208, dtype=float32)}


  0%|          | 492/1000000 [01:23<54:24:25,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 499/1000000 [01:23<26:17:13, 10.56it/s]

{'loss': Array(1.5901626, dtype=float32), 'loss_cross_entropy': Array(1.5901626, dtype=float32)}


  0%|          | 501/1000000 [01:31<299:40:03,  1.08s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 508/1000000 [01:32<116:00:29,  2.39it/s]

{'loss': Array(1.589477, dtype=float32), 'loss_cross_entropy': Array(1.589477, dtype=float32)}


  0%|          | 513/1000000 [01:33<83:00:48,  3.34it/s] 

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 519/1000000 [01:33<40:47:02,  6.81it/s]

{'loss': Array(1.5891417, dtype=float32), 'loss_cross_entropy': Array(1.5891417, dtype=float32)}


  0%|          | 521/1000000 [01:34<64:41:09,  4.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 530/1000000 [01:35<27:24:47, 10.13it/s]

{'loss': Array(1.5886483, dtype=float32), 'loss_cross_entropy': Array(1.5886483, dtype=float32)}


  0%|          | 532/1000000 [01:35<51:47:55,  5.36it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 539/1000000 [01:36<27:28:57, 10.10it/s]

{'loss': Array(1.5888028, dtype=float32), 'loss_cross_entropy': Array(1.5888028, dtype=float32)}


  0%|          | 541/1000000 [01:37<52:04:38,  5.33it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 550/1000000 [01:37<24:22:07, 11.39it/s]

{'loss': Array(1.5885557, dtype=float32), 'loss_cross_entropy': Array(1.5885557, dtype=float32)}


  0%|          | 552/1000000 [01:38<47:33:56,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 560/1000000 [01:38<23:46:22, 11.68it/s]

{'loss': Array(1.5883409, dtype=float32), 'loss_cross_entropy': Array(1.5883409, dtype=float32)}


  0%|          | 562/1000000 [01:39<52:02:46,  5.33it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 569/1000000 [01:40<40:55:11,  6.78it/s]

{'loss': Array(1.5880108, dtype=float32), 'loss_cross_entropy': Array(1.5880108, dtype=float32)}


  0%|          | 571/1000000 [01:41<62:26:06,  4.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 580/1000000 [01:42<26:23:30, 10.52it/s]

{'loss': Array(1.5877079, dtype=float32), 'loss_cross_entropy': Array(1.5877079, dtype=float32)}


  0%|          | 582/1000000 [01:42<52:38:07,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 590/1000000 [01:43<24:44:36, 11.22it/s]

{'loss': Array(1.5876025, dtype=float32), 'loss_cross_entropy': Array(1.5876025, dtype=float32)}


  0%|          | 592/1000000 [01:44<49:08:23,  5.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 600/1000000 [01:44<24:44:00, 11.22it/s]

{'loss': Array(1.5867671, dtype=float32), 'loss_cross_entropy': Array(1.5867671, dtype=float32)}


  0%|          | 602/1000000 [01:45<49:25:58,  5.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 608/1000000 [01:45<29:14:07,  9.50it/s]

{'loss': Array(1.5864232, dtype=float32), 'loss_cross_entropy': Array(1.5864232, dtype=float32)}


  0%|          | 613/1000000 [01:46<40:05:11,  6.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 620/1000000 [01:47<23:28:38, 11.82it/s]

{'loss': Array(1.5864793, dtype=float32), 'loss_cross_entropy': Array(1.5864793, dtype=float32)}


  0%|          | 622/1000000 [01:48<47:59:56,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 629/1000000 [01:48<25:51:07, 10.74it/s]

{'loss': Array(1.5860404, dtype=float32), 'loss_cross_entropy': Array(1.5860404, dtype=float32)}


  0%|          | 633/1000000 [01:49<56:05:47,  4.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 639/1000000 [01:50<31:00:25,  8.95it/s]

{'loss': Array(1.5854603, dtype=float32), 'loss_cross_entropy': Array(1.5854603, dtype=float32)}


  0%|          | 643/1000000 [01:51<45:14:29,  6.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 649/1000000 [01:51<26:10:00, 10.61it/s]

{'loss': Array(1.5853318, dtype=float32), 'loss_cross_entropy': Array(1.5853318, dtype=float32)}


  0%|          | 653/1000000 [01:52<40:39:22,  6.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 659/1000000 [01:52<24:02:10, 11.55it/s]

{'loss': Array(1.5847429, dtype=float32), 'loss_cross_entropy': Array(1.5847429, dtype=float32)}


  0%|          | 663/1000000 [01:53<41:37:03,  6.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 670/1000000 [01:54<22:32:49, 12.31it/s]

{'loss': Array(1.5841554, dtype=float32), 'loss_cross_entropy': Array(1.5841554, dtype=float32)}


  0%|          | 672/1000000 [01:55<52:14:57,  5.31it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 679/1000000 [01:55<27:34:07, 10.07it/s]

{'loss': Array(1.5831896, dtype=float32), 'loss_cross_entropy': Array(1.5831896, dtype=float32)}


  0%|          | 683/1000000 [01:56<44:16:22,  6.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 690/1000000 [01:56<23:18:08, 11.91it/s]

{'loss': Array(1.5820878, dtype=float32), 'loss_cross_entropy': Array(1.5820878, dtype=float32)}
context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 699/1000000 [01:58<33:48:30,  8.21it/s]

{'loss': Array(1.5811342, dtype=float32), 'loss_cross_entropy': Array(1.5811342, dtype=float32)}


  0%|          | 703/1000000 [01:59<46:48:41,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 710/1000000 [02:00<27:09:16, 10.22it/s]

{'loss': Array(1.5801123, dtype=float32), 'loss_cross_entropy': Array(1.5801123, dtype=float32)}


  0%|          | 712/1000000 [02:01<53:44:29,  5.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 720/1000000 [02:01<25:50:34, 10.74it/s]

{'loss': Array(1.5784479, dtype=float32), 'loss_cross_entropy': Array(1.5784479, dtype=float32)}


  0%|          | 722/1000000 [02:02<52:18:48,  5.31it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 730/1000000 [02:02<25:58:46, 10.68it/s]

{'loss': Array(1.5765914, dtype=float32), 'loss_cross_entropy': Array(1.5765914, dtype=float32)}


  0%|          | 732/1000000 [02:03<50:47:00,  5.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 740/1000000 [02:04<25:28:36, 10.90it/s]

{'loss': Array(1.5744909, dtype=float32), 'loss_cross_entropy': Array(1.5744909, dtype=float32)}


  0%|          | 742/1000000 [02:04<49:54:43,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 750/1000000 [02:05<25:04:01, 11.07it/s]

{'loss': Array(1.5717472, dtype=float32), 'loss_cross_entropy': Array(1.5717472, dtype=float32)}


  0%|          | 752/1000000 [02:06<54:10:30,  5.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 759/1000000 [02:07<40:52:18,  6.79it/s]

{'loss': Array(1.56772, dtype=float32), 'loss_cross_entropy': Array(1.56772, dtype=float32)}


  0%|          | 763/1000000 [02:08<49:48:19,  5.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 768/1000000 [02:08<30:28:37,  9.11it/s]

{'loss': Array(1.5632147, dtype=float32), 'loss_cross_entropy': Array(1.5632147, dtype=float32)}


  0%|          | 773/1000000 [02:09<42:09:03,  6.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 780/1000000 [02:09<23:46:06, 11.68it/s]

{'loss': Array(1.5564483, dtype=float32), 'loss_cross_entropy': Array(1.5564483, dtype=float32)}


  0%|          | 782/1000000 [02:10<51:02:05,  5.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 789/1000000 [02:11<26:49:11, 10.35it/s]

{'loss': Array(1.5477256, dtype=float32), 'loss_cross_entropy': Array(1.5477256, dtype=float32)}


  0%|          | 791/1000000 [02:12<52:11:50,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 799/1000000 [02:12<25:10:14, 11.03it/s]

{'loss': Array(1.5386417, dtype=float32), 'loss_cross_entropy': Array(1.5386417, dtype=float32)}


  0%|          | 801/1000000 [02:13<49:41:51,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 810/1000000 [02:13<24:10:12, 11.48it/s]

{'loss': Array(1.5251468, dtype=float32), 'loss_cross_entropy': Array(1.5251468, dtype=float32)}


  0%|          | 812/1000000 [02:14<46:36:45,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 819/1000000 [02:15<48:29:33,  5.72it/s]

{'loss': Array(1.5158702, dtype=float32), 'loss_cross_entropy': Array(1.5158702, dtype=float32)}


  0%|          | 823/1000000 [02:16<52:54:28,  5.25it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 830/1000000 [02:16<26:56:40, 10.30it/s]

{'loss': Array(1.510241, dtype=float32), 'loss_cross_entropy': Array(1.510241, dtype=float32)}


  0%|          | 832/1000000 [02:17<55:05:46,  5.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 839/1000000 [02:18<27:57:35,  9.93it/s]

{'loss': Array(1.4991102, dtype=float32), 'loss_cross_entropy': Array(1.4991102, dtype=float32)}


  0%|          | 841/1000000 [02:19<53:35:51,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 850/1000000 [02:19<24:28:02, 11.34it/s]

{'loss': Array(1.4860034, dtype=float32), 'loss_cross_entropy': Array(1.4860034, dtype=float32)}


  0%|          | 852/1000000 [02:20<48:53:13,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 860/1000000 [02:20<25:06:59, 11.05it/s]

{'loss': Array(1.4765571, dtype=float32), 'loss_cross_entropy': Array(1.4765571, dtype=float32)}


  0%|          | 862/1000000 [02:21<48:08:47,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 869/1000000 [02:22<26:15:06, 10.57it/s]

{'loss': Array(1.4651885, dtype=float32), 'loss_cross_entropy': Array(1.4651885, dtype=float32)}


  0%|          | 873/1000000 [02:23<41:36:47,  6.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 879/1000000 [02:23<25:04:15, 11.07it/s]

{'loss': Array(1.4501907, dtype=float32), 'loss_cross_entropy': Array(1.4501907, dtype=float32)}


  0%|          | 881/1000000 [02:24<48:30:01,  5.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 890/1000000 [02:25<31:41:08,  8.76it/s]

{'loss': Array(1.4379433, dtype=float32), 'loss_cross_entropy': Array(1.4379433, dtype=float32)}


  0%|          | 892/1000000 [02:26<56:14:43,  4.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 898/1000000 [02:26<31:09:11,  8.91it/s]

{'loss': Array(1.4245975, dtype=float32), 'loss_cross_entropy': Array(1.4245975, dtype=float32)}


  0%|          | 903/1000000 [02:27<41:16:39,  6.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 908/1000000 [02:27<27:40:52, 10.03it/s]

{'loss': Array(1.4117655, dtype=float32), 'loss_cross_entropy': Array(1.4117655, dtype=float32)}


  0%|          | 911/1000000 [02:28<47:34:18,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 919/1000000 [02:29<25:41:57, 10.80it/s]

{'loss': Array(1.39444, dtype=float32), 'loss_cross_entropy': Array(1.39444, dtype=float32)}


  0%|          | 923/1000000 [02:30<40:33:05,  6.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 930/1000000 [02:30<23:29:42, 11.81it/s]

{'loss': Array(1.3822275, dtype=float32), 'loss_cross_entropy': Array(1.3822275, dtype=float32)}


  0%|          | 932/1000000 [02:31<49:00:14,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 940/1000000 [02:31<24:59:27, 11.10it/s]

{'loss': Array(1.3680387, dtype=float32), 'loss_cross_entropy': Array(1.3680387, dtype=float32)}


  0%|          | 942/1000000 [02:32<51:14:36,  5.42it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 950/1000000 [02:33<38:28:15,  7.21it/s]

{'loss': Array(1.3521672, dtype=float32), 'loss_cross_entropy': Array(1.3521672, dtype=float32)}


  0%|          | 952/1000000 [02:34<57:57:48,  4.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 960/1000000 [02:34<27:37:14, 10.05it/s]

{'loss': Array(1.3407137, dtype=float32), 'loss_cross_entropy': Array(1.3407137, dtype=float32)}


  0%|          | 962/1000000 [02:35<51:27:40,  5.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 970/1000000 [02:36<26:30:16, 10.47it/s]

{'loss': Array(1.3175071, dtype=float32), 'loss_cross_entropy': Array(1.3175071, dtype=float32)}


  0%|          | 972/1000000 [02:36<51:24:01,  5.40it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 980/1000000 [02:37<25:23:30, 10.93it/s]

{'loss': Array(1.3076029, dtype=float32), 'loss_cross_entropy': Array(1.3076029, dtype=float32)}


  0%|          | 982/1000000 [02:38<49:16:29,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 988/1000000 [02:38<29:13:21,  9.50it/s]

{'loss': Array(1.2956771, dtype=float32), 'loss_cross_entropy': Array(1.2956771, dtype=float32)}


  0%|          | 991/1000000 [02:39<46:40:12,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 999/1000000 [02:39<25:51:29, 10.73it/s]

{'loss': Array(1.2733215, dtype=float32), 'loss_cross_entropy': Array(1.2733215, dtype=float32)}


  0%|          | 1001/1000000 [02:47<277:53:32,  1.00s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1010/1000000 [02:48<109:52:03,  2.53it/s]

{'loss': Array(1.2600378, dtype=float32), 'loss_cross_entropy': Array(1.2600378, dtype=float32)}


  0%|          | 1012/1000000 [02:49<112:03:06,  2.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1018/1000000 [02:49<53:23:04,  5.20it/s] 

{'loss': Array(1.2459862, dtype=float32), 'loss_cross_entropy': Array(1.2459862, dtype=float32)}


  0%|          | 1023/1000000 [02:50<52:36:06,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1029/1000000 [02:50<28:47:20,  9.64it/s]

{'loss': Array(1.2244707, dtype=float32), 'loss_cross_entropy': Array(1.2244707, dtype=float32)}


  0%|          | 1033/1000000 [02:52<44:55:31,  6.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1039/1000000 [02:52<25:19:53, 10.95it/s]

{'loss': Array(1.2089063, dtype=float32), 'loss_cross_entropy': Array(1.2089063, dtype=float32)}


  0%|          | 1043/1000000 [02:53<42:08:19,  6.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1049/1000000 [02:53<24:46:16, 11.20it/s]

{'loss': Array(1.1913664, dtype=float32), 'loss_cross_entropy': Array(1.1913664, dtype=float32)}


  0%|          | 1053/1000000 [02:54<40:20:45,  6.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1059/1000000 [02:54<25:15:28, 10.99it/s]

{'loss': Array(1.1711016, dtype=float32), 'loss_cross_entropy': Array(1.1711016, dtype=float32)}


  0%|          | 1061/1000000 [02:55<50:18:00,  5.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1070/1000000 [02:56<24:39:43, 11.25it/s]

{'loss': Array(1.1537464, dtype=float32), 'loss_cross_entropy': Array(1.1537464, dtype=float32)}


  0%|          | 1072/1000000 [02:57<52:00:13,  5.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1079/1000000 [02:58<36:34:44,  7.59it/s]

{'loss': Array(1.1484298, dtype=float32), 'loss_cross_entropy': Array(1.1484298, dtype=float32)}


  0%|          | 1083/1000000 [02:59<46:01:18,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1090/1000000 [02:59<24:17:55, 11.42it/s]

{'loss': Array(1.1372083, dtype=float32), 'loss_cross_entropy': Array(1.1372083, dtype=float32)}


  0%|          | 1092/1000000 [03:00<52:34:37,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1099/1000000 [03:00<27:57:09,  9.93it/s]

{'loss': Array(1.1109102, dtype=float32), 'loss_cross_entropy': Array(1.1109102, dtype=float32)}


  0%|          | 1103/1000000 [03:01<43:47:11,  6.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1110/1000000 [03:02<23:46:09, 11.67it/s]

{'loss': Array(1.1044337, dtype=float32), 'loss_cross_entropy': Array(1.1044337, dtype=float32)}


  0%|          | 1112/1000000 [03:02<51:03:46,  5.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1119/1000000 [03:03<26:37:25, 10.42it/s]

{'loss': Array(1.0866228, dtype=float32), 'loss_cross_entropy': Array(1.0866228, dtype=float32)}


  0%|          | 1123/1000000 [03:04<42:51:17,  6.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1129/1000000 [03:04<25:26:14, 10.91it/s]

{'loss': Array(1.0809323, dtype=float32), 'loss_cross_entropy': Array(1.0809323, dtype=float32)}


  0%|          | 1131/1000000 [03:05<51:35:02,  5.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1140/1000000 [03:06<36:40:32,  7.57it/s]

{'loss': Array(1.0637424, dtype=float32), 'loss_cross_entropy': Array(1.0637424, dtype=float32)}


  0%|          | 1142/1000000 [03:07<56:19:40,  4.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1150/1000000 [03:07<28:21:58,  9.78it/s]

{'loss': Array(1.0530542, dtype=float32), 'loss_cross_entropy': Array(1.0530542, dtype=float32)}


  0%|          | 1152/1000000 [03:08<52:54:05,  5.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1159/1000000 [03:09<27:34:25, 10.06it/s]

{'loss': Array(1.0404619, dtype=float32), 'loss_cross_entropy': Array(1.0404619, dtype=float32)}


  0%|          | 1161/1000000 [03:10<54:36:29,  5.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1168/1000000 [03:10<28:10:18,  9.85it/s]

{'loss': Array(1.0129144, dtype=float32), 'loss_cross_entropy': Array(1.0129144, dtype=float32)}


  0%|          | 1171/1000000 [03:11<48:36:59,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1178/1000000 [03:11<26:51:57, 10.33it/s]

{'loss': Array(1.0039333, dtype=float32), 'loss_cross_entropy': Array(1.0039333, dtype=float32)}


  0%|          | 1181/1000000 [03:12<47:04:49,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1189/1000000 [03:13<25:10:23, 11.02it/s]

{'loss': Array(1.0079678, dtype=float32), 'loss_cross_entropy': Array(1.0079678, dtype=float32)}


  0%|          | 1193/1000000 [03:14<41:16:08,  6.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1198/1000000 [03:14<26:14:42, 10.57it/s]

{'loss': Array(0.98950857, dtype=float32), 'loss_cross_entropy': Array(0.98950857, dtype=float32)}


  0%|          | 1201/1000000 [03:15<47:58:04,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1209/1000000 [03:16<31:42:03,  8.75it/s]

{'loss': Array(0.9805188, dtype=float32), 'loss_cross_entropy': Array(0.9805188, dtype=float32)}


  0%|          | 1211/1000000 [03:17<58:26:01,  4.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1220/1000000 [03:17<25:28:24, 10.89it/s]

{'loss': Array(0.96263903, dtype=float32), 'loss_cross_entropy': Array(0.96263903, dtype=float32)}


  0%|          | 1222/1000000 [03:18<51:22:27,  5.40it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1230/1000000 [03:18<25:09:54, 11.02it/s]

{'loss': Array(0.970319, dtype=float32), 'loss_cross_entropy': Array(0.970319, dtype=float32)}


  0%|          | 1232/1000000 [03:19<52:58:32,  5.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1240/1000000 [03:20<25:12:37, 11.00it/s]

{'loss': Array(0.94085467, dtype=float32), 'loss_cross_entropy': Array(0.94085467, dtype=float32)}


  0%|          | 1242/1000000 [03:21<52:57:21,  5.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1248/1000000 [03:21<29:03:34,  9.55it/s]

{'loss': Array(0.92883676, dtype=float32), 'loss_cross_entropy': Array(0.92883676, dtype=float32)}


  0%|          | 1251/1000000 [03:22<48:20:24,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1259/1000000 [03:22<25:54:02, 10.71it/s]

{'loss': Array(0.9227541, dtype=float32), 'loss_cross_entropy': Array(0.9227541, dtype=float32)}


  0%|          | 1261/1000000 [03:23<48:51:58,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1270/1000000 [03:24<38:37:42,  7.18it/s]

{'loss': Array(0.90686667, dtype=float32), 'loss_cross_entropy': Array(0.90686667, dtype=float32)}


  0%|          | 1272/1000000 [03:25<63:52:43,  4.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1278/1000000 [03:25<34:03:35,  8.15it/s]

{'loss': Array(0.8922989, dtype=float32), 'loss_cross_entropy': Array(0.8922989, dtype=float32)}


  0%|          | 1281/1000000 [03:26<51:02:13,  5.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1290/1000000 [03:27<25:24:53, 10.92it/s]

{'loss': Array(0.8867084, dtype=float32), 'loss_cross_entropy': Array(0.8867084, dtype=float32)}


  0%|          | 1292/1000000 [03:28<46:45:05,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1300/1000000 [03:28<24:42:01, 11.23it/s]

{'loss': Array(0.8794324, dtype=float32), 'loss_cross_entropy': Array(0.8794324, dtype=float32)}


  0%|          | 1302/1000000 [03:29<47:23:03,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1310/1000000 [03:29<24:24:27, 11.37it/s]

{'loss': Array(0.85794556, dtype=float32), 'loss_cross_entropy': Array(0.85794556, dtype=float32)}


  0%|          | 1312/1000000 [03:30<49:15:25,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1318/1000000 [03:30<29:00:47,  9.56it/s]

{'loss': Array(0.86711067, dtype=float32), 'loss_cross_entropy': Array(0.86711067, dtype=float32)}


  0%|          | 1321/1000000 [03:31<46:26:21,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1329/1000000 [03:32<42:31:17,  6.52it/s]

{'loss': Array(0.866169, dtype=float32), 'loss_cross_entropy': Array(0.866169, dtype=float32)}


  0%|          | 1331/1000000 [03:33<61:14:39,  4.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1340/1000000 [03:34<27:57:56,  9.92it/s]

{'loss': Array(0.8488787, dtype=float32), 'loss_cross_entropy': Array(0.8488787, dtype=float32)}


  0%|          | 1342/1000000 [03:34<48:04:47,  5.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1350/1000000 [03:35<26:02:36, 10.65it/s]

{'loss': Array(0.8259124, dtype=float32), 'loss_cross_entropy': Array(0.8259124, dtype=float32)}


  0%|          | 1352/1000000 [03:36<47:56:46,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1358/1000000 [03:36<28:27:52,  9.75it/s]

{'loss': Array(0.80587703, dtype=float32), 'loss_cross_entropy': Array(0.80587703, dtype=float32)}


  0%|          | 1363/1000000 [03:37<38:11:32,  7.26it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1369/1000000 [03:37<24:20:58, 11.39it/s]

{'loss': Array(0.8290399, dtype=float32), 'loss_cross_entropy': Array(0.8290399, dtype=float32)}


  0%|          | 1373/1000000 [03:38<38:29:04,  7.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1379/1000000 [03:38<24:15:58, 11.43it/s]

{'loss': Array(0.8033892, dtype=float32), 'loss_cross_entropy': Array(0.8033892, dtype=float32)}


  0%|          | 1381/1000000 [03:39<46:06:38,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1390/1000000 [03:40<23:51:41, 11.63it/s]

{'loss': Array(0.78806055, dtype=float32), 'loss_cross_entropy': Array(0.78806055, dtype=float32)}


  0%|          | 1392/1000000 [03:41<51:27:09,  5.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1399/1000000 [03:42<36:19:05,  7.64it/s]

{'loss': Array(0.7894793, dtype=float32), 'loss_cross_entropy': Array(0.7894793, dtype=float32)}


  0%|          | 1401/1000000 [03:42<59:50:35,  4.64it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1410/1000000 [03:43<26:06:24, 10.63it/s]

{'loss': Array(0.7863285, dtype=float32), 'loss_cross_entropy': Array(0.7863285, dtype=float32)}


  0%|          | 1412/1000000 [03:44<47:39:33,  5.82it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1418/1000000 [03:44<28:23:02,  9.77it/s]

{'loss': Array(0.7672703, dtype=float32), 'loss_cross_entropy': Array(0.7672703, dtype=float32)}


  0%|          | 1421/1000000 [03:45<46:29:44,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1430/1000000 [03:45<24:04:10, 11.52it/s]

{'loss': Array(0.7745437, dtype=float32), 'loss_cross_entropy': Array(0.7745437, dtype=float32)}


  0%|          | 1432/1000000 [03:46<47:10:26,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1440/1000000 [03:47<25:00:27, 11.09it/s]

{'loss': Array(0.7462983, dtype=float32), 'loss_cross_entropy': Array(0.7462983, dtype=float32)}


  0%|          | 1442/1000000 [03:48<49:40:22,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1448/1000000 [03:48<29:23:37,  9.44it/s]

{'loss': Array(0.74013513, dtype=float32), 'loss_cross_entropy': Array(0.74013513, dtype=float32)}


  0%|          | 1451/1000000 [03:49<47:31:24,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1459/1000000 [03:50<38:14:33,  7.25it/s]

{'loss': Array(0.72419196, dtype=float32), 'loss_cross_entropy': Array(0.72419196, dtype=float32)}


  0%|          | 1461/1000000 [03:51<57:11:33,  4.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1470/1000000 [03:51<26:25:24, 10.50it/s]

{'loss': Array(0.7318619, dtype=float32), 'loss_cross_entropy': Array(0.7318619, dtype=float32)}


  0%|          | 1472/1000000 [03:52<47:53:03,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1478/1000000 [03:52<28:49:57,  9.62it/s]

{'loss': Array(0.69425565, dtype=float32), 'loss_cross_entropy': Array(0.69425565, dtype=float32)}


  0%|          | 1481/1000000 [03:53<46:23:38,  5.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1490/1000000 [03:54<24:23:36, 11.37it/s]

{'loss': Array(0.7215164, dtype=float32), 'loss_cross_entropy': Array(0.7215164, dtype=float32)}


  0%|          | 1492/1000000 [03:54<44:48:03,  6.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1499/1000000 [03:55<27:25:35, 10.11it/s]

{'loss': Array(0.7167545, dtype=float32), 'loss_cross_entropy': Array(0.7167545, dtype=float32)}


  0%|          | 1501/1000000 [04:02<293:00:57,  1.06s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1510/1000000 [04:03<92:15:47,  3.01it/s] 

{'loss': Array(0.69418174, dtype=float32), 'loss_cross_entropy': Array(0.69418174, dtype=float32)}


  0%|          | 1512/1000000 [04:03<97:23:18,  2.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1520/1000000 [04:04<60:53:46,  4.55it/s]

{'loss': Array(0.69335306, dtype=float32), 'loss_cross_entropy': Array(0.69335306, dtype=float32)}


  0%|          | 1522/1000000 [04:05<76:45:24,  3.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1528/1000000 [04:06<40:43:44,  6.81it/s]

{'loss': Array(0.67682403, dtype=float32), 'loss_cross_entropy': Array(0.67682403, dtype=float32)}


  0%|          | 1531/1000000 [04:07<54:41:31,  5.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1540/1000000 [04:07<26:34:00, 10.44it/s]

{'loss': Array(0.6780991, dtype=float32), 'loss_cross_entropy': Array(0.6780991, dtype=float32)}


  0%|          | 1542/1000000 [04:08<47:11:04,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1548/1000000 [04:08<28:41:58,  9.66it/s]

{'loss': Array(0.6848592, dtype=float32), 'loss_cross_entropy': Array(0.6848592, dtype=float32)}


  0%|          | 1551/1000000 [04:09<45:28:47,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1559/1000000 [04:09<25:25:30, 10.91it/s]

{'loss': Array(0.6515812, dtype=float32), 'loss_cross_entropy': Array(0.6515812, dtype=float32)}


  0%|          | 1561/1000000 [04:10<49:47:13,  5.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1570/1000000 [04:11<24:18:56, 11.41it/s]

{'loss': Array(0.652137, dtype=float32), 'loss_cross_entropy': Array(0.652137, dtype=float32)}


  0%|          | 1572/1000000 [04:12<46:37:14,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1578/1000000 [04:12<28:25:06,  9.76it/s]

{'loss': Array(0.6637025, dtype=float32), 'loss_cross_entropy': Array(0.6637025, dtype=float32)}


  0%|          | 1581/1000000 [04:13<46:21:05,  5.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1588/1000000 [04:14<34:51:06,  7.96it/s]

{'loss': Array(0.63909364, dtype=float32), 'loss_cross_entropy': Array(0.63909364, dtype=float32)}


  0%|          | 1591/1000000 [04:15<50:36:26,  5.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1599/1000000 [04:15<26:24:40, 10.50it/s]

{'loss': Array(0.64905274, dtype=float32), 'loss_cross_entropy': Array(0.64905274, dtype=float32)}


  0%|          | 1603/1000000 [04:16<39:27:50,  7.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1609/1000000 [04:16<24:42:48, 11.22it/s]

{'loss': Array(0.6231575, dtype=float32), 'loss_cross_entropy': Array(0.6231575, dtype=float32)}


  0%|          | 1611/1000000 [04:17<47:56:50,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1620/1000000 [04:18<23:43:01, 11.69it/s]

{'loss': Array(0.6294414, dtype=float32), 'loss_cross_entropy': Array(0.6294414, dtype=float32)}


  0%|          | 1622/1000000 [04:18<47:23:29,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1628/1000000 [04:19<28:43:24,  9.66it/s]

{'loss': Array(0.62007254, dtype=float32), 'loss_cross_entropy': Array(0.62007254, dtype=float32)}


  0%|          | 1631/1000000 [04:20<46:01:31,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1639/1000000 [04:20<24:57:34, 11.11it/s]

{'loss': Array(0.60942584, dtype=float32), 'loss_cross_entropy': Array(0.60942584, dtype=float32)}


  0%|          | 1641/1000000 [04:21<47:45:49,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1648/1000000 [04:22<41:09:16,  6.74it/s]

{'loss': Array(0.6289971, dtype=float32), 'loss_cross_entropy': Array(0.6289971, dtype=float32)}


  0%|          | 1651/1000000 [04:23<55:34:54,  4.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1660/1000000 [04:23<26:02:51, 10.65it/s]

{'loss': Array(0.6183906, dtype=float32), 'loss_cross_entropy': Array(0.6183906, dtype=float32)}


  0%|          | 1662/1000000 [04:24<47:55:34,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1668/1000000 [04:24<28:46:49,  9.64it/s]

{'loss': Array(0.6235861, dtype=float32), 'loss_cross_entropy': Array(0.6235861, dtype=float32)}


  0%|          | 1671/1000000 [04:25<46:42:40,  5.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1679/1000000 [04:26<25:48:14, 10.75it/s]

{'loss': Array(0.6057455, dtype=float32), 'loss_cross_entropy': Array(0.6057455, dtype=float32)}


  0%|          | 1683/1000000 [04:27<41:39:35,  6.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1690/1000000 [04:27<23:18:29, 11.90it/s]

{'loss': Array(0.5911462, dtype=float32), 'loss_cross_entropy': Array(0.5911462, dtype=float32)}


  0%|          | 1692/1000000 [04:28<51:03:01,  5.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1700/1000000 [04:28<25:11:23, 11.01it/s]

{'loss': Array(0.5830214, dtype=float32), 'loss_cross_entropy': Array(0.5830214, dtype=float32)}


  0%|          | 1702/1000000 [04:29<54:05:35,  5.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1709/1000000 [04:30<29:18:16,  9.46it/s]

{'loss': Array(0.5730605, dtype=float32), 'loss_cross_entropy': Array(0.5730605, dtype=float32)}


  0%|          | 1711/1000000 [04:31<76:36:33,  3.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1719/1000000 [04:32<32:06:46,  8.64it/s]

{'loss': Array(0.5851135, dtype=float32), 'loss_cross_entropy': Array(0.5851135, dtype=float32)}


  0%|          | 1721/1000000 [04:32<54:45:21,  5.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1730/1000000 [04:33<25:20:00, 10.95it/s]

{'loss': Array(0.5702849, dtype=float32), 'loss_cross_entropy': Array(0.5702849, dtype=float32)}


  0%|          | 1732/1000000 [04:34<47:00:08,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1738/1000000 [04:34<28:16:19,  9.81it/s]

{'loss': Array(0.55883455, dtype=float32), 'loss_cross_entropy': Array(0.55883455, dtype=float32)}


  0%|          | 1743/1000000 [04:35<40:30:33,  6.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1750/1000000 [04:35<23:33:39, 11.77it/s]

{'loss': Array(0.5780471, dtype=float32), 'loss_cross_entropy': Array(0.5780471, dtype=float32)}


  0%|          | 1752/1000000 [04:36<49:36:25,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1758/1000000 [04:37<27:46:58,  9.98it/s]

{'loss': Array(0.5611991, dtype=float32), 'loss_cross_entropy': Array(0.5611991, dtype=float32)}


  0%|          | 1763/1000000 [04:38<39:11:44,  7.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1770/1000000 [04:38<22:18:57, 12.43it/s]

{'loss': Array(0.5604396, dtype=float32), 'loss_cross_entropy': Array(0.5604396, dtype=float32)}


  0%|          | 1772/1000000 [04:39<47:45:27,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1780/1000000 [04:40<32:22:29,  8.56it/s]

{'loss': Array(0.5600655, dtype=float32), 'loss_cross_entropy': Array(0.5600655, dtype=float32)}


  0%|          | 1782/1000000 [04:41<60:29:11,  4.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1789/1000000 [04:41<28:02:16,  9.89it/s]

{'loss': Array(0.545736, dtype=float32), 'loss_cross_entropy': Array(0.545736, dtype=float32)}


  0%|          | 1791/1000000 [04:42<54:48:58,  5.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1799/1000000 [04:42<26:15:31, 10.56it/s]

{'loss': Array(0.5454672, dtype=float32), 'loss_cross_entropy': Array(0.5454672, dtype=float32)}


  0%|          | 1803/1000000 [04:43<41:04:32,  6.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1809/1000000 [04:44<25:06:52, 11.04it/s]

{'loss': Array(0.5466252, dtype=float32), 'loss_cross_entropy': Array(0.5466252, dtype=float32)}


  0%|          | 1811/1000000 [04:45<48:23:34,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1820/1000000 [04:45<22:39:32, 12.24it/s]

{'loss': Array(0.5341348, dtype=float32), 'loss_cross_entropy': Array(0.5341348, dtype=float32)}


  0%|          | 1822/1000000 [04:46<50:31:11,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1830/1000000 [04:46<25:05:54, 11.05it/s]

{'loss': Array(0.52137595, dtype=float32), 'loss_cross_entropy': Array(0.52137595, dtype=float32)}


  0%|          | 1832/1000000 [04:47<48:16:46,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1840/1000000 [04:48<37:33:34,  7.38it/s]

{'loss': Array(0.547935, dtype=float32), 'loss_cross_entropy': Array(0.547935, dtype=float32)}


  0%|          | 1842/1000000 [04:49<60:18:12,  4.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1850/1000000 [04:50<30:47:09,  9.01it/s]

{'loss': Array(0.5285633, dtype=float32), 'loss_cross_entropy': Array(0.5285633, dtype=float32)}


  0%|          | 1852/1000000 [04:51<55:01:55,  5.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1859/1000000 [04:51<28:02:21,  9.89it/s]

{'loss': Array(0.5162942, dtype=float32), 'loss_cross_entropy': Array(0.5162942, dtype=float32)}


  0%|          | 1863/1000000 [04:52<42:39:59,  6.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1868/1000000 [04:52<27:27:14, 10.10it/s]

{'loss': Array(0.50240755, dtype=float32), 'loss_cross_entropy': Array(0.50240755, dtype=float32)}


  0%|          | 1873/1000000 [04:53<39:16:03,  7.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1879/1000000 [04:53<23:53:09, 11.61it/s]

{'loss': Array(0.5324311, dtype=float32), 'loss_cross_entropy': Array(0.5324311, dtype=float32)}


  0%|          | 1883/1000000 [04:54<40:52:02,  6.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1889/1000000 [04:55<24:53:35, 11.14it/s]

{'loss': Array(0.5167782, dtype=float32), 'loss_cross_entropy': Array(0.5167782, dtype=float32)}


  0%|          | 1891/1000000 [04:56<53:14:30,  5.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1899/1000000 [04:56<25:22:58, 10.92it/s]

{'loss': Array(0.49745527, dtype=float32), 'loss_cross_entropy': Array(0.49745527, dtype=float32)}


  0%|          | 1901/1000000 [04:57<71:29:34,  3.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1909/1000000 [04:58<31:51:48,  8.70it/s]

{'loss': Array(0.48917317, dtype=float32), 'loss_cross_entropy': Array(0.48917317, dtype=float32)}


  0%|          | 1913/1000000 [04:59<45:08:11,  6.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1920/1000000 [04:59<24:26:12, 11.35it/s]

{'loss': Array(0.49208498, dtype=float32), 'loss_cross_entropy': Array(0.49208498, dtype=float32)}


  0%|          | 1922/1000000 [05:00<53:23:08,  5.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1928/1000000 [05:01<29:50:05,  9.29it/s]

{'loss': Array(0.5021489, dtype=float32), 'loss_cross_entropy': Array(0.5021489, dtype=float32)}


  0%|          | 1933/1000000 [05:02<41:00:03,  6.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1940/1000000 [05:02<22:57:52, 12.07it/s]

{'loss': Array(0.51025045, dtype=float32), 'loss_cross_entropy': Array(0.51025045, dtype=float32)}


  0%|          | 1942/1000000 [05:03<49:39:09,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1950/1000000 [05:03<24:30:27, 11.31it/s]

{'loss': Array(0.48457295, dtype=float32), 'loss_cross_entropy': Array(0.48457295, dtype=float32)}


  0%|          | 1952/1000000 [05:04<49:53:14,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1959/1000000 [05:04<26:42:34, 10.38it/s]

{'loss': Array(0.4847281, dtype=float32), 'loss_cross_entropy': Array(0.4847281, dtype=float32)}


  0%|          | 1961/1000000 [05:05<54:18:39,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1968/1000000 [05:06<36:47:55,  7.53it/s]

{'loss': Array(0.46610114, dtype=float32), 'loss_cross_entropy': Array(0.46610114, dtype=float32)}


  0%|          | 1971/1000000 [05:07<53:17:25,  5.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1979/1000000 [05:08<27:32:30, 10.07it/s]

{'loss': Array(0.48526952, dtype=float32), 'loss_cross_entropy': Array(0.48526952, dtype=float32)}


  0%|          | 1983/1000000 [05:09<41:42:22,  6.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1988/1000000 [05:09<26:26:28, 10.48it/s]

{'loss': Array(0.4858679, dtype=float32), 'loss_cross_entropy': Array(0.4858679, dtype=float32)}


  0%|          | 1993/1000000 [05:10<39:26:35,  7.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 1999/1000000 [05:10<24:06:57, 11.50it/s]

{'loss': Array(0.46515355, dtype=float32), 'loss_cross_entropy': Array(0.46515355, dtype=float32)}


  0%|          | 2001/1000000 [05:18<316:17:06,  1.14s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2009/1000000 [05:18<101:40:49,  2.73it/s]

{'loss': Array(0.45671543, dtype=float32), 'loss_cross_entropy': Array(0.45671543, dtype=float32)}


  0%|          | 2013/1000000 [05:19<82:37:46,  3.35it/s] 

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2020/1000000 [05:20<38:10:24,  7.26it/s]

{'loss': Array(0.45629773, dtype=float32), 'loss_cross_entropy': Array(0.45629773, dtype=float32)}


  0%|          | 2022/1000000 [05:20<63:25:28,  4.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2029/1000000 [05:21<44:01:57,  6.30it/s]

{'loss': Array(0.4755089, dtype=float32), 'loss_cross_entropy': Array(0.4755089, dtype=float32)}


  0%|          | 2033/1000000 [05:22<51:35:38,  5.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2039/1000000 [05:23<27:50:26,  9.96it/s]

{'loss': Array(0.45783478, dtype=float32), 'loss_cross_entropy': Array(0.45783478, dtype=float32)}


  0%|          | 2041/1000000 [05:24<55:44:57,  4.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2049/1000000 [05:24<25:41:48, 10.79it/s]

{'loss': Array(0.4756353, dtype=float32), 'loss_cross_entropy': Array(0.4756353, dtype=float32)}


  0%|          | 2053/1000000 [05:25<42:30:24,  6.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2059/1000000 [05:25<24:38:10, 11.25it/s]

{'loss': Array(0.44810197, dtype=float32), 'loss_cross_entropy': Array(0.44810197, dtype=float32)}


  0%|          | 2063/1000000 [05:26<42:24:16,  6.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2069/1000000 [05:27<25:09:12, 11.02it/s]

{'loss': Array(0.4417517, dtype=float32), 'loss_cross_entropy': Array(0.4417517, dtype=float32)}


  0%|          | 2071/1000000 [05:28<48:48:20,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2079/1000000 [05:28<25:05:16, 11.05it/s]

{'loss': Array(0.47527573, dtype=float32), 'loss_cross_entropy': Array(0.47527573, dtype=float32)}


  0%|          | 2083/1000000 [05:29<39:20:01,  7.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2090/1000000 [05:30<44:34:43,  6.22it/s]

{'loss': Array(0.45562482, dtype=float32), 'loss_cross_entropy': Array(0.45562482, dtype=float32)}


  0%|          | 2092/1000000 [05:31<65:37:47,  4.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2100/1000000 [05:31<29:25:31,  9.42it/s]

{'loss': Array(0.45586357, dtype=float32), 'loss_cross_entropy': Array(0.45586357, dtype=float32)}


  0%|          | 2102/1000000 [05:32<51:09:02,  5.42it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2108/1000000 [05:32<29:55:39,  9.26it/s]

{'loss': Array(0.4500465, dtype=float32), 'loss_cross_entropy': Array(0.4500465, dtype=float32)}


  0%|          | 2113/1000000 [05:33<40:04:17,  6.92it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2120/1000000 [05:34<23:43:44, 11.68it/s]

{'loss': Array(0.43837324, dtype=float32), 'loss_cross_entropy': Array(0.43837324, dtype=float32)}


  0%|          | 2122/1000000 [05:35<50:31:17,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2128/1000000 [05:35<28:34:56,  9.70it/s]

{'loss': Array(0.44791603, dtype=float32), 'loss_cross_entropy': Array(0.44791603, dtype=float32)}


  0%|          | 2131/1000000 [05:36<50:25:49,  5.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2140/1000000 [05:36<24:25:35, 11.35it/s]

{'loss': Array(0.45070973, dtype=float32), 'loss_cross_entropy': Array(0.45070973, dtype=float32)}


  0%|          | 2142/1000000 [05:37<47:56:52,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2150/1000000 [05:38<24:01:02, 11.54it/s]

{'loss': Array(0.44289184, dtype=float32), 'loss_cross_entropy': Array(0.44289184, dtype=float32)}


  0%|          | 2152/1000000 [05:38<51:54:15,  5.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2159/1000000 [05:39<36:58:20,  7.50it/s]

{'loss': Array(0.4294737, dtype=float32), 'loss_cross_entropy': Array(0.4294737, dtype=float32)}


  0%|          | 2163/1000000 [05:40<48:31:21,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2170/1000000 [05:41<24:54:03, 11.13it/s]

{'loss': Array(0.43475643, dtype=float32), 'loss_cross_entropy': Array(0.43475643, dtype=float32)}


  0%|          | 2172/1000000 [05:42<52:50:22,  5.25it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2178/1000000 [05:42<29:57:51,  9.25it/s]

{'loss': Array(0.44353557, dtype=float32), 'loss_cross_entropy': Array(0.44353557, dtype=float32)}


  0%|          | 2181/1000000 [05:43<46:53:04,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2189/1000000 [05:43<25:23:55, 10.91it/s]

{'loss': Array(0.46212316, dtype=float32), 'loss_cross_entropy': Array(0.46212316, dtype=float32)}


  0%|          | 2193/1000000 [05:44<39:35:29,  7.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2200/1000000 [05:45<24:48:55, 11.17it/s]

{'loss': Array(0.45895097, dtype=float32), 'loss_cross_entropy': Array(0.45895097, dtype=float32)}


  0%|          | 2202/1000000 [05:46<50:32:47,  5.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2209/1000000 [05:46<26:00:50, 10.65it/s]

{'loss': Array(0.4024416, dtype=float32), 'loss_cross_entropy': Array(0.4024416, dtype=float32)}


  0%|          | 2211/1000000 [05:47<50:46:51,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2220/1000000 [05:48<34:04:18,  8.13it/s]

{'loss': Array(0.42950922, dtype=float32), 'loss_cross_entropy': Array(0.42950922, dtype=float32)}


  0%|          | 2222/1000000 [05:49<59:05:02,  4.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2228/1000000 [05:49<31:53:54,  8.69it/s]

{'loss': Array(0.43532392, dtype=float32), 'loss_cross_entropy': Array(0.43532392, dtype=float32)}


  0%|          | 2231/1000000 [05:50<51:24:40,  5.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2239/1000000 [05:50<27:07:20, 10.22it/s]

{'loss': Array(0.41735077, dtype=float32), 'loss_cross_entropy': Array(0.41735077, dtype=float32)}


  0%|          | 2243/1000000 [05:51<41:17:40,  6.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2249/1000000 [05:52<24:28:55, 11.32it/s]

{'loss': Array(0.44117552, dtype=float32), 'loss_cross_entropy': Array(0.44117552, dtype=float32)}


  0%|          | 2251/1000000 [05:53<53:26:55,  5.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2260/1000000 [05:53<23:13:29, 11.93it/s]

{'loss': Array(0.41245824, dtype=float32), 'loss_cross_entropy': Array(0.41245824, dtype=float32)}


  0%|          | 2262/1000000 [05:54<51:35:18,  5.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2269/1000000 [05:54<26:01:37, 10.65it/s]

{'loss': Array(0.41371354, dtype=float32), 'loss_cross_entropy': Array(0.41371354, dtype=float32)}


  0%|          | 2273/1000000 [05:55<42:50:37,  6.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2279/1000000 [05:56<24:57:03, 11.11it/s]

{'loss': Array(0.42947945, dtype=float32), 'loss_cross_entropy': Array(0.42947945, dtype=float32)}


  0%|          | 2281/1000000 [05:57<79:09:46,  3.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2289/1000000 [05:58<33:08:19,  8.36it/s]

{'loss': Array(0.40942368, dtype=float32), 'loss_cross_entropy': Array(0.40942368, dtype=float32)}


  0%|          | 2293/1000000 [05:59<44:47:07,  6.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2300/1000000 [05:59<24:36:37, 11.26it/s]

{'loss': Array(0.41433784, dtype=float32), 'loss_cross_entropy': Array(0.41433784, dtype=float32)}


  0%|          | 2302/1000000 [06:00<53:46:34,  5.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2310/1000000 [06:00<25:13:04, 10.99it/s]

{'loss': Array(0.40793592, dtype=float32), 'loss_cross_entropy': Array(0.40793592, dtype=float32)}


  0%|          | 2312/1000000 [06:01<50:24:42,  5.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2320/1000000 [06:01<25:20:44, 10.93it/s]

{'loss': Array(0.3981512, dtype=float32), 'loss_cross_entropy': Array(0.3981512, dtype=float32)}


  0%|          | 2322/1000000 [06:02<50:18:24,  5.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2329/1000000 [06:03<26:20:59, 10.52it/s]

{'loss': Array(0.40720263, dtype=float32), 'loss_cross_entropy': Array(0.40720263, dtype=float32)}


  0%|          | 2333/1000000 [06:04<40:55:57,  6.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2338/1000000 [06:04<26:33:56, 10.43it/s]

{'loss': Array(0.3838645, dtype=float32), 'loss_cross_entropy': Array(0.3838645, dtype=float32)}


  0%|          | 2341/1000000 [06:05<48:39:51,  5.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2350/1000000 [06:06<31:06:26,  8.91it/s]

{'loss': Array(0.41198573, dtype=float32), 'loss_cross_entropy': Array(0.41198573, dtype=float32)}


  0%|          | 2352/1000000 [06:07<55:30:33,  4.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2358/1000000 [06:07<30:52:53,  8.97it/s]

{'loss': Array(0.38722226, dtype=float32), 'loss_cross_entropy': Array(0.38722226, dtype=float32)}


  0%|          | 2363/1000000 [06:08<41:34:02,  6.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2369/1000000 [06:09<25:08:54, 11.02it/s]

{'loss': Array(0.3994782, dtype=float32), 'loss_cross_entropy': Array(0.3994782, dtype=float32)}


  0%|          | 2373/1000000 [06:09<40:44:49,  6.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2380/1000000 [06:10<23:14:23, 11.92it/s]

{'loss': Array(0.39439234, dtype=float32), 'loss_cross_entropy': Array(0.39439234, dtype=float32)}


  0%|          | 2382/1000000 [06:11<49:30:45,  5.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2389/1000000 [06:11<26:28:13, 10.47it/s]

{'loss': Array(0.40153083, dtype=float32), 'loss_cross_entropy': Array(0.40153083, dtype=float32)}


  0%|          | 2393/1000000 [06:12<42:30:19,  6.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2399/1000000 [06:12<24:17:43, 11.41it/s]

{'loss': Array(0.38489255, dtype=float32), 'loss_cross_entropy': Array(0.38489255, dtype=float32)}


  0%|          | 2403/1000000 [06:13<40:44:19,  6.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2409/1000000 [06:14<37:38:09,  7.36it/s]

{'loss': Array(0.40822154, dtype=float32), 'loss_cross_entropy': Array(0.40822154, dtype=float32)}


  0%|          | 2411/1000000 [06:15<60:28:39,  4.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2420/1000000 [06:16<25:58:24, 10.67it/s]

{'loss': Array(0.3821139, dtype=float32), 'loss_cross_entropy': Array(0.3821139, dtype=float32)}


  0%|          | 2422/1000000 [06:16<52:45:46,  5.25it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2430/1000000 [06:17<24:03:05, 11.52it/s]

{'loss': Array(0.3957931, dtype=float32), 'loss_cross_entropy': Array(0.3957931, dtype=float32)}


  0%|          | 2432/1000000 [06:18<54:14:21,  5.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2439/1000000 [06:18<26:35:29, 10.42it/s]

{'loss': Array(0.37517485, dtype=float32), 'loss_cross_entropy': Array(0.37517485, dtype=float32)}


  0%|          | 2443/1000000 [06:19<42:15:07,  6.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2449/1000000 [06:19<24:18:38, 11.40it/s]

{'loss': Array(0.37509555, dtype=float32), 'loss_cross_entropy': Array(0.37509555, dtype=float32)}


  0%|          | 2453/1000000 [06:21<43:15:10,  6.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2459/1000000 [06:21<24:14:39, 11.43it/s]

{'loss': Array(0.39952534, dtype=float32), 'loss_cross_entropy': Array(0.39952534, dtype=float32)}


  0%|          | 2461/1000000 [06:22<53:27:41,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2470/1000000 [06:23<41:58:44,  6.60it/s]

{'loss': Array(0.3939094, dtype=float32), 'loss_cross_entropy': Array(0.3939094, dtype=float32)}


  0%|          | 2472/1000000 [06:24<60:42:19,  4.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2479/1000000 [06:24<31:03:17,  8.92it/s]

{'loss': Array(0.35774425, dtype=float32), 'loss_cross_entropy': Array(0.35774425, dtype=float32)}


  0%|          | 2483/1000000 [06:25<45:50:41,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2490/1000000 [06:25<24:54:08, 11.13it/s]

{'loss': Array(0.36295706, dtype=float32), 'loss_cross_entropy': Array(0.36295706, dtype=float32)}


  0%|          | 2492/1000000 [06:26<53:26:33,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2499/1000000 [06:27<27:12:55, 10.18it/s]

{'loss': Array(0.40816274, dtype=float32), 'loss_cross_entropy': Array(0.40816274, dtype=float32)}


  0%|          | 2501/1000000 [06:34<316:31:37,  1.14s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2510/1000000 [06:35<88:59:00,  3.11it/s] 

{'loss': Array(0.38168907, dtype=float32), 'loss_cross_entropy': Array(0.38168907, dtype=float32)}


  0%|          | 2512/1000000 [06:36<99:00:24,  2.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2518/1000000 [06:36<45:13:59,  6.13it/s]

{'loss': Array(0.37287942, dtype=float32), 'loss_cross_entropy': Array(0.37287942, dtype=float32)}


  0%|          | 2521/1000000 [06:37<60:47:28,  4.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2530/1000000 [06:37<26:07:35, 10.61it/s]

{'loss': Array(0.37580445, dtype=float32), 'loss_cross_entropy': Array(0.37580445, dtype=float32)}


  0%|          | 2532/1000000 [06:38<53:32:23,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2539/1000000 [06:39<37:27:30,  7.40it/s]

{'loss': Array(0.3781, dtype=float32), 'loss_cross_entropy': Array(0.3781, dtype=float32)}


  0%|          | 2541/1000000 [06:40<62:44:55,  4.42it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2550/1000000 [06:41<26:10:34, 10.58it/s]

{'loss': Array(0.3963794, dtype=float32), 'loss_cross_entropy': Array(0.3963794, dtype=float32)}


  0%|          | 2552/1000000 [06:42<54:00:17,  5.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2559/1000000 [06:42<27:22:27, 10.12it/s]

{'loss': Array(0.36762148, dtype=float32), 'loss_cross_entropy': Array(0.36762148, dtype=float32)}


  0%|          | 2561/1000000 [06:43<52:35:11,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2568/1000000 [06:43<26:53:08, 10.31it/s]

{'loss': Array(0.37644932, dtype=float32), 'loss_cross_entropy': Array(0.37644932, dtype=float32)}


  0%|          | 2573/1000000 [06:44<40:03:49,  6.92it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2579/1000000 [06:44<24:01:11, 11.53it/s]

{'loss': Array(0.37311277, dtype=float32), 'loss_cross_entropy': Array(0.37311277, dtype=float32)}


  0%|          | 2581/1000000 [06:45<54:55:37,  5.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2589/1000000 [06:46<25:39:57, 10.79it/s]

{'loss': Array(0.3622091, dtype=float32), 'loss_cross_entropy': Array(0.3622091, dtype=float32)}


  0%|          | 2593/1000000 [06:47<40:19:52,  6.87it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2599/1000000 [06:48<40:24:44,  6.86it/s]

{'loss': Array(0.38018066, dtype=float32), 'loss_cross_entropy': Array(0.38018066, dtype=float32)}


  0%|          | 2603/1000000 [06:49<49:17:48,  5.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2609/1000000 [06:49<26:18:06, 10.53it/s]

{'loss': Array(0.3803798, dtype=float32), 'loss_cross_entropy': Array(0.3803798, dtype=float32)}


  0%|          | 2611/1000000 [06:50<57:37:23,  4.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2620/1000000 [06:50<24:49:21, 11.16it/s]

{'loss': Array(0.3551761, dtype=float32), 'loss_cross_entropy': Array(0.3551761, dtype=float32)}


  0%|          | 2622/1000000 [06:51<55:15:06,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2630/1000000 [06:52<24:19:44, 11.39it/s]

{'loss': Array(0.3643842, dtype=float32), 'loss_cross_entropy': Array(0.3643842, dtype=float32)}


  0%|          | 2632/1000000 [06:53<53:00:10,  5.23it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2640/1000000 [06:53<23:23:38, 11.84it/s]

{'loss': Array(0.3624304, dtype=float32), 'loss_cross_entropy': Array(0.3624304, dtype=float32)}


  0%|          | 2642/1000000 [06:54<53:53:09,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2649/1000000 [06:54<25:48:34, 10.73it/s]

{'loss': Array(0.36043712, dtype=float32), 'loss_cross_entropy': Array(0.36043712, dtype=float32)}


  0%|          | 2653/1000000 [06:55<43:50:39,  6.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2660/1000000 [06:56<43:23:39,  6.38it/s]

{'loss': Array(0.37156907, dtype=float32), 'loss_cross_entropy': Array(0.37156907, dtype=float32)}


  0%|          | 2662/1000000 [06:57<65:19:57,  4.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2670/1000000 [06:58<30:10:55,  9.18it/s]

{'loss': Array(0.3846761, dtype=float32), 'loss_cross_entropy': Array(0.3846761, dtype=float32)}


  0%|          | 2672/1000000 [06:58<53:56:07,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2679/1000000 [06:59<27:23:51, 10.11it/s]

{'loss': Array(0.35352436, dtype=float32), 'loss_cross_entropy': Array(0.35352436, dtype=float32)}


  0%|          | 2681/1000000 [07:00<55:18:48,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2689/1000000 [07:00<26:23:08, 10.50it/s]

{'loss': Array(0.38490424, dtype=float32), 'loss_cross_entropy': Array(0.38490424, dtype=float32)}


  0%|          | 2693/1000000 [07:01<42:05:57,  6.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2699/1000000 [07:01<24:45:50, 11.19it/s]

{'loss': Array(0.35881278, dtype=float32), 'loss_cross_entropy': Array(0.35881278, dtype=float32)}


  0%|          | 2701/1000000 [07:02<50:01:02,  5.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2709/1000000 [07:03<25:14:51, 10.97it/s]

{'loss': Array(0.36921966, dtype=float32), 'loss_cross_entropy': Array(0.36921966, dtype=float32)}


  0%|          | 2711/1000000 [07:04<49:11:54,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2720/1000000 [07:04<23:17:54, 11.89it/s]

{'loss': Array(0.36800495, dtype=float32), 'loss_cross_entropy': Array(0.36800495, dtype=float32)}


  0%|          | 2722/1000000 [07:05<53:35:37,  5.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2730/1000000 [07:06<34:42:10,  7.98it/s]

{'loss': Array(0.34870568, dtype=float32), 'loss_cross_entropy': Array(0.34870568, dtype=float32)}


  0%|          | 2732/1000000 [07:07<56:57:57,  4.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2739/1000000 [07:07<29:07:19,  9.51it/s]

{'loss': Array(0.35289714, dtype=float32), 'loss_cross_entropy': Array(0.35289714, dtype=float32)}


  0%|          | 2741/1000000 [07:08<53:45:02,  5.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2750/1000000 [07:09<24:34:15, 11.27it/s]

{'loss': Array(0.36783424, dtype=float32), 'loss_cross_entropy': Array(0.36783424, dtype=float32)}


  0%|          | 2752/1000000 [07:10<54:11:48,  5.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2760/1000000 [07:10<24:53:31, 11.13it/s]

{'loss': Array(0.3646344, dtype=float32), 'loss_cross_entropy': Array(0.3646344, dtype=float32)}


  0%|          | 2762/1000000 [07:11<54:10:27,  5.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2769/1000000 [07:11<27:17:07, 10.15it/s]

{'loss': Array(0.3517289, dtype=float32), 'loss_cross_entropy': Array(0.3517289, dtype=float32)}


  0%|          | 2773/1000000 [07:12<42:53:36,  6.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2779/1000000 [07:13<24:41:05, 11.22it/s]

{'loss': Array(0.33539706, dtype=float32), 'loss_cross_entropy': Array(0.33539706, dtype=float32)}


  0%|          | 2783/1000000 [07:14<41:10:44,  6.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2790/1000000 [07:14<36:39:10,  7.56it/s]

{'loss': Array(0.3391618, dtype=float32), 'loss_cross_entropy': Array(0.3391618, dtype=float32)}


  0%|          | 2792/1000000 [07:15<60:25:42,  4.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2800/1000000 [07:16<27:55:17,  9.92it/s]

{'loss': Array(0.34770462, dtype=float32), 'loss_cross_entropy': Array(0.34770462, dtype=float32)}


  0%|          | 2802/1000000 [07:17<52:01:14,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2809/1000000 [07:17<26:38:51, 10.39it/s]

{'loss': Array(0.3569623, dtype=float32), 'loss_cross_entropy': Array(0.3569623, dtype=float32)}


  0%|          | 2813/1000000 [07:18<41:16:02,  6.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2820/1000000 [07:18<23:28:31, 11.80it/s]

{'loss': Array(0.35513318, dtype=float32), 'loss_cross_entropy': Array(0.35513318, dtype=float32)}


  0%|          | 2822/1000000 [07:19<54:21:01,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2828/1000000 [07:20<31:20:00,  8.84it/s]

{'loss': Array(0.32745788, dtype=float32), 'loss_cross_entropy': Array(0.32745788, dtype=float32)}


  0%|          | 2833/1000000 [07:21<41:08:12,  6.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2840/1000000 [07:21<22:39:51, 12.22it/s]

{'loss': Array(0.35850498, dtype=float32), 'loss_cross_entropy': Array(0.35850498, dtype=float32)}


  0%|          | 2842/1000000 [07:22<48:29:29,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2850/1000000 [07:23<47:28:09,  5.84it/s]

{'loss': Array(0.3598362, dtype=float32), 'loss_cross_entropy': Array(0.3598362, dtype=float32)}


  0%|          | 2852/1000000 [07:24<70:51:50,  3.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2860/1000000 [07:24<28:14:42,  9.81it/s]

{'loss': Array(0.33220455, dtype=float32), 'loss_cross_entropy': Array(0.33220455, dtype=float32)}


  0%|          | 2862/1000000 [07:25<59:29:01,  4.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2869/1000000 [07:26<28:25:57,  9.74it/s]

{'loss': Array(0.335781, dtype=float32), 'loss_cross_entropy': Array(0.335781, dtype=float32)}


  0%|          | 2873/1000000 [07:27<43:02:06,  6.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2880/1000000 [07:27<24:07:30, 11.48it/s]

{'loss': Array(0.33520067, dtype=float32), 'loss_cross_entropy': Array(0.33520067, dtype=float32)}


  0%|          | 2882/1000000 [07:28<51:57:11,  5.33it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2888/1000000 [07:28<28:19:46,  9.78it/s]

{'loss': Array(0.34286654, dtype=float32), 'loss_cross_entropy': Array(0.34286654, dtype=float32)}


  0%|          | 2893/1000000 [07:29<41:18:44,  6.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2899/1000000 [07:30<25:09:24, 11.01it/s]

{'loss': Array(0.3422371, dtype=float32), 'loss_cross_entropy': Array(0.3422371, dtype=float32)}


  0%|          | 2903/1000000 [07:31<43:41:53,  6.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2909/1000000 [07:31<24:27:14, 11.33it/s]

{'loss': Array(0.34391874, dtype=float32), 'loss_cross_entropy': Array(0.34391874, dtype=float32)}


  0%|          | 2911/1000000 [07:32<53:57:57,  5.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2920/1000000 [07:33<30:39:15,  9.04it/s]

{'loss': Array(0.36185282, dtype=float32), 'loss_cross_entropy': Array(0.36185282, dtype=float32)}


  0%|          | 2922/1000000 [07:34<57:14:46,  4.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2929/1000000 [07:34<28:13:14,  9.81it/s]

{'loss': Array(0.33486035, dtype=float32), 'loss_cross_entropy': Array(0.33486035, dtype=float32)}


  0%|          | 2933/1000000 [07:35<44:39:16,  6.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2939/1000000 [07:35<26:11:34, 10.57it/s]

{'loss': Array(0.33506468, dtype=float32), 'loss_cross_entropy': Array(0.33506468, dtype=float32)}


  0%|          | 2943/1000000 [07:36<39:19:57,  7.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2949/1000000 [07:37<23:47:27, 11.64it/s]

{'loss': Array(0.32637197, dtype=float32), 'loss_cross_entropy': Array(0.32637197, dtype=float32)}


  0%|          | 2951/1000000 [07:38<52:39:51,  5.26it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2959/1000000 [07:38<25:24:28, 10.90it/s]

{'loss': Array(0.33971882, dtype=float32), 'loss_cross_entropy': Array(0.33971882, dtype=float32)}


  0%|          | 2963/1000000 [07:39<40:44:06,  6.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2968/1000000 [07:39<26:19:25, 10.52it/s]

{'loss': Array(0.33607742, dtype=float32), 'loss_cross_entropy': Array(0.33607742, dtype=float32)}


  0%|          | 2973/1000000 [07:40<39:44:33,  6.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2979/1000000 [07:41<37:37:15,  7.36it/s]

{'loss': Array(0.34380654, dtype=float32), 'loss_cross_entropy': Array(0.34380654, dtype=float32)}


  0%|          | 2983/1000000 [07:42<47:01:01,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2989/1000000 [07:42<26:43:35, 10.36it/s]

{'loss': Array(0.3344677, dtype=float32), 'loss_cross_entropy': Array(0.3344677, dtype=float32)}


  0%|          | 2993/1000000 [07:44<45:01:34,  6.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 2999/1000000 [07:44<26:14:50, 10.55it/s]

{'loss': Array(0.3412865, dtype=float32), 'loss_cross_entropy': Array(0.3412865, dtype=float32)}


  0%|          | 3003/1000000 [07:52<213:04:08,  1.30it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3010/1000000 [07:52<81:21:31,  3.40it/s] 

{'loss': Array(0.3330845, dtype=float32), 'loss_cross_entropy': Array(0.3330845, dtype=float32)}


  0%|          | 3012/1000000 [07:53<92:34:56,  2.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3019/1000000 [07:53<39:11:03,  7.07it/s]

{'loss': Array(0.31993303, dtype=float32), 'loss_cross_entropy': Array(0.31993303, dtype=float32)}


  0%|          | 3021/1000000 [07:54<59:28:55,  4.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3030/1000000 [07:54<26:13:00, 10.56it/s]

{'loss': Array(0.34597322, dtype=float32), 'loss_cross_entropy': Array(0.34597322, dtype=float32)}


  0%|          | 3032/1000000 [07:55<50:44:51,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3040/1000000 [07:56<48:21:13,  5.73it/s]

{'loss': Array(0.32447824, dtype=float32), 'loss_cross_entropy': Array(0.32447824, dtype=float32)}


  0%|          | 3042/1000000 [07:57<71:07:56,  3.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3049/1000000 [07:58<32:45:20,  8.45it/s]

{'loss': Array(0.32296297, dtype=float32), 'loss_cross_entropy': Array(0.32296297, dtype=float32)}


  0%|          | 3053/1000000 [07:59<46:27:54,  5.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3059/1000000 [07:59<25:51:19, 10.71it/s]

{'loss': Array(0.30729786, dtype=float32), 'loss_cross_entropy': Array(0.30729786, dtype=float32)}


  0%|          | 3061/1000000 [08:00<54:42:37,  5.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3070/1000000 [08:00<24:08:05, 11.47it/s]

{'loss': Array(0.29952967, dtype=float32), 'loss_cross_entropy': Array(0.29952967, dtype=float32)}


  0%|          | 3072/1000000 [08:01<47:41:05,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3078/1000000 [08:01<28:28:25,  9.73it/s]

{'loss': Array(0.3497381, dtype=float32), 'loss_cross_entropy': Array(0.3497381, dtype=float32)}


  0%|          | 3083/1000000 [08:03<40:27:37,  6.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3090/1000000 [08:03<23:12:41, 11.93it/s]

{'loss': Array(0.33145294, dtype=float32), 'loss_cross_entropy': Array(0.33145294, dtype=float32)}


  0%|          | 3092/1000000 [08:04<50:44:37,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3100/1000000 [08:04<23:53:34, 11.59it/s]

{'loss': Array(0.32333013, dtype=float32), 'loss_cross_entropy': Array(0.32333013, dtype=float32)}


  0%|          | 3102/1000000 [08:05<55:32:20,  4.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3109/1000000 [08:06<37:53:31,  7.31it/s]

{'loss': Array(0.32421947, dtype=float32), 'loss_cross_entropy': Array(0.32421947, dtype=float32)}


  0%|          | 3111/1000000 [08:07<61:27:44,  4.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3119/1000000 [08:07<27:54:40,  9.92it/s]

{'loss': Array(0.326544, dtype=float32), 'loss_cross_entropy': Array(0.326544, dtype=float32)}


  0%|          | 3123/1000000 [08:08<42:01:19,  6.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3130/1000000 [08:09<23:39:43, 11.70it/s]

{'loss': Array(0.32509, dtype=float32), 'loss_cross_entropy': Array(0.32509, dtype=float32)}


  0%|          | 3132/1000000 [08:10<50:25:15,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3140/1000000 [08:10<24:04:29, 11.50it/s]

{'loss': Array(0.3241134, dtype=float32), 'loss_cross_entropy': Array(0.3241134, dtype=float32)}


  0%|          | 3142/1000000 [08:11<53:25:59,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3150/1000000 [08:11<23:58:20, 11.55it/s]

{'loss': Array(0.32525828, dtype=float32), 'loss_cross_entropy': Array(0.32525828, dtype=float32)}


  0%|          | 3152/1000000 [08:12<55:51:02,  4.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3160/1000000 [08:13<24:37:51, 11.24it/s]

{'loss': Array(0.31749257, dtype=float32), 'loss_cross_entropy': Array(0.31749257, dtype=float32)}


  0%|          | 3162/1000000 [08:14<57:10:56,  4.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3170/1000000 [08:15<38:37:49,  7.17it/s]

{'loss': Array(0.33358547, dtype=float32), 'loss_cross_entropy': Array(0.33358547, dtype=float32)}


  0%|          | 3172/1000000 [08:16<65:06:17,  4.25it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3180/1000000 [08:16<26:36:17, 10.41it/s]

{'loss': Array(0.33835638, dtype=float32), 'loss_cross_entropy': Array(0.33835638, dtype=float32)}


  0%|          | 3182/1000000 [08:17<56:10:14,  4.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3189/1000000 [08:17<27:55:36,  9.91it/s]

{'loss': Array(0.31639525, dtype=float32), 'loss_cross_entropy': Array(0.31639525, dtype=float32)}


  0%|          | 3193/1000000 [08:18<42:49:57,  6.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3199/1000000 [08:19<25:00:42, 11.07it/s]

{'loss': Array(0.3335032, dtype=float32), 'loss_cross_entropy': Array(0.3335032, dtype=float32)}


  0%|          | 3201/1000000 [08:19<53:01:57,  5.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3209/1000000 [08:20<24:35:41, 11.26it/s]

{'loss': Array(0.32022852, dtype=float32), 'loss_cross_entropy': Array(0.32022852, dtype=float32)}


  0%|          | 3213/1000000 [08:21<43:11:49,  6.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3218/1000000 [08:21<27:13:27, 10.17it/s]

{'loss': Array(0.32427946, dtype=float32), 'loss_cross_entropy': Array(0.32427946, dtype=float32)}


  0%|          | 3221/1000000 [08:22<49:28:10,  5.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3229/1000000 [08:23<25:58:27, 10.66it/s]

{'loss': Array(0.3193263, dtype=float32), 'loss_cross_entropy': Array(0.3193263, dtype=float32)}


  0%|          | 3231/1000000 [08:24<71:31:30,  3.87it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3240/1000000 [08:25<29:25:26,  9.41it/s]

{'loss': Array(0.3115478, dtype=float32), 'loss_cross_entropy': Array(0.3115478, dtype=float32)}


  0%|          | 3242/1000000 [08:25<56:23:32,  4.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3250/1000000 [08:26<25:59:23, 10.65it/s]

{'loss': Array(0.3092007, dtype=float32), 'loss_cross_entropy': Array(0.3092007, dtype=float32)}


  0%|          | 3252/1000000 [08:27<50:23:33,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3260/1000000 [08:27<25:25:52, 10.89it/s]

{'loss': Array(0.3317255, dtype=float32), 'loss_cross_entropy': Array(0.3317255, dtype=float32)}


  0%|          | 3262/1000000 [08:28<53:30:32,  5.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3269/1000000 [08:28<27:30:46, 10.06it/s]

{'loss': Array(0.3434129, dtype=float32), 'loss_cross_entropy': Array(0.3434129, dtype=float32)}


  0%|          | 3271/1000000 [08:29<53:54:07,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3280/1000000 [08:30<25:00:27, 11.07it/s]

{'loss': Array(0.31081945, dtype=float32), 'loss_cross_entropy': Array(0.31081945, dtype=float32)}


  0%|          | 3282/1000000 [08:31<52:15:32,  5.30it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3289/1000000 [08:31<26:50:36, 10.31it/s]

{'loss': Array(0.33702993, dtype=float32), 'loss_cross_entropy': Array(0.33702993, dtype=float32)}


  0%|          | 3291/1000000 [08:32<54:07:18,  5.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3300/1000000 [08:33<29:55:49,  9.25it/s]

{'loss': Array(0.31085014, dtype=float32), 'loss_cross_entropy': Array(0.31085014, dtype=float32)}


  0%|          | 3302/1000000 [08:34<53:54:02,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3310/1000000 [08:34<26:16:35, 10.54it/s]

{'loss': Array(0.33624122, dtype=float32), 'loss_cross_entropy': Array(0.33624122, dtype=float32)}


  0%|          | 3312/1000000 [08:35<52:34:03,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3319/1000000 [08:36<27:42:32,  9.99it/s]

{'loss': Array(0.33178732, dtype=float32), 'loss_cross_entropy': Array(0.33178732, dtype=float32)}


  0%|          | 3323/1000000 [08:37<42:14:16,  6.55it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3329/1000000 [08:37<24:10:49, 11.45it/s]

{'loss': Array(0.33467537, dtype=float32), 'loss_cross_entropy': Array(0.33467537, dtype=float32)}


  0%|          | 3333/1000000 [08:38<42:04:26,  6.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3339/1000000 [08:38<24:48:51, 11.16it/s]

{'loss': Array(0.30881643, dtype=float32), 'loss_cross_entropy': Array(0.30881643, dtype=float32)}


  0%|          | 3341/1000000 [08:39<50:52:51,  5.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3349/1000000 [08:39<25:29:29, 10.86it/s]

{'loss': Array(0.3047407, dtype=float32), 'loss_cross_entropy': Array(0.3047407, dtype=float32)}


  0%|          | 3353/1000000 [08:40<41:35:03,  6.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3360/1000000 [08:41<37:01:07,  7.48it/s]

{'loss': Array(0.31502822, dtype=float32), 'loss_cross_entropy': Array(0.31502822, dtype=float32)}


  0%|          | 3362/1000000 [08:42<59:20:25,  4.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3370/1000000 [08:43<27:30:12, 10.07it/s]

{'loss': Array(0.32907137, dtype=float32), 'loss_cross_entropy': Array(0.32907137, dtype=float32)}


  0%|          | 3372/1000000 [08:44<52:17:54,  5.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3379/1000000 [08:44<27:39:01, 10.01it/s]

{'loss': Array(0.31699368, dtype=float32), 'loss_cross_entropy': Array(0.31699368, dtype=float32)}


  0%|          | 3381/1000000 [08:45<55:04:06,  5.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3390/1000000 [08:45<25:13:05, 10.98it/s]

{'loss': Array(0.2939019, dtype=float32), 'loss_cross_entropy': Array(0.2939019, dtype=float32)}


  0%|          | 3392/1000000 [08:46<48:04:58,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3398/1000000 [08:47<29:01:48,  9.54it/s]

{'loss': Array(0.30976245, dtype=float32), 'loss_cross_entropy': Array(0.30976245, dtype=float32)}


  0%|          | 3403/1000000 [08:48<39:27:53,  7.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3410/1000000 [08:48<22:57:59, 12.05it/s]

{'loss': Array(0.29266667, dtype=float32), 'loss_cross_entropy': Array(0.29266667, dtype=float32)}


  0%|          | 3412/1000000 [08:49<47:55:50,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3420/1000000 [08:50<44:33:46,  6.21it/s]

{'loss': Array(0.32776147, dtype=float32), 'loss_cross_entropy': Array(0.32776147, dtype=float32)}


  0%|          | 3422/1000000 [08:51<64:48:28,  4.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3430/1000000 [08:51<28:04:53,  9.86it/s]

{'loss': Array(0.31413236, dtype=float32), 'loss_cross_entropy': Array(0.31413236, dtype=float32)}


  0%|          | 3432/1000000 [08:52<55:02:27,  5.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3439/1000000 [08:52<27:36:11, 10.03it/s]

{'loss': Array(0.3122898, dtype=float32), 'loss_cross_entropy': Array(0.3122898, dtype=float32)}


  0%|          | 3443/1000000 [08:53<43:26:40,  6.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3448/1000000 [08:54<27:25:22, 10.09it/s]

{'loss': Array(0.31018475, dtype=float32), 'loss_cross_entropy': Array(0.31018475, dtype=float32)}


  0%|          | 3451/1000000 [08:55<49:20:55,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3459/1000000 [08:55<25:02:13, 11.06it/s]

{'loss': Array(0.3114566, dtype=float32), 'loss_cross_entropy': Array(0.3114566, dtype=float32)}


  0%|          | 3463/1000000 [08:56<42:34:00,  6.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3469/1000000 [08:56<24:59:03, 11.08it/s]

{'loss': Array(0.29558593, dtype=float32), 'loss_cross_entropy': Array(0.29558593, dtype=float32)}


  0%|          | 3473/1000000 [08:57<39:46:07,  6.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3478/1000000 [08:57<26:00:37, 10.64it/s]

{'loss': Array(0.30456004, dtype=float32), 'loss_cross_entropy': Array(0.30456004, dtype=float32)}


  0%|          | 3481/1000000 [08:58<46:04:37,  6.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3490/1000000 [08:59<29:20:40,  9.43it/s]

{'loss': Array(0.3073549, dtype=float32), 'loss_cross_entropy': Array(0.3073549, dtype=float32)}


  0%|          | 3492/1000000 [09:00<57:31:37,  4.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3499/1000000 [09:01<28:05:10,  9.86it/s]

{'loss': Array(0.30635965, dtype=float32), 'loss_cross_entropy': Array(0.30635965, dtype=float32)}


  0%|          | 3501/1000000 [09:08<299:14:13,  1.08s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3510/1000000 [09:09<85:58:45,  3.22it/s] 

{'loss': Array(0.2960963, dtype=float32), 'loss_cross_entropy': Array(0.2960963, dtype=float32)}


  0%|          | 3512/1000000 [09:10<97:10:01,  2.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3520/1000000 [09:10<39:07:35,  7.07it/s]

{'loss': Array(0.2967211, dtype=float32), 'loss_cross_entropy': Array(0.2967211, dtype=float32)}


  0%|          | 3522/1000000 [09:11<61:08:24,  4.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3530/1000000 [09:11<26:44:46, 10.35it/s]

{'loss': Array(0.2997681, dtype=float32), 'loss_cross_entropy': Array(0.2997681, dtype=float32)}


  0%|          | 3532/1000000 [09:12<55:20:13,  5.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3538/1000000 [09:13<28:54:26,  9.58it/s]

{'loss': Array(0.3321348, dtype=float32), 'loss_cross_entropy': Array(0.3321348, dtype=float32)}


  0%|          | 3541/1000000 [09:14<50:42:09,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3550/1000000 [09:15<40:22:30,  6.86it/s]

{'loss': Array(0.32679924, dtype=float32), 'loss_cross_entropy': Array(0.32679924, dtype=float32)}


  0%|          | 3552/1000000 [09:15<62:18:19,  4.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3559/1000000 [09:16<29:29:36,  9.38it/s]

{'loss': Array(0.32754162, dtype=float32), 'loss_cross_entropy': Array(0.32754162, dtype=float32)}


  0%|          | 3561/1000000 [09:17<55:13:37,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3568/1000000 [09:17<27:52:53,  9.93it/s]

{'loss': Array(0.3048826, dtype=float32), 'loss_cross_entropy': Array(0.3048826, dtype=float32)}


  0%|          | 3571/1000000 [09:18<49:31:28,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3580/1000000 [09:18<24:21:34, 11.36it/s]

{'loss': Array(0.29681128, dtype=float32), 'loss_cross_entropy': Array(0.29681128, dtype=float32)}


  0%|          | 3582/1000000 [09:19<46:41:39,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3590/1000000 [09:20<26:08:41, 10.59it/s]

{'loss': Array(0.3022501, dtype=float32), 'loss_cross_entropy': Array(0.3022501, dtype=float32)}


  0%|          | 3592/1000000 [09:21<54:04:24,  5.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3599/1000000 [09:21<27:16:06, 10.15it/s]

{'loss': Array(0.3177463, dtype=float32), 'loss_cross_entropy': Array(0.3177463, dtype=float32)}


  0%|          | 3601/1000000 [09:22<54:17:51,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3610/1000000 [09:22<24:37:39, 11.24it/s]

{'loss': Array(0.29896155, dtype=float32), 'loss_cross_entropy': Array(0.29896155, dtype=float32)}


  0%|          | 3612/1000000 [09:24<67:42:36,  4.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3619/1000000 [09:24<33:14:20,  8.33it/s]

{'loss': Array(0.31943327, dtype=float32), 'loss_cross_entropy': Array(0.31943327, dtype=float32)}


  0%|          | 3621/1000000 [09:25<57:13:45,  4.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3630/1000000 [09:26<25:01:40, 11.06it/s]

{'loss': Array(0.30361754, dtype=float32), 'loss_cross_entropy': Array(0.30361754, dtype=float32)}


  0%|          | 3632/1000000 [09:26<52:01:09,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3639/1000000 [09:27<26:23:51, 10.48it/s]

{'loss': Array(0.3247068, dtype=float32), 'loss_cross_entropy': Array(0.3247068, dtype=float32)}


  0%|          | 3643/1000000 [09:28<43:02:21,  6.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3649/1000000 [09:28<24:41:02, 11.21it/s]

{'loss': Array(0.3070212, dtype=float32), 'loss_cross_entropy': Array(0.3070212, dtype=float32)}


  0%|          | 3653/1000000 [09:29<42:23:20,  6.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3659/1000000 [09:30<25:30:23, 10.85it/s]

{'loss': Array(0.30358884, dtype=float32), 'loss_cross_entropy': Array(0.30358884, dtype=float32)}


  0%|          | 3663/1000000 [09:31<43:47:42,  6.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3668/1000000 [09:31<27:07:51, 10.20it/s]

{'loss': Array(0.32543367, dtype=float32), 'loss_cross_entropy': Array(0.32543367, dtype=float32)}


  0%|          | 3673/1000000 [09:32<39:46:39,  6.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3680/1000000 [09:33<33:07:46,  8.35it/s]

{'loss': Array(0.31754205, dtype=float32), 'loss_cross_entropy': Array(0.31754205, dtype=float32)}


  0%|          | 3682/1000000 [09:34<55:31:01,  4.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3689/1000000 [09:34<28:27:40,  9.72it/s]

{'loss': Array(0.2985321, dtype=float32), 'loss_cross_entropy': Array(0.2985321, dtype=float32)}


  0%|          | 3691/1000000 [09:35<56:05:49,  4.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3699/1000000 [09:35<26:26:32, 10.47it/s]

{'loss': Array(0.30376348, dtype=float32), 'loss_cross_entropy': Array(0.30376348, dtype=float32)}


  0%|          | 3701/1000000 [09:36<49:35:53,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3709/1000000 [09:37<26:09:31, 10.58it/s]

{'loss': Array(0.29484627, dtype=float32), 'loss_cross_entropy': Array(0.29484627, dtype=float32)}


  0%|          | 3713/1000000 [09:38<41:10:51,  6.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3720/1000000 [09:38<22:52:50, 12.10it/s]

{'loss': Array(0.31379816, dtype=float32), 'loss_cross_entropy': Array(0.31379816, dtype=float32)}


  0%|          | 3722/1000000 [09:39<47:48:48,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3728/1000000 [09:39<28:07:31,  9.84it/s]

{'loss': Array(0.31792337, dtype=float32), 'loss_cross_entropy': Array(0.31792337, dtype=float32)}


  0%|          | 3731/1000000 [09:40<47:45:56,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3740/1000000 [09:41<38:36:31,  7.17it/s]

{'loss': Array(0.31682155, dtype=float32), 'loss_cross_entropy': Array(0.31682155, dtype=float32)}


  0%|          | 3742/1000000 [09:42<61:02:11,  4.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3749/1000000 [09:42<29:32:25,  9.37it/s]

{'loss': Array(0.30532542, dtype=float32), 'loss_cross_entropy': Array(0.30532542, dtype=float32)}


  0%|          | 3753/1000000 [09:43<45:28:41,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3759/1000000 [09:44<25:43:39, 10.76it/s]

{'loss': Array(0.32147437, dtype=float32), 'loss_cross_entropy': Array(0.32147437, dtype=float32)}


  0%|          | 3761/1000000 [09:45<55:15:06,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3768/1000000 [09:45<27:34:33, 10.04it/s]

{'loss': Array(0.276794, dtype=float32), 'loss_cross_entropy': Array(0.276794, dtype=float32)}


  0%|          | 3771/1000000 [09:46<48:09:31,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3779/1000000 [09:46<24:43:35, 11.19it/s]

{'loss': Array(0.28721595, dtype=float32), 'loss_cross_entropy': Array(0.28721595, dtype=float32)}


  0%|          | 3783/1000000 [09:47<40:57:31,  6.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3789/1000000 [09:48<24:21:04, 11.36it/s]

{'loss': Array(0.31005263, dtype=float32), 'loss_cross_entropy': Array(0.31005263, dtype=float32)}


  0%|          | 3793/1000000 [09:49<43:46:51,  6.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3799/1000000 [09:49<24:40:39, 11.21it/s]

{'loss': Array(0.30397615, dtype=float32), 'loss_cross_entropy': Array(0.30397615, dtype=float32)}


  0%|          | 3803/1000000 [09:51<60:38:39,  4.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3809/1000000 [09:51<30:24:11,  9.10it/s]

{'loss': Array(0.31793854, dtype=float32), 'loss_cross_entropy': Array(0.31793854, dtype=float32)}


  0%|          | 3813/1000000 [09:52<45:29:03,  6.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3819/1000000 [09:52<24:56:11, 11.10it/s]

{'loss': Array(0.284542, dtype=float32), 'loss_cross_entropy': Array(0.284542, dtype=float32)}


  0%|          | 3821/1000000 [09:53<54:39:02,  5.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3829/1000000 [09:54<25:47:06, 10.73it/s]

{'loss': Array(0.30173805, dtype=float32), 'loss_cross_entropy': Array(0.30173805, dtype=float32)}


  0%|          | 3833/1000000 [09:54<40:09:44,  6.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3839/1000000 [09:55<24:44:21, 11.19it/s]

{'loss': Array(0.3125002, dtype=float32), 'loss_cross_entropy': Array(0.3125002, dtype=float32)}


  0%|          | 3843/1000000 [09:56<42:05:47,  6.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3850/1000000 [09:56<23:12:26, 11.92it/s]

{'loss': Array(0.30480608, dtype=float32), 'loss_cross_entropy': Array(0.30480608, dtype=float32)}


  0%|          | 3852/1000000 [09:57<53:19:54,  5.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3859/1000000 [09:57<27:08:34, 10.19it/s]

{'loss': Array(0.28864852, dtype=float32), 'loss_cross_entropy': Array(0.28864852, dtype=float32)}


  0%|          | 3863/1000000 [09:59<43:32:40,  6.35it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3868/1000000 [09:59<40:59:51,  6.75it/s]

{'loss': Array(0.30201623, dtype=float32), 'loss_cross_entropy': Array(0.30201623, dtype=float32)}


  0%|          | 3871/1000000 [10:00<57:50:52,  4.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3879/1000000 [10:01<28:25:19,  9.74it/s]

{'loss': Array(0.29016376, dtype=float32), 'loss_cross_entropy': Array(0.29016376, dtype=float32)}


  0%|          | 3883/1000000 [10:02<40:42:36,  6.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3889/1000000 [10:02<25:12:13, 10.98it/s]

{'loss': Array(0.31944218, dtype=float32), 'loss_cross_entropy': Array(0.31944218, dtype=float32)}


  0%|          | 3893/1000000 [10:03<39:49:33,  6.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3900/1000000 [10:03<23:10:28, 11.94it/s]

{'loss': Array(0.30620906, dtype=float32), 'loss_cross_entropy': Array(0.30620906, dtype=float32)}


  0%|          | 3902/1000000 [10:04<50:36:54,  5.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3909/1000000 [10:05<26:33:00, 10.42it/s]

{'loss': Array(0.30692458, dtype=float32), 'loss_cross_entropy': Array(0.30692458, dtype=float32)}


  0%|          | 3911/1000000 [10:05<56:08:05,  4.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3920/1000000 [10:06<24:21:39, 11.36it/s]

{'loss': Array(0.3275762, dtype=float32), 'loss_cross_entropy': Array(0.3275762, dtype=float32)}


  0%|          | 3922/1000000 [10:07<53:43:30,  5.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3929/1000000 [10:08<42:35:02,  6.50it/s]

{'loss': Array(0.2801426, dtype=float32), 'loss_cross_entropy': Array(0.2801426, dtype=float32)}


  0%|          | 3933/1000000 [10:09<50:54:43,  5.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3940/1000000 [10:09<26:33:39, 10.42it/s]

{'loss': Array(0.31148893, dtype=float32), 'loss_cross_entropy': Array(0.31148893, dtype=float32)}


  0%|          | 3942/1000000 [10:10<55:27:39,  4.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3950/1000000 [10:11<26:35:37, 10.40it/s]

{'loss': Array(0.30171263, dtype=float32), 'loss_cross_entropy': Array(0.30171263, dtype=float32)}


  0%|          | 3952/1000000 [10:11<51:20:13,  5.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3960/1000000 [10:12<25:42:46, 10.76it/s]

{'loss': Array(0.31073862, dtype=float32), 'loss_cross_entropy': Array(0.31073862, dtype=float32)}


  0%|          | 3962/1000000 [10:13<51:20:43,  5.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3968/1000000 [10:13<29:15:47,  9.45it/s]

{'loss': Array(0.3275853, dtype=float32), 'loss_cross_entropy': Array(0.3275853, dtype=float32)}


  0%|          | 3973/1000000 [10:14<39:12:34,  7.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3979/1000000 [10:14<24:16:33, 11.40it/s]

{'loss': Array(0.31503952, dtype=float32), 'loss_cross_entropy': Array(0.31503952, dtype=float32)}


  0%|          | 3983/1000000 [10:15<43:02:33,  6.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 3988/1000000 [10:16<27:08:08, 10.20it/s]

{'loss': Array(0.3110081, dtype=float32), 'loss_cross_entropy': Array(0.3110081, dtype=float32)}


  0%|          | 3991/1000000 [10:17<49:05:39,  5.64it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4000/1000000 [10:18<30:23:25,  9.10it/s]

{'loss': Array(0.30001935, dtype=float32), 'loss_cross_entropy': Array(0.30001935, dtype=float32)}


  0%|          | 4002/1000000 [10:25<312:03:28,  1.13s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4010/1000000 [10:26<106:13:57,  2.60it/s]

{'loss': Array(0.3093378, dtype=float32), 'loss_cross_entropy': Array(0.3093378, dtype=float32)}


  0%|          | 4012/1000000 [10:26<110:39:52,  2.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4020/1000000 [10:27<43:38:38,  6.34it/s] 

{'loss': Array(0.31244045, dtype=float32), 'loss_cross_entropy': Array(0.31244045, dtype=float32)}


  0%|          | 4022/1000000 [10:28<63:18:32,  4.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4028/1000000 [10:28<34:46:48,  7.95it/s]

{'loss': Array(0.29292285, dtype=float32), 'loss_cross_entropy': Array(0.29292285, dtype=float32)}


  0%|          | 4031/1000000 [10:29<50:40:47,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4040/1000000 [10:29<25:11:17, 10.98it/s]

{'loss': Array(0.30754203, dtype=float32), 'loss_cross_entropy': Array(0.30754203, dtype=float32)}


  0%|          | 4042/1000000 [10:30<47:18:54,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4048/1000000 [10:31<29:03:23,  9.52it/s]

{'loss': Array(0.29041395, dtype=float32), 'loss_cross_entropy': Array(0.29041395, dtype=float32)}


  0%|          | 4053/1000000 [10:32<38:33:30,  7.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4059/1000000 [10:32<36:55:53,  7.49it/s]

{'loss': Array(0.27786493, dtype=float32), 'loss_cross_entropy': Array(0.27786493, dtype=float32)}


  0%|          | 4061/1000000 [10:33<56:35:58,  4.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4070/1000000 [10:34<26:29:28, 10.44it/s]

{'loss': Array(0.30007678, dtype=float32), 'loss_cross_entropy': Array(0.30007678, dtype=float32)}


  0%|          | 4072/1000000 [10:35<48:22:47,  5.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4080/1000000 [10:35<25:06:55, 11.01it/s]

{'loss': Array(0.28094393, dtype=float32), 'loss_cross_entropy': Array(0.28094393, dtype=float32)}


  0%|          | 4082/1000000 [10:36<47:14:00,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4088/1000000 [10:36<28:17:40,  9.78it/s]

{'loss': Array(0.315004, dtype=float32), 'loss_cross_entropy': Array(0.315004, dtype=float32)}


  0%|          | 4091/1000000 [10:37<47:29:30,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4100/1000000 [10:38<24:32:58, 11.27it/s]

{'loss': Array(0.30721304, dtype=float32), 'loss_cross_entropy': Array(0.30721304, dtype=float32)}


  0%|          | 4102/1000000 [10:38<44:35:30,  6.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4108/1000000 [10:39<27:43:24,  9.98it/s]

{'loss': Array(0.28324372, dtype=float32), 'loss_cross_entropy': Array(0.28324372, dtype=float32)}


  0%|          | 4111/1000000 [10:40<47:44:19,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4119/1000000 [10:41<42:42:56,  6.48it/s]

{'loss': Array(0.30110487, dtype=float32), 'loss_cross_entropy': Array(0.30110487, dtype=float32)}


  0%|          | 4123/1000000 [10:42<48:06:31,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4129/1000000 [10:42<28:18:30,  9.77it/s]

{'loss': Array(0.29034525, dtype=float32), 'loss_cross_entropy': Array(0.29034525, dtype=float32)}


  0%|          | 4131/1000000 [10:43<49:45:15,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4140/1000000 [10:43<24:02:52, 11.50it/s]

{'loss': Array(0.2968033, dtype=float32), 'loss_cross_entropy': Array(0.2968033, dtype=float32)}


  0%|          | 4142/1000000 [10:44<47:07:06,  5.87it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4148/1000000 [10:44<28:40:33,  9.65it/s]

{'loss': Array(0.29661843, dtype=float32), 'loss_cross_entropy': Array(0.29661843, dtype=float32)}


  0%|          | 4153/1000000 [10:45<39:29:19,  7.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4158/1000000 [10:46<27:02:20, 10.23it/s]

{'loss': Array(0.2851061, dtype=float32), 'loss_cross_entropy': Array(0.2851061, dtype=float32)}


  0%|          | 4163/1000000 [10:47<37:42:47,  7.33it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4169/1000000 [10:47<23:53:36, 11.58it/s]

{'loss': Array(0.27974272, dtype=float32), 'loss_cross_entropy': Array(0.27974272, dtype=float32)}


  0%|          | 4171/1000000 [10:48<46:10:23,  5.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4180/1000000 [10:48<23:07:52, 11.96it/s]

{'loss': Array(0.29453787, dtype=float32), 'loss_cross_entropy': Array(0.29453787, dtype=float32)}


  0%|          | 4182/1000000 [10:49<45:08:48,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4190/1000000 [10:50<32:52:12,  8.42it/s]

{'loss': Array(0.31448826, dtype=float32), 'loss_cross_entropy': Array(0.31448826, dtype=float32)}


  0%|          | 4192/1000000 [10:51<53:44:10,  5.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4200/1000000 [10:51<27:13:46, 10.16it/s]

{'loss': Array(0.29563692, dtype=float32), 'loss_cross_entropy': Array(0.29563692, dtype=float32)}


  0%|          | 4202/1000000 [10:52<50:47:36,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4208/1000000 [10:52<29:22:20,  9.42it/s]

{'loss': Array(0.28313074, dtype=float32), 'loss_cross_entropy': Array(0.28313074, dtype=float32)}


  0%|          | 4211/1000000 [10:53<46:04:45,  6.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4220/1000000 [10:54<23:56:03, 11.56it/s]

{'loss': Array(0.29349554, dtype=float32), 'loss_cross_entropy': Array(0.29349554, dtype=float32)}


  0%|          | 4222/1000000 [10:55<46:15:49,  5.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4230/1000000 [10:55<24:49:41, 11.14it/s]

{'loss': Array(0.28937116, dtype=float32), 'loss_cross_entropy': Array(0.28937116, dtype=float32)}


  0%|          | 4232/1000000 [10:56<47:26:49,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4240/1000000 [10:56<24:18:39, 11.38it/s]

{'loss': Array(0.2810031, dtype=float32), 'loss_cross_entropy': Array(0.2810031, dtype=float32)}


  0%|          | 4242/1000000 [10:57<46:48:16,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4249/1000000 [10:58<40:33:04,  6.82it/s]

{'loss': Array(0.2983647, dtype=float32), 'loss_cross_entropy': Array(0.2983647, dtype=float32)}


  0%|          | 4251/1000000 [10:59<60:07:40,  4.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4260/1000000 [10:59<26:03:13, 10.62it/s]

{'loss': Array(0.2928142, dtype=float32), 'loss_cross_entropy': Array(0.2928142, dtype=float32)}


  0%|          | 4262/1000000 [11:00<50:11:58,  5.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4270/1000000 [11:01<26:04:26, 10.61it/s]

{'loss': Array(0.29047295, dtype=float32), 'loss_cross_entropy': Array(0.29047295, dtype=float32)}


  0%|          | 4272/1000000 [11:01<51:44:02,  5.35it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4280/1000000 [11:02<25:08:08, 11.00it/s]

{'loss': Array(0.29086003, dtype=float32), 'loss_cross_entropy': Array(0.29086003, dtype=float32)}


  0%|          | 4282/1000000 [11:03<52:36:49,  5.26it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4288/1000000 [11:03<28:50:29,  9.59it/s]

{'loss': Array(0.30189323, dtype=float32), 'loss_cross_entropy': Array(0.30189323, dtype=float32)}


  0%|          | 4293/1000000 [11:04<40:32:20,  6.82it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4299/1000000 [11:04<25:18:58, 10.93it/s]

{'loss': Array(0.28562292, dtype=float32), 'loss_cross_entropy': Array(0.28562292, dtype=float32)}


  0%|          | 4303/1000000 [11:06<44:18:13,  6.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4310/1000000 [11:07<44:38:28,  6.20it/s]

{'loss': Array(0.3113683, dtype=float32), 'loss_cross_entropy': Array(0.3113683, dtype=float32)}


  0%|          | 4312/1000000 [11:07<65:43:50,  4.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4319/1000000 [11:08<30:40:41,  9.02it/s]

{'loss': Array(0.2938487, dtype=float32), 'loss_cross_entropy': Array(0.2938487, dtype=float32)}


  0%|          | 4323/1000000 [11:09<43:34:13,  6.35it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4329/1000000 [11:09<24:45:23, 11.17it/s]

{'loss': Array(0.31740817, dtype=float32), 'loss_cross_entropy': Array(0.31740817, dtype=float32)}


  0%|          | 4333/1000000 [11:10<43:18:34,  6.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4340/1000000 [11:10<22:43:24, 12.17it/s]

{'loss': Array(0.31788588, dtype=float32), 'loss_cross_entropy': Array(0.31788588, dtype=float32)}


  0%|          | 4342/1000000 [11:11<48:49:27,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4348/1000000 [11:12<26:50:27, 10.30it/s]

{'loss': Array(0.26972637, dtype=float32), 'loss_cross_entropy': Array(0.26972637, dtype=float32)}


  0%|          | 4351/1000000 [11:13<48:28:40,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4360/1000000 [11:13<23:14:01, 11.90it/s]

{'loss': Array(0.31099758, dtype=float32), 'loss_cross_entropy': Array(0.31099758, dtype=float32)}


  0%|          | 4362/1000000 [11:14<50:07:12,  5.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4370/1000000 [11:14<24:36:44, 11.24it/s]

{'loss': Array(0.29017383, dtype=float32), 'loss_cross_entropy': Array(0.29017383, dtype=float32)}


  0%|          | 4372/1000000 [11:15<50:44:31,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4380/1000000 [11:16<35:18:01,  7.83it/s]

{'loss': Array(0.32527205, dtype=float32), 'loss_cross_entropy': Array(0.32527205, dtype=float32)}


  0%|          | 4382/1000000 [11:17<60:04:49,  4.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4389/1000000 [11:17<29:05:29,  9.51it/s]

{'loss': Array(0.28404662, dtype=float32), 'loss_cross_entropy': Array(0.28404662, dtype=float32)}


  0%|          | 4393/1000000 [11:18<42:41:25,  6.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4399/1000000 [11:19<25:25:56, 10.87it/s]

{'loss': Array(0.29001868, dtype=float32), 'loss_cross_entropy': Array(0.29001868, dtype=float32)}


  0%|          | 4401/1000000 [11:19<48:42:42,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4408/1000000 [11:20<26:18:24, 10.51it/s]

{'loss': Array(0.28966904, dtype=float32), 'loss_cross_entropy': Array(0.28966904, dtype=float32)}


  0%|          | 4411/1000000 [11:21<46:35:57,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4419/1000000 [11:21<25:18:40, 10.93it/s]

{'loss': Array(0.3195055, dtype=float32), 'loss_cross_entropy': Array(0.3195055, dtype=float32)}


  0%|          | 4421/1000000 [11:22<50:29:52,  5.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4429/1000000 [11:23<25:47:05, 10.73it/s]

{'loss': Array(0.29295713, dtype=float32), 'loss_cross_entropy': Array(0.29295713, dtype=float32)}


  0%|          | 4433/1000000 [11:24<42:31:06,  6.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4440/1000000 [11:25<45:45:50,  6.04it/s]

{'loss': Array(0.31177494, dtype=float32), 'loss_cross_entropy': Array(0.31177494, dtype=float32)}


  0%|          | 4442/1000000 [11:25<68:39:27,  4.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4450/1000000 [11:26<30:04:57,  9.19it/s]

{'loss': Array(0.2820668, dtype=float32), 'loss_cross_entropy': Array(0.2820668, dtype=float32)}


  0%|          | 4452/1000000 [11:27<52:35:05,  5.26it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4459/1000000 [11:27<27:49:49,  9.94it/s]

{'loss': Array(0.310047, dtype=float32), 'loss_cross_entropy': Array(0.310047, dtype=float32)}


  0%|          | 4461/1000000 [11:28<53:06:50,  5.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4469/1000000 [11:28<25:29:28, 10.85it/s]

{'loss': Array(0.2860817, dtype=float32), 'loss_cross_entropy': Array(0.2860817, dtype=float32)}


  0%|          | 4473/1000000 [11:29<40:12:21,  6.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4479/1000000 [11:30<26:09:40, 10.57it/s]

{'loss': Array(0.3005957, dtype=float32), 'loss_cross_entropy': Array(0.3005957, dtype=float32)}


  0%|          | 4481/1000000 [11:31<55:08:49,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4489/1000000 [11:31<26:07:44, 10.58it/s]

{'loss': Array(0.28769216, dtype=float32), 'loss_cross_entropy': Array(0.28769216, dtype=float32)}


  0%|          | 4493/1000000 [11:32<42:18:09,  6.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4500/1000000 [11:32<23:49:31, 11.61it/s]

{'loss': Array(0.27195057, dtype=float32), 'loss_cross_entropy': Array(0.27195057, dtype=float32)}
context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4510/1000000 [11:41<107:58:24,  2.56it/s]

{'loss': Array(0.30402133, dtype=float32), 'loss_cross_entropy': Array(0.30402133, dtype=float32)}


  0%|          | 4512/1000000 [11:42<111:03:33,  2.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4519/1000000 [11:42<48:33:40,  5.69it/s] 

{'loss': Array(0.29041252, dtype=float32), 'loss_cross_entropy': Array(0.29041252, dtype=float32)}


  0%|          | 4521/1000000 [11:43<69:44:32,  3.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4529/1000000 [11:43<30:31:07,  9.06it/s]

{'loss': Array(0.29711092, dtype=float32), 'loss_cross_entropy': Array(0.29711092, dtype=float32)}


  0%|          | 4533/1000000 [11:44<41:56:51,  6.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4539/1000000 [11:45<26:05:28, 10.60it/s]

{'loss': Array(0.28782925, dtype=float32), 'loss_cross_entropy': Array(0.28782925, dtype=float32)}


  0%|          | 4543/1000000 [11:46<41:44:43,  6.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4550/1000000 [11:46<23:01:05, 12.01it/s]

{'loss': Array(0.30981925, dtype=float32), 'loss_cross_entropy': Array(0.30981925, dtype=float32)}


  0%|          | 4552/1000000 [11:47<52:53:03,  5.23it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4559/1000000 [11:47<27:04:01, 10.22it/s]

{'loss': Array(0.29433882, dtype=float32), 'loss_cross_entropy': Array(0.29433882, dtype=float32)}


  0%|          | 4561/1000000 [11:48<52:24:05,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4569/1000000 [11:49<38:47:47,  7.13it/s]

{'loss': Array(0.28643215, dtype=float32), 'loss_cross_entropy': Array(0.28643215, dtype=float32)}


  0%|          | 4571/1000000 [11:50<60:48:23,  4.55it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4580/1000000 [11:51<26:21:18, 10.49it/s]

{'loss': Array(0.28922775, dtype=float32), 'loss_cross_entropy': Array(0.28922775, dtype=float32)}


  0%|          | 4582/1000000 [11:51<53:13:57,  5.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4589/1000000 [11:52<26:41:49, 10.36it/s]

{'loss': Array(0.31208515, dtype=float32), 'loss_cross_entropy': Array(0.31208515, dtype=float32)}


  0%|          | 4591/1000000 [11:53<52:23:43,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4599/1000000 [11:53<25:31:34, 10.83it/s]

{'loss': Array(0.3076582, dtype=float32), 'loss_cross_entropy': Array(0.3076582, dtype=float32)}


  0%|          | 4603/1000000 [11:54<41:36:40,  6.64it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4609/1000000 [11:54<24:17:47, 11.38it/s]

{'loss': Array(0.28982034, dtype=float32), 'loss_cross_entropy': Array(0.28982034, dtype=float32)}


  0%|          | 4611/1000000 [11:55<54:11:10,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4620/1000000 [11:56<24:12:15, 11.42it/s]

{'loss': Array(0.2991055, dtype=float32), 'loss_cross_entropy': Array(0.2991055, dtype=float32)}


  0%|          | 4622/1000000 [11:57<51:08:26,  5.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4629/1000000 [11:58<46:04:44,  6.00it/s]

{'loss': Array(0.29514843, dtype=float32), 'loss_cross_entropy': Array(0.29514843, dtype=float32)}


  0%|          | 4633/1000000 [11:59<51:35:23,  5.36it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4639/1000000 [11:59<27:49:57,  9.93it/s]

{'loss': Array(0.30776247, dtype=float32), 'loss_cross_entropy': Array(0.30776247, dtype=float32)}


  0%|          | 4643/1000000 [12:00<43:36:08,  6.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4648/1000000 [12:00<27:17:46, 10.13it/s]

{'loss': Array(0.27534378, dtype=float32), 'loss_cross_entropy': Array(0.27534378, dtype=float32)}


  0%|          | 4653/1000000 [12:01<39:41:05,  6.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4659/1000000 [12:01<23:53:05, 11.58it/s]

{'loss': Array(0.29184994, dtype=float32), 'loss_cross_entropy': Array(0.29184994, dtype=float32)}


  0%|          | 4661/1000000 [12:02<52:17:42,  5.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4670/1000000 [12:03<23:11:18, 11.92it/s]

{'loss': Array(0.28687033, dtype=float32), 'loss_cross_entropy': Array(0.28687033, dtype=float32)}


  0%|          | 4672/1000000 [12:04<50:03:06,  5.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4679/1000000 [12:04<25:38:21, 10.78it/s]

{'loss': Array(0.29028532, dtype=float32), 'loss_cross_entropy': Array(0.29028532, dtype=float32)}


  0%|          | 4681/1000000 [12:05<54:24:15,  5.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4689/1000000 [12:05<26:20:24, 10.50it/s]

{'loss': Array(0.28971013, dtype=float32), 'loss_cross_entropy': Array(0.28971013, dtype=float32)}


  0%|          | 4691/1000000 [12:06<49:39:10,  5.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4699/1000000 [12:07<33:10:22,  8.33it/s]

{'loss': Array(0.28621712, dtype=float32), 'loss_cross_entropy': Array(0.28621712, dtype=float32)}


  0%|          | 4703/1000000 [12:08<45:03:59,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4710/1000000 [12:09<24:59:18, 11.06it/s]

{'loss': Array(0.29677162, dtype=float32), 'loss_cross_entropy': Array(0.29677162, dtype=float32)}


  0%|          | 4712/1000000 [12:09<49:43:44,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4718/1000000 [12:10<29:07:21,  9.49it/s]

{'loss': Array(0.30077526, dtype=float32), 'loss_cross_entropy': Array(0.30077526, dtype=float32)}


  0%|          | 4721/1000000 [12:11<50:29:46,  5.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4730/1000000 [12:11<23:20:22, 11.85it/s]

{'loss': Array(0.28146967, dtype=float32), 'loss_cross_entropy': Array(0.28146967, dtype=float32)}


  0%|          | 4732/1000000 [12:12<51:06:03,  5.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4739/1000000 [12:12<26:22:12, 10.48it/s]

{'loss': Array(0.29515, dtype=float32), 'loss_cross_entropy': Array(0.29515, dtype=float32)}


  0%|          | 4741/1000000 [12:13<52:55:33,  5.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4749/1000000 [12:14<26:08:49, 10.57it/s]

{'loss': Array(0.2857524, dtype=float32), 'loss_cross_entropy': Array(0.2857524, dtype=float32)}


  0%|          | 4751/1000000 [12:15<50:21:50,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4759/1000000 [12:16<40:39:11,  6.80it/s]

{'loss': Array(0.28260684, dtype=float32), 'loss_cross_entropy': Array(0.28260684, dtype=float32)}


  0%|          | 4763/1000000 [12:17<50:45:35,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4770/1000000 [12:17<25:58:24, 10.64it/s]

{'loss': Array(0.2947393, dtype=float32), 'loss_cross_entropy': Array(0.2947393, dtype=float32)}


  0%|          | 4772/1000000 [12:18<52:49:18,  5.23it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4779/1000000 [12:18<26:31:52, 10.42it/s]

{'loss': Array(0.29533324, dtype=float32), 'loss_cross_entropy': Array(0.29533324, dtype=float32)}


  0%|          | 4783/1000000 [12:19<41:54:56,  6.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4790/1000000 [12:19<23:06:46, 11.96it/s]

{'loss': Array(0.29794675, dtype=float32), 'loss_cross_entropy': Array(0.29794675, dtype=float32)}


  0%|          | 4792/1000000 [12:20<51:52:35,  5.33it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4799/1000000 [12:21<26:13:36, 10.54it/s]

{'loss': Array(0.2778534, dtype=float32), 'loss_cross_entropy': Array(0.2778534, dtype=float32)}


  0%|          | 4801/1000000 [12:22<50:20:29,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4809/1000000 [12:22<24:59:55, 11.06it/s]

{'loss': Array(0.29487163, dtype=float32), 'loss_cross_entropy': Array(0.29487163, dtype=float32)}


  0%|          | 4813/1000000 [12:23<42:03:03,  6.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4820/1000000 [12:24<43:28:33,  6.36it/s]

{'loss': Array(0.30652615, dtype=float32), 'loss_cross_entropy': Array(0.30652615, dtype=float32)}


  0%|          | 4822/1000000 [12:25<64:10:55,  4.31it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4830/1000000 [12:25<27:31:50, 10.04it/s]

{'loss': Array(0.28266808, dtype=float32), 'loss_cross_entropy': Array(0.28266808, dtype=float32)}


  0%|          | 4832/1000000 [12:26<54:41:04,  5.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4840/1000000 [12:27<25:05:23, 11.02it/s]

{'loss': Array(0.29935133, dtype=float32), 'loss_cross_entropy': Array(0.29935133, dtype=float32)}


  0%|          | 4842/1000000 [12:27<50:42:58,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4850/1000000 [12:28<24:57:10, 11.08it/s]

{'loss': Array(0.30099446, dtype=float32), 'loss_cross_entropy': Array(0.30099446, dtype=float32)}


  0%|          | 4852/1000000 [12:29<50:21:28,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4858/1000000 [12:29<29:37:03,  9.33it/s]

{'loss': Array(0.2973753, dtype=float32), 'loss_cross_entropy': Array(0.2973753, dtype=float32)}


  0%|          | 4861/1000000 [12:30<48:21:45,  5.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4869/1000000 [12:30<26:11:22, 10.55it/s]

{'loss': Array(0.28759074, dtype=float32), 'loss_cross_entropy': Array(0.28759074, dtype=float32)}


  0%|          | 4871/1000000 [12:31<49:42:06,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4880/1000000 [12:32<24:13:33, 11.41it/s]

{'loss': Array(0.2840161, dtype=float32), 'loss_cross_entropy': Array(0.2840161, dtype=float32)}


  0%|          | 4882/1000000 [12:33<47:07:08,  5.87it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4890/1000000 [12:34<32:40:59,  8.46it/s]

{'loss': Array(0.31104484, dtype=float32), 'loss_cross_entropy': Array(0.31104484, dtype=float32)}


  0%|          | 4892/1000000 [12:34<52:10:47,  5.30it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4899/1000000 [12:35<29:20:24,  9.42it/s]

{'loss': Array(0.29980645, dtype=float32), 'loss_cross_entropy': Array(0.29980645, dtype=float32)}


  0%|          | 4903/1000000 [12:36<43:36:46,  6.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4909/1000000 [12:36<25:31:50, 10.83it/s]

{'loss': Array(0.27821907, dtype=float32), 'loss_cross_entropy': Array(0.27821907, dtype=float32)}


  0%|          | 4911/1000000 [12:37<49:48:22,  5.55it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4920/1000000 [12:37<23:53:17, 11.57it/s]

{'loss': Array(0.29638344, dtype=float32), 'loss_cross_entropy': Array(0.29638344, dtype=float32)}


  0%|          | 4922/1000000 [12:38<46:22:17,  5.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4930/1000000 [12:39<24:48:21, 11.14it/s]

{'loss': Array(0.2577327, dtype=float32), 'loss_cross_entropy': Array(0.2577327, dtype=float32)}


  0%|          | 4932/1000000 [12:39<48:32:52,  5.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4939/1000000 [12:40<26:23:04, 10.48it/s]

{'loss': Array(0.30753437, dtype=float32), 'loss_cross_entropy': Array(0.30753437, dtype=float32)}


  0%|          | 4943/1000000 [12:41<41:18:03,  6.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4948/1000000 [12:42<47:20:19,  5.84it/s]

{'loss': Array(0.29036394, dtype=float32), 'loss_cross_entropy': Array(0.29036394, dtype=float32)}


  0%|          | 4951/1000000 [12:43<60:08:47,  4.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4960/1000000 [12:43<27:43:33,  9.97it/s]

{'loss': Array(0.28235096, dtype=float32), 'loss_cross_entropy': Array(0.28235096, dtype=float32)}


  0%|          | 4962/1000000 [12:44<49:33:35,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4968/1000000 [12:44<29:35:27,  9.34it/s]

{'loss': Array(0.28954816, dtype=float32), 'loss_cross_entropy': Array(0.28954816, dtype=float32)}


  0%|          | 4971/1000000 [12:45<47:38:50,  5.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4980/1000000 [12:46<24:35:34, 11.24it/s]

{'loss': Array(0.28265032, dtype=float32), 'loss_cross_entropy': Array(0.28265032, dtype=float32)}


  0%|          | 4982/1000000 [12:46<44:11:57,  6.25it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4988/1000000 [12:47<27:30:26, 10.05it/s]

{'loss': Array(0.3022047, dtype=float32), 'loss_cross_entropy': Array(0.3022047, dtype=float32)}


  0%|          | 4993/1000000 [12:48<38:09:29,  7.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  0%|          | 4999/1000000 [12:48<24:34:47, 11.24it/s]

{'loss': Array(0.29942968, dtype=float32), 'loss_cross_entropy': Array(0.29942968, dtype=float32)}


  1%|          | 5001/1000000 [12:55<258:43:59,  1.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5010/1000000 [12:56<89:03:49,  3.10it/s] 

{'loss': Array(0.27582657, dtype=float32), 'loss_cross_entropy': Array(0.27582657, dtype=float32)}
context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5018/1000000 [12:58<58:57:35,  4.69it/s] 

{'loss': Array(0.3018817, dtype=float32), 'loss_cross_entropy': Array(0.3018817, dtype=float32)}


  1%|          | 5021/1000000 [12:58<65:52:35,  4.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5030/1000000 [12:59<30:39:56,  9.01it/s]

{'loss': Array(0.2935208, dtype=float32), 'loss_cross_entropy': Array(0.2935208, dtype=float32)}


  1%|          | 5032/1000000 [13:00<51:56:40,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5038/1000000 [13:00<30:57:29,  8.93it/s]

{'loss': Array(0.29173967, dtype=float32), 'loss_cross_entropy': Array(0.29173967, dtype=float32)}


  1%|          | 5041/1000000 [13:01<47:05:33,  5.87it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5050/1000000 [13:01<24:24:25, 11.32it/s]

{'loss': Array(0.27757367, dtype=float32), 'loss_cross_entropy': Array(0.27757367, dtype=float32)}


  1%|          | 5052/1000000 [13:02<45:30:10,  6.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5058/1000000 [13:02<28:06:54,  9.83it/s]

{'loss': Array(0.2962034, dtype=float32), 'loss_cross_entropy': Array(0.2962034, dtype=float32)}


  1%|          | 5063/1000000 [13:04<39:16:23,  7.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5070/1000000 [13:04<22:58:18, 12.03it/s]

{'loss': Array(0.28817114, dtype=float32), 'loss_cross_entropy': Array(0.28817114, dtype=float32)}


  1%|          | 5072/1000000 [13:05<50:04:23,  5.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5079/1000000 [13:06<41:12:43,  6.71it/s]

{'loss': Array(0.30431706, dtype=float32), 'loss_cross_entropy': Array(0.30431706, dtype=float32)}


  1%|          | 5081/1000000 [13:07<61:39:55,  4.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5090/1000000 [13:07<26:15:04, 10.53it/s]

{'loss': Array(0.28944767, dtype=float32), 'loss_cross_entropy': Array(0.28944767, dtype=float32)}


  1%|          | 5092/1000000 [13:08<47:19:56,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5098/1000000 [13:08<28:12:25,  9.80it/s]

{'loss': Array(0.28683326, dtype=float32), 'loss_cross_entropy': Array(0.28683326, dtype=float32)}


  1%|          | 5101/1000000 [13:09<45:30:01,  6.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5110/1000000 [13:09<23:45:33, 11.63it/s]

{'loss': Array(0.29170474, dtype=float32), 'loss_cross_entropy': Array(0.29170474, dtype=float32)}


  1%|          | 5112/1000000 [13:10<46:56:14,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5120/1000000 [13:11<24:51:53, 11.11it/s]

{'loss': Array(0.2985246, dtype=float32), 'loss_cross_entropy': Array(0.2985246, dtype=float32)}


  1%|          | 5122/1000000 [13:12<47:11:30,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5129/1000000 [13:12<26:08:36, 10.57it/s]

{'loss': Array(0.2900452, dtype=float32), 'loss_cross_entropy': Array(0.2900452, dtype=float32)}


  1%|          | 5131/1000000 [13:13<52:22:25,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5139/1000000 [13:14<44:37:29,  6.19it/s]

{'loss': Array(0.28904176, dtype=float32), 'loss_cross_entropy': Array(0.28904176, dtype=float32)}


  1%|          | 5141/1000000 [13:15<65:28:34,  4.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5150/1000000 [13:15<28:01:00,  9.86it/s]

{'loss': Array(0.2947783, dtype=float32), 'loss_cross_entropy': Array(0.2947783, dtype=float32)}


  1%|          | 5152/1000000 [13:16<49:24:31,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5160/1000000 [13:16<25:19:44, 10.91it/s]

{'loss': Array(0.31876037, dtype=float32), 'loss_cross_entropy': Array(0.31876037, dtype=float32)}


  1%|          | 5162/1000000 [13:17<48:01:14,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5170/1000000 [13:18<24:59:39, 11.06it/s]

{'loss': Array(0.29014823, dtype=float32), 'loss_cross_entropy': Array(0.29014823, dtype=float32)}


  1%|          | 5172/1000000 [13:18<48:12:30,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5180/1000000 [13:19<24:56:46, 11.08it/s]

{'loss': Array(0.27848458, dtype=float32), 'loss_cross_entropy': Array(0.27848458, dtype=float32)}


  1%|          | 5182/1000000 [13:20<50:13:01,  5.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5189/1000000 [13:20<26:17:13, 10.51it/s]

{'loss': Array(0.28923476, dtype=float32), 'loss_cross_entropy': Array(0.28923476, dtype=float32)}


  1%|          | 5191/1000000 [13:21<50:42:02,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5200/1000000 [13:21<23:29:01, 11.77it/s]

{'loss': Array(0.29040208, dtype=float32), 'loss_cross_entropy': Array(0.29040208, dtype=float32)}


  1%|          | 5202/1000000 [13:22<44:52:52,  6.16it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5210/1000000 [13:23<32:02:08,  8.63it/s]

{'loss': Array(0.29369572, dtype=float32), 'loss_cross_entropy': Array(0.29369572, dtype=float32)}


  1%|          | 5212/1000000 [13:24<52:11:46,  5.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5218/1000000 [13:24<30:28:19,  9.07it/s]

{'loss': Array(0.30260926, dtype=float32), 'loss_cross_entropy': Array(0.30260926, dtype=float32)}


  1%|          | 5221/1000000 [13:25<49:54:43,  5.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5230/1000000 [13:26<25:26:26, 10.86it/s]

{'loss': Array(0.28573498, dtype=float32), 'loss_cross_entropy': Array(0.28573498, dtype=float32)}


  1%|          | 5232/1000000 [13:26<45:04:46,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5239/1000000 [13:27<26:38:47, 10.37it/s]

{'loss': Array(0.2704786, dtype=float32), 'loss_cross_entropy': Array(0.2704786, dtype=float32)}


  1%|          | 5243/1000000 [13:28<43:10:33,  6.40it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5249/1000000 [13:28<25:16:27, 10.93it/s]

{'loss': Array(0.28227076, dtype=float32), 'loss_cross_entropy': Array(0.28227076, dtype=float32)}


  1%|          | 5253/1000000 [13:29<42:41:31,  6.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5259/1000000 [13:30<25:07:58, 10.99it/s]

{'loss': Array(0.31182465, dtype=float32), 'loss_cross_entropy': Array(0.31182465, dtype=float32)}


  1%|          | 5261/1000000 [13:30<56:03:19,  4.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5269/1000000 [13:31<39:13:18,  7.04it/s]

{'loss': Array(0.28872666, dtype=float32), 'loss_cross_entropy': Array(0.28872666, dtype=float32)}


  1%|          | 5273/1000000 [13:32<49:15:03,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5278/1000000 [13:33<29:38:47,  9.32it/s]

{'loss': Array(0.2761032, dtype=float32), 'loss_cross_entropy': Array(0.2761032, dtype=float32)}


  1%|          | 5281/1000000 [13:34<49:47:10,  5.55it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5290/1000000 [13:34<24:28:47, 11.29it/s]

{'loss': Array(0.29874617, dtype=float32), 'loss_cross_entropy': Array(0.29874617, dtype=float32)}


  1%|          | 5292/1000000 [13:35<48:04:19,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5300/1000000 [13:35<24:57:09, 11.07it/s]

{'loss': Array(0.29912588, dtype=float32), 'loss_cross_entropy': Array(0.29912588, dtype=float32)}


  1%|          | 5302/1000000 [13:36<47:54:37,  5.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5310/1000000 [13:37<25:00:17, 11.05it/s]

{'loss': Array(0.29301012, dtype=float32), 'loss_cross_entropy': Array(0.29301012, dtype=float32)}


  1%|          | 5312/1000000 [13:38<48:40:21,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5319/1000000 [13:38<26:16:07, 10.52it/s]

{'loss': Array(0.2926465, dtype=float32), 'loss_cross_entropy': Array(0.2926465, dtype=float32)}


  1%|          | 5321/1000000 [13:39<52:49:14,  5.23it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5329/1000000 [13:40<45:35:50,  6.06it/s]

{'loss': Array(0.2823237, dtype=float32), 'loss_cross_entropy': Array(0.2823237, dtype=float32)}


  1%|          | 5333/1000000 [13:41<52:08:09,  5.30it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5338/1000000 [13:41<32:00:28,  8.63it/s]

{'loss': Array(0.26501408, dtype=float32), 'loss_cross_entropy': Array(0.26501408, dtype=float32)}


  1%|          | 5343/1000000 [13:42<41:52:26,  6.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5350/1000000 [13:43<24:13:01, 11.41it/s]

{'loss': Array(0.31010085, dtype=float32), 'loss_cross_entropy': Array(0.31010085, dtype=float32)}


  1%|          | 5352/1000000 [13:43<52:17:32,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5360/1000000 [13:44<24:09:13, 11.44it/s]

{'loss': Array(0.2880915, dtype=float32), 'loss_cross_entropy': Array(0.2880915, dtype=float32)}


  1%|          | 5362/1000000 [13:45<55:20:47,  4.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5369/1000000 [13:45<26:57:46, 10.25it/s]

{'loss': Array(0.3005642, dtype=float32), 'loss_cross_entropy': Array(0.3005642, dtype=float32)}


  1%|          | 5373/1000000 [13:46<42:10:32,  6.55it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5379/1000000 [13:46<24:56:02, 11.08it/s]

{'loss': Array(0.29762897, dtype=float32), 'loss_cross_entropy': Array(0.29762897, dtype=float32)}


  1%|          | 5381/1000000 [13:47<47:45:27,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5389/1000000 [13:48<24:55:07, 11.09it/s]

{'loss': Array(0.28763747, dtype=float32), 'loss_cross_entropy': Array(0.28763747, dtype=float32)}


  1%|          | 5391/1000000 [13:49<49:31:14,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5400/1000000 [13:50<32:08:11,  8.60it/s]

{'loss': Array(0.28404546, dtype=float32), 'loss_cross_entropy': Array(0.28404546, dtype=float32)}


  1%|          | 5402/1000000 [13:51<59:28:32,  4.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5410/1000000 [13:51<27:47:41,  9.94it/s]

{'loss': Array(0.27116144, dtype=float32), 'loss_cross_entropy': Array(0.27116144, dtype=float32)}


  1%|          | 5412/1000000 [13:52<53:28:50,  5.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5418/1000000 [13:52<28:51:28,  9.57it/s]

{'loss': Array(0.3035923, dtype=float32), 'loss_cross_entropy': Array(0.3035923, dtype=float32)}


  1%|          | 5423/1000000 [13:53<40:42:57,  6.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5430/1000000 [13:54<23:54:51, 11.55it/s]

{'loss': Array(0.29313853, dtype=float32), 'loss_cross_entropy': Array(0.29313853, dtype=float32)}


  1%|          | 5432/1000000 [13:54<51:09:06,  5.40it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5439/1000000 [13:55<26:55:21, 10.26it/s]

{'loss': Array(0.2999874, dtype=float32), 'loss_cross_entropy': Array(0.2999874, dtype=float32)}


  1%|          | 5443/1000000 [13:56<41:44:16,  6.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5449/1000000 [13:56<25:07:54, 10.99it/s]

{'loss': Array(0.27409926, dtype=float32), 'loss_cross_entropy': Array(0.27409926, dtype=float32)}


  1%|          | 5451/1000000 [13:57<49:34:43,  5.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5460/1000000 [13:58<33:53:52,  8.15it/s]

{'loss': Array(0.3034526, dtype=float32), 'loss_cross_entropy': Array(0.3034526, dtype=float32)}


  1%|          | 5462/1000000 [13:59<58:05:20,  4.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5469/1000000 [13:59<29:01:36,  9.52it/s]

{'loss': Array(0.2991021, dtype=float32), 'loss_cross_entropy': Array(0.2991021, dtype=float32)}


  1%|          | 5471/1000000 [14:00<56:47:10,  4.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5480/1000000 [14:01<24:31:47, 11.26it/s]

{'loss': Array(0.29401985, dtype=float32), 'loss_cross_entropy': Array(0.29401985, dtype=float32)}


  1%|          | 5482/1000000 [14:02<51:53:52,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5490/1000000 [14:02<23:30:48, 11.75it/s]

{'loss': Array(0.28133148, dtype=float32), 'loss_cross_entropy': Array(0.28133148, dtype=float32)}


  1%|          | 5492/1000000 [14:03<52:49:40,  5.23it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5500/1000000 [14:03<24:27:45, 11.29it/s]

{'loss': Array(0.28672594, dtype=float32), 'loss_cross_entropy': Array(0.28672594, dtype=float32)}


  1%|          | 5502/1000000 [14:11<287:41:20,  1.04s/it]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5508/1000000 [14:11<118:27:42,  2.33it/s]

{'loss': Array(0.27689025, dtype=float32), 'loss_cross_entropy': Array(0.27689025, dtype=float32)}


  1%|          | 5513/1000000 [14:12<81:35:30,  3.39it/s] 

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5519/1000000 [14:13<39:11:37,  7.05it/s]

{'loss': Array(0.316094, dtype=float32), 'loss_cross_entropy': Array(0.316094, dtype=float32)}


  1%|          | 5523/1000000 [14:14<65:06:37,  4.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5530/1000000 [14:14<30:31:45,  9.05it/s]

{'loss': Array(0.31309015, dtype=float32), 'loss_cross_entropy': Array(0.31309015, dtype=float32)}


  1%|          | 5532/1000000 [14:15<58:56:49,  4.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5539/1000000 [14:16<28:27:52,  9.70it/s]

{'loss': Array(0.2826937, dtype=float32), 'loss_cross_entropy': Array(0.2826937, dtype=float32)}


  1%|          | 5543/1000000 [14:17<43:08:05,  6.40it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5549/1000000 [14:17<24:26:38, 11.30it/s]

{'loss': Array(0.26080146, dtype=float32), 'loss_cross_entropy': Array(0.26080146, dtype=float32)}


  1%|          | 5553/1000000 [14:18<42:22:28,  6.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5559/1000000 [14:18<24:04:05, 11.48it/s]

{'loss': Array(0.27326176, dtype=float32), 'loss_cross_entropy': Array(0.27326176, dtype=float32)}


  1%|          | 5563/1000000 [14:19<40:45:08,  6.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5569/1000000 [14:20<25:20:43, 10.90it/s]

{'loss': Array(0.28513777, dtype=float32), 'loss_cross_entropy': Array(0.28513777, dtype=float32)}


  1%|          | 5571/1000000 [14:21<54:01:10,  5.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5579/1000000 [14:21<25:29:16, 10.84it/s]

{'loss': Array(0.31406626, dtype=float32), 'loss_cross_entropy': Array(0.31406626, dtype=float32)}


  1%|          | 5581/1000000 [14:22<50:16:41,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5588/1000000 [14:23<36:13:06,  7.63it/s]

{'loss': Array(0.2803347, dtype=float32), 'loss_cross_entropy': Array(0.2803347, dtype=float32)}


  1%|          | 5591/1000000 [14:24<53:39:22,  5.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5600/1000000 [14:24<25:47:18, 10.71it/s]

{'loss': Array(0.27742147, dtype=float32), 'loss_cross_entropy': Array(0.27742147, dtype=float32)}


  1%|          | 5602/1000000 [14:25<48:58:48,  5.64it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5609/1000000 [14:25<27:22:23, 10.09it/s]

{'loss': Array(0.29468438, dtype=float32), 'loss_cross_entropy': Array(0.29468438, dtype=float32)}


  1%|          | 5611/1000000 [14:26<51:53:18,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5620/1000000 [14:27<24:10:02, 11.43it/s]

{'loss': Array(0.2661664, dtype=float32), 'loss_cross_entropy': Array(0.2661664, dtype=float32)}


  1%|          | 5622/1000000 [14:28<46:31:44,  5.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5630/1000000 [14:28<24:15:08, 11.39it/s]

{'loss': Array(0.2728776, dtype=float32), 'loss_cross_entropy': Array(0.2728776, dtype=float32)}


  1%|          | 5632/1000000 [14:29<47:42:32,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5638/1000000 [14:29<28:16:38,  9.77it/s]

{'loss': Array(0.30929974, dtype=float32), 'loss_cross_entropy': Array(0.30929974, dtype=float32)}


  1%|          | 5641/1000000 [14:30<47:03:22,  5.87it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5649/1000000 [14:31<37:18:15,  7.40it/s]

{'loss': Array(0.2969798, dtype=float32), 'loss_cross_entropy': Array(0.2969798, dtype=float32)}


  1%|          | 5653/1000000 [14:32<45:34:31,  6.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5658/1000000 [14:32<28:24:56,  9.72it/s]

{'loss': Array(0.304193, dtype=float32), 'loss_cross_entropy': Array(0.304193, dtype=float32)}


  1%|          | 5661/1000000 [14:33<47:06:46,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5670/1000000 [14:34<23:44:27, 11.63it/s]

{'loss': Array(0.2899262, dtype=float32), 'loss_cross_entropy': Array(0.2899262, dtype=float32)}


  1%|          | 5672/1000000 [14:34<45:43:42,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5680/1000000 [14:35<25:22:42, 10.88it/s]

{'loss': Array(0.26456937, dtype=float32), 'loss_cross_entropy': Array(0.26456937, dtype=float32)}


  1%|          | 5682/1000000 [14:36<47:31:47,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5688/1000000 [14:36<28:13:26,  9.79it/s]

{'loss': Array(0.28636032, dtype=float32), 'loss_cross_entropy': Array(0.28636032, dtype=float32)}


  1%|          | 5691/1000000 [14:37<45:48:23,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5700/1000000 [14:37<23:44:56, 11.63it/s]

{'loss': Array(0.27457088, dtype=float32), 'loss_cross_entropy': Array(0.27457088, dtype=float32)}


  1%|          | 5702/1000000 [14:38<44:25:23,  6.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5710/1000000 [14:39<40:59:28,  6.74it/s]

{'loss': Array(0.29431066, dtype=float32), 'loss_cross_entropy': Array(0.29431066, dtype=float32)}


  1%|          | 5712/1000000 [14:40<59:49:38,  4.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5720/1000000 [14:40<29:25:14,  9.39it/s]

{'loss': Array(0.29795623, dtype=float32), 'loss_cross_entropy': Array(0.29795623, dtype=float32)}


  1%|          | 5722/1000000 [14:41<52:05:46,  5.30it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5728/1000000 [14:41<30:08:50,  9.16it/s]

{'loss': Array(0.28592715, dtype=float32), 'loss_cross_entropy': Array(0.28592715, dtype=float32)}


  1%|          | 5731/1000000 [14:42<46:42:32,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5740/1000000 [14:43<24:06:02, 11.46it/s]

{'loss': Array(0.2857635, dtype=float32), 'loss_cross_entropy': Array(0.2857635, dtype=float32)}


  1%|          | 5742/1000000 [14:44<44:56:51,  6.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5748/1000000 [14:44<27:38:59,  9.99it/s]

{'loss': Array(0.28675872, dtype=float32), 'loss_cross_entropy': Array(0.28675872, dtype=float32)}


  1%|          | 5753/1000000 [14:45<39:26:44,  7.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5759/1000000 [14:45<25:20:44, 10.90it/s]

{'loss': Array(0.28489116, dtype=float32), 'loss_cross_entropy': Array(0.28489116, dtype=float32)}


  1%|          | 5761/1000000 [14:46<46:50:03,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5769/1000000 [14:47<25:00:57, 11.04it/s]

{'loss': Array(0.27960035, dtype=float32), 'loss_cross_entropy': Array(0.27960035, dtype=float32)}


  1%|          | 5771/1000000 [14:47<47:21:21,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5779/1000000 [14:48<32:34:04,  8.48it/s]

{'loss': Array(0.30848673, dtype=float32), 'loss_cross_entropy': Array(0.30848673, dtype=float32)}


  1%|          | 5781/1000000 [14:49<53:44:02,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5789/1000000 [14:50<28:19:30,  9.75it/s]

{'loss': Array(0.2997143, dtype=float32), 'loss_cross_entropy': Array(0.2997143, dtype=float32)}


  1%|          | 5791/1000000 [14:50<52:24:46,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5799/1000000 [14:51<26:04:29, 10.59it/s]

{'loss': Array(0.28698528, dtype=float32), 'loss_cross_entropy': Array(0.28698528, dtype=float32)}


  1%|          | 5803/1000000 [14:52<41:19:37,  6.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5809/1000000 [14:52<25:08:45, 10.98it/s]

{'loss': Array(0.28372344, dtype=float32), 'loss_cross_entropy': Array(0.28372344, dtype=float32)}


  1%|          | 5811/1000000 [14:53<48:21:02,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5820/1000000 [14:54<23:46:34, 11.61it/s]

{'loss': Array(0.30014095, dtype=float32), 'loss_cross_entropy': Array(0.30014095, dtype=float32)}


  1%|          | 5822/1000000 [14:54<47:20:41,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5829/1000000 [14:55<26:49:52, 10.29it/s]

{'loss': Array(0.27404127, dtype=float32), 'loss_cross_entropy': Array(0.27404127, dtype=float32)}


  1%|          | 5831/1000000 [14:56<51:55:59,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5839/1000000 [14:57<38:09:34,  7.24it/s]

{'loss': Array(0.2632419, dtype=float32), 'loss_cross_entropy': Array(0.2632419, dtype=float32)}


  1%|          | 5841/1000000 [14:58<58:29:51,  4.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5850/1000000 [14:58<26:36:05, 10.38it/s]

{'loss': Array(0.2755253, dtype=float32), 'loss_cross_entropy': Array(0.2755253, dtype=float32)}


  1%|          | 5852/1000000 [14:59<46:55:25,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5858/1000000 [14:59<28:30:21,  9.69it/s]

{'loss': Array(0.27372178, dtype=float32), 'loss_cross_entropy': Array(0.27372178, dtype=float32)}


  1%|          | 5861/1000000 [15:00<46:32:06,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5870/1000000 [15:00<24:04:38, 11.47it/s]

{'loss': Array(0.29508862, dtype=float32), 'loss_cross_entropy': Array(0.29508862, dtype=float32)}


  1%|          | 5872/1000000 [15:01<45:32:29,  6.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5878/1000000 [15:02<28:05:57,  9.83it/s]

{'loss': Array(0.28847075, dtype=float32), 'loss_cross_entropy': Array(0.28847075, dtype=float32)}


  1%|          | 5881/1000000 [15:02<44:58:49,  6.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5890/1000000 [15:03<23:37:37, 11.69it/s]

{'loss': Array(0.2759013, dtype=float32), 'loss_cross_entropy': Array(0.2759013, dtype=float32)}


  1%|          | 5892/1000000 [15:04<44:37:32,  6.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5900/1000000 [15:05<43:28:03,  6.35it/s]

{'loss': Array(0.3002825, dtype=float32), 'loss_cross_entropy': Array(0.3002825, dtype=float32)}


  1%|          | 5902/1000000 [15:06<62:11:03,  4.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5908/1000000 [15:06<34:40:22,  7.96it/s]

{'loss': Array(0.295271, dtype=float32), 'loss_cross_entropy': Array(0.295271, dtype=float32)}


  1%|          | 5913/1000000 [15:07<43:12:44,  6.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5919/1000000 [15:07<26:50:53, 10.29it/s]

{'loss': Array(0.2802785, dtype=float32), 'loss_cross_entropy': Array(0.2802785, dtype=float32)}


  1%|          | 5921/1000000 [15:08<48:33:38,  5.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5930/1000000 [15:09<23:50:58, 11.58it/s]

{'loss': Array(0.29318637, dtype=float32), 'loss_cross_entropy': Array(0.29318637, dtype=float32)}


  1%|          | 5932/1000000 [15:09<44:45:57,  6.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5938/1000000 [15:10<28:07:27,  9.82it/s]

{'loss': Array(0.27741182, dtype=float32), 'loss_cross_entropy': Array(0.27741182, dtype=float32)}


  1%|          | 5941/1000000 [15:11<48:56:54,  5.64it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5950/1000000 [15:11<23:42:40, 11.65it/s]

{'loss': Array(0.272592, dtype=float32), 'loss_cross_entropy': Array(0.272592, dtype=float32)}


  1%|          | 5952/1000000 [15:12<45:48:18,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5958/1000000 [15:12<27:43:59,  9.96it/s]

{'loss': Array(0.29744262, dtype=float32), 'loss_cross_entropy': Array(0.29744262, dtype=float32)}


  1%|          | 5961/1000000 [15:13<46:21:27,  5.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5969/1000000 [15:14<33:11:34,  8.32it/s]

{'loss': Array(0.28908682, dtype=float32), 'loss_cross_entropy': Array(0.28908682, dtype=float32)}


  1%|          | 5971/1000000 [15:15<55:41:32,  4.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5980/1000000 [15:15<26:01:08, 10.61it/s]

{'loss': Array(0.31244612, dtype=float32), 'loss_cross_entropy': Array(0.31244612, dtype=float32)}


  1%|          | 5982/1000000 [15:16<49:50:20,  5.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 5989/1000000 [15:17<26:34:25, 10.39it/s]

{'loss': Array(0.28297594, dtype=float32), 'loss_cross_entropy': Array(0.28297594, dtype=float32)}


  1%|          | 5991/1000000 [15:18<52:15:20,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6000/1000000 [15:18<24:30:27, 11.27it/s]

{'loss': Array(0.28776985, dtype=float32), 'loss_cross_entropy': Array(0.28776985, dtype=float32)}


  1%|          | 6002/1000000 [15:26<261:55:21,  1.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6008/1000000 [15:26<124:46:57,  2.21it/s]

{'loss': Array(0.29544076, dtype=float32), 'loss_cross_entropy': Array(0.29544076, dtype=float32)}


  1%|          | 6013/1000000 [15:27<89:13:39,  3.09it/s] 

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6019/1000000 [15:27<44:43:19,  6.17it/s]

{'loss': Array(0.29448667, dtype=float32), 'loss_cross_entropy': Array(0.29448667, dtype=float32)}


  1%|          | 6023/1000000 [15:28<52:15:44,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6029/1000000 [15:29<44:20:39,  6.23it/s]

{'loss': Array(0.2739917, dtype=float32), 'loss_cross_entropy': Array(0.2739917, dtype=float32)}


  1%|          | 6031/1000000 [15:30<68:54:19,  4.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6040/1000000 [15:30<27:44:38,  9.95it/s]

{'loss': Array(0.27322102, dtype=float32), 'loss_cross_entropy': Array(0.27322102, dtype=float32)}


  1%|          | 6042/1000000 [15:31<48:38:03,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6048/1000000 [15:32<28:52:29,  9.56it/s]

{'loss': Array(0.3063297, dtype=float32), 'loss_cross_entropy': Array(0.3063297, dtype=float32)}


  1%|          | 6053/1000000 [15:33<38:53:41,  7.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6059/1000000 [15:33<24:44:58, 11.16it/s]

{'loss': Array(0.31210515, dtype=float32), 'loss_cross_entropy': Array(0.31210515, dtype=float32)}


  1%|          | 6061/1000000 [15:34<46:45:31,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6069/1000000 [15:34<24:53:26, 11.09it/s]

{'loss': Array(0.28466687, dtype=float32), 'loss_cross_entropy': Array(0.28466687, dtype=float32)}


  1%|          | 6071/1000000 [15:35<49:47:32,  5.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6080/1000000 [15:35<23:55:41, 11.54it/s]

{'loss': Array(0.2745562, dtype=float32), 'loss_cross_entropy': Array(0.2745562, dtype=float32)}


  1%|          | 6082/1000000 [15:36<45:31:25,  6.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6090/1000000 [15:37<43:00:42,  6.42it/s]

{'loss': Array(0.29115245, dtype=float32), 'loss_cross_entropy': Array(0.29115245, dtype=float32)}


  1%|          | 6092/1000000 [15:38<61:22:43,  4.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6098/1000000 [15:38<33:59:13,  8.12it/s]

{'loss': Array(0.29313245, dtype=float32), 'loss_cross_entropy': Array(0.29313245, dtype=float32)}


  1%|          | 6101/1000000 [15:39<48:38:26,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6109/1000000 [15:40<27:01:11, 10.22it/s]

{'loss': Array(0.28787953, dtype=float32), 'loss_cross_entropy': Array(0.28787953, dtype=float32)}


  1%|          | 6113/1000000 [15:41<40:26:46,  6.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6119/1000000 [15:41<24:01:14, 11.49it/s]

{'loss': Array(0.28185943, dtype=float32), 'loss_cross_entropy': Array(0.28185943, dtype=float32)}


  1%|          | 6121/1000000 [15:42<53:53:05,  5.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6130/1000000 [15:42<23:51:42, 11.57it/s]

{'loss': Array(0.2982572, dtype=float32), 'loss_cross_entropy': Array(0.2982572, dtype=float32)}


  1%|          | 6132/1000000 [15:43<47:11:18,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6138/1000000 [15:43<28:07:47,  9.81it/s]

{'loss': Array(0.2792264, dtype=float32), 'loss_cross_entropy': Array(0.2792264, dtype=float32)}


  1%|          | 6143/1000000 [15:44<39:03:46,  7.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6150/1000000 [15:45<23:47:05, 11.61it/s]

{'loss': Array(0.2651924, dtype=float32), 'loss_cross_entropy': Array(0.2651924, dtype=float32)}


  1%|          | 6152/1000000 [15:46<48:24:04,  5.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6160/1000000 [15:47<32:43:18,  8.44it/s]

{'loss': Array(0.28451863, dtype=float32), 'loss_cross_entropy': Array(0.28451863, dtype=float32)}


  1%|          | 6162/1000000 [15:47<53:18:59,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6168/1000000 [15:48<30:42:26,  8.99it/s]

{'loss': Array(0.2730394, dtype=float32), 'loss_cross_entropy': Array(0.2730394, dtype=float32)}


  1%|          | 6171/1000000 [15:49<46:48:19,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6180/1000000 [15:49<24:05:25, 11.46it/s]

{'loss': Array(0.3029679, dtype=float32), 'loss_cross_entropy': Array(0.3029679, dtype=float32)}


  1%|          | 6182/1000000 [15:50<47:04:55,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6188/1000000 [15:50<28:49:35,  9.58it/s]

{'loss': Array(0.2890912, dtype=float32), 'loss_cross_entropy': Array(0.2890912, dtype=float32)}


  1%|          | 6191/1000000 [15:51<45:41:02,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6200/1000000 [15:52<24:06:03, 11.45it/s]

{'loss': Array(0.28226033, dtype=float32), 'loss_cross_entropy': Array(0.28226033, dtype=float32)}


  1%|          | 6202/1000000 [15:52<43:52:08,  6.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6208/1000000 [15:53<27:22:29, 10.08it/s]

{'loss': Array(0.26555002, dtype=float32), 'loss_cross_entropy': Array(0.26555002, dtype=float32)}


  1%|          | 6211/1000000 [15:54<44:09:58,  6.25it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6219/1000000 [15:55<37:00:51,  7.46it/s]

{'loss': Array(0.28405333, dtype=float32), 'loss_cross_entropy': Array(0.28405333, dtype=float32)}


  1%|          | 6223/1000000 [15:56<46:32:49,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6229/1000000 [15:56<27:46:48,  9.94it/s]

{'loss': Array(0.25939572, dtype=float32), 'loss_cross_entropy': Array(0.25939572, dtype=float32)}


  1%|          | 6231/1000000 [15:57<50:24:29,  5.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6239/1000000 [15:57<25:56:52, 10.64it/s]

{'loss': Array(0.28000852, dtype=float32), 'loss_cross_entropy': Array(0.28000852, dtype=float32)}


  1%|          | 6243/1000000 [15:58<40:23:19,  6.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6249/1000000 [15:58<24:40:07, 11.19it/s]

{'loss': Array(0.26272327, dtype=float32), 'loss_cross_entropy': Array(0.26272327, dtype=float32)}


  1%|          | 6253/1000000 [15:59<40:05:36,  6.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6258/1000000 [16:00<28:13:11,  9.78it/s]

{'loss': Array(0.26723516, dtype=float32), 'loss_cross_entropy': Array(0.26723516, dtype=float32)}


  1%|          | 6261/1000000 [16:01<46:44:52,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6270/1000000 [16:01<23:26:49, 11.77it/s]

{'loss': Array(0.2867531, dtype=float32), 'loss_cross_entropy': Array(0.2867531, dtype=float32)}


  1%|          | 6272/1000000 [16:02<45:57:52,  6.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6280/1000000 [16:03<42:57:30,  6.43it/s]

{'loss': Array(0.29346058, dtype=float32), 'loss_cross_entropy': Array(0.29346058, dtype=float32)}


  1%|          | 6282/1000000 [16:04<61:08:08,  4.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6289/1000000 [16:04<29:30:08,  9.36it/s]

{'loss': Array(0.29770413, dtype=float32), 'loss_cross_entropy': Array(0.29770413, dtype=float32)}


  1%|          | 6291/1000000 [16:05<54:05:30,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6300/1000000 [16:05<24:43:12, 11.17it/s]

{'loss': Array(0.29329434, dtype=float32), 'loss_cross_entropy': Array(0.29329434, dtype=float32)}


  1%|          | 6302/1000000 [16:06<47:30:10,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6310/1000000 [16:07<25:02:07, 11.03it/s]

{'loss': Array(0.29475188, dtype=float32), 'loss_cross_entropy': Array(0.29475188, dtype=float32)}


  1%|          | 6312/1000000 [16:07<47:58:35,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6318/1000000 [16:08<28:25:50,  9.71it/s]

{'loss': Array(0.28131774, dtype=float32), 'loss_cross_entropy': Array(0.28131774, dtype=float32)}


  1%|          | 6321/1000000 [16:09<45:16:12,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6330/1000000 [16:09<23:25:12, 11.79it/s]

{'loss': Array(0.3072703, dtype=float32), 'loss_cross_entropy': Array(0.3072703, dtype=float32)}


  1%|          | 6332/1000000 [16:10<45:38:42,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6338/1000000 [16:10<28:07:35,  9.81it/s]

{'loss': Array(0.28094006, dtype=float32), 'loss_cross_entropy': Array(0.28094006, dtype=float32)}


  1%|          | 6341/1000000 [16:11<45:13:11,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6349/1000000 [16:12<32:37:39,  8.46it/s]

{'loss': Array(0.27972114, dtype=float32), 'loss_cross_entropy': Array(0.27972114, dtype=float32)}


  1%|          | 6351/1000000 [16:13<52:20:31,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6360/1000000 [16:13<25:02:58, 11.02it/s]

{'loss': Array(0.2904676, dtype=float32), 'loss_cross_entropy': Array(0.2904676, dtype=float32)}


  1%|          | 6362/1000000 [16:14<46:54:19,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6370/1000000 [16:15<25:41:02, 10.75it/s]

{'loss': Array(0.29732057, dtype=float32), 'loss_cross_entropy': Array(0.29732057, dtype=float32)}


  1%|          | 6372/1000000 [16:15<49:35:51,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6378/1000000 [16:16<28:58:01,  9.53it/s]

{'loss': Array(0.26104587, dtype=float32), 'loss_cross_entropy': Array(0.26104587, dtype=float32)}


  1%|          | 6383/1000000 [16:17<38:27:48,  7.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6389/1000000 [16:17<24:30:47, 11.26it/s]

{'loss': Array(0.27821296, dtype=float32), 'loss_cross_entropy': Array(0.27821296, dtype=float32)}


  1%|          | 6391/1000000 [16:18<46:04:59,  5.99it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6400/1000000 [16:18<23:06:41, 11.94it/s]

{'loss': Array(0.26739916, dtype=float32), 'loss_cross_entropy': Array(0.26739916, dtype=float32)}


  1%|          | 6402/1000000 [16:19<45:04:11,  6.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6410/1000000 [16:20<37:16:33,  7.40it/s]

{'loss': Array(0.29322052, dtype=float32), 'loss_cross_entropy': Array(0.29322052, dtype=float32)}


  1%|          | 6412/1000000 [16:21<57:16:55,  4.82it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6418/1000000 [16:21<32:28:42,  8.50it/s]

{'loss': Array(0.29283637, dtype=float32), 'loss_cross_entropy': Array(0.29283637, dtype=float32)}


  1%|          | 6421/1000000 [16:22<49:27:13,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6429/1000000 [16:23<27:03:27, 10.20it/s]

{'loss': Array(0.2763302, dtype=float32), 'loss_cross_entropy': Array(0.2763302, dtype=float32)}


  1%|          | 6431/1000000 [16:23<49:37:13,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6440/1000000 [16:24<24:06:48, 11.45it/s]

{'loss': Array(0.285546, dtype=float32), 'loss_cross_entropy': Array(0.285546, dtype=float32)}


  1%|          | 6442/1000000 [16:25<47:35:58,  5.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6448/1000000 [16:25<28:42:01,  9.62it/s]

{'loss': Array(0.31277332, dtype=float32), 'loss_cross_entropy': Array(0.31277332, dtype=float32)}


  1%|          | 6451/1000000 [16:26<47:39:00,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6459/1000000 [16:26<25:37:22, 10.77it/s]

{'loss': Array(0.28810188, dtype=float32), 'loss_cross_entropy': Array(0.28810188, dtype=float32)}


  1%|          | 6463/1000000 [16:27<40:47:25,  6.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6469/1000000 [16:28<24:38:57, 11.20it/s]

{'loss': Array(0.27244166, dtype=float32), 'loss_cross_entropy': Array(0.27244166, dtype=float32)}


  1%|          | 6473/1000000 [16:29<58:21:03,  4.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6479/1000000 [16:30<31:26:26,  8.78it/s]

{'loss': Array(0.29313484, dtype=float32), 'loss_cross_entropy': Array(0.29313484, dtype=float32)}


  1%|          | 6481/1000000 [16:31<55:39:39,  4.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6490/1000000 [16:31<24:11:23, 11.41it/s]

{'loss': Array(0.29426718, dtype=float32), 'loss_cross_entropy': Array(0.29426718, dtype=float32)}


  1%|          | 6492/1000000 [16:32<46:15:30,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6498/1000000 [16:32<27:33:25, 10.01it/s]

{'loss': Array(0.28578398, dtype=float32), 'loss_cross_entropy': Array(0.28578398, dtype=float32)}


  1%|          | 6501/1000000 [16:40<240:53:14,  1.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6509/1000000 [16:40<92:20:58,  2.99it/s] 

{'loss': Array(0.28997627, dtype=float32), 'loss_cross_entropy': Array(0.28997627, dtype=float32)}


  1%|          | 6511/1000000 [16:41<97:20:18,  2.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6520/1000000 [16:41<38:56:32,  7.09it/s]

{'loss': Array(0.28231484, dtype=float32), 'loss_cross_entropy': Array(0.28231484, dtype=float32)}


  1%|          | 6522/1000000 [16:42<56:18:41,  4.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6528/1000000 [16:42<32:39:17,  8.45it/s]

{'loss': Array(0.27516142, dtype=float32), 'loss_cross_entropy': Array(0.27516142, dtype=float32)}


  1%|          | 6531/1000000 [16:43<47:53:58,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6539/1000000 [16:44<33:20:56,  8.27it/s]

{'loss': Array(0.27582386, dtype=float32), 'loss_cross_entropy': Array(0.27582386, dtype=float32)}


  1%|          | 6543/1000000 [16:45<44:39:24,  6.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6549/1000000 [16:46<27:02:30, 10.20it/s]

{'loss': Array(0.27387112, dtype=float32), 'loss_cross_entropy': Array(0.27387112, dtype=float32)}


  1%|          | 6553/1000000 [16:47<40:48:42,  6.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6559/1000000 [16:47<25:12:23, 10.95it/s]

{'loss': Array(0.26333168, dtype=float32), 'loss_cross_entropy': Array(0.26333168, dtype=float32)}


  1%|          | 6561/1000000 [16:48<49:01:30,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6570/1000000 [16:48<23:50:24, 11.58it/s]

{'loss': Array(0.30164745, dtype=float32), 'loss_cross_entropy': Array(0.30164745, dtype=float32)}


  1%|          | 6572/1000000 [16:49<45:43:45,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6578/1000000 [16:49<27:47:16,  9.93it/s]

{'loss': Array(0.27747595, dtype=float32), 'loss_cross_entropy': Array(0.27747595, dtype=float32)}


  1%|          | 6581/1000000 [16:50<46:26:04,  5.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6590/1000000 [16:51<24:14:31, 11.38it/s]

{'loss': Array(0.27204362, dtype=float32), 'loss_cross_entropy': Array(0.27204362, dtype=float32)}


  1%|          | 6592/1000000 [16:51<45:03:40,  6.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6600/1000000 [16:52<36:12:39,  7.62it/s]

{'loss': Array(0.27386183, dtype=float32), 'loss_cross_entropy': Array(0.27386183, dtype=float32)}


  1%|          | 6602/1000000 [16:53<57:30:58,  4.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6610/1000000 [16:54<27:51:21,  9.91it/s]

{'loss': Array(0.28009713, dtype=float32), 'loss_cross_entropy': Array(0.28009713, dtype=float32)}


  1%|          | 6612/1000000 [16:55<53:46:13,  5.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6618/1000000 [16:55<30:53:17,  8.93it/s]

{'loss': Array(0.27795085, dtype=float32), 'loss_cross_entropy': Array(0.27795085, dtype=float32)}


  1%|          | 6621/1000000 [16:56<47:04:43,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6630/1000000 [16:56<24:09:40, 11.42it/s]

{'loss': Array(0.27995262, dtype=float32), 'loss_cross_entropy': Array(0.27995262, dtype=float32)}


  1%|          | 6632/1000000 [16:57<45:07:27,  6.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6640/1000000 [16:58<24:32:00, 11.25it/s]

{'loss': Array(0.29432526, dtype=float32), 'loss_cross_entropy': Array(0.29432526, dtype=float32)}


  1%|          | 6642/1000000 [16:58<46:55:08,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6648/1000000 [16:59<27:56:49,  9.87it/s]

{'loss': Array(0.28966627, dtype=float32), 'loss_cross_entropy': Array(0.28966627, dtype=float32)}


  1%|          | 6651/1000000 [17:00<45:02:22,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6659/1000000 [17:00<24:56:29, 11.06it/s]

{'loss': Array(0.28702173, dtype=float32), 'loss_cross_entropy': Array(0.28702173, dtype=float32)}


  1%|          | 6661/1000000 [17:01<65:41:21,  4.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6670/1000000 [17:02<29:10:26,  9.46it/s]

{'loss': Array(0.2764117, dtype=float32), 'loss_cross_entropy': Array(0.2764117, dtype=float32)}


  1%|          | 6672/1000000 [17:03<49:08:19,  5.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6678/1000000 [17:03<29:17:21,  9.42it/s]

{'loss': Array(0.27208632, dtype=float32), 'loss_cross_entropy': Array(0.27208632, dtype=float32)}


  1%|          | 6681/1000000 [17:04<46:00:25,  6.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6690/1000000 [17:04<24:16:19, 11.37it/s]

{'loss': Array(0.27110884, dtype=float32), 'loss_cross_entropy': Array(0.27110884, dtype=float32)}


  1%|          | 6692/1000000 [17:05<46:47:28,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6698/1000000 [17:05<28:30:40,  9.68it/s]

{'loss': Array(0.2858667, dtype=float32), 'loss_cross_entropy': Array(0.2858667, dtype=float32)}


  1%|          | 6701/1000000 [17:06<45:16:35,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6710/1000000 [17:07<23:56:35, 11.52it/s]

{'loss': Array(0.2845405, dtype=float32), 'loss_cross_entropy': Array(0.2845405, dtype=float32)}


  1%|          | 6712/1000000 [17:08<45:13:44,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6718/1000000 [17:08<27:51:30,  9.90it/s]

{'loss': Array(0.29538012, dtype=float32), 'loss_cross_entropy': Array(0.29538012, dtype=float32)}


  1%|          | 6721/1000000 [17:09<44:14:01,  6.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6730/1000000 [17:10<29:43:39,  9.28it/s]

{'loss': Array(0.30579448, dtype=float32), 'loss_cross_entropy': Array(0.30579448, dtype=float32)}


  1%|          | 6732/1000000 [17:11<54:18:36,  5.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6738/1000000 [17:11<30:52:11,  8.94it/s]

{'loss': Array(0.28969312, dtype=float32), 'loss_cross_entropy': Array(0.28969312, dtype=float32)}


  1%|          | 6741/1000000 [17:12<48:28:33,  5.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6750/1000000 [17:12<24:26:56, 11.28it/s]

{'loss': Array(0.29578507, dtype=float32), 'loss_cross_entropy': Array(0.29578507, dtype=float32)}


  1%|          | 6752/1000000 [17:13<45:09:32,  6.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6758/1000000 [17:14<27:57:01,  9.87it/s]

{'loss': Array(0.27552438, dtype=float32), 'loss_cross_entropy': Array(0.27552438, dtype=float32)}


  1%|          | 6761/1000000 [17:15<47:10:20,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6768/1000000 [17:15<27:10:18, 10.15it/s]

{'loss': Array(0.29201517, dtype=float32), 'loss_cross_entropy': Array(0.29201517, dtype=float32)}


  1%|          | 6771/1000000 [17:16<46:20:13,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6780/1000000 [17:16<23:42:22, 11.64it/s]

{'loss': Array(0.27694008, dtype=float32), 'loss_cross_entropy': Array(0.27694008, dtype=float32)}


  1%|          | 6782/1000000 [17:17<46:00:26,  6.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6790/1000000 [17:18<36:46:27,  7.50it/s]

{'loss': Array(0.26246247, dtype=float32), 'loss_cross_entropy': Array(0.26246247, dtype=float32)}


  1%|          | 6792/1000000 [17:19<57:02:31,  4.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6798/1000000 [17:19<32:23:53,  8.52it/s]

{'loss': Array(0.3039352, dtype=float32), 'loss_cross_entropy': Array(0.3039352, dtype=float32)}


  1%|          | 6801/1000000 [17:20<50:28:07,  5.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6810/1000000 [17:21<25:34:01, 10.79it/s]

{'loss': Array(0.29956087, dtype=float32), 'loss_cross_entropy': Array(0.29956087, dtype=float32)}


  1%|          | 6812/1000000 [17:22<46:54:57,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6818/1000000 [17:22<28:43:48,  9.60it/s]

{'loss': Array(0.2980213, dtype=float32), 'loss_cross_entropy': Array(0.2980213, dtype=float32)}


  1%|          | 6821/1000000 [17:23<45:34:33,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6829/1000000 [17:23<25:25:35, 10.85it/s]

{'loss': Array(0.28852764, dtype=float32), 'loss_cross_entropy': Array(0.28852764, dtype=float32)}


  1%|          | 6831/1000000 [17:24<48:20:41,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6840/1000000 [17:24<23:39:45, 11.66it/s]

{'loss': Array(0.28443852, dtype=float32), 'loss_cross_entropy': Array(0.28443852, dtype=float32)}


  1%|          | 6842/1000000 [17:25<48:03:29,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6850/1000000 [17:26<43:40:43,  6.32it/s]

{'loss': Array(0.29709485, dtype=float32), 'loss_cross_entropy': Array(0.29709485, dtype=float32)}


  1%|          | 6852/1000000 [17:27<64:14:06,  4.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6860/1000000 [17:28<28:47:58,  9.58it/s]

{'loss': Array(0.28378123, dtype=float32), 'loss_cross_entropy': Array(0.28378123, dtype=float32)}


  1%|          | 6862/1000000 [17:29<57:24:52,  4.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6870/1000000 [17:29<25:24:08, 10.86it/s]

{'loss': Array(0.26502135, dtype=float32), 'loss_cross_entropy': Array(0.26502135, dtype=float32)}


  1%|          | 6872/1000000 [17:30<56:46:26,  4.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6880/1000000 [17:30<24:47:05, 11.13it/s]

{'loss': Array(0.29494572, dtype=float32), 'loss_cross_entropy': Array(0.29494572, dtype=float32)}


  1%|          | 6882/1000000 [17:31<52:51:18,  5.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6890/1000000 [17:32<25:00:19, 11.03it/s]

{'loss': Array(0.27928087, dtype=float32), 'loss_cross_entropy': Array(0.27928087, dtype=float32)}


  1%|          | 6892/1000000 [17:32<50:37:53,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6900/1000000 [17:33<24:30:26, 11.26it/s]

{'loss': Array(0.2797897, dtype=float32), 'loss_cross_entropy': Array(0.2797897, dtype=float32)}


  1%|          | 6902/1000000 [17:34<48:43:09,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6908/1000000 [17:34<28:52:01,  9.56it/s]

{'loss': Array(0.29854763, dtype=float32), 'loss_cross_entropy': Array(0.29854763, dtype=float32)}


  1%|          | 6911/1000000 [17:35<48:50:57,  5.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6919/1000000 [17:36<33:51:30,  8.15it/s]

{'loss': Array(0.2747138, dtype=float32), 'loss_cross_entropy': Array(0.2747138, dtype=float32)}


  1%|          | 6921/1000000 [17:37<54:06:28,  5.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6930/1000000 [17:37<25:31:07, 10.81it/s]

{'loss': Array(0.2920757, dtype=float32), 'loss_cross_entropy': Array(0.2920757, dtype=float32)}


  1%|          | 6932/1000000 [17:38<47:16:23,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6940/1000000 [17:39<25:01:54, 11.02it/s]

{'loss': Array(0.28249937, dtype=float32), 'loss_cross_entropy': Array(0.28249937, dtype=float32)}


  1%|          | 6942/1000000 [17:39<47:15:23,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6950/1000000 [17:40<25:24:00, 10.86it/s]

{'loss': Array(0.2952109, dtype=float32), 'loss_cross_entropy': Array(0.2952109, dtype=float32)}


  1%|          | 6952/1000000 [17:41<48:23:24,  5.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6958/1000000 [17:41<28:24:50,  9.71it/s]

{'loss': Array(0.27962983, dtype=float32), 'loss_cross_entropy': Array(0.27962983, dtype=float32)}


  1%|          | 6963/1000000 [17:42<38:36:27,  7.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6968/1000000 [17:42<26:01:16, 10.60it/s]

{'loss': Array(0.27880773, dtype=float32), 'loss_cross_entropy': Array(0.27880773, dtype=float32)}


  1%|          | 6971/1000000 [17:43<45:34:50,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6979/1000000 [17:44<36:37:29,  7.53it/s]

{'loss': Array(0.28606635, dtype=float32), 'loss_cross_entropy': Array(0.28606635, dtype=float32)}


  1%|          | 6981/1000000 [17:45<57:40:23,  4.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6990/1000000 [17:45<26:48:31, 10.29it/s]

{'loss': Array(0.2822549, dtype=float32), 'loss_cross_entropy': Array(0.2822549, dtype=float32)}


  1%|          | 6992/1000000 [17:46<49:19:30,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 6998/1000000 [17:47<29:36:11,  9.32it/s]

{'loss': Array(0.28220513, dtype=float32), 'loss_cross_entropy': Array(0.28220513, dtype=float32)}


  1%|          | 7001/1000000 [17:54<234:16:28,  1.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7010/1000000 [17:54<86:29:25,  3.19it/s] 

{'loss': Array(0.26223096, dtype=float32), 'loss_cross_entropy': Array(0.26223096, dtype=float32)}


  1%|          | 7012/1000000 [17:55<95:48:56,  2.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7018/1000000 [17:56<51:11:42,  5.39it/s]

{'loss': Array(0.29157737, dtype=float32), 'loss_cross_entropy': Array(0.29157737, dtype=float32)}


  1%|          | 7023/1000000 [17:57<50:10:34,  5.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7029/1000000 [17:57<30:03:27,  9.18it/s]

{'loss': Array(0.3063986, dtype=float32), 'loss_cross_entropy': Array(0.3063986, dtype=float32)}


  1%|          | 7033/1000000 [17:58<41:55:40,  6.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7040/1000000 [17:59<42:58:59,  6.42it/s]

{'loss': Array(0.2665698, dtype=float32), 'loss_cross_entropy': Array(0.2665698, dtype=float32)}


  1%|          | 7042/1000000 [18:00<63:14:10,  4.36it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7048/1000000 [18:00<34:21:15,  8.03it/s]

{'loss': Array(0.2853247, dtype=float32), 'loss_cross_entropy': Array(0.2853247, dtype=float32)}


  1%|          | 7051/1000000 [18:01<51:52:43,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7060/1000000 [18:01<25:37:49, 10.76it/s]

{'loss': Array(0.26847106, dtype=float32), 'loss_cross_entropy': Array(0.26847106, dtype=float32)}


  1%|          | 7062/1000000 [18:02<47:48:48,  5.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7068/1000000 [18:03<29:09:21,  9.46it/s]

{'loss': Array(0.27326676, dtype=float32), 'loss_cross_entropy': Array(0.27326676, dtype=float32)}


  1%|          | 7073/1000000 [18:04<39:11:14,  7.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7079/1000000 [18:04<25:09:27, 10.96it/s]

{'loss': Array(0.28580856, dtype=float32), 'loss_cross_entropy': Array(0.28580856, dtype=float32)}


  1%|          | 7083/1000000 [18:05<40:36:55,  6.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7088/1000000 [18:05<26:18:52, 10.48it/s]

{'loss': Array(0.27005854, dtype=float32), 'loss_cross_entropy': Array(0.27005854, dtype=float32)}


  1%|          | 7091/1000000 [18:06<45:15:04,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7100/1000000 [18:06<23:18:29, 11.83it/s]

{'loss': Array(0.2753671, dtype=float32), 'loss_cross_entropy': Array(0.2753671, dtype=float32)}


  1%|          | 7102/1000000 [18:07<45:01:09,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7110/1000000 [18:08<32:12:07,  8.56it/s]

{'loss': Array(0.30057976, dtype=float32), 'loss_cross_entropy': Array(0.30057976, dtype=float32)}


  1%|          | 7112/1000000 [18:09<52:48:25,  5.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7120/1000000 [18:10<27:10:50, 10.15it/s]

{'loss': Array(0.29561868, dtype=float32), 'loss_cross_entropy': Array(0.29561868, dtype=float32)}


  1%|          | 7122/1000000 [18:10<50:58:12,  5.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7130/1000000 [18:11<24:52:16, 11.09it/s]

{'loss': Array(0.28786427, dtype=float32), 'loss_cross_entropy': Array(0.28786427, dtype=float32)}


  1%|          | 7132/1000000 [18:12<49:19:46,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7140/1000000 [18:12<24:51:56, 11.09it/s]

{'loss': Array(0.2902144, dtype=float32), 'loss_cross_entropy': Array(0.2902144, dtype=float32)}


  1%|          | 7142/1000000 [18:13<47:43:56,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7150/1000000 [18:13<25:01:49, 11.02it/s]

{'loss': Array(0.2832326, dtype=float32), 'loss_cross_entropy': Array(0.2832326, dtype=float32)}


  1%|          | 7152/1000000 [18:14<48:29:16,  5.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7160/1000000 [18:15<26:37:51, 10.36it/s]

{'loss': Array(0.30596453, dtype=float32), 'loss_cross_entropy': Array(0.30596453, dtype=float32)}


  1%|          | 7162/1000000 [18:16<51:38:59,  5.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7170/1000000 [18:17<38:28:03,  7.17it/s]

{'loss': Array(0.27298874, dtype=float32), 'loss_cross_entropy': Array(0.27298874, dtype=float32)}


  1%|          | 7172/1000000 [18:17<58:28:36,  4.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7180/1000000 [18:18<27:30:45, 10.02it/s]

{'loss': Array(0.29581204, dtype=float32), 'loss_cross_entropy': Array(0.29581204, dtype=float32)}


  1%|          | 7182/1000000 [18:19<52:11:26,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7188/1000000 [18:19<30:18:30,  9.10it/s]

{'loss': Array(0.2856735, dtype=float32), 'loss_cross_entropy': Array(0.2856735, dtype=float32)}


  1%|          | 7191/1000000 [18:20<48:07:57,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7200/1000000 [18:20<24:28:49, 11.27it/s]

{'loss': Array(0.26624528, dtype=float32), 'loss_cross_entropy': Array(0.26624528, dtype=float32)}


  1%|          | 7202/1000000 [18:21<45:29:21,  6.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7210/1000000 [18:22<24:54:13, 11.07it/s]

{'loss': Array(0.27293968, dtype=float32), 'loss_cross_entropy': Array(0.27293968, dtype=float32)}


  1%|          | 7212/1000000 [18:22<48:18:19,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7220/1000000 [18:23<24:06:24, 11.44it/s]

{'loss': Array(0.29612204, dtype=float32), 'loss_cross_entropy': Array(0.29612204, dtype=float32)}


  1%|          | 7222/1000000 [18:24<47:05:45,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7230/1000000 [18:25<42:28:57,  6.49it/s]

{'loss': Array(0.27491102, dtype=float32), 'loss_cross_entropy': Array(0.27491102, dtype=float32)}


  1%|          | 7232/1000000 [18:26<60:49:16,  4.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7238/1000000 [18:26<34:17:40,  8.04it/s]

{'loss': Array(0.30263412, dtype=float32), 'loss_cross_entropy': Array(0.30263412, dtype=float32)}


  1%|          | 7241/1000000 [18:27<49:20:07,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7250/1000000 [18:27<25:16:57, 10.91it/s]

{'loss': Array(0.2938343, dtype=float32), 'loss_cross_entropy': Array(0.2938343, dtype=float32)}


  1%|          | 7252/1000000 [18:28<48:36:08,  5.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7260/1000000 [18:29<24:26:21, 11.28it/s]

{'loss': Array(0.27701107, dtype=float32), 'loss_cross_entropy': Array(0.27701107, dtype=float32)}


  1%|          | 7262/1000000 [18:29<52:48:32,  5.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7270/1000000 [18:30<25:27:22, 10.83it/s]

{'loss': Array(0.2899302, dtype=float32), 'loss_cross_entropy': Array(0.2899302, dtype=float32)}


  1%|          | 7272/1000000 [18:31<55:35:34,  4.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7278/1000000 [18:31<29:39:42,  9.30it/s]

{'loss': Array(0.28115487, dtype=float32), 'loss_cross_entropy': Array(0.28115487, dtype=float32)}


  1%|          | 7281/1000000 [18:32<47:51:18,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7289/1000000 [18:32<25:08:19, 10.97it/s]

{'loss': Array(0.27087894, dtype=float32), 'loss_cross_entropy': Array(0.27087894, dtype=float32)}


  1%|          | 7291/1000000 [18:33<46:15:48,  5.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7299/1000000 [18:34<32:11:57,  8.56it/s]

{'loss': Array(0.2792718, dtype=float32), 'loss_cross_entropy': Array(0.2792718, dtype=float32)}


  1%|          | 7301/1000000 [18:35<54:43:42,  5.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7310/1000000 [18:35<25:44:33, 10.71it/s]

{'loss': Array(0.28554818, dtype=float32), 'loss_cross_entropy': Array(0.28554818, dtype=float32)}


  1%|          | 7312/1000000 [18:36<48:15:04,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7319/1000000 [18:37<25:55:53, 10.63it/s]

{'loss': Array(0.298478, dtype=float32), 'loss_cross_entropy': Array(0.298478, dtype=float32)}


  1%|          | 7323/1000000 [18:38<41:22:44,  6.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7329/1000000 [18:38<25:07:59, 10.97it/s]

{'loss': Array(0.28392044, dtype=float32), 'loss_cross_entropy': Array(0.28392044, dtype=float32)}


  1%|          | 7331/1000000 [18:39<49:00:57,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7340/1000000 [18:39<23:43:44, 11.62it/s]

{'loss': Array(0.26216003, dtype=float32), 'loss_cross_entropy': Array(0.26216003, dtype=float32)}


  1%|          | 7342/1000000 [18:40<47:01:15,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7348/1000000 [18:41<28:34:13,  9.65it/s]

{'loss': Array(0.28853136, dtype=float32), 'loss_cross_entropy': Array(0.28853136, dtype=float32)}


  1%|          | 7353/1000000 [18:41<37:24:53,  7.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7359/1000000 [18:42<35:59:43,  7.66it/s]

{'loss': Array(0.27531832, dtype=float32), 'loss_cross_entropy': Array(0.27531832, dtype=float32)}


  1%|          | 7361/1000000 [18:43<55:31:37,  4.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7370/1000000 [18:44<26:20:15, 10.47it/s]

{'loss': Array(0.2832301, dtype=float32), 'loss_cross_entropy': Array(0.2832301, dtype=float32)}


  1%|          | 7372/1000000 [18:45<48:44:20,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7380/1000000 [18:45<25:31:32, 10.80it/s]

{'loss': Array(0.28573105, dtype=float32), 'loss_cross_entropy': Array(0.28573105, dtype=float32)}


  1%|          | 7382/1000000 [18:46<46:51:38,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7388/1000000 [18:46<28:03:00,  9.83it/s]

{'loss': Array(0.28257394, dtype=float32), 'loss_cross_entropy': Array(0.28257394, dtype=float32)}


  1%|          | 7391/1000000 [18:47<46:19:48,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7400/1000000 [18:47<24:11:19, 11.40it/s]

{'loss': Array(0.27314445, dtype=float32), 'loss_cross_entropy': Array(0.27314445, dtype=float32)}


  1%|          | 7402/1000000 [18:48<45:36:46,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7408/1000000 [18:49<28:15:47,  9.76it/s]

{'loss': Array(0.25800797, dtype=float32), 'loss_cross_entropy': Array(0.25800797, dtype=float32)}


  1%|          | 7411/1000000 [18:50<45:00:02,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7420/1000000 [18:51<42:30:32,  6.49it/s]

{'loss': Array(0.27104387, dtype=float32), 'loss_cross_entropy': Array(0.27104387, dtype=float32)}


  1%|          | 7422/1000000 [18:51<62:27:40,  4.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7428/1000000 [18:52<34:00:26,  8.11it/s]

{'loss': Array(0.28459463, dtype=float32), 'loss_cross_entropy': Array(0.28459463, dtype=float32)}


  1%|          | 7433/1000000 [18:53<42:33:45,  6.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7440/1000000 [18:53<24:34:50, 11.22it/s]

{'loss': Array(0.26393315, dtype=float32), 'loss_cross_entropy': Array(0.26393315, dtype=float32)}


  1%|          | 7442/1000000 [18:54<51:32:30,  5.35it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7448/1000000 [18:54<29:15:42,  9.42it/s]

{'loss': Array(0.26223555, dtype=float32), 'loss_cross_entropy': Array(0.26223555, dtype=float32)}


  1%|          | 7451/1000000 [18:55<48:19:49,  5.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7460/1000000 [18:56<24:28:08, 11.27it/s]

{'loss': Array(0.26429138, dtype=float32), 'loss_cross_entropy': Array(0.26429138, dtype=float32)}


  1%|          | 7462/1000000 [18:57<45:47:54,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7468/1000000 [18:57<28:17:02,  9.75it/s]

{'loss': Array(0.2848053, dtype=float32), 'loss_cross_entropy': Array(0.2848053, dtype=float32)}


  1%|          | 7471/1000000 [18:58<46:09:57,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7480/1000000 [18:58<24:14:55, 11.37it/s]

{'loss': Array(0.29022512, dtype=float32), 'loss_cross_entropy': Array(0.29022512, dtype=float32)}


  1%|          | 7482/1000000 [18:59<44:30:55,  6.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7490/1000000 [19:00<32:23:29,  8.51it/s]

{'loss': Array(0.27873024, dtype=float32), 'loss_cross_entropy': Array(0.27873024, dtype=float32)}


  1%|          | 7492/1000000 [19:01<51:42:48,  5.33it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7498/1000000 [19:01<30:12:45,  9.13it/s]

{'loss': Array(0.2936655, dtype=float32), 'loss_cross_entropy': Array(0.2936655, dtype=float32)}


  1%|          | 7501/1000000 [19:09<238:58:28,  1.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7510/1000000 [19:09<87:50:26,  3.14it/s] 

{'loss': Array(0.27315158, dtype=float32), 'loss_cross_entropy': Array(0.27315158, dtype=float32)}


  1%|          | 7512/1000000 [19:10<97:58:28,  2.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7520/1000000 [19:10<41:30:02,  6.64it/s]

{'loss': Array(0.27259278, dtype=float32), 'loss_cross_entropy': Array(0.27259278, dtype=float32)}


  1%|          | 7522/1000000 [19:11<61:22:12,  4.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7530/1000000 [19:12<29:03:25,  9.49it/s]

{'loss': Array(0.29961193, dtype=float32), 'loss_cross_entropy': Array(0.29961193, dtype=float32)}


  1%|          | 7532/1000000 [19:13<51:21:28,  5.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7538/1000000 [19:13<29:55:38,  9.21it/s]

{'loss': Array(0.28447315, dtype=float32), 'loss_cross_entropy': Array(0.28447315, dtype=float32)}


  1%|          | 7543/1000000 [19:14<39:50:45,  6.92it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7548/1000000 [19:15<45:01:31,  6.12it/s]

{'loss': Array(0.28500187, dtype=float32), 'loss_cross_entropy': Array(0.28500187, dtype=float32)}


  1%|          | 7551/1000000 [19:16<57:41:22,  4.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7560/1000000 [19:16<27:32:44, 10.01it/s]

{'loss': Array(0.28004822, dtype=float32), 'loss_cross_entropy': Array(0.28004822, dtype=float32)}


  1%|          | 7562/1000000 [19:17<48:52:58,  5.64it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7568/1000000 [19:17<29:39:27,  9.30it/s]

{'loss': Array(0.28381678, dtype=float32), 'loss_cross_entropy': Array(0.28381678, dtype=float32)}


  1%|          | 7571/1000000 [19:18<47:35:54,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7580/1000000 [19:19<24:44:37, 11.14it/s]

{'loss': Array(0.2688298, dtype=float32), 'loss_cross_entropy': Array(0.2688298, dtype=float32)}


  1%|          | 7582/1000000 [19:19<46:19:58,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7589/1000000 [19:20<26:25:45, 10.43it/s]

{'loss': Array(0.26503962, dtype=float32), 'loss_cross_entropy': Array(0.26503962, dtype=float32)}


  1%|          | 7591/1000000 [19:21<49:59:11,  5.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7600/1000000 [19:21<23:52:55, 11.54it/s]

{'loss': Array(0.27634984, dtype=float32), 'loss_cross_entropy': Array(0.27634984, dtype=float32)}


  1%|          | 7602/1000000 [19:22<46:48:55,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7610/1000000 [19:22<24:30:04, 11.25it/s]

{'loss': Array(0.28793633, dtype=float32), 'loss_cross_entropy': Array(0.28793633, dtype=float32)}


  1%|          | 7612/1000000 [19:24<66:37:42,  4.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7618/1000000 [19:24<36:34:33,  7.54it/s]

{'loss': Array(0.28002632, dtype=float32), 'loss_cross_entropy': Array(0.28002632, dtype=float32)}


  1%|          | 7621/1000000 [19:25<52:45:40,  5.22it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7630/1000000 [19:26<26:12:32, 10.52it/s]

{'loss': Array(0.28705162, dtype=float32), 'loss_cross_entropy': Array(0.28705162, dtype=float32)}


  1%|          | 7632/1000000 [19:26<45:35:42,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7639/1000000 [19:27<25:09:21, 10.96it/s]

{'loss': Array(0.28240174, dtype=float32), 'loss_cross_entropy': Array(0.28240174, dtype=float32)}


  1%|          | 7643/1000000 [19:28<40:19:28,  6.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7649/1000000 [19:28<23:53:51, 11.53it/s]

{'loss': Array(0.2919195, dtype=float32), 'loss_cross_entropy': Array(0.2919195, dtype=float32)}


  1%|          | 7653/1000000 [19:29<42:52:20,  6.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7659/1000000 [19:29<24:13:53, 11.38it/s]

{'loss': Array(0.26061222, dtype=float32), 'loss_cross_entropy': Array(0.26061222, dtype=float32)}


  1%|          | 7663/1000000 [19:30<43:24:01,  6.35it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7669/1000000 [19:31<24:44:52, 11.14it/s]

{'loss': Array(0.2777585, dtype=float32), 'loss_cross_entropy': Array(0.2777585, dtype=float32)}


  1%|          | 7673/1000000 [19:32<41:22:25,  6.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7678/1000000 [19:32<39:35:35,  6.96it/s]

{'loss': Array(0.28511953, dtype=float32), 'loss_cross_entropy': Array(0.28511953, dtype=float32)}


  1%|          | 7681/1000000 [19:33<54:14:31,  5.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7690/1000000 [19:34<25:39:40, 10.74it/s]

{'loss': Array(0.29605058, dtype=float32), 'loss_cross_entropy': Array(0.29605058, dtype=float32)}


  1%|          | 7692/1000000 [19:35<48:32:25,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7699/1000000 [19:35<25:54:09, 10.64it/s]

{'loss': Array(0.2854817, dtype=float32), 'loss_cross_entropy': Array(0.2854817, dtype=float32)}


  1%|          | 7701/1000000 [19:36<49:48:22,  5.53it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7710/1000000 [19:36<23:36:41, 11.67it/s]

{'loss': Array(0.29443416, dtype=float32), 'loss_cross_entropy': Array(0.29443416, dtype=float32)}


  1%|          | 7712/1000000 [19:37<45:30:02,  6.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7720/1000000 [19:38<24:18:53, 11.34it/s]

{'loss': Array(0.26883152, dtype=float32), 'loss_cross_entropy': Array(0.26883152, dtype=float32)}


  1%|          | 7722/1000000 [19:38<48:01:44,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7728/1000000 [19:39<28:12:52,  9.77it/s]

{'loss': Array(0.26221406, dtype=float32), 'loss_cross_entropy': Array(0.26221406, dtype=float32)}


  1%|          | 7731/1000000 [19:40<46:13:30,  5.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7738/1000000 [19:41<45:52:12,  6.01it/s]

{'loss': Array(0.28997332, dtype=float32), 'loss_cross_entropy': Array(0.28997332, dtype=float32)}


  1%|          | 7741/1000000 [19:41<58:32:34,  4.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7750/1000000 [19:42<27:12:30, 10.13it/s]

{'loss': Array(0.29124722, dtype=float32), 'loss_cross_entropy': Array(0.29124722, dtype=float32)}


  1%|          | 7752/1000000 [19:43<47:37:20,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7758/1000000 [19:43<28:47:37,  9.57it/s]

{'loss': Array(0.27886057, dtype=float32), 'loss_cross_entropy': Array(0.27886057, dtype=float32)}


  1%|          | 7761/1000000 [19:44<45:25:18,  6.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7770/1000000 [19:44<23:37:27, 11.67it/s]

{'loss': Array(0.26018462, dtype=float32), 'loss_cross_entropy': Array(0.26018462, dtype=float32)}


  1%|          | 7772/1000000 [19:45<45:17:57,  6.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7780/1000000 [19:46<23:58:19, 11.50it/s]

{'loss': Array(0.2794265, dtype=float32), 'loss_cross_entropy': Array(0.2794265, dtype=float32)}


  1%|          | 7782/1000000 [19:46<45:04:12,  6.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7788/1000000 [19:47<27:21:41, 10.07it/s]

{'loss': Array(0.2804695, dtype=float32), 'loss_cross_entropy': Array(0.2804695, dtype=float32)}


  1%|          | 7793/1000000 [19:48<37:02:47,  7.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7799/1000000 [19:48<24:08:13, 11.42it/s]

{'loss': Array(0.27945372, dtype=float32), 'loss_cross_entropy': Array(0.27945372, dtype=float32)}


  1%|          | 7803/1000000 [19:49<51:37:36,  5.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7810/1000000 [19:50<27:17:52, 10.10it/s]

{'loss': Array(0.29208186, dtype=float32), 'loss_cross_entropy': Array(0.29208186, dtype=float32)}


  1%|          | 7812/1000000 [19:51<51:16:44,  5.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7818/1000000 [19:51<29:12:54,  9.43it/s]

{'loss': Array(0.29001, dtype=float32), 'loss_cross_entropy': Array(0.29001, dtype=float32)}


  1%|          | 7821/1000000 [19:52<46:35:48,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7830/1000000 [19:52<24:13:34, 11.38it/s]

{'loss': Array(0.26314092, dtype=float32), 'loss_cross_entropy': Array(0.26314092, dtype=float32)}


  1%|          | 7832/1000000 [19:53<45:31:44,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7840/1000000 [19:54<25:00:29, 11.02it/s]

{'loss': Array(0.28867084, dtype=float32), 'loss_cross_entropy': Array(0.28867084, dtype=float32)}


  1%|          | 7842/1000000 [19:54<48:34:23,  5.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7849/1000000 [19:55<26:24:19, 10.44it/s]

{'loss': Array(0.25937587, dtype=float32), 'loss_cross_entropy': Array(0.25937587, dtype=float32)}


  1%|          | 7853/1000000 [19:56<41:07:45,  6.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7860/1000000 [19:56<22:28:33, 12.26it/s]

{'loss': Array(0.27984226, dtype=float32), 'loss_cross_entropy': Array(0.27984226, dtype=float32)}


  1%|          | 7862/1000000 [19:57<46:42:19,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7870/1000000 [19:58<32:13:27,  8.55it/s]

{'loss': Array(0.28458497, dtype=float32), 'loss_cross_entropy': Array(0.28458497, dtype=float32)}


  1%|          | 7872/1000000 [19:59<53:23:19,  5.16it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7878/1000000 [19:59<30:38:34,  8.99it/s]

{'loss': Array(0.30406788, dtype=float32), 'loss_cross_entropy': Array(0.30406788, dtype=float32)}


  1%|          | 7881/1000000 [20:00<48:25:11,  5.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7890/1000000 [20:00<24:39:36, 11.18it/s]

{'loss': Array(0.29716524, dtype=float32), 'loss_cross_entropy': Array(0.29716524, dtype=float32)}


  1%|          | 7892/1000000 [20:01<44:38:19,  6.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7900/1000000 [20:02<23:57:16, 11.50it/s]

{'loss': Array(0.26971927, dtype=float32), 'loss_cross_entropy': Array(0.26971927, dtype=float32)}


  1%|          | 7902/1000000 [20:03<45:45:22,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7908/1000000 [20:03<27:39:45,  9.96it/s]

{'loss': Array(0.28184864, dtype=float32), 'loss_cross_entropy': Array(0.28184864, dtype=float32)}


  1%|          | 7911/1000000 [20:04<45:41:40,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7920/1000000 [20:04<23:54:00, 11.53it/s]

{'loss': Array(0.27518415, dtype=float32), 'loss_cross_entropy': Array(0.27518415, dtype=float32)}


  1%|          | 7922/1000000 [20:05<46:33:57,  5.92it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7930/1000000 [20:06<36:44:57,  7.50it/s]

{'loss': Array(0.2806018, dtype=float32), 'loss_cross_entropy': Array(0.2806018, dtype=float32)}


  1%|          | 7932/1000000 [20:07<58:15:47,  4.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7938/1000000 [20:07<32:54:51,  8.37it/s]

{'loss': Array(0.28307316, dtype=float32), 'loss_cross_entropy': Array(0.28307316, dtype=float32)}


  1%|          | 7943/1000000 [20:08<41:06:15,  6.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7949/1000000 [20:09<25:49:49, 10.67it/s]

{'loss': Array(0.27158815, dtype=float32), 'loss_cross_entropy': Array(0.27158815, dtype=float32)}


  1%|          | 7953/1000000 [20:10<40:18:26,  6.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7958/1000000 [20:10<27:17:09, 10.10it/s]

{'loss': Array(0.28603616, dtype=float32), 'loss_cross_entropy': Array(0.28603616, dtype=float32)}


  1%|          | 7961/1000000 [20:11<45:41:08,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7970/1000000 [20:11<23:17:54, 11.83it/s]

{'loss': Array(0.2916033, dtype=float32), 'loss_cross_entropy': Array(0.2916033, dtype=float32)}


  1%|          | 7972/1000000 [20:12<45:15:21,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7980/1000000 [20:12<24:27:49, 11.26it/s]

{'loss': Array(0.26274005, dtype=float32), 'loss_cross_entropy': Array(0.26274005, dtype=float32)}


  1%|          | 7982/1000000 [20:13<47:23:36,  5.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7988/1000000 [20:13<28:04:24,  9.82it/s]

{'loss': Array(0.2648444, dtype=float32), 'loss_cross_entropy': Array(0.2648444, dtype=float32)}


  1%|          | 7991/1000000 [20:14<45:42:49,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 7999/1000000 [20:15<33:15:30,  8.29it/s]

{'loss': Array(0.26986656, dtype=float32), 'loss_cross_entropy': Array(0.26986656, dtype=float32)}


  1%|          | 8001/1000000 [20:23<265:17:08,  1.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8010/1000000 [20:23<90:30:22,  3.04it/s] 

{'loss': Array(0.26986063, dtype=float32), 'loss_cross_entropy': Array(0.26986063, dtype=float32)}


  1%|          | 8012/1000000 [20:24<96:42:43,  2.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8018/1000000 [20:24<50:54:09,  5.41it/s]

{'loss': Array(0.26747423, dtype=float32), 'loss_cross_entropy': Array(0.26747423, dtype=float32)}


  1%|          | 8021/1000000 [20:25<62:15:41,  4.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8030/1000000 [20:26<29:28:26,  9.35it/s]

{'loss': Array(0.28060564, dtype=float32), 'loss_cross_entropy': Array(0.28060564, dtype=float32)}


  1%|          | 8032/1000000 [20:27<49:55:00,  5.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8040/1000000 [20:27<26:46:02, 10.29it/s]

{'loss': Array(0.30041066, dtype=float32), 'loss_cross_entropy': Array(0.30041066, dtype=float32)}


  1%|          | 8042/1000000 [20:28<51:27:03,  5.36it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8050/1000000 [20:28<25:04:13, 10.99it/s]

{'loss': Array(0.2717581, dtype=float32), 'loss_cross_entropy': Array(0.2717581, dtype=float32)}


  1%|          | 8052/1000000 [20:29<53:54:26,  5.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8060/1000000 [20:30<36:35:07,  7.53it/s]

{'loss': Array(0.28354138, dtype=float32), 'loss_cross_entropy': Array(0.28354138, dtype=float32)}


  1%|          | 8062/1000000 [20:31<63:00:37,  4.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8070/1000000 [20:32<26:56:33, 10.23it/s]

{'loss': Array(0.28991306, dtype=float32), 'loss_cross_entropy': Array(0.28991306, dtype=float32)}


  1%|          | 8072/1000000 [20:32<49:04:33,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8078/1000000 [20:33<28:34:30,  9.64it/s]

{'loss': Array(0.27133068, dtype=float32), 'loss_cross_entropy': Array(0.27133068, dtype=float32)}


  1%|          | 8081/1000000 [20:34<44:53:33,  6.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8090/1000000 [20:34<23:37:42, 11.66it/s]

{'loss': Array(0.26580387, dtype=float32), 'loss_cross_entropy': Array(0.26580387, dtype=float32)}


  1%|          | 8092/1000000 [20:35<48:05:27,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8098/1000000 [20:35<29:13:07,  9.43it/s]

{'loss': Array(0.2702673, dtype=float32), 'loss_cross_entropy': Array(0.2702673, dtype=float32)}


  1%|          | 8101/1000000 [20:36<46:15:07,  5.96it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8110/1000000 [20:37<24:16:15, 11.35it/s]

{'loss': Array(0.29213694, dtype=float32), 'loss_cross_entropy': Array(0.29213694, dtype=float32)}


  1%|          | 8112/1000000 [20:37<43:49:26,  6.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8119/1000000 [20:38<44:19:25,  6.22it/s]

{'loss': Array(0.30072647, dtype=float32), 'loss_cross_entropy': Array(0.30072647, dtype=float32)}


  1%|          | 8123/1000000 [20:39<51:33:26,  5.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8130/1000000 [20:40<28:20:16,  9.72it/s]

{'loss': Array(0.29331502, dtype=float32), 'loss_cross_entropy': Array(0.29331502, dtype=float32)}


  1%|          | 8132/1000000 [20:41<53:37:06,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8138/1000000 [20:41<29:47:01,  9.25it/s]

{'loss': Array(0.29504076, dtype=float32), 'loss_cross_entropy': Array(0.29504076, dtype=float32)}


  1%|          | 8141/1000000 [20:42<48:04:39,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8150/1000000 [20:42<24:33:59, 11.22it/s]

{'loss': Array(0.2820141, dtype=float32), 'loss_cross_entropy': Array(0.2820141, dtype=float32)}


  1%|          | 8152/1000000 [20:43<45:31:24,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8158/1000000 [20:43<27:49:50,  9.90it/s]

{'loss': Array(0.2811615, dtype=float32), 'loss_cross_entropy': Array(0.2811615, dtype=float32)}


  1%|          | 8161/1000000 [20:44<44:56:00,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8168/1000000 [20:45<27:53:39,  9.88it/s]

{'loss': Array(0.27714446, dtype=float32), 'loss_cross_entropy': Array(0.27714446, dtype=float32)}


  1%|          | 8171/1000000 [20:46<47:51:00,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8178/1000000 [20:46<27:01:53, 10.19it/s]

{'loss': Array(0.29869932, dtype=float32), 'loss_cross_entropy': Array(0.29869932, dtype=float32)}


  1%|          | 8181/1000000 [20:47<49:33:05,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8189/1000000 [20:48<33:34:40,  8.20it/s]

{'loss': Array(0.2728737, dtype=float32), 'loss_cross_entropy': Array(0.2728737, dtype=float32)}


  1%|          | 8191/1000000 [20:49<54:21:26,  5.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8200/1000000 [20:49<25:24:20, 10.84it/s]

{'loss': Array(0.2991524, dtype=float32), 'loss_cross_entropy': Array(0.2991524, dtype=float32)}


  1%|          | 8202/1000000 [20:50<48:30:07,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8208/1000000 [20:50<29:02:30,  9.49it/s]

{'loss': Array(0.30324492, dtype=float32), 'loss_cross_entropy': Array(0.30324492, dtype=float32)}


  1%|          | 8211/1000000 [20:51<46:17:18,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8219/1000000 [20:52<25:06:44, 10.97it/s]

{'loss': Array(0.28736088, dtype=float32), 'loss_cross_entropy': Array(0.28736088, dtype=float32)}


  1%|          | 8221/1000000 [20:53<48:53:44,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8230/1000000 [20:53<23:55:28, 11.51it/s]

{'loss': Array(0.28023878, dtype=float32), 'loss_cross_entropy': Array(0.28023878, dtype=float32)}


  1%|          | 8232/1000000 [20:54<45:52:53,  6.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8240/1000000 [20:54<24:50:51, 11.09it/s]

{'loss': Array(0.2775063, dtype=float32), 'loss_cross_entropy': Array(0.2775063, dtype=float32)}


  1%|          | 8242/1000000 [20:55<49:23:23,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8250/1000000 [20:56<37:09:53,  7.41it/s]

{'loss': Array(0.2837902, dtype=float32), 'loss_cross_entropy': Array(0.2837902, dtype=float32)}


  1%|          | 8252/1000000 [20:57<57:36:55,  4.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8258/1000000 [20:57<32:37:57,  8.44it/s]

{'loss': Array(0.2713404, dtype=float32), 'loss_cross_entropy': Array(0.2713404, dtype=float32)}


  1%|          | 8261/1000000 [20:58<48:54:59,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8268/1000000 [20:59<27:45:25,  9.92it/s]

{'loss': Array(0.27875528, dtype=float32), 'loss_cross_entropy': Array(0.27875528, dtype=float32)}


  1%|          | 8271/1000000 [21:00<49:01:49,  5.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8278/1000000 [21:00<27:40:06,  9.96it/s]

{'loss': Array(0.25312683, dtype=float32), 'loss_cross_entropy': Array(0.25312683, dtype=float32)}


  1%|          | 8281/1000000 [21:01<46:26:21,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8290/1000000 [21:01<23:26:30, 11.75it/s]

{'loss': Array(0.28933367, dtype=float32), 'loss_cross_entropy': Array(0.28933367, dtype=float32)}


  1%|          | 8292/1000000 [21:02<46:24:34,  5.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8300/1000000 [21:03<24:51:00, 11.09it/s]

{'loss': Array(0.2715116, dtype=float32), 'loss_cross_entropy': Array(0.2715116, dtype=float32)}


  1%|          | 8302/1000000 [21:03<49:18:30,  5.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8310/1000000 [21:04<42:30:20,  6.48it/s]

{'loss': Array(0.27392393, dtype=float32), 'loss_cross_entropy': Array(0.27392393, dtype=float32)}


  1%|          | 8312/1000000 [21:05<62:06:17,  4.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8318/1000000 [21:06<34:41:02,  7.94it/s]

{'loss': Array(0.26395106, dtype=float32), 'loss_cross_entropy': Array(0.26395106, dtype=float32)}


  1%|          | 8323/1000000 [21:07<40:43:37,  6.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8330/1000000 [21:07<23:55:39, 11.51it/s]

{'loss': Array(0.29125533, dtype=float32), 'loss_cross_entropy': Array(0.29125533, dtype=float32)}


  1%|          | 8332/1000000 [21:08<48:38:26,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8338/1000000 [21:08<28:09:25,  9.78it/s]

{'loss': Array(0.293704, dtype=float32), 'loss_cross_entropy': Array(0.293704, dtype=float32)}


  1%|          | 8341/1000000 [21:09<46:52:57,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8350/1000000 [21:09<23:58:02, 11.49it/s]

{'loss': Array(0.2837012, dtype=float32), 'loss_cross_entropy': Array(0.2837012, dtype=float32)}


  1%|          | 8352/1000000 [21:10<47:59:45,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8358/1000000 [21:11<29:12:02,  9.43it/s]

{'loss': Array(0.2661532, dtype=float32), 'loss_cross_entropy': Array(0.2661532, dtype=float32)}


  1%|          | 8361/1000000 [21:12<47:08:20,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8369/1000000 [21:12<25:24:53, 10.84it/s]

{'loss': Array(0.27280498, dtype=float32), 'loss_cross_entropy': Array(0.27280498, dtype=float32)}


  1%|          | 8371/1000000 [21:13<47:55:46,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8379/1000000 [21:14<37:19:04,  7.38it/s]

{'loss': Array(0.27855384, dtype=float32), 'loss_cross_entropy': Array(0.27855384, dtype=float32)}


  1%|          | 8381/1000000 [21:15<58:43:12,  4.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8389/1000000 [21:15<27:45:10,  9.92it/s]

{'loss': Array(0.2546547, dtype=float32), 'loss_cross_entropy': Array(0.2546547, dtype=float32)}


  1%|          | 8391/1000000 [21:16<50:11:32,  5.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8400/1000000 [21:16<24:03:23, 11.45it/s]

{'loss': Array(0.2938341, dtype=float32), 'loss_cross_entropy': Array(0.2938341, dtype=float32)}


  1%|          | 8402/1000000 [21:17<45:46:43,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8408/1000000 [21:18<27:54:50,  9.87it/s]

{'loss': Array(0.2926002, dtype=float32), 'loss_cross_entropy': Array(0.2926002, dtype=float32)}


  1%|          | 8411/1000000 [21:18<45:02:46,  6.11it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8420/1000000 [21:19<23:48:13, 11.57it/s]

{'loss': Array(0.2813715, dtype=float32), 'loss_cross_entropy': Array(0.2813715, dtype=float32)}


  1%|          | 8422/1000000 [21:20<47:00:10,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8428/1000000 [21:20<28:40:01,  9.61it/s]

{'loss': Array(0.2911987, dtype=float32), 'loss_cross_entropy': Array(0.2911987, dtype=float32)}


  1%|          | 8431/1000000 [21:21<45:01:27,  6.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8440/1000000 [21:22<39:48:07,  6.92it/s]

{'loss': Array(0.254979, dtype=float32), 'loss_cross_entropy': Array(0.254979, dtype=float32)}


  1%|          | 8442/1000000 [21:23<59:11:31,  4.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8448/1000000 [21:23<34:15:17,  8.04it/s]

{'loss': Array(0.26825938, dtype=float32), 'loss_cross_entropy': Array(0.26825938, dtype=float32)}


  1%|          | 8451/1000000 [21:24<49:44:12,  5.54it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8460/1000000 [21:24<25:29:47, 10.80it/s]

{'loss': Array(0.2799072, dtype=float32), 'loss_cross_entropy': Array(0.2799072, dtype=float32)}


  1%|          | 8462/1000000 [21:25<47:54:55,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8469/1000000 [21:26<26:00:32, 10.59it/s]

{'loss': Array(0.28019986, dtype=float32), 'loss_cross_entropy': Array(0.28019986, dtype=float32)}


  1%|          | 8471/1000000 [21:27<48:03:01,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8480/1000000 [21:27<22:59:06, 11.98it/s]

{'loss': Array(0.2603884, dtype=float32), 'loss_cross_entropy': Array(0.2603884, dtype=float32)}


  1%|          | 8482/1000000 [21:28<54:20:22,  5.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8490/1000000 [21:28<24:57:21, 11.04it/s]

{'loss': Array(0.2735799, dtype=float32), 'loss_cross_entropy': Array(0.2735799, dtype=float32)}


  1%|          | 8492/1000000 [21:29<54:21:09,  5.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8500/1000000 [21:30<25:59:09, 10.60it/s]

{'loss': Array(0.26007485, dtype=float32), 'loss_cross_entropy': Array(0.26007485, dtype=float32)}
context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8508/1000000 [21:38<142:35:49,  1.93it/s]

{'loss': Array(0.2798658, dtype=float32), 'loss_cross_entropy': Array(0.2798658, dtype=float32)}


  1%|          | 8511/1000000 [21:39<120:52:26,  2.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8520/1000000 [21:39<47:05:27,  5.85it/s] 

{'loss': Array(0.26787326, dtype=float32), 'loss_cross_entropy': Array(0.26787326, dtype=float32)}


  1%|          | 8522/1000000 [21:40<64:07:31,  4.29it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8530/1000000 [21:41<28:37:17,  9.62it/s]

{'loss': Array(0.30397528, dtype=float32), 'loss_cross_entropy': Array(0.30397528, dtype=float32)}


  1%|          | 8532/1000000 [21:41<53:42:19,  5.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8538/1000000 [21:42<29:28:11,  9.35it/s]

{'loss': Array(0.26293805, dtype=float32), 'loss_cross_entropy': Array(0.26293805, dtype=float32)}


  1%|          | 8541/1000000 [21:43<49:07:12,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8550/1000000 [21:43<24:28:29, 11.25it/s]

{'loss': Array(0.2663121, dtype=float32), 'loss_cross_entropy': Array(0.2663121, dtype=float32)}


  1%|          | 8552/1000000 [21:44<47:02:24,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8560/1000000 [21:44<24:54:25, 11.06it/s]

{'loss': Array(0.29302058, dtype=float32), 'loss_cross_entropy': Array(0.29302058, dtype=float32)}


  1%|          | 8562/1000000 [21:45<48:53:19,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8570/1000000 [21:46<37:11:50,  7.40it/s]

{'loss': Array(0.26742807, dtype=float32), 'loss_cross_entropy': Array(0.26742807, dtype=float32)}


  1%|          | 8572/1000000 [21:47<58:37:46,  4.70it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8580/1000000 [21:48<28:28:22,  9.67it/s]

{'loss': Array(0.272639, dtype=float32), 'loss_cross_entropy': Array(0.272639, dtype=float32)}


  1%|          | 8582/1000000 [21:48<53:31:00,  5.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8590/1000000 [21:49<25:40:02, 10.73it/s]

{'loss': Array(0.29286757, dtype=float32), 'loss_cross_entropy': Array(0.29286757, dtype=float32)}


  1%|          | 8592/1000000 [21:50<48:44:28,  5.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8598/1000000 [21:50<28:36:29,  9.63it/s]

{'loss': Array(0.26754883, dtype=float32), 'loss_cross_entropy': Array(0.26754883, dtype=float32)}


  1%|          | 8601/1000000 [21:51<45:43:02,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8610/1000000 [21:51<23:46:02, 11.59it/s]

{'loss': Array(0.28992552, dtype=float32), 'loss_cross_entropy': Array(0.28992552, dtype=float32)}


  1%|          | 8612/1000000 [21:52<45:21:51,  6.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8620/1000000 [21:53<24:38:52, 11.17it/s]

{'loss': Array(0.27765128, dtype=float32), 'loss_cross_entropy': Array(0.27765128, dtype=float32)}


  1%|          | 8622/1000000 [21:53<47:44:48,  5.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8630/1000000 [21:54<43:27:18,  6.34it/s]

{'loss': Array(0.26939502, dtype=float32), 'loss_cross_entropy': Array(0.26939502, dtype=float32)}


  1%|          | 8632/1000000 [21:55<63:28:18,  4.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8638/1000000 [21:56<34:48:42,  7.91it/s]

{'loss': Array(0.26085615, dtype=float32), 'loss_cross_entropy': Array(0.26085615, dtype=float32)}


  1%|          | 8643/1000000 [21:57<41:56:23,  6.57it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8649/1000000 [21:57<25:57:42, 10.61it/s]

{'loss': Array(0.26175916, dtype=float32), 'loss_cross_entropy': Array(0.26175916, dtype=float32)}


  1%|          | 8651/1000000 [21:58<47:47:37,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8660/1000000 [21:58<23:47:38, 11.57it/s]

{'loss': Array(0.26189166, dtype=float32), 'loss_cross_entropy': Array(0.26189166, dtype=float32)}


  1%|          | 8662/1000000 [21:59<45:40:02,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8668/1000000 [21:59<27:57:40,  9.85it/s]

{'loss': Array(0.30775392, dtype=float32), 'loss_cross_entropy': Array(0.30775392, dtype=float32)}


  1%|          | 8671/1000000 [22:00<46:47:19,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8680/1000000 [22:01<24:24:58, 11.28it/s]

{'loss': Array(0.28698847, dtype=float32), 'loss_cross_entropy': Array(0.28698847, dtype=float32)}


  1%|          | 8682/1000000 [22:02<45:46:44,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8688/1000000 [22:02<28:09:38,  9.78it/s]

{'loss': Array(0.28468406, dtype=float32), 'loss_cross_entropy': Array(0.28468406, dtype=float32)}


  1%|          | 8691/1000000 [22:03<45:29:50,  6.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8699/1000000 [22:04<32:51:02,  8.38it/s]

{'loss': Array(0.2762671, dtype=float32), 'loss_cross_entropy': Array(0.2762671, dtype=float32)}


  1%|          | 8701/1000000 [22:05<51:43:44,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8709/1000000 [22:05<26:43:50, 10.30it/s]

{'loss': Array(0.2824249, dtype=float32), 'loss_cross_entropy': Array(0.2824249, dtype=float32)}


  1%|          | 8713/1000000 [22:06<41:30:32,  6.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8719/1000000 [22:06<25:27:42, 10.81it/s]

{'loss': Array(0.26954386, dtype=float32), 'loss_cross_entropy': Array(0.26954386, dtype=float32)}


  1%|          | 8721/1000000 [22:07<47:55:42,  5.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8729/1000000 [22:08<25:27:34, 10.82it/s]

{'loss': Array(0.28999004, dtype=float32), 'loss_cross_entropy': Array(0.28999004, dtype=float32)}


  1%|          | 8733/1000000 [22:09<40:58:17,  6.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8739/1000000 [22:09<24:57:55, 11.03it/s]

{'loss': Array(0.25524965, dtype=float32), 'loss_cross_entropy': Array(0.25524965, dtype=float32)}


  1%|          | 8741/1000000 [22:10<48:15:34,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8750/1000000 [22:10<23:24:19, 11.76it/s]

{'loss': Array(0.27680573, dtype=float32), 'loss_cross_entropy': Array(0.27680573, dtype=float32)}


  1%|          | 8752/1000000 [22:11<45:15:28,  6.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8760/1000000 [22:12<36:58:06,  7.45it/s]

{'loss': Array(0.29541597, dtype=float32), 'loss_cross_entropy': Array(0.29541597, dtype=float32)}


  1%|          | 8762/1000000 [22:13<58:07:08,  4.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8768/1000000 [22:13<32:52:47,  8.37it/s]

{'loss': Array(0.2931449, dtype=float32), 'loss_cross_entropy': Array(0.2931449, dtype=float32)}


  1%|          | 8771/1000000 [22:14<47:57:52,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8780/1000000 [22:14<24:22:37, 11.29it/s]

{'loss': Array(0.28141993, dtype=float32), 'loss_cross_entropy': Array(0.28141993, dtype=float32)}


  1%|          | 8782/1000000 [22:15<48:33:24,  5.67it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8789/1000000 [22:16<26:22:43, 10.44it/s]

{'loss': Array(0.28153962, dtype=float32), 'loss_cross_entropy': Array(0.28153962, dtype=float32)}


  1%|          | 8793/1000000 [22:17<42:14:19,  6.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8798/1000000 [22:17<26:34:54, 10.36it/s]

{'loss': Array(0.2687821, dtype=float32), 'loss_cross_entropy': Array(0.2687821, dtype=float32)}


  1%|          | 8801/1000000 [22:18<46:49:41,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8810/1000000 [22:18<23:32:23, 11.70it/s]

{'loss': Array(0.29337367, dtype=float32), 'loss_cross_entropy': Array(0.29337367, dtype=float32)}


  1%|          | 8812/1000000 [22:19<44:46:59,  6.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8820/1000000 [22:20<44:21:28,  6.21it/s]

{'loss': Array(0.29218858, dtype=float32), 'loss_cross_entropy': Array(0.29218858, dtype=float32)}


  1%|          | 8822/1000000 [22:21<63:50:28,  4.31it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8830/1000000 [22:22<30:08:35,  9.13it/s]

{'loss': Array(0.2583235, dtype=float32), 'loss_cross_entropy': Array(0.2583235, dtype=float32)}


  1%|          | 8832/1000000 [22:22<54:03:49,  5.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8839/1000000 [22:23<27:05:41, 10.16it/s]

{'loss': Array(0.28034592, dtype=float32), 'loss_cross_entropy': Array(0.28034592, dtype=float32)}


  1%|          | 8841/1000000 [22:24<52:31:05,  5.24it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8850/1000000 [22:24<24:30:36, 11.23it/s]

{'loss': Array(0.27269942, dtype=float32), 'loss_cross_entropy': Array(0.27269942, dtype=float32)}


  1%|          | 8852/1000000 [22:25<48:04:41,  5.73it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8858/1000000 [22:25<28:44:28,  9.58it/s]

{'loss': Array(0.26978856, dtype=float32), 'loss_cross_entropy': Array(0.26978856, dtype=float32)}


  1%|          | 8861/1000000 [22:26<46:49:41,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8870/1000000 [22:27<24:38:21, 11.17it/s]

{'loss': Array(0.28198212, dtype=float32), 'loss_cross_entropy': Array(0.28198212, dtype=float32)}


  1%|          | 8872/1000000 [22:28<47:08:08,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8880/1000000 [22:28<24:04:25, 11.44it/s]

{'loss': Array(0.2802975, dtype=float32), 'loss_cross_entropy': Array(0.2802975, dtype=float32)}


  1%|          | 8882/1000000 [22:29<52:07:14,  5.28it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8890/1000000 [22:30<33:36:15,  8.19it/s]

{'loss': Array(0.28727394, dtype=float32), 'loss_cross_entropy': Array(0.28727394, dtype=float32)}


  1%|          | 8892/1000000 [22:31<60:01:11,  4.59it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8900/1000000 [22:31<25:28:48, 10.80it/s]

{'loss': Array(0.26958382, dtype=float32), 'loss_cross_entropy': Array(0.26958382, dtype=float32)}


  1%|          | 8902/1000000 [22:32<53:31:09,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8908/1000000 [22:32<28:49:38,  9.55it/s]

{'loss': Array(0.29744983, dtype=float32), 'loss_cross_entropy': Array(0.29744983, dtype=float32)}


  1%|          | 8911/1000000 [22:33<46:15:50,  5.95it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8920/1000000 [22:34<23:35:01, 11.67it/s]

{'loss': Array(0.2749137, dtype=float32), 'loss_cross_entropy': Array(0.2749137, dtype=float32)}


  1%|          | 8922/1000000 [22:35<47:10:41,  5.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8929/1000000 [22:35<25:34:08, 10.77it/s]

{'loss': Array(0.2541481, dtype=float32), 'loss_cross_entropy': Array(0.2541481, dtype=float32)}


  1%|          | 8931/1000000 [22:36<49:02:34,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8940/1000000 [22:36<23:27:06, 11.74it/s]

{'loss': Array(0.26984575, dtype=float32), 'loss_cross_entropy': Array(0.26984575, dtype=float32)}


  1%|          | 8942/1000000 [22:37<45:00:29,  6.12it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8950/1000000 [22:38<36:08:55,  7.62it/s]

{'loss': Array(0.27435714, dtype=float32), 'loss_cross_entropy': Array(0.27435714, dtype=float32)}


  1%|          | 8952/1000000 [22:39<56:51:26,  4.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8960/1000000 [22:39<27:10:35, 10.13it/s]

{'loss': Array(0.2842547, dtype=float32), 'loss_cross_entropy': Array(0.2842547, dtype=float32)}


  1%|          | 8962/1000000 [22:40<51:08:27,  5.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8968/1000000 [22:41<30:00:51,  9.17it/s]

{'loss': Array(0.27576843, dtype=float32), 'loss_cross_entropy': Array(0.27576843, dtype=float32)}


  1%|          | 8973/1000000 [22:42<39:30:46,  6.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8979/1000000 [22:42<25:00:28, 11.01it/s]

{'loss': Array(0.28050908, dtype=float32), 'loss_cross_entropy': Array(0.28050908, dtype=float32)}


  1%|          | 8981/1000000 [22:43<48:07:04,  5.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8990/1000000 [22:43<24:00:17, 11.47it/s]

{'loss': Array(0.27594975, dtype=float32), 'loss_cross_entropy': Array(0.27594975, dtype=float32)}


  1%|          | 8992/1000000 [22:44<46:35:48,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 8998/1000000 [22:44<28:16:32,  9.74it/s]

{'loss': Array(0.24871531, dtype=float32), 'loss_cross_entropy': Array(0.24871531, dtype=float32)}


  1%|          | 9001/1000000 [22:52<235:33:39,  1.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9009/1000000 [22:53<112:57:27,  2.44it/s]

{'loss': Array(0.27803403, dtype=float32), 'loss_cross_entropy': Array(0.27803403, dtype=float32)}


  1%|          | 9011/1000000 [22:54<114:16:35,  2.41it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9020/1000000 [22:54<43:26:39,  6.34it/s] 

{'loss': Array(0.28286424, dtype=float32), 'loss_cross_entropy': Array(0.28286424, dtype=float32)}


  1%|          | 9022/1000000 [22:55<62:32:57,  4.40it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9028/1000000 [22:55<35:22:22,  7.78it/s]

{'loss': Array(0.28414336, dtype=float32), 'loss_cross_entropy': Array(0.28414336, dtype=float32)}


  1%|          | 9033/1000000 [22:56<42:31:02,  6.47it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9040/1000000 [22:57<24:41:34, 11.15it/s]

{'loss': Array(0.29148895, dtype=float32), 'loss_cross_entropy': Array(0.29148895, dtype=float32)}


  1%|          | 9042/1000000 [22:58<50:04:39,  5.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9048/1000000 [22:58<28:41:55,  9.59it/s]

{'loss': Array(0.30023307, dtype=float32), 'loss_cross_entropy': Array(0.30023307, dtype=float32)}


  1%|          | 9051/1000000 [22:59<46:32:25,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9060/1000000 [22:59<23:39:37, 11.63it/s]

{'loss': Array(0.2920968, dtype=float32), 'loss_cross_entropy': Array(0.2920968, dtype=float32)}


  1%|          | 9062/1000000 [23:00<48:11:44,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9068/1000000 [23:00<29:21:10,  9.38it/s]

{'loss': Array(0.2749672, dtype=float32), 'loss_cross_entropy': Array(0.2749672, dtype=float32)}


  1%|          | 9071/1000000 [23:01<48:58:26,  5.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9079/1000000 [23:02<33:52:20,  8.13it/s]

{'loss': Array(0.28749794, dtype=float32), 'loss_cross_entropy': Array(0.28749794, dtype=float32)}


  1%|          | 9081/1000000 [23:03<53:31:53,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9090/1000000 [23:04<25:41:19, 10.71it/s]

{'loss': Array(0.27092582, dtype=float32), 'loss_cross_entropy': Array(0.27092582, dtype=float32)}


  1%|          | 9092/1000000 [23:04<46:19:23,  5.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9099/1000000 [23:05<26:15:34, 10.48it/s]

{'loss': Array(0.28519458, dtype=float32), 'loss_cross_entropy': Array(0.28519458, dtype=float32)}


  1%|          | 9101/1000000 [23:06<51:09:56,  5.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9109/1000000 [23:06<26:13:57, 10.49it/s]

{'loss': Array(0.28720224, dtype=float32), 'loss_cross_entropy': Array(0.28720224, dtype=float32)}


  1%|          | 9111/1000000 [23:07<50:32:56,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9120/1000000 [23:07<23:39:21, 11.64it/s]

{'loss': Array(0.2772505, dtype=float32), 'loss_cross_entropy': Array(0.2772505, dtype=float32)}


  1%|          | 9122/1000000 [23:08<46:24:26,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9128/1000000 [23:09<28:03:34,  9.81it/s]

{'loss': Array(0.2820774, dtype=float32), 'loss_cross_entropy': Array(0.2820774, dtype=float32)}


  1%|          | 9131/1000000 [23:10<45:33:00,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9138/1000000 [23:11<40:23:45,  6.81it/s]

{'loss': Array(0.25212523, dtype=float32), 'loss_cross_entropy': Array(0.25212523, dtype=float32)}


  1%|          | 9143/1000000 [23:12<44:44:12,  6.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9149/1000000 [23:12<27:11:05, 10.12it/s]

{'loss': Array(0.2732968, dtype=float32), 'loss_cross_entropy': Array(0.2732968, dtype=float32)}


  1%|          | 9151/1000000 [23:13<48:27:50,  5.68it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9160/1000000 [23:13<23:39:20, 11.63it/s]

{'loss': Array(0.28034577, dtype=float32), 'loss_cross_entropy': Array(0.28034577, dtype=float32)}


  1%|          | 9162/1000000 [23:14<44:35:00,  6.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9168/1000000 [23:14<27:12:14, 10.12it/s]

{'loss': Array(0.27784795, dtype=float32), 'loss_cross_entropy': Array(0.27784795, dtype=float32)}


  1%|          | 9171/1000000 [23:15<45:52:30,  6.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9180/1000000 [23:16<24:06:22, 11.42it/s]

{'loss': Array(0.26674482, dtype=float32), 'loss_cross_entropy': Array(0.26674482, dtype=float32)}


  1%|          | 9182/1000000 [23:16<45:11:30,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9190/1000000 [23:17<24:29:20, 11.24it/s]

{'loss': Array(0.27653876, dtype=float32), 'loss_cross_entropy': Array(0.27653876, dtype=float32)}


  1%|          | 9192/1000000 [23:18<48:59:49,  5.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9200/1000000 [23:19<44:09:56,  6.23it/s]

{'loss': Array(0.27784315, dtype=float32), 'loss_cross_entropy': Array(0.27784315, dtype=float32)}


  1%|          | 9202/1000000 [23:20<62:07:31,  4.43it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9210/1000000 [23:20<29:09:02,  9.44it/s]

{'loss': Array(0.27733698, dtype=float32), 'loss_cross_entropy': Array(0.27733698, dtype=float32)}


  1%|          | 9212/1000000 [23:21<52:51:23,  5.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9218/1000000 [23:21<30:43:36,  8.96it/s]

{'loss': Array(0.28227833, dtype=float32), 'loss_cross_entropy': Array(0.28227833, dtype=float32)}


  1%|          | 9221/1000000 [23:22<49:31:05,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9230/1000000 [23:23<25:04:00, 10.98it/s]

{'loss': Array(0.26573935, dtype=float32), 'loss_cross_entropy': Array(0.26573935, dtype=float32)}


  1%|          | 9232/1000000 [23:23<46:36:44,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9238/1000000 [23:24<28:35:13,  9.63it/s]

{'loss': Array(0.27891156, dtype=float32), 'loss_cross_entropy': Array(0.27891156, dtype=float32)}


  1%|          | 9241/1000000 [23:25<47:00:09,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9248/1000000 [23:25<27:16:13, 10.09it/s]

{'loss': Array(0.27007017, dtype=float32), 'loss_cross_entropy': Array(0.27007017, dtype=float32)}


  1%|          | 9251/1000000 [23:26<46:38:57,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9260/1000000 [23:26<23:35:35, 11.66it/s]

{'loss': Array(0.27762836, dtype=float32), 'loss_cross_entropy': Array(0.27762836, dtype=float32)}


  1%|          | 9262/1000000 [23:27<47:37:10,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9270/1000000 [23:28<31:49:26,  8.65it/s]

{'loss': Array(0.27194214, dtype=float32), 'loss_cross_entropy': Array(0.27194214, dtype=float32)}


  1%|          | 9272/1000000 [23:29<59:03:24,  4.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9280/1000000 [23:30<28:46:37,  9.56it/s]

{'loss': Array(0.27720487, dtype=float32), 'loss_cross_entropy': Array(0.27720487, dtype=float32)}


  1%|          | 9282/1000000 [23:31<57:19:12,  4.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9290/1000000 [23:31<24:56:52, 11.03it/s]

{'loss': Array(0.27905944, dtype=float32), 'loss_cross_entropy': Array(0.27905944, dtype=float32)}


  1%|          | 9292/1000000 [23:32<54:42:57,  5.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9298/1000000 [23:32<29:12:32,  9.42it/s]

{'loss': Array(0.26804015, dtype=float32), 'loss_cross_entropy': Array(0.26804015, dtype=float32)}


  1%|          | 9301/1000000 [23:33<46:45:58,  5.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9309/1000000 [23:34<25:20:32, 10.86it/s]

{'loss': Array(0.2930739, dtype=float32), 'loss_cross_entropy': Array(0.2930739, dtype=float32)}


  1%|          | 9311/1000000 [23:34<47:11:06,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9318/1000000 [23:35<26:56:58, 10.21it/s]

{'loss': Array(0.28975803, dtype=float32), 'loss_cross_entropy': Array(0.28975803, dtype=float32)}


  1%|          | 9321/1000000 [23:36<47:37:35,  5.78it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9329/1000000 [23:37<36:51:55,  7.46it/s]

{'loss': Array(0.2595634, dtype=float32), 'loss_cross_entropy': Array(0.2595634, dtype=float32)}


  1%|          | 9333/1000000 [23:38<45:11:01,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9338/1000000 [23:38<28:18:41,  9.72it/s]

{'loss': Array(0.28353092, dtype=float32), 'loss_cross_entropy': Array(0.28353092, dtype=float32)}


  1%|          | 9341/1000000 [23:39<45:42:07,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9350/1000000 [23:39<23:05:26, 11.92it/s]

{'loss': Array(0.2903147, dtype=float32), 'loss_cross_entropy': Array(0.2903147, dtype=float32)}


  1%|          | 9352/1000000 [23:40<46:20:01,  5.94it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9358/1000000 [23:40<28:01:54,  9.82it/s]

{'loss': Array(0.29565185, dtype=float32), 'loss_cross_entropy': Array(0.29565185, dtype=float32)}


  1%|          | 9361/1000000 [23:41<45:12:51,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9370/1000000 [23:42<23:50:57, 11.54it/s]

{'loss': Array(0.2687104, dtype=float32), 'loss_cross_entropy': Array(0.2687104, dtype=float32)}


  1%|          | 9372/1000000 [23:42<45:11:57,  6.09it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9378/1000000 [23:43<27:50:20,  9.88it/s]

{'loss': Array(0.25677237, dtype=float32), 'loss_cross_entropy': Array(0.25677237, dtype=float32)}


  1%|          | 9381/1000000 [23:44<44:46:54,  6.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9390/1000000 [23:45<43:57:29,  6.26it/s]

{'loss': Array(0.29573953, dtype=float32), 'loss_cross_entropy': Array(0.29573953, dtype=float32)}


  1%|          | 9392/1000000 [23:46<62:51:47,  4.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9398/1000000 [23:46<33:39:45,  8.17it/s]

{'loss': Array(0.2836618, dtype=float32), 'loss_cross_entropy': Array(0.2836618, dtype=float32)}


  1%|          | 9401/1000000 [23:47<49:01:43,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9410/1000000 [23:47<24:31:10, 11.22it/s]

{'loss': Array(0.24978042, dtype=float32), 'loss_cross_entropy': Array(0.24978042, dtype=float32)}


  1%|          | 9412/1000000 [23:48<45:20:48,  6.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9418/1000000 [23:48<28:02:50,  9.81it/s]

{'loss': Array(0.26041803, dtype=float32), 'loss_cross_entropy': Array(0.26041803, dtype=float32)}


  1%|          | 9421/1000000 [23:49<46:39:35,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9429/1000000 [23:50<26:56:32, 10.21it/s]

{'loss': Array(0.28151578, dtype=float32), 'loss_cross_entropy': Array(0.28151578, dtype=float32)}


  1%|          | 9431/1000000 [23:51<51:42:33,  5.32it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9440/1000000 [23:51<24:35:35, 11.19it/s]

{'loss': Array(0.27297446, dtype=float32), 'loss_cross_entropy': Array(0.27297446, dtype=float32)}


  1%|          | 9442/1000000 [23:52<47:02:25,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9450/1000000 [23:52<25:06:17, 10.96it/s]

{'loss': Array(0.27172723, dtype=float32), 'loss_cross_entropy': Array(0.27172723, dtype=float32)}


  1%|          | 9452/1000000 [23:53<48:51:20,  5.63it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9460/1000000 [23:54<32:45:28,  8.40it/s]

{'loss': Array(0.2824538, dtype=float32), 'loss_cross_entropy': Array(0.2824538, dtype=float32)}


  1%|          | 9462/1000000 [23:55<54:35:07,  5.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9468/1000000 [23:55<31:07:57,  8.84it/s]

{'loss': Array(0.26802364, dtype=float32), 'loss_cross_entropy': Array(0.26802364, dtype=float32)}


  1%|          | 9471/1000000 [23:56<47:03:16,  5.85it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9479/1000000 [23:56<25:47:04, 10.67it/s]

{'loss': Array(0.2675309, dtype=float32), 'loss_cross_entropy': Array(0.2675309, dtype=float32)}


  1%|          | 9481/1000000 [23:57<48:40:14,  5.65it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9490/1000000 [23:58<23:51:41, 11.53it/s]

{'loss': Array(0.2678891, dtype=float32), 'loss_cross_entropy': Array(0.2678891, dtype=float32)}


  1%|          | 9492/1000000 [23:59<46:57:28,  5.86it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9498/1000000 [23:59<28:15:33,  9.74it/s]

{'loss': Array(0.2949867, dtype=float32), 'loss_cross_entropy': Array(0.2949867, dtype=float32)}


  1%|          | 9503/1000000 [24:07<185:06:21,  1.49it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9509/1000000 [24:07<91:22:28,  3.01it/s] 

{'loss': Array(0.3066375, dtype=float32), 'loss_cross_entropy': Array(0.3066375, dtype=float32)}


  1%|          | 9511/1000000 [24:08<99:20:31,  2.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9519/1000000 [24:09<54:20:43,  5.06it/s]

{'loss': Array(0.2600052, dtype=float32), 'loss_cross_entropy': Array(0.2600052, dtype=float32)}


  1%|          | 9521/1000000 [24:10<70:35:16,  3.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9529/1000000 [24:10<31:48:59,  8.65it/s]

{'loss': Array(0.26534924, dtype=float32), 'loss_cross_entropy': Array(0.26534924, dtype=float32)}


  1%|          | 9531/1000000 [24:11<53:09:47,  5.18it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9539/1000000 [24:11<26:23:47, 10.42it/s]

{'loss': Array(0.28405187, dtype=float32), 'loss_cross_entropy': Array(0.28405187, dtype=float32)}


  1%|          | 9543/1000000 [24:12<40:50:26,  6.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9548/1000000 [24:13<26:41:09, 10.31it/s]

{'loss': Array(0.2813463, dtype=float32), 'loss_cross_entropy': Array(0.2813463, dtype=float32)}


  1%|          | 9553/1000000 [24:14<38:52:17,  7.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9559/1000000 [24:14<24:10:30, 11.38it/s]

{'loss': Array(0.2625451, dtype=float32), 'loss_cross_entropy': Array(0.2625451, dtype=float32)}


  1%|          | 9561/1000000 [24:15<47:30:07,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9570/1000000 [24:15<23:24:31, 11.75it/s]

{'loss': Array(0.27256903, dtype=float32), 'loss_cross_entropy': Array(0.27256903, dtype=float32)}


  1%|          | 9572/1000000 [24:16<45:59:57,  5.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9580/1000000 [24:17<42:40:30,  6.45it/s]

{'loss': Array(0.2496163, dtype=float32), 'loss_cross_entropy': Array(0.2496163, dtype=float32)}


  1%|          | 9582/1000000 [24:18<61:21:23,  4.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9588/1000000 [24:18<34:04:10,  8.08it/s]

{'loss': Array(0.2830367, dtype=float32), 'loss_cross_entropy': Array(0.2830367, dtype=float32)}


  1%|          | 9591/1000000 [24:19<49:54:46,  5.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9599/1000000 [24:19<26:43:20, 10.30it/s]

{'loss': Array(0.27303812, dtype=float32), 'loss_cross_entropy': Array(0.27303812, dtype=float32)}


  1%|          | 9601/1000000 [24:20<51:16:19,  5.37it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9610/1000000 [24:21<24:21:41, 11.29it/s]

{'loss': Array(0.26081005, dtype=float32), 'loss_cross_entropy': Array(0.26081005, dtype=float32)}


  1%|          | 9612/1000000 [24:22<46:06:44,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9618/1000000 [24:22<28:04:51,  9.80it/s]

{'loss': Array(0.28133228, dtype=float32), 'loss_cross_entropy': Array(0.28133228, dtype=float32)}


  1%|          | 9621/1000000 [24:23<47:47:06,  5.76it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9630/1000000 [24:23<24:44:11, 11.12it/s]

{'loss': Array(0.29799074, dtype=float32), 'loss_cross_entropy': Array(0.29799074, dtype=float32)}


  1%|          | 9632/1000000 [24:24<47:31:28,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9640/1000000 [24:25<27:13:57, 10.10it/s]

{'loss': Array(0.2646815, dtype=float32), 'loss_cross_entropy': Array(0.2646815, dtype=float32)}


  1%|          | 9642/1000000 [24:26<49:32:50,  5.55it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9650/1000000 [24:26<33:03:39,  8.32it/s]

{'loss': Array(0.28289482, dtype=float32), 'loss_cross_entropy': Array(0.28289482, dtype=float32)}


  1%|          | 9652/1000000 [24:27<55:14:29,  4.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9660/1000000 [24:28<26:14:54, 10.48it/s]

{'loss': Array(0.25747105, dtype=float32), 'loss_cross_entropy': Array(0.25747105, dtype=float32)}


  1%|          | 9662/1000000 [24:29<54:54:39,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9670/1000000 [24:29<24:50:53, 11.07it/s]

{'loss': Array(0.26159987, dtype=float32), 'loss_cross_entropy': Array(0.26159987, dtype=float32)}


  1%|          | 9672/1000000 [24:30<57:40:37,  4.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9680/1000000 [24:31<25:15:20, 10.89it/s]

{'loss': Array(0.2416697, dtype=float32), 'loss_cross_entropy': Array(0.2416697, dtype=float32)}


  1%|          | 9682/1000000 [24:31<54:33:22,  5.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9688/1000000 [24:32<28:55:30,  9.51it/s]

{'loss': Array(0.2846096, dtype=float32), 'loss_cross_entropy': Array(0.2846096, dtype=float32)}


  1%|          | 9691/1000000 [24:33<49:02:38,  5.61it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9700/1000000 [24:33<23:10:31, 11.87it/s]

{'loss': Array(0.28442603, dtype=float32), 'loss_cross_entropy': Array(0.28442603, dtype=float32)}


  1%|          | 9702/1000000 [24:34<46:06:41,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9710/1000000 [24:35<36:58:34,  7.44it/s]

{'loss': Array(0.26115963, dtype=float32), 'loss_cross_entropy': Array(0.26115963, dtype=float32)}


  1%|          | 9712/1000000 [24:36<56:55:50,  4.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9718/1000000 [24:36<32:15:48,  8.53it/s]

{'loss': Array(0.2714467, dtype=float32), 'loss_cross_entropy': Array(0.2714467, dtype=float32)}


  1%|          | 9721/1000000 [24:37<49:04:57,  5.60it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9730/1000000 [24:37<24:44:23, 11.12it/s]

{'loss': Array(0.259532, dtype=float32), 'loss_cross_entropy': Array(0.259532, dtype=float32)}


  1%|          | 9732/1000000 [24:38<45:03:44,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9738/1000000 [24:39<27:56:03,  9.85it/s]

{'loss': Array(0.26154742, dtype=float32), 'loss_cross_entropy': Array(0.26154742, dtype=float32)}


  1%|          | 9743/1000000 [24:40<37:13:18,  7.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9748/1000000 [24:40<26:25:59, 10.41it/s]

{'loss': Array(0.27828184, dtype=float32), 'loss_cross_entropy': Array(0.27828184, dtype=float32)}


  1%|          | 9751/1000000 [24:41<44:36:03,  6.17it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9760/1000000 [24:41<23:05:02, 11.92it/s]

{'loss': Array(0.2659003, dtype=float32), 'loss_cross_entropy': Array(0.2659003, dtype=float32)}


  1%|          | 9762/1000000 [24:42<45:04:36,  6.10it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9770/1000000 [24:43<43:07:50,  6.38it/s]

{'loss': Array(0.26263905, dtype=float32), 'loss_cross_entropy': Array(0.26263905, dtype=float32)}


  1%|          | 9772/1000000 [24:44<62:36:00,  4.39it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9778/1000000 [24:44<34:34:02,  7.96it/s]

{'loss': Array(0.2823291, dtype=float32), 'loss_cross_entropy': Array(0.2823291, dtype=float32)}


  1%|          | 9781/1000000 [24:45<51:10:04,  5.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9790/1000000 [24:45<25:28:33, 10.80it/s]

{'loss': Array(0.243986, dtype=float32), 'loss_cross_entropy': Array(0.243986, dtype=float32)}


  1%|          | 9792/1000000 [24:46<45:35:26,  6.03it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9800/1000000 [24:47<24:20:50, 11.30it/s]

{'loss': Array(0.2647939, dtype=float32), 'loss_cross_entropy': Array(0.2647939, dtype=float32)}


  1%|          | 9802/1000000 [24:48<48:07:59,  5.71it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9808/1000000 [24:48<28:22:24,  9.69it/s]

{'loss': Array(0.27330852, dtype=float32), 'loss_cross_entropy': Array(0.27330852, dtype=float32)}


  1%|          | 9811/1000000 [24:49<44:27:11,  6.19it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9820/1000000 [24:49<23:32:33, 11.68it/s]

{'loss': Array(0.26235315, dtype=float32), 'loss_cross_entropy': Array(0.26235315, dtype=float32)}


  1%|          | 9822/1000000 [24:50<45:58:31,  5.98it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9828/1000000 [24:50<28:12:24,  9.75it/s]

{'loss': Array(0.28469288, dtype=float32), 'loss_cross_entropy': Array(0.28469288, dtype=float32)}


  1%|          | 9831/1000000 [24:51<44:53:59,  6.13it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9839/1000000 [24:52<32:34:37,  8.44it/s]

{'loss': Array(0.28539595, dtype=float32), 'loss_cross_entropy': Array(0.28539595, dtype=float32)}


  1%|          | 9843/1000000 [24:53<42:42:50,  6.44it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9848/1000000 [24:53<27:12:26, 10.11it/s]

{'loss': Array(0.2916557, dtype=float32), 'loss_cross_entropy': Array(0.2916557, dtype=float32)}


  1%|          | 9851/1000000 [24:54<46:40:36,  5.89it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9858/1000000 [24:55<27:47:57,  9.89it/s]

{'loss': Array(0.27900416, dtype=float32), 'loss_cross_entropy': Array(0.27900416, dtype=float32)}


  1%|          | 9861/1000000 [24:56<47:13:24,  5.82it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9870/1000000 [24:56<23:33:27, 11.68it/s]

{'loss': Array(0.2686723, dtype=float32), 'loss_cross_entropy': Array(0.2686723, dtype=float32)}


  1%|          | 9872/1000000 [24:57<45:49:20,  6.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9878/1000000 [24:57<28:07:05,  9.78it/s]

{'loss': Array(0.26682636, dtype=float32), 'loss_cross_entropy': Array(0.26682636, dtype=float32)}


  1%|          | 9881/1000000 [24:58<46:28:48,  5.92it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9889/1000000 [24:59<25:37:25, 10.73it/s]

{'loss': Array(0.258386, dtype=float32), 'loss_cross_entropy': Array(0.258386, dtype=float32)}


  1%|          | 9891/1000000 [24:59<46:34:11,  5.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9899/1000000 [25:00<37:49:11,  7.27it/s]

{'loss': Array(0.29861218, dtype=float32), 'loss_cross_entropy': Array(0.29861218, dtype=float32)}


  1%|          | 9903/1000000 [25:01<47:22:56,  5.80it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9909/1000000 [25:02<27:57:28,  9.84it/s]

{'loss': Array(0.2835444, dtype=float32), 'loss_cross_entropy': Array(0.2835444, dtype=float32)}


  1%|          | 9913/1000000 [25:03<39:40:02,  6.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9919/1000000 [25:03<24:33:49, 11.20it/s]

{'loss': Array(0.26470345, dtype=float32), 'loss_cross_entropy': Array(0.26470345, dtype=float32)}


  1%|          | 9921/1000000 [25:04<46:03:09,  5.97it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9930/1000000 [25:04<22:48:06, 12.06it/s]

{'loss': Array(0.2536596, dtype=float32), 'loss_cross_entropy': Array(0.2536596, dtype=float32)}


  1%|          | 9932/1000000 [25:05<45:31:14,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9938/1000000 [25:05<27:44:54,  9.91it/s]

{'loss': Array(0.26176205, dtype=float32), 'loss_cross_entropy': Array(0.26176205, dtype=float32)}


  1%|          | 9941/1000000 [25:06<45:16:06,  6.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9950/1000000 [25:07<23:56:26, 11.49it/s]

{'loss': Array(0.26241866, dtype=float32), 'loss_cross_entropy': Array(0.26241866, dtype=float32)}


  1%|          | 9952/1000000 [25:07<44:39:36,  6.16it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9960/1000000 [25:08<43:11:34,  6.37it/s]

{'loss': Array(0.3160834, dtype=float32), 'loss_cross_entropy': Array(0.3160834, dtype=float32)}


  1%|          | 9962/1000000 [25:09<61:20:52,  4.48it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9969/1000000 [25:10<32:35:49,  8.44it/s]

{'loss': Array(0.28304878, dtype=float32), 'loss_cross_entropy': Array(0.28304878, dtype=float32)}


  1%|          | 9971/1000000 [25:11<55:02:55,  5.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9980/1000000 [25:11<24:33:10, 11.20it/s]

{'loss': Array(0.2736775, dtype=float32), 'loss_cross_entropy': Array(0.2736775, dtype=float32)}


  1%|          | 9982/1000000 [25:12<46:24:28,  5.93it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9990/1000000 [25:12<24:18:37, 11.31it/s]

{'loss': Array(0.25020453, dtype=float32), 'loss_cross_entropy': Array(0.25020453, dtype=float32)}


  1%|          | 9992/1000000 [25:13<47:28:33,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 9998/1000000 [25:13<28:00:07,  9.82it/s]

{'loss': Array(0.25985885, dtype=float32), 'loss_cross_entropy': Array(0.25985885, dtype=float32)}


  1%|          | 10001/1000000 [25:21<239:02:36,  1.15it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10010/1000000 [25:21<87:03:03,  3.16it/s] 

{'loss': Array(0.25640544, dtype=float32), 'loss_cross_entropy': Array(0.25640544, dtype=float32)}


  1%|          | 10012/1000000 [25:22<94:25:16,  2.91it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10020/1000000 [25:22<42:01:01,  6.54it/s]

{'loss': Array(0.29925108, dtype=float32), 'loss_cross_entropy': Array(0.29925108, dtype=float32)}


  1%|          | 10022/1000000 [25:23<60:55:43,  4.51it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10030/1000000 [25:24<36:47:34,  7.47it/s]

{'loss': Array(0.27369145, dtype=float32), 'loss_cross_entropy': Array(0.27369145, dtype=float32)}


  1%|          | 10032/1000000 [25:25<57:54:47,  4.75it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10038/1000000 [25:25<32:51:51,  8.37it/s]

{'loss': Array(0.26106486, dtype=float32), 'loss_cross_entropy': Array(0.26106486, dtype=float32)}


  1%|          | 10041/1000000 [25:26<47:54:33,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10049/1000000 [25:27<26:27:40, 10.39it/s]

{'loss': Array(0.27225447, dtype=float32), 'loss_cross_entropy': Array(0.27225447, dtype=float32)}


  1%|          | 10053/1000000 [25:28<40:37:00,  6.77it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10059/1000000 [25:28<24:24:24, 11.27it/s]

{'loss': Array(0.25882873, dtype=float32), 'loss_cross_entropy': Array(0.25882873, dtype=float32)}


  1%|          | 10063/1000000 [25:29<42:18:51,  6.50it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10069/1000000 [25:29<24:19:41, 11.30it/s]

{'loss': Array(0.25737628, dtype=float32), 'loss_cross_entropy': Array(0.25737628, dtype=float32)}


  1%|          | 10073/1000000 [25:30<44:09:49,  6.23it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10079/1000000 [25:31<24:47:10, 11.09it/s]

{'loss': Array(0.29883578, dtype=float32), 'loss_cross_entropy': Array(0.29883578, dtype=float32)}


  1%|          | 10083/1000000 [25:32<43:24:20,  6.34it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10090/1000000 [25:33<36:43:55,  7.49it/s]

{'loss': Array(0.27401337, dtype=float32), 'loss_cross_entropy': Array(0.27401337, dtype=float32)}


  1%|          | 10092/1000000 [25:34<59:32:16,  4.62it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10098/1000000 [25:34<32:16:22,  8.52it/s]

{'loss': Array(0.27864015, dtype=float32), 'loss_cross_entropy': Array(0.27864015, dtype=float32)}


  1%|          | 10101/1000000 [25:35<51:06:26,  5.38it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10109/1000000 [25:35<27:00:33, 10.18it/s]

{'loss': Array(0.28099963, dtype=float32), 'loss_cross_entropy': Array(0.28099963, dtype=float32)}


  1%|          | 10111/1000000 [25:36<48:36:05,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10120/1000000 [25:37<23:44:16, 11.58it/s]

{'loss': Array(0.26492378, dtype=float32), 'loss_cross_entropy': Array(0.26492378, dtype=float32)}


  1%|          | 10122/1000000 [25:37<44:15:57,  6.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10128/1000000 [25:38<27:24:06, 10.03it/s]

{'loss': Array(0.26743686, dtype=float32), 'loss_cross_entropy': Array(0.26743686, dtype=float32)}


  1%|          | 10131/1000000 [25:39<44:22:39,  6.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10140/1000000 [25:39<23:14:26, 11.83it/s]

{'loss': Array(0.2875409, dtype=float32), 'loss_cross_entropy': Array(0.2875409, dtype=float32)}


  1%|          | 10142/1000000 [25:40<45:38:20,  6.02it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10150/1000000 [25:41<42:44:40,  6.43it/s]

{'loss': Array(0.2780854, dtype=float32), 'loss_cross_entropy': Array(0.2780854, dtype=float32)}


  1%|          | 10152/1000000 [25:42<60:50:59,  4.52it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10158/1000000 [25:42<33:54:07,  8.11it/s]

{'loss': Array(0.2677271, dtype=float32), 'loss_cross_entropy': Array(0.2677271, dtype=float32)}


  1%|          | 10161/1000000 [25:43<49:29:31,  5.56it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10170/1000000 [25:43<24:54:59, 11.03it/s]

{'loss': Array(0.2725541, dtype=float32), 'loss_cross_entropy': Array(0.2725541, dtype=float32)}


  1%|          | 10172/1000000 [25:44<47:07:16,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10180/1000000 [25:45<25:53:39, 10.62it/s]

{'loss': Array(0.2676453, dtype=float32), 'loss_cross_entropy': Array(0.2676453, dtype=float32)}


  1%|          | 10182/1000000 [25:45<49:18:57,  5.58it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10190/1000000 [25:46<25:23:06, 10.83it/s]

{'loss': Array(0.26589453, dtype=float32), 'loss_cross_entropy': Array(0.26589453, dtype=float32)}


  1%|          | 10192/1000000 [25:47<48:33:06,  5.66it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10198/1000000 [25:47<28:09:19,  9.77it/s]

{'loss': Array(0.25388536, dtype=float32), 'loss_cross_entropy': Array(0.25388536, dtype=float32)}


  1%|          | 10201/1000000 [25:48<46:36:22,  5.90it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10210/1000000 [25:48<24:13:18, 11.35it/s]

{'loss': Array(0.26405725, dtype=float32), 'loss_cross_entropy': Array(0.26405725, dtype=float32)}


  1%|          | 10212/1000000 [25:49<47:08:42,  5.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10220/1000000 [25:50<33:22:50,  8.24it/s]

{'loss': Array(0.27554595, dtype=float32), 'loss_cross_entropy': Array(0.27554595, dtype=float32)}


  1%|          | 10222/1000000 [25:51<54:18:24,  5.06it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10228/1000000 [25:51<31:31:08,  8.72it/s]

{'loss': Array(0.2607219, dtype=float32), 'loss_cross_entropy': Array(0.2607219, dtype=float32)}


  1%|          | 10231/1000000 [25:52<47:56:13,  5.74it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10240/1000000 [25:53<24:39:50, 11.15it/s]

{'loss': Array(0.27196237, dtype=float32), 'loss_cross_entropy': Array(0.27196237, dtype=float32)}


  1%|          | 10242/1000000 [25:54<47:31:19,  5.79it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10248/1000000 [25:54<29:05:16,  9.45it/s]

{'loss': Array(0.2740972, dtype=float32), 'loss_cross_entropy': Array(0.2740972, dtype=float32)}


  1%|          | 10251/1000000 [25:55<48:03:40,  5.72it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10260/1000000 [25:55<24:46:18, 11.10it/s]

{'loss': Array(0.27032822, dtype=float32), 'loss_cross_entropy': Array(0.27032822, dtype=float32)}


  1%|          | 10262/1000000 [25:56<45:30:09,  6.04it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10268/1000000 [25:56<28:02:08,  9.81it/s]

{'loss': Array(0.27948102, dtype=float32), 'loss_cross_entropy': Array(0.27948102, dtype=float32)}


  1%|          | 10271/1000000 [25:57<45:13:58,  6.08it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10278/1000000 [25:58<40:02:08,  6.87it/s]

{'loss': Array(0.26434875, dtype=float32), 'loss_cross_entropy': Array(0.26434875, dtype=float32)}


  1%|          | 10281/1000000 [25:59<54:12:41,  5.07it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10289/1000000 [26:00<29:02:14,  9.47it/s]

{'loss': Array(0.268353, dtype=float32), 'loss_cross_entropy': Array(0.268353, dtype=float32)}


  1%|          | 10293/1000000 [26:01<41:04:09,  6.69it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10299/1000000 [26:01<25:02:42, 10.98it/s]

{'loss': Array(0.2695435, dtype=float32), 'loss_cross_entropy': Array(0.2695435, dtype=float32)}


  1%|          | 10303/1000000 [26:02<39:57:17,  6.88it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10310/1000000 [26:02<22:49:02, 12.05it/s]

{'loss': Array(0.26900235, dtype=float32), 'loss_cross_entropy': Array(0.26900235, dtype=float32)}


  1%|          | 10312/1000000 [26:03<52:07:39,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10320/1000000 [26:04<24:43:41, 11.12it/s]

{'loss': Array(0.27713928, dtype=float32), 'loss_cross_entropy': Array(0.27713928, dtype=float32)}


  1%|          | 10322/1000000 [26:05<54:27:01,  5.05it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10330/1000000 [26:05<25:05:35, 10.96it/s]

{'loss': Array(0.2746365, dtype=float32), 'loss_cross_entropy': Array(0.2746365, dtype=float32)}


  1%|          | 10332/1000000 [26:06<54:57:21,  5.00it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10339/1000000 [26:06<26:04:02, 10.55it/s]

{'loss': Array(0.25982937, dtype=float32), 'loss_cross_entropy': Array(0.25982937, dtype=float32)}


  1%|          | 10341/1000000 [26:08<71:32:49,  3.84it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10350/1000000 [26:08<29:16:13,  9.39it/s]

{'loss': Array(0.2571602, dtype=float32), 'loss_cross_entropy': Array(0.2571602, dtype=float32)}


  1%|          | 10352/1000000 [26:09<50:20:34,  5.46it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10358/1000000 [26:09<29:48:42,  9.22it/s]

{'loss': Array(0.2766718, dtype=float32), 'loss_cross_entropy': Array(0.2766718, dtype=float32)}


  1%|          | 10363/1000000 [26:10<40:16:02,  6.83it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10369/1000000 [26:11<24:48:24, 11.08it/s]

{'loss': Array(0.26645926, dtype=float32), 'loss_cross_entropy': Array(0.26645926, dtype=float32)}


  1%|          | 10371/1000000 [26:12<52:09:54,  5.27it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10380/1000000 [26:12<23:15:29, 11.82it/s]

{'loss': Array(0.27043638, dtype=float32), 'loss_cross_entropy': Array(0.27043638, dtype=float32)}


  1%|          | 10382/1000000 [26:13<52:50:46,  5.20it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10389/1000000 [26:13<25:41:26, 10.70it/s]

{'loss': Array(0.2836546, dtype=float32), 'loss_cross_entropy': Array(0.2836546, dtype=float32)}


  1%|          | 10391/1000000 [26:14<51:24:02,  5.35it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10399/1000000 [26:15<25:39:54, 10.71it/s]

{'loss': Array(0.26976147, dtype=float32), 'loss_cross_entropy': Array(0.26976147, dtype=float32)}


  1%|          | 10401/1000000 [26:15<52:43:08,  5.21it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10410/1000000 [26:17<30:43:36,  8.95it/s]

{'loss': Array(0.26944163, dtype=float32), 'loss_cross_entropy': Array(0.26944163, dtype=float32)}


  1%|          | 10412/1000000 [26:17<54:50:15,  5.01it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10419/1000000 [26:18<27:47:20,  9.89it/s]

{'loss': Array(0.2801586, dtype=float32), 'loss_cross_entropy': Array(0.2801586, dtype=float32)}


  1%|          | 10421/1000000 [26:19<53:27:51,  5.14it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10428/1000000 [26:19<28:19:40,  9.70it/s]

{'loss': Array(0.27124593, dtype=float32), 'loss_cross_entropy': Array(0.27124593, dtype=float32)}


  1%|          | 10431/1000000 [26:20<50:25:00,  5.45it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10439/1000000 [26:20<25:57:27, 10.59it/s]

{'loss': Array(0.28148216, dtype=float32), 'loss_cross_entropy': Array(0.28148216, dtype=float32)}


  1%|          | 10443/1000000 [26:21<40:23:23,  6.81it/s]

context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


  1%|          | 10450/1000000 [26:22<23:18:57, 11.79it/s]

In [15]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer_eval, buffer_list_eval = dataset.fast_gathering_data_diffusion(
    env,
    vmap_reset,
    vmap_step,
    int(128),
    config.len_seq,
    buffer_eval,
    buffer_list_eval,
    subkey,
)

sample = buffer_eval.sample(buffer_list_eval, subkey)
sample = reshape_diffusion_setup(sample)

loss, (loss_crossentropy) = loss_fn_transformer_rf(
    transformer, sample
)


context value shape (128, 20, 512)
transformer input shape (128, 20, 512)


In [55]:
for key in sample.keys():
    print(key)
    print(sample[key].shape)

action
(128, 20, 3)
reward
(128, 1)
state_histo
(128, 20, 54, 6)
context
(128, 2)
state_past
(128, 10, 54, 6)
state_future
(128, 10, 54, 6)
state_future_noise
(128, 10, 54, 6)


In [None]:
buffer_eval, buffer_list_eval = dataset.fast_gathering_data(
    env,
    vmap_reset,
    vmap_step_proba,
    int(128),
    config.len_seq,
    buffer_eval,
    buffer_list_eval,
    subkey,
)

TrajectoryBufferSample(experience={'action': Array([[[1.32556781e-01, 7.96739519e-01, 5.36718592e-02, ...,
         3.91646661e-03, 4.48901858e-03, 9.91594553e-01],
        [3.49070907e-01, 4.57749265e-04, 4.38157976e-01, ...,
         7.23136306e-01, 1.23497941e-01, 1.53365776e-01],
        [6.12441264e-03, 2.50436477e-02, 1.35732419e-03, ...,
         3.82237613e-01, 5.98694921e-01, 1.90675538e-02],
        ...,
        [1.41329234e-04, 2.44877161e-03, 8.43136787e-01, ...,
         2.33344346e-01, 6.42170012e-01, 1.24485560e-01],
        [6.32655225e-04, 1.77795421e-02, 9.65278149e-01, ...,
         1.25269741e-02, 3.21629345e-01, 6.65843725e-01],
        [9.08881542e-04, 1.04175135e-01, 7.50824576e-04, ...,
         9.99683421e-03, 7.89827347e-01, 2.00175866e-01]],

       [[2.03237548e-01, 7.00179100e-01, 3.63819454e-05, ...,
         9.96583939e-01, 2.39940570e-03, 1.01662707e-03],
        [7.63220847e-01, 1.11325733e-01, 3.15520242e-02, ...,
         5.45369804e-01, 4.54322606e-0

In [None]:
sample = buffer_eval.sample(buffer_list_eval, subkey)
sample = reshape_sample(sample)

In [None]:
loss, (loss_crossentropy, loss_reward) = loss_fn_transformer(
    transformer, sample.experience
)

AttributeError: 'RubikTransformer' object has no attribute 'state_mapping'

In [14]:
# save buffer, buffer_list
# in pickle
import pickle

state_weight = nnx.state(transformer)

In [15]:
state_weight

State({
  'action_mapping': {
    'bias': VariableState(
      type=Param,
      value=Array([ 4.04955857e-02,  2.82790326e-02, -7.85927773e-02,  7.52160996e-02,
             -6.11112965e-03,  6.96982583e-03, -1.17343664e-02, -1.74523471e-03,
              1.28632234e-02, -7.83682019e-02,  2.75444221e-02, -4.00350802e-02,
              1.79233290e-02,  8.38570073e-02,  2.03401130e-02,  4.92124483e-02,
              8.69528428e-02,  2.20998153e-02, -3.42875794e-02, -6.76687211e-02,
              2.17811018e-02,  8.36544111e-02, -3.08539756e-02, -8.56901798e-03,
             -6.66830465e-02,  1.15918748e-01,  5.94779989e-03,  1.72799546e-02,
             -1.16014622e-01, -6.75882176e-02, -6.16184436e-02, -5.52975051e-02,
              4.17982265e-02, -5.43787293e-02,  1.19193546e-01, -9.40112211e-03,
             -4.03175130e-02, -3.47817354e-02, -6.77642366e-03,  8.85512680e-02,
              4.83243428e-02,  6.59283325e-02, -5.58541063e-03,  3.46533172e-02,
             -6.63223118e-02

In [16]:
# save state into pickle
with open("state_probainput_vscale5.pickle", "wb") as handle:
    pickle.dump(state_weight, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
state_weight

State({
  'action_mapping': {
    'bias': VariableState(
      type=Param,
      value=Array([ 4.04955857e-02,  2.82790326e-02, -7.85927773e-02,  7.52160996e-02,
             -6.11112965e-03,  6.96982583e-03, -1.17343664e-02, -1.74523471e-03,
              1.28632234e-02, -7.83682019e-02,  2.75444221e-02, -4.00350802e-02,
              1.79233290e-02,  8.38570073e-02,  2.03401130e-02,  4.92124483e-02,
              8.69528428e-02,  2.20998153e-02, -3.42875794e-02, -6.76687211e-02,
              2.17811018e-02,  8.36544111e-02, -3.08539756e-02, -8.56901798e-03,
             -6.66830465e-02,  1.15918748e-01,  5.94779989e-03,  1.72799546e-02,
             -1.16014622e-01, -6.75882176e-02, -6.16184436e-02, -5.52975051e-02,
              4.17982265e-02, -5.43787293e-02,  1.19193546e-01, -9.40112211e-03,
             -4.03175130e-02, -3.47817354e-02, -6.77642366e-03,  8.85512680e-02,
              4.83243428e-02,  6.59283325e-02, -5.58541063e-03,  3.46533172e-02,
             -6.63223118e-02

In [19]:
sample.experience.keys()

dict_keys(['action', 'action_pred', 'reward', 'state_first', 'state_next'])

In [None]:
state_pred_transformer, reward = transformer(
    sample.experience["state_first"], sample.experience["action_pred"]
)

In [None]:
proba = jax.nn.softmax(state_pred_transformer[0, 1, :].reshape(6, 3, 3, 6))
state_prediction = jnp.argmax(proba, axis=3)
print(state_prediction)

[[[5 1 3]
  [0 0 1]
  [0 2 0]]

 [[1 3 1]
  [4 1 4]
  [4 2 5]]

 [[2 4 2]
  [5 2 0]
  [3 0 4]]

 [[5 2 2]
  [2 3 3]
  [1 1 3]]

 [[1 3 4]
  [5 4 0]
  [2 4 0]]

 [[3 5 4]
  [3 5 1]
  [0 5 5]]]


In [None]:
sample.experience["state_next"][0, 0, :].reshape(6, 3, 3)

Array([[[5, 1, 3],
        [0, 0, 1],
        [0, 2, 0]],

       [[1, 3, 1],
        [4, 1, 4],
        [4, 2, 5]],

       [[2, 4, 2],
        [5, 2, 0],
        [3, 0, 4]],

       [[5, 2, 2],
        [2, 3, 3],
        [1, 1, 3]],

       [[1, 3, 4],
        [5, 4, 0],
        [2, 4, 0]],

       [[3, 5, 4],
        [3, 5, 1],
        [0, 5, 5]]], dtype=int8)