In [1]:
# chose the current file directory as the working directory
import os 
os.chdir("/teamspace/studios/this_studio/rubikscubesolver")


In [2]:
from tqdm import tqdm

import wandb  # for logging
import time
from dataclasses import dataclass

import jax
import jax.numpy as jnp
import flax.nnx as nnx

import optax

from rubiktransformer.models import RubikTransformer, PolicyModel
import rubiktransformer.dataset as dataset
from rubiktransformer.trainer import train
from rubiktransformer.trainer import reshape_sample

cuda_plugin_extension is not found.


In [3]:
@dataclass
class Config:
    """Configuration class"""
    jax_key: jnp.ndarray = jax.random.PRNGKey(46)
    rngs = nnx.Rngs(45)
    batch_size: int = 128
    lr_1: float = 4e-3
    lr_2: float = 4e-3
    nb_games: int = 128 * 100
    len_seq: int = 20
    nb_step: int = 1000000
    log_every_step: int = 10
    log_eval_every_step: int = 10
    log_policy_reward_every_step: int = 10
    add_data_every_step: int = 500

config = Config()

# init wandb config
user = "forbu14"
project = "RubikTransformer"
display_name = "experiment_" + time.strftime("%Y%m%d-%H%M%S")

wandb.init(entity=user, project=project, name=display_name)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mforbu14[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
policy = PolicyModel(rngs=config.rngs)
transformer = RubikTransformer(rngs=config.rngs, causal=True)

scheduler = optax.linear_schedule(init_value=0., end_value=1., transition_steps=4000)

# init optimizer
optimizer_optaxworldmodel = optax.chain(
    optax.clip_by_global_norm(1.0),
    optax.lion(config.lr_1 / 100.),
   # optax.adamw(config.lr_1/10.),
    optax.scale_by_schedule(scheduler),
)

optimizer_worldmodel = nnx.Optimizer(transformer, optimizer_optaxworldmodel)
optimizer_policy = nnx.Optimizer(policy, optax.adam(config.lr_2))

# metrics
metrics_train = nnx.MultiMetric(
    loss=nnx.metrics.Average("loss"),
    loss_reward=nnx.metrics.Average("loss_reward"),
    loss_cross_entropy=nnx.metrics.Average("loss_cross_entropy"),
)

metrics_eval = nnx.MultiMetric(
    loss_eval=nnx.metrics.Average("loss_eval"),
    loss_reward_eval=nnx.metrics.Average("loss_reward_eval"),
    loss_cross_entropy_eval=nnx.metrics.Average("loss_cross_entropy_eval"),
)


In [5]:
# import pickle

# filename = "state_probainput_vscale3.pickle"

# with open(filename, "rb") as input_file:
#     state = pickle.load(input_file)

# nnx.update(transformer, state)

In [6]:
# gather data from the environment
# init models and optimizers
env, buffer = dataset.init_env_buffer(sample_batch_size=config.batch_size)
env, buffer_eval = dataset.init_env_buffer(sample_batch_size=config.batch_size)


nb_games = config.nb_games
len_seq = config.len_seq

state_first = jnp.zeros((6, 3, 3))
state_next = jnp.zeros((len_seq, 6, 3, 3))
action = jnp.zeros((len_seq, 3))
action_proba = jnp.zeros((len_seq, 9))

# transform state to int8 type
state_first = state_first.astype(jnp.int8)
state_next = state_next.astype(jnp.int8)

# action to int32 type
action = action.astype(jnp.int32)

reward = jnp.zeros((len_seq))

jit_step = jax.jit(env.step)

buffer_list = buffer.init(
    {
        "state_first": state_first,
        "action": action,
        "reward": reward,
        "state_next": state_next,
        "action_pred": action_proba,
    }
)

buffer_list_eval = buffer_eval.init(
    {
        "state_first": state_first,
        "action": action,
        "reward": reward,
        "state_next": state_next,
        "action_pred": action_proba,
    }
)


In [7]:

def step_fn(state, key):
    """
    Simple step function
    We choose a random action
    """

    action = jax.random.randint(
        key=key,
        minval=env.action_spec.minimum,
        maxval=env.action_spec.maximum,
        shape=(3,),
    )

    new_state, timestep = jit_step(state, action)
    timestep.extras["action"] = action
    timestep.extras["action_pred"] = jnp.zeros((9,))

    return new_state, timestep

def run_n_steps(state, key, n):
    random_keys = jax.random.split(key, n)
    state, rollout = jax.lax.scan(step_fn, state, random_keys)

    return rollout

vmap_reset = jax.vmap(jax.jit(env.reset))
vmap_step = jax.vmap(run_n_steps, in_axes=(0, 0, None))

In [8]:
scale_factor_0 = 15.
scale_factor_1 = 10.

def step_fn_proba_setup(state, data):
    """
    Simple step function
    We choose a random action
    """

    key1, key2 = data

    # we choose a probability distribution over the action
    action_proba_0 = jax.random.normal(key1, shape=(6,)) * scale_factor_0
    action_proba_1 = jax.random.normal(key2, shape=(3,)) * scale_factor_1

    # softmax over the action
    action_proba_0 = jax.nn.softmax(action_proba_0)
    action_proba_1 = jax.nn.softmax(action_proba_1)

    # sample the action from the probability distribution
    action_proba_0_value = jax.random.categorical(key1, action_proba_0)
    action_proba_1_value = jax.random.categorical(key2, action_proba_1)

    action = jnp.array([action_proba_0_value, jnp.array(0), action_proba_1_value])

    # concat 
    action_proba = jnp.concatenate([action_proba_0, action_proba_1])

    new_state, timestep = jit_step(state, action)
    timestep.extras["action"] = action
    timestep.extras["action_pred"] = action_proba

    return new_state, timestep

def run_n_steps_proba(state, key, n):
    random_keys = jax.random.split(key, (n, 2))
    state, rollout = jax.lax.scan(step_fn_proba_setup, state, random_keys)

    return rollout

vmap_reset = jax.vmap(jax.jit(env.reset))
vmap_step_proba = jax.vmap(run_n_steps_proba, in_axes=(0, 0, None))

In [9]:
nnx.display(transformer)

In [10]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer, buffer_list = dataset.fast_gathering_data(
    env,
    vmap_reset,
    vmap_step,
    int(config.nb_games / 10),
    config.len_seq,
    buffer,
    buffer_list,
    subkey,
)

In [11]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer, buffer_list = dataset.fast_gathering_data(
    env,
    vmap_reset,
    vmap_step_proba,
    int(config.nb_games / 10),
    config.len_seq,
    buffer,
    buffer_list,
    subkey,
)

[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 34.8 seconds.), retrying request


In [12]:

sample = buffer.sample(buffer_list, subkey)
sample = reshape_sample(sample)

In [13]:
sample

TrajectoryBufferSample(experience={'action': Array([[[1.0342622e-03, 9.8763175e-02, 1.9616945e-01, ...,
         9.1598278e-01, 7.9124421e-02, 4.8927194e-03],
        [6.5340154e-04, 7.4344707e-01, 2.9543904e-03, ...,
         8.2277584e-01, 1.3074802e-01, 4.6476152e-02],
        [2.9181805e-01, 3.3277448e-02, 1.3126548e-01, ...,
         1.6328433e-03, 9.9831665e-01, 5.0603030e-05],
        ...,
        [3.3290795e-04, 2.8182656e-04, 1.0129113e-05, ...,
         1.2191311e-01, 4.8528105e-01, 3.9280584e-01],
        [7.0439512e-04, 9.9126726e-01, 8.9772133e-04, ...,
         2.1325910e-02, 7.3878825e-01, 2.3988584e-01],
        [1.1097225e-02, 2.3210780e-03, 8.3933562e-02, ...,
         3.7379863e-04, 9.6265030e-01, 3.6975913e-02]],

       [[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.00

In [11]:

def loss_fn_transformer(model: RubikTransformer, batch):
    state_logits, reward_value = model(batch["state_first"], batch["action"])

    # reshape state_logits
    # from (batch_size, sequence_length, 324) => (batch_size, sequence_length -1, 54, 6)
    state_logits = state_logits[:, 1:, :]
    state_logits = state_logits.reshape(
        (state_logits.shape[0], state_logits.shape[1], 54, 6)
    )

    reward_value = reward_value[:, 1:]

    loss_crossentropy = optax.softmax_cross_entropy_with_integer_labels(
        logits=state_logits, labels=batch["state_next"]
    ).mean()

    loss_reward = jnp.square(reward_value - batch["reward"]).mean()

    loss = loss_crossentropy + loss_reward

    return loss, (loss_crossentropy, loss_reward)

@nnx.jit
def train_step_transformer(
    model: RubikTransformer, optimizer: nnx.Optimizer, metrics: nnx.MultiMetric, batch
):
    """Train for a single step."""
    grad_fn = nnx.value_and_grad(loss_fn_transformer, has_aux=True)
    (loss, (loss_crossentropy, loss_reward)), grads = grad_fn(model, batch)
    metrics.update(
        loss=loss, loss_reward=loss_reward, loss_cross_entropy=loss_crossentropy
    )
    optimizer.update(grads)


In [12]:
key, subkey = jax.random.split(config.jax_key)
config.jax_key = key

buffer, buffer_list = dataset.fast_gathering_data(
    env,
    vmap_reset,
    vmap_step_proba,
    config.nb_games * 10,
    config.len_seq,
    buffer,
    buffer_list,
    subkey,
)


In [13]:

# transformer model calibration
for idx_step in tqdm(range(config.nb_step)):

    # training for world model
    key, subkey = jax.random.split(config.jax_key)
    config.jax_key = key

    if idx_step % config.add_data_every_step == 0:
        buffer, buffer_list = dataset.fast_gathering_data(
            env,
            vmap_reset,
            vmap_step_proba,
            int(config.nb_games / 10.),
            config.len_seq,
            buffer,
            buffer_list,
            config.jax_key,
        )

    sample = buffer.sample(buffer_list, subkey)
    sample = reshape_sample(sample)

    # we update the policy
    train_step_transformer(
        transformer, optimizer_worldmodel, metrics_train, sample.experience
    )

    if idx_step % config.log_every_step == 0:
        metrics_train_result = metrics_train.compute()
        print(metrics_train_result)

        wandb.log(metrics_train_result, step=idx_step)
        metrics_train.reset()

    if idx_step % config.log_eval_every_step == 0:

        scale_factor = 30.

        key, subkey = jax.random.split(config.jax_key)
        config.jax_key = key
        
        buffer_eval, buffer_list_eval = dataset.fast_gathering_data(
            env,
            vmap_reset,
            vmap_step_proba,
            int(128),
            config.len_seq,
            buffer_eval,
            buffer_list_eval,
            subkey,
        )

        sample = buffer_eval.sample(buffer_list_eval, subkey)
        sample = reshape_sample(sample)

        loss, (loss_crossentropy, loss_reward) = loss_fn_transformer(transformer, sample.experience)

        metrics_eval.update(loss_eval=loss, loss_reward_eval=loss_reward, loss_cross_entropy_eval=loss_crossentropy)
        wandb.log(metrics_eval.compute(), step=idx_step)

        metrics_eval.reset()

        scale_factor = 3.



  2%|▏         | 23350/1000000 [1:12:29<35:43:47,  7.59it/s]

{'loss': Array(0.6715029, dtype=float32), 'loss_reward': Array(0.00503205, dtype=float32), 'loss_cross_entropy': Array(0.6664708, dtype=float32)}


  2%|▏         | 23358/1000000 [1:12:30<36:06:33,  7.51it/s]

{'loss': Array(0.6737969, dtype=float32), 'loss_reward': Array(0.00537699, dtype=float32), 'loss_cross_entropy': Array(0.6684199, dtype=float32)}


  2%|▏         | 23370/1000000 [1:12:32<29:14:52,  9.28it/s]

{'loss': Array(0.68048686, dtype=float32), 'loss_reward': Array(0.00512654, dtype=float32), 'loss_cross_entropy': Array(0.6753603, dtype=float32)}


  2%|▏         | 23378/1000000 [1:12:34<34:18:24,  7.91it/s]

{'loss': Array(0.66714174, dtype=float32), 'loss_reward': Array(0.00510682, dtype=float32), 'loss_cross_entropy': Array(0.6620349, dtype=float32)}


  2%|▏         | 23389/1000000 [1:12:36<35:26:43,  7.65it/s]

{'loss': Array(0.6750406, dtype=float32), 'loss_reward': Array(0.0050937, dtype=float32), 'loss_cross_entropy': Array(0.6699469, dtype=float32)}


  2%|▏         | 23400/1000000 [1:12:37<28:34:34,  9.49it/s]

{'loss': Array(0.6748676, dtype=float32), 'loss_reward': Array(0.00497825, dtype=float32), 'loss_cross_entropy': Array(0.66988933, dtype=float32)}


  2%|▏         | 23408/1000000 [1:12:39<33:28:31,  8.10it/s]

{'loss': Array(0.6800634, dtype=float32), 'loss_reward': Array(0.0051951, dtype=float32), 'loss_cross_entropy': Array(0.6748683, dtype=float32)}


  2%|▏         | 23420/1000000 [1:12:40<28:28:05,  9.53it/s]

{'loss': Array(0.67543626, dtype=float32), 'loss_reward': Array(0.00497879, dtype=float32), 'loss_cross_entropy': Array(0.67045754, dtype=float32)}


  2%|▏         | 23429/1000000 [1:12:42<35:24:45,  7.66it/s]

{'loss': Array(0.6731736, dtype=float32), 'loss_reward': Array(0.00503336, dtype=float32), 'loss_cross_entropy': Array(0.66814035, dtype=float32)}


  2%|▏         | 23440/1000000 [1:12:44<28:39:33,  9.47it/s]

{'loss': Array(0.666459, dtype=float32), 'loss_reward': Array(0.00518325, dtype=float32), 'loss_cross_entropy': Array(0.6612757, dtype=float32)}


  2%|▏         | 23448/1000000 [1:12:45<33:29:06,  8.10it/s]

{'loss': Array(0.66575074, dtype=float32), 'loss_reward': Array(0.00511006, dtype=float32), 'loss_cross_entropy': Array(0.6606407, dtype=float32)}


  2%|▏         | 23460/1000000 [1:12:47<27:09:58,  9.99it/s]

{'loss': Array(0.6758151, dtype=float32), 'loss_reward': Array(0.00500631, dtype=float32), 'loss_cross_entropy': Array(0.6708088, dtype=float32)}


  2%|▏         | 23468/1000000 [1:12:48<37:27:35,  7.24it/s]

{'loss': Array(0.6740262, dtype=float32), 'loss_reward': Array(0.00497767, dtype=float32), 'loss_cross_entropy': Array(0.66904855, dtype=float32)}


  2%|▏         | 23479/1000000 [1:12:50<30:21:56,  8.93it/s]

{'loss': Array(0.66926175, dtype=float32), 'loss_reward': Array(0.00492494, dtype=float32), 'loss_cross_entropy': Array(0.6643368, dtype=float32)}


  2%|▏         | 23490/1000000 [1:12:52<27:12:45,  9.97it/s]

{'loss': Array(0.6726063, dtype=float32), 'loss_reward': Array(0.00490978, dtype=float32), 'loss_cross_entropy': Array(0.6676965, dtype=float32)}


  2%|▏         | 23500/1000000 [1:12:54<39:20:23,  6.90it/s]

{'loss': Array(0.66329426, dtype=float32), 'loss_reward': Array(0.00519799, dtype=float32), 'loss_cross_entropy': Array(0.6580963, dtype=float32)}


  2%|▏         | 23510/1000000 [1:13:05<142:41:55,  1.90it/s]

{'loss': Array(0.68039787, dtype=float32), 'loss_reward': Array(0.00520897, dtype=float32), 'loss_cross_entropy': Array(0.67518884, dtype=float32)}


  2%|▏         | 23518/1000000 [1:13:06<70:02:47,  3.87it/s] 

{'loss': Array(0.67549145, dtype=float32), 'loss_reward': Array(0.00487651, dtype=float32), 'loss_cross_entropy': Array(0.6706149, dtype=float32)}


  2%|▏         | 23530/1000000 [1:13:08<36:20:15,  7.46it/s]

{'loss': Array(0.6697666, dtype=float32), 'loss_reward': Array(0.00507131, dtype=float32), 'loss_cross_entropy': Array(0.6646952, dtype=float32)}


  2%|▏         | 23540/1000000 [1:13:10<38:13:32,  7.10it/s]

{'loss': Array(0.67355937, dtype=float32), 'loss_reward': Array(0.00529323, dtype=float32), 'loss_cross_entropy': Array(0.6682661, dtype=float32)}


  2%|▏         | 23548/1000000 [1:13:12<36:34:17,  7.42it/s]

{'loss': Array(0.6711969, dtype=float32), 'loss_reward': Array(0.00525515, dtype=float32), 'loss_cross_entropy': Array(0.66594183, dtype=float32)}


  2%|▏         | 23558/1000000 [1:13:13<30:40:24,  8.84it/s]

{'loss': Array(0.6672033, dtype=float32), 'loss_reward': Array(0.00505563, dtype=float32), 'loss_cross_entropy': Array(0.6621477, dtype=float32)}


  2%|▏         | 23570/1000000 [1:13:15<26:36:49, 10.19it/s]

{'loss': Array(0.66784376, dtype=float32), 'loss_reward': Array(0.00503925, dtype=float32), 'loss_cross_entropy': Array(0.6628045, dtype=float32)}


  2%|▏         | 23579/1000000 [1:13:17<37:26:57,  7.24it/s]

{'loss': Array(0.66981804, dtype=float32), 'loss_reward': Array(0.0050659, dtype=float32), 'loss_cross_entropy': Array(0.6647521, dtype=float32)}


  2%|▏         | 23590/1000000 [1:13:18<28:28:39,  9.52it/s]

{'loss': Array(0.67171746, dtype=float32), 'loss_reward': Array(0.00521955, dtype=float32), 'loss_cross_entropy': Array(0.6664979, dtype=float32)}


  2%|▏         | 23600/1000000 [1:13:20<28:52:19,  9.39it/s]

{'loss': Array(0.6761125, dtype=float32), 'loss_reward': Array(0.00515031, dtype=float32), 'loss_cross_entropy': Array(0.6709623, dtype=float32)}


  2%|▏         | 23608/1000000 [1:13:21<34:16:28,  7.91it/s]

{'loss': Array(0.6738192, dtype=float32), 'loss_reward': Array(0.00510073, dtype=float32), 'loss_cross_entropy': Array(0.6687185, dtype=float32)}


  2%|▏         | 23620/1000000 [1:13:23<32:04:53,  8.45it/s]

{'loss': Array(0.67196894, dtype=float32), 'loss_reward': Array(0.00521733, dtype=float32), 'loss_cross_entropy': Array(0.6667517, dtype=float32)}


  2%|▏         | 23628/1000000 [1:13:25<34:35:57,  7.84it/s]

{'loss': Array(0.67429703, dtype=float32), 'loss_reward': Array(0.00505787, dtype=float32), 'loss_cross_entropy': Array(0.66923916, dtype=float32)}


  2%|▏         | 23640/1000000 [1:13:26<28:08:34,  9.64it/s]

{'loss': Array(0.66358584, dtype=float32), 'loss_reward': Array(0.00515185, dtype=float32), 'loss_cross_entropy': Array(0.6584341, dtype=float32)}


  2%|▏         | 23648/1000000 [1:13:28<33:01:31,  8.21it/s]

{'loss': Array(0.6647619, dtype=float32), 'loss_reward': Array(0.00511896, dtype=float32), 'loss_cross_entropy': Array(0.65964293, dtype=float32)}


  2%|▏         | 23659/1000000 [1:13:30<32:46:39,  8.27it/s]

{'loss': Array(0.67177266, dtype=float32), 'loss_reward': Array(0.00503966, dtype=float32), 'loss_cross_entropy': Array(0.666733, dtype=float32)}


  2%|▏         | 23669/1000000 [1:13:31<30:16:37,  8.96it/s]

{'loss': Array(0.6631277, dtype=float32), 'loss_reward': Array(0.00506185, dtype=float32), 'loss_cross_entropy': Array(0.658066, dtype=float32)}


  2%|▏         | 23680/1000000 [1:13:33<27:17:23,  9.94it/s]

{'loss': Array(0.6639929, dtype=float32), 'loss_reward': Array(0.00504229, dtype=float32), 'loss_cross_entropy': Array(0.6589506, dtype=float32)}


  2%|▏         | 23690/1000000 [1:13:35<39:40:49,  6.83it/s]

{'loss': Array(0.66467875, dtype=float32), 'loss_reward': Array(0.00490291, dtype=float32), 'loss_cross_entropy': Array(0.6597759, dtype=float32)}


  2%|▏         | 23698/1000000 [1:13:36<36:40:23,  7.39it/s]

{'loss': Array(0.66236705, dtype=float32), 'loss_reward': Array(0.00493452, dtype=float32), 'loss_cross_entropy': Array(0.65743256, dtype=float32)}


  2%|▏         | 23709/1000000 [1:13:38<31:07:22,  8.71it/s]

{'loss': Array(0.6721811, dtype=float32), 'loss_reward': Array(0.00534767, dtype=float32), 'loss_cross_entropy': Array(0.66683346, dtype=float32)}


  2%|▏         | 23720/1000000 [1:13:39<28:25:46,  9.54it/s]

{'loss': Array(0.6722005, dtype=float32), 'loss_reward': Array(0.00510983, dtype=float32), 'loss_cross_entropy': Array(0.66709065, dtype=float32)}


  2%|▏         | 23729/1000000 [1:13:41<43:54:33,  6.18it/s]

{'loss': Array(0.66979384, dtype=float32), 'loss_reward': Array(0.00501814, dtype=float32), 'loss_cross_entropy': Array(0.6647757, dtype=float32)}


  2%|▏         | 23740/1000000 [1:13:43<30:16:48,  8.96it/s]

{'loss': Array(0.66392964, dtype=float32), 'loss_reward': Array(0.00505451, dtype=float32), 'loss_cross_entropy': Array(0.65887517, dtype=float32)}


  2%|▏         | 23748/1000000 [1:13:45<34:51:50,  7.78it/s]

{'loss': Array(0.6633945, dtype=float32), 'loss_reward': Array(0.00506917, dtype=float32), 'loss_cross_entropy': Array(0.6583253, dtype=float32)}


  2%|▏         | 23760/1000000 [1:13:46<28:12:02,  9.62it/s]

{'loss': Array(0.6646164, dtype=float32), 'loss_reward': Array(0.00505437, dtype=float32), 'loss_cross_entropy': Array(0.659562, dtype=float32)}


  2%|▏         | 23769/1000000 [1:13:48<38:51:07,  6.98it/s]

{'loss': Array(0.667772, dtype=float32), 'loss_reward': Array(0.00517179, dtype=float32), 'loss_cross_entropy': Array(0.6626002, dtype=float32)}


  2%|▏         | 23780/1000000 [1:13:50<29:53:52,  9.07it/s]

{'loss': Array(0.66271055, dtype=float32), 'loss_reward': Array(0.00506594, dtype=float32), 'loss_cross_entropy': Array(0.6576446, dtype=float32)}


  2%|▏         | 23788/1000000 [1:13:51<34:34:19,  7.84it/s]

{'loss': Array(0.6681704, dtype=float32), 'loss_reward': Array(0.0050512, dtype=float32), 'loss_cross_entropy': Array(0.66311926, dtype=float32)}


  2%|▏         | 23800/1000000 [1:13:53<28:01:52,  9.67it/s]

{'loss': Array(0.6636757, dtype=float32), 'loss_reward': Array(0.00504451, dtype=float32), 'loss_cross_entropy': Array(0.65863115, dtype=float32)}


  2%|▏         | 23810/1000000 [1:13:55<32:20:43,  8.38it/s]

{'loss': Array(0.67324865, dtype=float32), 'loss_reward': Array(0.00512248, dtype=float32), 'loss_cross_entropy': Array(0.6681261, dtype=float32)}


  2%|▏         | 23818/1000000 [1:13:56<35:21:46,  7.67it/s]

{'loss': Array(0.66660696, dtype=float32), 'loss_reward': Array(0.00519058, dtype=float32), 'loss_cross_entropy': Array(0.66141635, dtype=float32)}


  2%|▏         | 23828/1000000 [1:13:58<30:22:23,  8.93it/s]

{'loss': Array(0.6634137, dtype=float32), 'loss_reward': Array(0.00524485, dtype=float32), 'loss_cross_entropy': Array(0.65816873, dtype=float32)}


  2%|▏         | 23840/1000000 [1:13:59<26:47:39, 10.12it/s]

{'loss': Array(0.6646552, dtype=float32), 'loss_reward': Array(0.00504025, dtype=float32), 'loss_cross_entropy': Array(0.65961504, dtype=float32)}


  2%|▏         | 23848/1000000 [1:14:01<37:03:35,  7.32it/s]

{'loss': Array(0.6679824, dtype=float32), 'loss_reward': Array(0.00523734, dtype=float32), 'loss_cross_entropy': Array(0.66274506, dtype=float32)}


  2%|▏         | 23860/1000000 [1:14:03<28:10:46,  9.62it/s]

{'loss': Array(0.66939634, dtype=float32), 'loss_reward': Array(0.00494873, dtype=float32), 'loss_cross_entropy': Array(0.6644477, dtype=float32)}


  2%|▏         | 23868/1000000 [1:14:04<33:41:42,  8.05it/s]

{'loss': Array(0.6534242, dtype=float32), 'loss_reward': Array(0.00494436, dtype=float32), 'loss_cross_entropy': Array(0.6484798, dtype=float32)}


  2%|▏         | 23879/1000000 [1:14:06<40:07:08,  6.76it/s]

{'loss': Array(0.65625495, dtype=float32), 'loss_reward': Array(0.00500831, dtype=float32), 'loss_cross_entropy': Array(0.65124655, dtype=float32)}


  2%|▏         | 23890/1000000 [1:14:08<29:54:59,  9.06it/s]

{'loss': Array(0.6565479, dtype=float32), 'loss_reward': Array(0.00511927, dtype=float32), 'loss_cross_entropy': Array(0.65142864, dtype=float32)}


  2%|▏         | 23898/1000000 [1:14:09<34:33:59,  7.84it/s]

{'loss': Array(0.6675786, dtype=float32), 'loss_reward': Array(0.00492718, dtype=float32), 'loss_cross_entropy': Array(0.6626513, dtype=float32)}


  2%|▏         | 23908/1000000 [1:14:11<32:28:35,  8.35it/s]

{'loss': Array(0.6590234, dtype=float32), 'loss_reward': Array(0.00497364, dtype=float32), 'loss_cross_entropy': Array(0.6540498, dtype=float32)}


  2%|▏         | 23919/1000000 [1:14:13<39:10:01,  6.92it/s]

{'loss': Array(0.65299493, dtype=float32), 'loss_reward': Array(0.00497452, dtype=float32), 'loss_cross_entropy': Array(0.6480204, dtype=float32)}


  2%|▏         | 23930/1000000 [1:14:15<30:27:41,  8.90it/s]

{'loss': Array(0.66124845, dtype=float32), 'loss_reward': Array(0.00514195, dtype=float32), 'loss_cross_entropy': Array(0.6561066, dtype=float32)}


  2%|▏         | 23940/1000000 [1:14:16<29:12:58,  9.28it/s]

{'loss': Array(0.65787697, dtype=float32), 'loss_reward': Array(0.00503643, dtype=float32), 'loss_cross_entropy': Array(0.65284055, dtype=float32)}


  2%|▏         | 23948/1000000 [1:14:18<34:55:10,  7.76it/s]

{'loss': Array(0.6591931, dtype=float32), 'loss_reward': Array(0.00513872, dtype=float32), 'loss_cross_entropy': Array(0.65405434, dtype=float32)}


  2%|▏         | 23959/1000000 [1:14:20<37:03:27,  7.32it/s]

{'loss': Array(0.6573066, dtype=float32), 'loss_reward': Array(0.00513735, dtype=float32), 'loss_cross_entropy': Array(0.6521692, dtype=float32)}


  2%|▏         | 23969/1000000 [1:14:21<31:48:44,  8.52it/s]

{'loss': Array(0.66630214, dtype=float32), 'loss_reward': Array(0.00503764, dtype=float32), 'loss_cross_entropy': Array(0.66126454, dtype=float32)}


  2%|▏         | 23980/1000000 [1:14:23<27:59:03,  9.69it/s]

{'loss': Array(0.6577962, dtype=float32), 'loss_reward': Array(0.00501464, dtype=float32), 'loss_cross_entropy': Array(0.6527816, dtype=float32)}


  2%|▏         | 23988/1000000 [1:14:24<34:05:57,  7.95it/s]

{'loss': Array(0.6586656, dtype=float32), 'loss_reward': Array(0.00488672, dtype=float32), 'loss_cross_entropy': Array(0.6537789, dtype=float32)}


  2%|▏         | 23999/1000000 [1:14:26<33:16:57,  8.15it/s]

{'loss': Array(0.6521612, dtype=float32), 'loss_reward': Array(0.00484861, dtype=float32), 'loss_cross_entropy': Array(0.64731264, dtype=float32)}


  2%|▏         | 24010/1000000 [1:14:38<123:47:07,  2.19it/s]

{'loss': Array(0.6591036, dtype=float32), 'loss_reward': Array(0.00499576, dtype=float32), 'loss_cross_entropy': Array(0.65410775, dtype=float32)}


  2%|▏         | 24019/1000000 [1:14:39<63:23:49,  4.28it/s] 

{'loss': Array(0.65958875, dtype=float32), 'loss_reward': Array(0.00500599, dtype=float32), 'loss_cross_entropy': Array(0.6545827, dtype=float32)}


  2%|▏         | 24030/1000000 [1:14:41<34:57:08,  7.76it/s]

{'loss': Array(0.6640192, dtype=float32), 'loss_reward': Array(0.00521036, dtype=float32), 'loss_cross_entropy': Array(0.6588089, dtype=float32)}


  2%|▏         | 24038/1000000 [1:14:43<40:42:34,  6.66it/s]

{'loss': Array(0.6487412, dtype=float32), 'loss_reward': Array(0.00486197, dtype=float32), 'loss_cross_entropy': Array(0.64387923, dtype=float32)}


  2%|▏         | 24048/1000000 [1:14:44<33:34:27,  8.07it/s]

{'loss': Array(0.66315985, dtype=float32), 'loss_reward': Array(0.00499358, dtype=float32), 'loss_cross_entropy': Array(0.6581663, dtype=float32)}


  2%|▏         | 24060/1000000 [1:14:46<26:50:57, 10.10it/s]

{'loss': Array(0.67209446, dtype=float32), 'loss_reward': Array(0.00500415, dtype=float32), 'loss_cross_entropy': Array(0.66709036, dtype=float32)}


  2%|▏         | 24070/1000000 [1:14:48<38:39:38,  7.01it/s]

{'loss': Array(0.6535732, dtype=float32), 'loss_reward': Array(0.00497583, dtype=float32), 'loss_cross_entropy': Array(0.6485974, dtype=float32)}


  2%|▏         | 24078/1000000 [1:14:49<37:05:08,  7.31it/s]

{'loss': Array(0.6559561, dtype=float32), 'loss_reward': Array(0.00499972, dtype=float32), 'loss_cross_entropy': Array(0.65095633, dtype=float32)}


  2%|▏         | 24089/1000000 [1:14:51<28:57:51,  9.36it/s]

{'loss': Array(0.6533385, dtype=float32), 'loss_reward': Array(0.00499974, dtype=float32), 'loss_cross_entropy': Array(0.64833874, dtype=float32)}


  2%|▏         | 24100/1000000 [1:14:52<26:52:22, 10.09it/s]

{'loss': Array(0.6591787, dtype=float32), 'loss_reward': Array(0.00493654, dtype=float32), 'loss_cross_entropy': Array(0.6542422, dtype=float32)}


  2%|▏         | 24108/1000000 [1:14:54<42:07:28,  6.44it/s]

{'loss': Array(0.6632243, dtype=float32), 'loss_reward': Array(0.00481955, dtype=float32), 'loss_cross_entropy': Array(0.6584047, dtype=float32)}


  2%|▏         | 24120/1000000 [1:14:56<29:19:40,  9.24it/s]

{'loss': Array(0.6652843, dtype=float32), 'loss_reward': Array(0.00520885, dtype=float32), 'loss_cross_entropy': Array(0.6600755, dtype=float32)}


  2%|▏         | 24128/1000000 [1:14:57<33:06:01,  8.19it/s]

{'loss': Array(0.6585544, dtype=float32), 'loss_reward': Array(0.00504771, dtype=float32), 'loss_cross_entropy': Array(0.6535067, dtype=float32)}


  2%|▏         | 24140/1000000 [1:14:59<27:13:36,  9.96it/s]

{'loss': Array(0.65177935, dtype=float32), 'loss_reward': Array(0.00502867, dtype=float32), 'loss_cross_entropy': Array(0.6467506, dtype=float32)}


  2%|▏         | 24148/1000000 [1:15:00<39:27:28,  6.87it/s]

{'loss': Array(0.6524307, dtype=float32), 'loss_reward': Array(0.00498165, dtype=float32), 'loss_cross_entropy': Array(0.64744914, dtype=float32)}


  2%|▏         | 24160/1000000 [1:15:02<29:05:53,  9.32it/s]

{'loss': Array(0.6597177, dtype=float32), 'loss_reward': Array(0.00526825, dtype=float32), 'loss_cross_entropy': Array(0.6544494, dtype=float32)}


  2%|▏         | 24168/1000000 [1:15:03<33:28:44,  8.10it/s]

{'loss': Array(0.6591134, dtype=float32), 'loss_reward': Array(0.00512751, dtype=float32), 'loss_cross_entropy': Array(0.6539859, dtype=float32)}


  2%|▏         | 24180/1000000 [1:15:05<27:30:53,  9.85it/s]

{'loss': Array(0.6545626, dtype=float32), 'loss_reward': Array(0.00499654, dtype=float32), 'loss_cross_entropy': Array(0.649566, dtype=float32)}


  2%|▏         | 24189/1000000 [1:15:07<34:50:54,  7.78it/s]

{'loss': Array(0.652598, dtype=float32), 'loss_reward': Array(0.00488821, dtype=float32), 'loss_cross_entropy': Array(0.6477098, dtype=float32)}


  2%|▏         | 24199/1000000 [1:15:08<28:57:12,  9.36it/s]

{'loss': Array(0.65312594, dtype=float32), 'loss_reward': Array(0.00507329, dtype=float32), 'loss_cross_entropy': Array(0.64805263, dtype=float32)}


  2%|▏         | 24210/1000000 [1:15:10<26:39:00, 10.17it/s]

{'loss': Array(0.6534427, dtype=float32), 'loss_reward': Array(0.00502722, dtype=float32), 'loss_cross_entropy': Array(0.6484155, dtype=float32)}


  2%|▏         | 24220/1000000 [1:15:12<37:25:03,  7.24it/s]

{'loss': Array(0.65777403, dtype=float32), 'loss_reward': Array(0.0050895, dtype=float32), 'loss_cross_entropy': Array(0.6526845, dtype=float32)}


  2%|▏         | 24230/1000000 [1:15:13<29:41:15,  9.13it/s]

{'loss': Array(0.66096616, dtype=float32), 'loss_reward': Array(0.00502782, dtype=float32), 'loss_cross_entropy': Array(0.6559383, dtype=float32)}


  2%|▏         | 24238/1000000 [1:15:15<34:36:09,  7.83it/s]

{'loss': Array(0.6585757, dtype=float32), 'loss_reward': Array(0.0049641, dtype=float32), 'loss_cross_entropy': Array(0.6536116, dtype=float32)}


  2%|▏         | 24250/1000000 [1:15:17<27:29:11,  9.86it/s]

{'loss': Array(0.64110726, dtype=float32), 'loss_reward': Array(0.00479626, dtype=float32), 'loss_cross_entropy': Array(0.636311, dtype=float32)}


  2%|▏         | 24258/1000000 [1:15:18<42:30:36,  6.38it/s]

{'loss': Array(0.65229225, dtype=float32), 'loss_reward': Array(0.00489597, dtype=float32), 'loss_cross_entropy': Array(0.64739627, dtype=float32)}


  2%|▏         | 24270/1000000 [1:15:20<29:07:21,  9.31it/s]

{'loss': Array(0.6548292, dtype=float32), 'loss_reward': Array(0.00488071, dtype=float32), 'loss_cross_entropy': Array(0.64994854, dtype=float32)}


  2%|▏         | 24278/1000000 [1:15:21<33:15:17,  8.15it/s]

{'loss': Array(0.64497477, dtype=float32), 'loss_reward': Array(0.00482825, dtype=float32), 'loss_cross_entropy': Array(0.6401465, dtype=float32)}


  2%|▏         | 24290/1000000 [1:15:23<27:31:41,  9.85it/s]

{'loss': Array(0.65342826, dtype=float32), 'loss_reward': Array(0.00497091, dtype=float32), 'loss_cross_entropy': Array(0.64845735, dtype=float32)}


  2%|▏         | 24298/1000000 [1:15:25<40:36:52,  6.67it/s]

{'loss': Array(0.6502439, dtype=float32), 'loss_reward': Array(0.0048809, dtype=float32), 'loss_cross_entropy': Array(0.645363, dtype=float32)}


  2%|▏         | 24310/1000000 [1:15:27<29:53:27,  9.07it/s]

{'loss': Array(0.6478762, dtype=float32), 'loss_reward': Array(0.00487973, dtype=float32), 'loss_cross_entropy': Array(0.6429965, dtype=float32)}


  2%|▏         | 24318/1000000 [1:15:28<33:49:56,  8.01it/s]

{'loss': Array(0.6426314, dtype=float32), 'loss_reward': Array(0.00494048, dtype=float32), 'loss_cross_entropy': Array(0.63769096, dtype=float32)}


  2%|▏         | 24330/1000000 [1:15:30<27:39:26,  9.80it/s]

{'loss': Array(0.6478183, dtype=float32), 'loss_reward': Array(0.00492614, dtype=float32), 'loss_cross_entropy': Array(0.6428921, dtype=float32)}


  2%|▏         | 24338/1000000 [1:15:31<40:41:02,  6.66it/s]

{'loss': Array(0.64808875, dtype=float32), 'loss_reward': Array(0.00497158, dtype=float32), 'loss_cross_entropy': Array(0.6431171, dtype=float32)}


  2%|▏         | 24350/1000000 [1:15:33<29:08:55,  9.30it/s]

{'loss': Array(0.6530854, dtype=float32), 'loss_reward': Array(0.00498174, dtype=float32), 'loss_cross_entropy': Array(0.64810365, dtype=float32)}


  2%|▏         | 24358/1000000 [1:15:34<33:20:49,  8.13it/s]

{'loss': Array(0.6537061, dtype=float32), 'loss_reward': Array(0.00487804, dtype=float32), 'loss_cross_entropy': Array(0.648828, dtype=float32)}


  2%|▏         | 24370/1000000 [1:15:36<27:30:09,  9.85it/s]

{'loss': Array(0.6435667, dtype=float32), 'loss_reward': Array(0.00491465, dtype=float32), 'loss_cross_entropy': Array(0.638652, dtype=float32)}


  2%|▏         | 24378/1000000 [1:15:38<36:58:34,  7.33it/s]

{'loss': Array(0.6477198, dtype=float32), 'loss_reward': Array(0.00505866, dtype=float32), 'loss_cross_entropy': Array(0.64266104, dtype=float32)}


  2%|▏         | 24390/1000000 [1:15:40<28:48:15,  9.41it/s]

{'loss': Array(0.64065635, dtype=float32), 'loss_reward': Array(0.00497031, dtype=float32), 'loss_cross_entropy': Array(0.6356861, dtype=float32)}


  2%|▏         | 24398/1000000 [1:15:41<33:02:50,  8.20it/s]

{'loss': Array(0.6474044, dtype=float32), 'loss_reward': Array(0.00490172, dtype=float32), 'loss_cross_entropy': Array(0.64250267, dtype=float32)}


  2%|▏         | 24410/1000000 [1:15:43<36:32:37,  7.42it/s]

{'loss': Array(0.6496772, dtype=float32), 'loss_reward': Array(0.00490312, dtype=float32), 'loss_cross_entropy': Array(0.64477414, dtype=float32)}


  2%|▏         | 24419/1000000 [1:15:44<32:05:42,  8.44it/s]

{'loss': Array(0.6492762, dtype=float32), 'loss_reward': Array(0.00510113, dtype=float32), 'loss_cross_entropy': Array(0.644175, dtype=float32)}


  2%|▏         | 24430/1000000 [1:15:46<28:02:22,  9.66it/s]

{'loss': Array(0.6484687, dtype=float32), 'loss_reward': Array(0.00505914, dtype=float32), 'loss_cross_entropy': Array(0.6434095, dtype=float32)}


  2%|▏         | 24438/1000000 [1:15:48<33:31:29,  8.08it/s]

{'loss': Array(0.64376205, dtype=float32), 'loss_reward': Array(0.00487573, dtype=float32), 'loss_cross_entropy': Array(0.6388863, dtype=float32)}


  2%|▏         | 24449/1000000 [1:15:50<36:38:01,  7.40it/s]

{'loss': Array(0.6434223, dtype=float32), 'loss_reward': Array(0.00486919, dtype=float32), 'loss_cross_entropy': Array(0.63855314, dtype=float32)}


  2%|▏         | 24460/1000000 [1:15:51<29:06:21,  9.31it/s]

{'loss': Array(0.64382565, dtype=float32), 'loss_reward': Array(0.00505995, dtype=float32), 'loss_cross_entropy': Array(0.63876563, dtype=float32)}


  2%|▏         | 24470/1000000 [1:15:53<28:27:40,  9.52it/s]

{'loss': Array(0.6435366, dtype=float32), 'loss_reward': Array(0.00494892, dtype=float32), 'loss_cross_entropy': Array(0.6385878, dtype=float32)}


  2%|▏         | 24478/1000000 [1:15:54<34:18:42,  7.90it/s]

{'loss': Array(0.6421695, dtype=float32), 'loss_reward': Array(0.00484175, dtype=float32), 'loss_cross_entropy': Array(0.6373278, dtype=float32)}


  2%|▏         | 24489/1000000 [1:15:56<36:21:55,  7.45it/s]

{'loss': Array(0.6399708, dtype=float32), 'loss_reward': Array(0.00496075, dtype=float32), 'loss_cross_entropy': Array(0.63501, dtype=float32)}


  2%|▏         | 24500/1000000 [1:15:58<28:25:52,  9.53it/s]

{'loss': Array(0.6488467, dtype=float32), 'loss_reward': Array(0.00482323, dtype=float32), 'loss_cross_entropy': Array(0.64402336, dtype=float32)}


  2%|▏         | 24508/1000000 [1:16:09<174:06:37,  1.56it/s]

{'loss': Array(0.643822, dtype=float32), 'loss_reward': Array(0.00491793, dtype=float32), 'loss_cross_entropy': Array(0.6389041, dtype=float32)}


  2%|▏         | 24520/1000000 [1:16:11<59:15:14,  4.57it/s] 

{'loss': Array(0.65194005, dtype=float32), 'loss_reward': Array(0.00492573, dtype=float32), 'loss_cross_entropy': Array(0.6470144, dtype=float32)}


  2%|▏         | 24530/1000000 [1:16:13<41:24:09,  6.54it/s]

{'loss': Array(0.64850146, dtype=float32), 'loss_reward': Array(0.00496109, dtype=float32), 'loss_cross_entropy': Array(0.6435403, dtype=float32)}


  2%|▏         | 24538/1000000 [1:16:14<38:40:13,  7.01it/s]

{'loss': Array(0.64626485, dtype=float32), 'loss_reward': Array(0.00505391, dtype=float32), 'loss_cross_entropy': Array(0.641211, dtype=float32)}


  2%|▏         | 24550/1000000 [1:16:16<28:31:53,  9.50it/s]

{'loss': Array(0.64298373, dtype=float32), 'loss_reward': Array(0.00496355, dtype=float32), 'loss_cross_entropy': Array(0.6380202, dtype=float32)}


  2%|▏         | 24558/1000000 [1:16:17<33:27:29,  8.10it/s]

{'loss': Array(0.6433601, dtype=float32), 'loss_reward': Array(0.00496557, dtype=float32), 'loss_cross_entropy': Array(0.63839453, dtype=float32)}


  2%|▏         | 24570/1000000 [1:16:19<31:00:05,  8.74it/s]

{'loss': Array(0.6512773, dtype=float32), 'loss_reward': Array(0.00514156, dtype=float32), 'loss_cross_entropy': Array(0.64613575, dtype=float32)}


  2%|▏         | 24578/1000000 [1:16:21<34:36:47,  7.83it/s]

{'loss': Array(0.64840263, dtype=float32), 'loss_reward': Array(0.00487405, dtype=float32), 'loss_cross_entropy': Array(0.64352864, dtype=float32)}


  2%|▏         | 24590/1000000 [1:16:22<29:02:25,  9.33it/s]

{'loss': Array(0.64288193, dtype=float32), 'loss_reward': Array(0.00510833, dtype=float32), 'loss_cross_entropy': Array(0.6377737, dtype=float32)}


  2%|▏         | 24600/1000000 [1:16:24<40:57:33,  6.61it/s]

{'loss': Array(0.6418374, dtype=float32), 'loss_reward': Array(0.00478966, dtype=float32), 'loss_cross_entropy': Array(0.63704777, dtype=float32)}


  2%|▏         | 24608/1000000 [1:16:26<38:54:31,  6.96it/s]

{'loss': Array(0.64147, dtype=float32), 'loss_reward': Array(0.00480813, dtype=float32), 'loss_cross_entropy': Array(0.6366618, dtype=float32)}


  2%|▏         | 24619/1000000 [1:16:28<30:30:11,  8.88it/s]

{'loss': Array(0.6410774, dtype=float32), 'loss_reward': Array(0.00485265, dtype=float32), 'loss_cross_entropy': Array(0.63622475, dtype=float32)}


  2%|▏         | 24630/1000000 [1:16:29<27:18:29,  9.92it/s]

{'loss': Array(0.6422736, dtype=float32), 'loss_reward': Array(0.00493191, dtype=float32), 'loss_cross_entropy': Array(0.6373416, dtype=float32)}


  2%|▏         | 24640/1000000 [1:16:31<34:39:01,  7.82it/s]

{'loss': Array(0.63688415, dtype=float32), 'loss_reward': Array(0.00486952, dtype=float32), 'loss_cross_entropy': Array(0.6320147, dtype=float32)}


  2%|▏         | 24650/1000000 [1:16:33<29:30:09,  9.18it/s]

{'loss': Array(0.6407853, dtype=float32), 'loss_reward': Array(0.00486536, dtype=float32), 'loss_cross_entropy': Array(0.63592, dtype=float32)}


  2%|▏         | 24660/1000000 [1:16:34<28:35:00,  9.48it/s]

{'loss': Array(0.64275354, dtype=float32), 'loss_reward': Array(0.00491203, dtype=float32), 'loss_cross_entropy': Array(0.6378414, dtype=float32)}


  2%|▏         | 24668/1000000 [1:16:36<33:16:43,  8.14it/s]

{'loss': Array(0.6422891, dtype=float32), 'loss_reward': Array(0.00488537, dtype=float32), 'loss_cross_entropy': Array(0.6374037, dtype=float32)}


  2%|▏         | 24679/1000000 [1:16:38<35:30:27,  7.63it/s]

{'loss': Array(0.6501811, dtype=float32), 'loss_reward': Array(0.00487514, dtype=float32), 'loss_cross_entropy': Array(0.64530593, dtype=float32)}


  2%|▏         | 24690/1000000 [1:16:39<28:26:45,  9.52it/s]

{'loss': Array(0.6397691, dtype=float32), 'loss_reward': Array(0.00504066, dtype=float32), 'loss_cross_entropy': Array(0.6347284, dtype=float32)}


  2%|▏         | 24698/1000000 [1:16:41<33:08:58,  8.17it/s]

{'loss': Array(0.6470437, dtype=float32), 'loss_reward': Array(0.00502408, dtype=float32), 'loss_cross_entropy': Array(0.6420197, dtype=float32)}


  2%|▏         | 24710/1000000 [1:16:42<27:57:11,  9.69it/s]

{'loss': Array(0.6445314, dtype=float32), 'loss_reward': Array(0.00490409, dtype=float32), 'loss_cross_entropy': Array(0.6396274, dtype=float32)}


  2%|▏         | 24720/1000000 [1:16:44<32:35:35,  8.31it/s]

{'loss': Array(0.6404905, dtype=float32), 'loss_reward': Array(0.00482126, dtype=float32), 'loss_cross_entropy': Array(0.6356693, dtype=float32)}


  2%|▏         | 24728/1000000 [1:16:46<35:12:33,  7.69it/s]

{'loss': Array(0.6467402, dtype=float32), 'loss_reward': Array(0.00492878, dtype=float32), 'loss_cross_entropy': Array(0.64181155, dtype=float32)}


  2%|▏         | 24740/1000000 [1:16:47<28:23:35,  9.54it/s]

{'loss': Array(0.64306736, dtype=float32), 'loss_reward': Array(0.00506747, dtype=float32), 'loss_cross_entropy': Array(0.63799995, dtype=float32)}


  2%|▏         | 24748/1000000 [1:16:49<34:17:09,  7.90it/s]

{'loss': Array(0.64364165, dtype=float32), 'loss_reward': Array(0.00493218, dtype=float32), 'loss_cross_entropy': Array(0.6387095, dtype=float32)}


  2%|▏         | 24759/1000000 [1:16:51<34:47:57,  7.78it/s]

{'loss': Array(0.63794684, dtype=float32), 'loss_reward': Array(0.00487333, dtype=float32), 'loss_cross_entropy': Array(0.6330735, dtype=float32)}


  2%|▏         | 24770/1000000 [1:16:53<29:17:30,  9.25it/s]

{'loss': Array(0.6496061, dtype=float32), 'loss_reward': Array(0.00487941, dtype=float32), 'loss_cross_entropy': Array(0.6447267, dtype=float32)}


  2%|▏         | 24778/1000000 [1:16:54<33:30:46,  8.08it/s]

{'loss': Array(0.6374491, dtype=float32), 'loss_reward': Array(0.00496935, dtype=float32), 'loss_cross_entropy': Array(0.6324797, dtype=float32)}


  2%|▏         | 24790/1000000 [1:16:56<37:07:38,  7.30it/s]

{'loss': Array(0.64203924, dtype=float32), 'loss_reward': Array(0.00482118, dtype=float32), 'loss_cross_entropy': Array(0.6372181, dtype=float32)}


  2%|▏         | 24798/1000000 [1:16:57<36:16:06,  7.47it/s]

{'loss': Array(0.63994175, dtype=float32), 'loss_reward': Array(0.00498593, dtype=float32), 'loss_cross_entropy': Array(0.6349558, dtype=float32)}


  2%|▏         | 24809/1000000 [1:16:59<29:34:02,  9.16it/s]

{'loss': Array(0.63761383, dtype=float32), 'loss_reward': Array(0.00495204, dtype=float32), 'loss_cross_entropy': Array(0.6326618, dtype=float32)}


  2%|▏         | 24820/1000000 [1:17:01<26:58:37, 10.04it/s]

{'loss': Array(0.6404621, dtype=float32), 'loss_reward': Array(0.00493297, dtype=float32), 'loss_cross_entropy': Array(0.63552916, dtype=float32)}


  2%|▏         | 24830/1000000 [1:17:03<35:36:08,  7.61it/s]

{'loss': Array(0.6387434, dtype=float32), 'loss_reward': Array(0.00485022, dtype=float32), 'loss_cross_entropy': Array(0.6338932, dtype=float32)}


  2%|▏         | 24838/1000000 [1:17:04<36:36:12,  7.40it/s]

{'loss': Array(0.63828623, dtype=float32), 'loss_reward': Array(0.00494258, dtype=float32), 'loss_cross_entropy': Array(0.63334364, dtype=float32)}


  2%|▏         | 24850/1000000 [1:17:06<29:01:56,  9.33it/s]

{'loss': Array(0.6336451, dtype=float32), 'loss_reward': Array(0.00484392, dtype=float32), 'loss_cross_entropy': Array(0.6288012, dtype=float32)}


  2%|▏         | 24858/1000000 [1:17:07<33:23:53,  8.11it/s]

{'loss': Array(0.64123404, dtype=float32), 'loss_reward': Array(0.00488402, dtype=float32), 'loss_cross_entropy': Array(0.63635, dtype=float32)}


  2%|▏         | 24868/1000000 [1:17:09<39:00:31,  6.94it/s]

{'loss': Array(0.6368737, dtype=float32), 'loss_reward': Array(0.0048433, dtype=float32), 'loss_cross_entropy': Array(0.6320304, dtype=float32)}


  2%|▏         | 24880/1000000 [1:17:11<28:22:08,  9.55it/s]

{'loss': Array(0.64050126, dtype=float32), 'loss_reward': Array(0.00503519, dtype=float32), 'loss_cross_entropy': Array(0.63546604, dtype=float32)}


  2%|▏         | 24888/1000000 [1:17:12<33:24:15,  8.11it/s]

{'loss': Array(0.63142014, dtype=float32), 'loss_reward': Array(0.00495697, dtype=float32), 'loss_cross_entropy': Array(0.62646323, dtype=float32)}


  2%|▏         | 24899/1000000 [1:17:14<29:28:23,  9.19it/s]

{'loss': Array(0.6422383, dtype=float32), 'loss_reward': Array(0.00508456, dtype=float32), 'loss_cross_entropy': Array(0.6371537, dtype=float32)}


  2%|▏         | 24909/1000000 [1:17:16<33:31:16,  8.08it/s]

{'loss': Array(0.64008206, dtype=float32), 'loss_reward': Array(0.00491375, dtype=float32), 'loss_cross_entropy': Array(0.6351684, dtype=float32)}


  2%|▏         | 24919/1000000 [1:17:18<30:42:44,  8.82it/s]

{'loss': Array(0.6311774, dtype=float32), 'loss_reward': Array(0.0048289, dtype=float32), 'loss_cross_entropy': Array(0.6263486, dtype=float32)}


  2%|▏         | 24930/1000000 [1:17:19<27:41:21,  9.78it/s]

{'loss': Array(0.63923246, dtype=float32), 'loss_reward': Array(0.00487971, dtype=float32), 'loss_cross_entropy': Array(0.6343527, dtype=float32)}


  2%|▏         | 24938/1000000 [1:17:21<33:08:51,  8.17it/s]

{'loss': Array(0.63375497, dtype=float32), 'loss_reward': Array(0.00499251, dtype=float32), 'loss_cross_entropy': Array(0.62876254, dtype=float32)}


  2%|▏         | 24949/1000000 [1:17:23<33:36:57,  8.06it/s]

{'loss': Array(0.63259226, dtype=float32), 'loss_reward': Array(0.00501156, dtype=float32), 'loss_cross_entropy': Array(0.62758076, dtype=float32)}


  2%|▏         | 24960/1000000 [1:17:24<28:27:25,  9.52it/s]

{'loss': Array(0.631756, dtype=float32), 'loss_reward': Array(0.00484327, dtype=float32), 'loss_cross_entropy': Array(0.6269128, dtype=float32)}


  2%|▏         | 24970/1000000 [1:17:26<28:55:56,  9.36it/s]

{'loss': Array(0.63371295, dtype=float32), 'loss_reward': Array(0.00499521, dtype=float32), 'loss_cross_entropy': Array(0.6287177, dtype=float32)}


  2%|▏         | 24980/1000000 [1:17:28<41:21:18,  6.55it/s]

{'loss': Array(0.634227, dtype=float32), 'loss_reward': Array(0.00484763, dtype=float32), 'loss_cross_entropy': Array(0.62937933, dtype=float32)}


  2%|▏         | 24988/1000000 [1:17:29<39:44:45,  6.81it/s]

{'loss': Array(0.6349023, dtype=float32), 'loss_reward': Array(0.00495302, dtype=float32), 'loss_cross_entropy': Array(0.6299493, dtype=float32)}


  2%|▎         | 25000/1000000 [1:17:31<28:34:37,  9.48it/s]

{'loss': Array(0.6323146, dtype=float32), 'loss_reward': Array(0.00497171, dtype=float32), 'loss_cross_entropy': Array(0.6273429, dtype=float32)}


  3%|▎         | 25008/1000000 [1:17:43<177:37:15,  1.52it/s]

{'loss': Array(0.6393928, dtype=float32), 'loss_reward': Array(0.00479666, dtype=float32), 'loss_cross_entropy': Array(0.6345961, dtype=float32)}


  3%|▎         | 25018/1000000 [1:17:45<81:01:17,  3.34it/s] 

{'loss': Array(0.6392712, dtype=float32), 'loss_reward': Array(0.00491077, dtype=float32), 'loss_cross_entropy': Array(0.63436043, dtype=float32)}


  3%|▎         | 25030/1000000 [1:17:46<37:32:56,  7.21it/s]

{'loss': Array(0.63571805, dtype=float32), 'loss_reward': Array(0.00495148, dtype=float32), 'loss_cross_entropy': Array(0.6307666, dtype=float32)}


  3%|▎         | 25038/1000000 [1:17:48<37:14:13,  7.27it/s]

{'loss': Array(0.6334447, dtype=float32), 'loss_reward': Array(0.00492898, dtype=float32), 'loss_cross_entropy': Array(0.6285158, dtype=float32)}


  3%|▎         | 25050/1000000 [1:17:49<28:17:21,  9.57it/s]

{'loss': Array(0.6211459, dtype=float32), 'loss_reward': Array(0.00486029, dtype=float32), 'loss_cross_entropy': Array(0.6162856, dtype=float32)}


  3%|▎         | 25060/1000000 [1:17:51<35:44:51,  7.58it/s]

{'loss': Array(0.6290275, dtype=float32), 'loss_reward': Array(0.00481425, dtype=float32), 'loss_cross_entropy': Array(0.62421316, dtype=float32)}


  3%|▎         | 25068/1000000 [1:17:53<36:21:59,  7.45it/s]

{'loss': Array(0.63137734, dtype=float32), 'loss_reward': Array(0.00489273, dtype=float32), 'loss_cross_entropy': Array(0.6264847, dtype=float32)}


  3%|▎         | 25080/1000000 [1:17:54<28:18:40,  9.57it/s]

{'loss': Array(0.6296084, dtype=float32), 'loss_reward': Array(0.00489832, dtype=float32), 'loss_cross_entropy': Array(0.6247101, dtype=float32)}


  3%|▎         | 25088/1000000 [1:17:56<33:15:04,  8.14it/s]

{'loss': Array(0.6314026, dtype=float32), 'loss_reward': Array(0.00467614, dtype=float32), 'loss_cross_entropy': Array(0.6267265, dtype=float32)}


  3%|▎         | 25098/1000000 [1:17:58<36:59:20,  7.32it/s]

{'loss': Array(0.6389696, dtype=float32), 'loss_reward': Array(0.00484268, dtype=float32), 'loss_cross_entropy': Array(0.6341269, dtype=float32)}


  3%|▎         | 25110/1000000 [1:17:59<28:11:30,  9.61it/s]

{'loss': Array(0.62924033, dtype=float32), 'loss_reward': Array(0.00491273, dtype=float32), 'loss_cross_entropy': Array(0.62432754, dtype=float32)}


  3%|▎         | 25118/1000000 [1:18:01<33:37:34,  8.05it/s]

{'loss': Array(0.6347001, dtype=float32), 'loss_reward': Array(0.00481846, dtype=float32), 'loss_cross_entropy': Array(0.6298816, dtype=float32)}


  3%|▎         | 25130/1000000 [1:18:02<27:34:26,  9.82it/s]

{'loss': Array(0.6287618, dtype=float32), 'loss_reward': Array(0.00485101, dtype=float32), 'loss_cross_entropy': Array(0.62391084, dtype=float32)}


  3%|▎         | 25138/1000000 [1:18:04<38:22:56,  7.06it/s]

{'loss': Array(0.62623715, dtype=float32), 'loss_reward': Array(0.00469501, dtype=float32), 'loss_cross_entropy': Array(0.6215421, dtype=float32)}


  3%|▎         | 25150/1000000 [1:18:06<28:53:36,  9.37it/s]

{'loss': Array(0.62796205, dtype=float32), 'loss_reward': Array(0.00496783, dtype=float32), 'loss_cross_entropy': Array(0.6229942, dtype=float32)}


  3%|▎         | 25158/1000000 [1:18:07<34:12:27,  7.92it/s]

{'loss': Array(0.6251423, dtype=float32), 'loss_reward': Array(0.00481549, dtype=float32), 'loss_cross_entropy': Array(0.62032676, dtype=float32)}


  3%|▎         | 25169/1000000 [1:18:09<39:58:03,  6.78it/s]

{'loss': Array(0.6322481, dtype=float32), 'loss_reward': Array(0.00490566, dtype=float32), 'loss_cross_entropy': Array(0.6273424, dtype=float32)}


  3%|▎         | 25180/1000000 [1:18:11<29:10:20,  9.28it/s]

{'loss': Array(0.6278094, dtype=float32), 'loss_reward': Array(0.00485618, dtype=float32), 'loss_cross_entropy': Array(0.6229532, dtype=float32)}


  3%|▎         | 25188/1000000 [1:18:12<34:24:59,  7.87it/s]

{'loss': Array(0.6291111, dtype=float32), 'loss_reward': Array(0.00481663, dtype=float32), 'loss_cross_entropy': Array(0.62429446, dtype=float32)}


  3%|▎         | 25200/1000000 [1:18:14<27:31:22,  9.84it/s]

{'loss': Array(0.6355559, dtype=float32), 'loss_reward': Array(0.00496233, dtype=float32), 'loss_cross_entropy': Array(0.6305936, dtype=float32)}


  3%|▎         | 25210/1000000 [1:18:16<35:15:41,  7.68it/s]

{'loss': Array(0.6262006, dtype=float32), 'loss_reward': Array(0.00484371, dtype=float32), 'loss_cross_entropy': Array(0.62135684, dtype=float32)}


  3%|▎         | 25220/1000000 [1:18:18<31:36:27,  8.57it/s]

{'loss': Array(0.62827516, dtype=float32), 'loss_reward': Array(0.00475953, dtype=float32), 'loss_cross_entropy': Array(0.6235157, dtype=float32)}


  3%|▎         | 25228/1000000 [1:18:19<35:17:03,  7.67it/s]

{'loss': Array(0.62702626, dtype=float32), 'loss_reward': Array(0.0047715, dtype=float32), 'loss_cross_entropy': Array(0.6222548, dtype=float32)}


  3%|▎         | 25240/1000000 [1:18:21<28:31:03,  9.49it/s]

{'loss': Array(0.6230087, dtype=float32), 'loss_reward': Array(0.00475994, dtype=float32), 'loss_cross_entropy': Array(0.6182488, dtype=float32)}


  3%|▎         | 25250/1000000 [1:18:23<33:13:07,  8.15it/s]

{'loss': Array(0.62420166, dtype=float32), 'loss_reward': Array(0.00483898, dtype=float32), 'loss_cross_entropy': Array(0.61936265, dtype=float32)}


  3%|▎         | 25258/1000000 [1:18:24<36:31:06,  7.41it/s]

{'loss': Array(0.62639004, dtype=float32), 'loss_reward': Array(0.00497256, dtype=float32), 'loss_cross_entropy': Array(0.62141746, dtype=float32)}


  3%|▎         | 25270/1000000 [1:18:26<28:34:18,  9.48it/s]

{'loss': Array(0.6331781, dtype=float32), 'loss_reward': Array(0.00496864, dtype=float32), 'loss_cross_entropy': Array(0.6282096, dtype=float32)}


  3%|▎         | 25278/1000000 [1:18:27<33:26:59,  8.09it/s]

{'loss': Array(0.6277347, dtype=float32), 'loss_reward': Array(0.00475387, dtype=float32), 'loss_cross_entropy': Array(0.6229808, dtype=float32)}


  3%|▎         | 25289/1000000 [1:18:29<34:10:10,  7.92it/s]

{'loss': Array(0.6236083, dtype=float32), 'loss_reward': Array(0.0047668, dtype=float32), 'loss_cross_entropy': Array(0.6188415, dtype=float32)}


  3%|▎         | 25300/1000000 [1:18:31<28:41:40,  9.44it/s]

{'loss': Array(0.6252282, dtype=float32), 'loss_reward': Array(0.00479835, dtype=float32), 'loss_cross_entropy': Array(0.62042993, dtype=float32)}


  3%|▎         | 25308/1000000 [1:18:32<33:57:08,  7.97it/s]

{'loss': Array(0.6266107, dtype=float32), 'loss_reward': Array(0.00485298, dtype=float32), 'loss_cross_entropy': Array(0.62175775, dtype=float32)}


  3%|▎         | 25320/1000000 [1:18:35<37:48:14,  7.16it/s]

{'loss': Array(0.6243749, dtype=float32), 'loss_reward': Array(0.00484512, dtype=float32), 'loss_cross_entropy': Array(0.6195298, dtype=float32)}


  3%|▎         | 25328/1000000 [1:18:36<37:14:14,  7.27it/s]

{'loss': Array(0.6190778, dtype=float32), 'loss_reward': Array(0.00470787, dtype=float32), 'loss_cross_entropy': Array(0.6143699, dtype=float32)}


  3%|▎         | 25340/1000000 [1:18:38<28:19:40,  9.56it/s]

{'loss': Array(0.6261, dtype=float32), 'loss_reward': Array(0.00477972, dtype=float32), 'loss_cross_entropy': Array(0.6213202, dtype=float32)}


  3%|▎         | 25348/1000000 [1:18:39<32:51:53,  8.24it/s]

{'loss': Array(0.6226587, dtype=float32), 'loss_reward': Array(0.00474986, dtype=float32), 'loss_cross_entropy': Array(0.61790895, dtype=float32)}


  3%|▎         | 25358/1000000 [1:18:41<41:28:23,  6.53it/s]

{'loss': Array(0.623648, dtype=float32), 'loss_reward': Array(0.00498523, dtype=float32), 'loss_cross_entropy': Array(0.61866266, dtype=float32)}


  3%|▎         | 25369/1000000 [1:18:43<30:41:40,  8.82it/s]

{'loss': Array(0.6327395, dtype=float32), 'loss_reward': Array(0.00499177, dtype=float32), 'loss_cross_entropy': Array(0.6277477, dtype=float32)}


  3%|▎         | 25380/1000000 [1:18:44<26:51:23, 10.08it/s]

{'loss': Array(0.6180792, dtype=float32), 'loss_reward': Array(0.00492656, dtype=float32), 'loss_cross_entropy': Array(0.6131526, dtype=float32)}


  3%|▎         | 25388/1000000 [1:18:46<33:21:43,  8.11it/s]

{'loss': Array(0.6247181, dtype=float32), 'loss_reward': Array(0.00480072, dtype=float32), 'loss_cross_entropy': Array(0.6199174, dtype=float32)}


  3%|▎         | 25399/1000000 [1:18:47<35:36:27,  7.60it/s]

{'loss': Array(0.6153335, dtype=float32), 'loss_reward': Array(0.00483699, dtype=float32), 'loss_cross_entropy': Array(0.6104964, dtype=float32)}


  3%|▎         | 25409/1000000 [1:18:49<29:19:07,  9.23it/s]

{'loss': Array(0.61447495, dtype=float32), 'loss_reward': Array(0.00483254, dtype=float32), 'loss_cross_entropy': Array(0.6096424, dtype=float32)}


  3%|▎         | 25420/1000000 [1:18:51<27:40:38,  9.78it/s]

{'loss': Array(0.6149913, dtype=float32), 'loss_reward': Array(0.00487229, dtype=float32), 'loss_cross_entropy': Array(0.610119, dtype=float32)}


  3%|▎         | 25428/1000000 [1:18:52<33:31:11,  8.08it/s]

{'loss': Array(0.6179779, dtype=float32), 'loss_reward': Array(0.00477122, dtype=float32), 'loss_cross_entropy': Array(0.6132066, dtype=float32)}


  3%|▎         | 25439/1000000 [1:18:54<33:18:07,  8.13it/s]

{'loss': Array(0.6226778, dtype=float32), 'loss_reward': Array(0.0050261, dtype=float32), 'loss_cross_entropy': Array(0.61765164, dtype=float32)}


  3%|▎         | 25450/1000000 [1:18:56<27:48:19,  9.74it/s]

{'loss': Array(0.6188156, dtype=float32), 'loss_reward': Array(0.00476161, dtype=float32), 'loss_cross_entropy': Array(0.61405396, dtype=float32)}


  3%|▎         | 25458/1000000 [1:18:57<32:36:48,  8.30it/s]

{'loss': Array(0.6188593, dtype=float32), 'loss_reward': Array(0.00474493, dtype=float32), 'loss_cross_entropy': Array(0.6141144, dtype=float32)}


  3%|▎         | 25470/1000000 [1:18:59<28:24:08,  9.53it/s]

{'loss': Array(0.6126323, dtype=float32), 'loss_reward': Array(0.00479043, dtype=float32), 'loss_cross_entropy': Array(0.6078418, dtype=float32)}


  3%|▎         | 25478/1000000 [1:19:01<37:47:39,  7.16it/s]

{'loss': Array(0.61819553, dtype=float32), 'loss_reward': Array(0.00495085, dtype=float32), 'loss_cross_entropy': Array(0.6132447, dtype=float32)}


  3%|▎         | 25490/1000000 [1:19:02<28:20:26,  9.55it/s]

{'loss': Array(0.6178089, dtype=float32), 'loss_reward': Array(0.00479362, dtype=float32), 'loss_cross_entropy': Array(0.6130153, dtype=float32)}


  3%|▎         | 25498/1000000 [1:19:04<33:33:45,  8.07it/s]

{'loss': Array(0.61425865, dtype=float32), 'loss_reward': Array(0.00485704, dtype=float32), 'loss_cross_entropy': Array(0.6094016, dtype=float32)}


  3%|▎         | 25509/1000000 [1:19:15<142:19:58,  1.90it/s]

{'loss': Array(0.6187512, dtype=float32), 'loss_reward': Array(0.00475547, dtype=float32), 'loss_cross_entropy': Array(0.61399573, dtype=float32)}


  3%|▎         | 25520/1000000 [1:19:17<53:28:09,  5.06it/s] 

{'loss': Array(0.6190562, dtype=float32), 'loss_reward': Array(0.00485294, dtype=float32), 'loss_cross_entropy': Array(0.61420333, dtype=float32)}


  3%|▎         | 25528/1000000 [1:19:19<43:59:58,  6.15it/s]

{'loss': Array(0.6230511, dtype=float32), 'loss_reward': Array(0.00482717, dtype=float32), 'loss_cross_entropy': Array(0.6182239, dtype=float32)}


  3%|▎         | 25540/1000000 [1:19:20<31:04:03,  8.71it/s]

{'loss': Array(0.6213291, dtype=float32), 'loss_reward': Array(0.00486701, dtype=float32), 'loss_cross_entropy': Array(0.6164622, dtype=float32)}


  3%|▎         | 25550/1000000 [1:19:22<36:21:19,  7.45it/s]

{'loss': Array(0.62296, dtype=float32), 'loss_reward': Array(0.00485386, dtype=float32), 'loss_cross_entropy': Array(0.61810607, dtype=float32)}


  3%|▎         | 25558/1000000 [1:19:24<36:22:37,  7.44it/s]

{'loss': Array(0.62172514, dtype=float32), 'loss_reward': Array(0.0047764, dtype=float32), 'loss_cross_entropy': Array(0.6169487, dtype=float32)}


  3%|▎         | 25569/1000000 [1:19:25<29:08:14,  9.29it/s]

{'loss': Array(0.6212948, dtype=float32), 'loss_reward': Array(0.0049092, dtype=float32), 'loss_cross_entropy': Array(0.61638564, dtype=float32)}


  3%|▎         | 25580/1000000 [1:19:27<28:22:38,  9.54it/s]

{'loss': Array(0.6250316, dtype=float32), 'loss_reward': Array(0.00484947, dtype=float32), 'loss_cross_entropy': Array(0.6201821, dtype=float32)}


  3%|▎         | 25590/1000000 [1:19:29<32:30:30,  8.33it/s]

{'loss': Array(0.61686546, dtype=float32), 'loss_reward': Array(0.00479818, dtype=float32), 'loss_cross_entropy': Array(0.61206716, dtype=float32)}


  3%|▎         | 25600/1000000 [1:19:30<28:46:02,  9.41it/s]

{'loss': Array(0.61505145, dtype=float32), 'loss_reward': Array(0.00459099, dtype=float32), 'loss_cross_entropy': Array(0.61046046, dtype=float32)}


  3%|▎         | 25610/1000000 [1:19:32<29:35:04,  9.15it/s]

{'loss': Array(0.6117209, dtype=float32), 'loss_reward': Array(0.00486626, dtype=float32), 'loss_cross_entropy': Array(0.6068546, dtype=float32)}


  3%|▎         | 25618/1000000 [1:19:34<34:44:44,  7.79it/s]

{'loss': Array(0.61980754, dtype=float32), 'loss_reward': Array(0.00495604, dtype=float32), 'loss_cross_entropy': Array(0.6148514, dtype=float32)}


  3%|▎         | 25629/1000000 [1:19:36<33:33:19,  8.07it/s]

{'loss': Array(0.62241095, dtype=float32), 'loss_reward': Array(0.00498449, dtype=float32), 'loss_cross_entropy': Array(0.6174265, dtype=float32)}


  3%|▎         | 25640/1000000 [1:19:37<27:35:03,  9.81it/s]

{'loss': Array(0.6207806, dtype=float32), 'loss_reward': Array(0.00489392, dtype=float32), 'loss_cross_entropy': Array(0.6158866, dtype=float32)}


  3%|▎         | 25648/1000000 [1:19:39<32:59:49,  8.20it/s]

{'loss': Array(0.6115991, dtype=float32), 'loss_reward': Array(0.00468198, dtype=float32), 'loss_cross_entropy': Array(0.606917, dtype=float32)}


  3%|▎         | 25660/1000000 [1:19:40<27:45:49,  9.75it/s]

{'loss': Array(0.61837274, dtype=float32), 'loss_reward': Array(0.00477115, dtype=float32), 'loss_cross_entropy': Array(0.6136016, dtype=float32)}


  3%|▎         | 25668/1000000 [1:19:42<37:53:48,  7.14it/s]

{'loss': Array(0.6146391, dtype=float32), 'loss_reward': Array(0.00483489, dtype=float32), 'loss_cross_entropy': Array(0.6098042, dtype=float32)}


  3%|▎         | 25680/1000000 [1:19:44<28:50:19,  9.38it/s]

{'loss': Array(0.6216337, dtype=float32), 'loss_reward': Array(0.00483563, dtype=float32), 'loss_cross_entropy': Array(0.61679804, dtype=float32)}


  3%|▎         | 25688/1000000 [1:19:45<33:31:24,  8.07it/s]

{'loss': Array(0.6132105, dtype=float32), 'loss_reward': Array(0.00469799, dtype=float32), 'loss_cross_entropy': Array(0.6085125, dtype=float32)}


  3%|▎         | 25699/1000000 [1:19:47<40:10:15,  6.74it/s]

{'loss': Array(0.6150772, dtype=float32), 'loss_reward': Array(0.00487739, dtype=float32), 'loss_cross_entropy': Array(0.61019987, dtype=float32)}


  3%|▎         | 25710/1000000 [1:19:49<30:12:21,  8.96it/s]

{'loss': Array(0.6100854, dtype=float32), 'loss_reward': Array(0.00484835, dtype=float32), 'loss_cross_entropy': Array(0.60523695, dtype=float32)}


  3%|▎         | 25718/1000000 [1:19:50<35:30:41,  7.62it/s]

{'loss': Array(0.6138191, dtype=float32), 'loss_reward': Array(0.00489105, dtype=float32), 'loss_cross_entropy': Array(0.60892814, dtype=float32)}


  3%|▎         | 25730/1000000 [1:19:52<28:17:55,  9.56it/s]

{'loss': Array(0.61995244, dtype=float32), 'loss_reward': Array(0.00483214, dtype=float32), 'loss_cross_entropy': Array(0.61512023, dtype=float32)}


  3%|▎         | 25740/1000000 [1:19:54<35:12:40,  7.69it/s]

{'loss': Array(0.6088464, dtype=float32), 'loss_reward': Array(0.00458191, dtype=float32), 'loss_cross_entropy': Array(0.6042645, dtype=float32)}


  3%|▎         | 25748/1000000 [1:19:55<35:06:02,  7.71it/s]

{'loss': Array(0.62085545, dtype=float32), 'loss_reward': Array(0.00493224, dtype=float32), 'loss_cross_entropy': Array(0.61592317, dtype=float32)}


  3%|▎         | 25760/1000000 [1:19:57<28:02:35,  9.65it/s]

{'loss': Array(0.61249304, dtype=float32), 'loss_reward': Array(0.00480627, dtype=float32), 'loss_cross_entropy': Array(0.6076868, dtype=float32)}


  3%|▎         | 25768/1000000 [1:19:58<33:44:28,  8.02it/s]

{'loss': Array(0.6069508, dtype=float32), 'loss_reward': Array(0.00458144, dtype=float32), 'loss_cross_entropy': Array(0.60236937, dtype=float32)}


  3%|▎         | 25779/1000000 [1:20:00<35:56:15,  7.53it/s]

{'loss': Array(0.60734487, dtype=float32), 'loss_reward': Array(0.00474251, dtype=float32), 'loss_cross_entropy': Array(0.60260236, dtype=float32)}


  3%|▎         | 25790/1000000 [1:20:02<29:20:19,  9.22it/s]

{'loss': Array(0.61427015, dtype=float32), 'loss_reward': Array(0.00483968, dtype=float32), 'loss_cross_entropy': Array(0.6094305, dtype=float32)}


  3%|▎         | 25798/1000000 [1:20:04<34:35:09,  7.82it/s]

{'loss': Array(0.61179316, dtype=float32), 'loss_reward': Array(0.00480236, dtype=float32), 'loss_cross_entropy': Array(0.6069908, dtype=float32)}


  3%|▎         | 25809/1000000 [1:20:05<29:27:26,  9.19it/s]

{'loss': Array(0.6119762, dtype=float32), 'loss_reward': Array(0.00493357, dtype=float32), 'loss_cross_entropy': Array(0.6070426, dtype=float32)}


  3%|▎         | 25819/1000000 [1:20:07<33:57:05,  7.97it/s]

{'loss': Array(0.60768265, dtype=float32), 'loss_reward': Array(0.00487332, dtype=float32), 'loss_cross_entropy': Array(0.60280937, dtype=float32)}


  3%|▎         | 25830/1000000 [1:20:09<28:25:45,  9.52it/s]

{'loss': Array(0.6027924, dtype=float32), 'loss_reward': Array(0.00469157, dtype=float32), 'loss_cross_entropy': Array(0.5981008, dtype=float32)}


  3%|▎         | 25838/1000000 [1:20:10<33:51:04,  7.99it/s]

{'loss': Array(0.61649114, dtype=float32), 'loss_reward': Array(0.00482022, dtype=float32), 'loss_cross_entropy': Array(0.6116709, dtype=float32)}


  3%|▎         | 25849/1000000 [1:20:12<29:15:27,  9.25it/s]

{'loss': Array(0.6195957, dtype=float32), 'loss_reward': Array(0.00480473, dtype=float32), 'loss_cross_entropy': Array(0.61479104, dtype=float32)}


  3%|▎         | 25859/1000000 [1:20:14<33:56:36,  7.97it/s]

{'loss': Array(0.61175466, dtype=float32), 'loss_reward': Array(0.00472213, dtype=float32), 'loss_cross_entropy': Array(0.6070325, dtype=float32)}


  3%|▎         | 25870/1000000 [1:20:15<27:52:02,  9.71it/s]

{'loss': Array(0.6092478, dtype=float32), 'loss_reward': Array(0.00470339, dtype=float32), 'loss_cross_entropy': Array(0.6045444, dtype=float32)}


  3%|▎         | 25878/1000000 [1:20:17<33:06:09,  8.17it/s]

{'loss': Array(0.61305106, dtype=float32), 'loss_reward': Array(0.00481349, dtype=float32), 'loss_cross_entropy': Array(0.6082375, dtype=float32)}


  3%|▎         | 25889/1000000 [1:20:19<39:35:31,  6.83it/s]

{'loss': Array(0.611549, dtype=float32), 'loss_reward': Array(0.0048026, dtype=float32), 'loss_cross_entropy': Array(0.6067464, dtype=float32)}


  3%|▎         | 25899/1000000 [1:20:20<30:17:29,  8.93it/s]

{'loss': Array(0.6052422, dtype=float32), 'loss_reward': Array(0.00455526, dtype=float32), 'loss_cross_entropy': Array(0.60068697, dtype=float32)}


  3%|▎         | 25910/1000000 [1:20:22<27:19:11,  9.90it/s]

{'loss': Array(0.6045601, dtype=float32), 'loss_reward': Array(0.00464024, dtype=float32), 'loss_cross_entropy': Array(0.59991986, dtype=float32)}


  3%|▎         | 25918/1000000 [1:20:23<33:34:26,  8.06it/s]

{'loss': Array(0.6105582, dtype=float32), 'loss_reward': Array(0.00481991, dtype=float32), 'loss_cross_entropy': Array(0.6057384, dtype=float32)}


  3%|▎         | 25930/1000000 [1:20:25<34:27:24,  7.85it/s]

{'loss': Array(0.613226, dtype=float32), 'loss_reward': Array(0.00481527, dtype=float32), 'loss_cross_entropy': Array(0.6084108, dtype=float32)}


  3%|▎         | 25938/1000000 [1:20:27<36:25:48,  7.43it/s]

{'loss': Array(0.6019677, dtype=float32), 'loss_reward': Array(0.00458965, dtype=float32), 'loss_cross_entropy': Array(0.597378, dtype=float32)}


  3%|▎         | 25950/1000000 [1:20:29<28:59:43,  9.33it/s]

{'loss': Array(0.6103351, dtype=float32), 'loss_reward': Array(0.00463946, dtype=float32), 'loss_cross_entropy': Array(0.60569566, dtype=float32)}


  3%|▎         | 25958/1000000 [1:20:30<35:03:05,  7.72it/s]

{'loss': Array(0.60605377, dtype=float32), 'loss_reward': Array(0.00474405, dtype=float32), 'loss_cross_entropy': Array(0.60130966, dtype=float32)}


  3%|▎         | 25969/1000000 [1:20:32<36:14:50,  7.46it/s]

{'loss': Array(0.609807, dtype=float32), 'loss_reward': Array(0.00460746, dtype=float32), 'loss_cross_entropy': Array(0.6051995, dtype=float32)}


  3%|▎         | 25979/1000000 [1:20:34<29:39:07,  9.12it/s]

{'loss': Array(0.60806066, dtype=float32), 'loss_reward': Array(0.00481812, dtype=float32), 'loss_cross_entropy': Array(0.6032426, dtype=float32)}


  3%|▎         | 25990/1000000 [1:20:35<27:32:16,  9.82it/s]

{'loss': Array(0.6068191, dtype=float32), 'loss_reward': Array(0.00480665, dtype=float32), 'loss_cross_entropy': Array(0.6020124, dtype=float32)}


  3%|▎         | 25998/1000000 [1:20:37<33:01:40,  8.19it/s]

{'loss': Array(0.61162704, dtype=float32), 'loss_reward': Array(0.00477109, dtype=float32), 'loss_cross_entropy': Array(0.60685605, dtype=float32)}


  3%|▎         | 26009/1000000 [1:20:49<133:55:00,  2.02it/s]

{'loss': Array(0.6115631, dtype=float32), 'loss_reward': Array(0.00469489, dtype=float32), 'loss_cross_entropy': Array(0.6068682, dtype=float32)}


  3%|▎         | 26020/1000000 [1:20:51<53:13:22,  5.08it/s] 

{'loss': Array(0.6092415, dtype=float32), 'loss_reward': Array(0.00482393, dtype=float32), 'loss_cross_entropy': Array(0.6044175, dtype=float32)}


  3%|▎         | 26028/1000000 [1:20:52<43:15:22,  6.25it/s]

{'loss': Array(0.60282654, dtype=float32), 'loss_reward': Array(0.0046572, dtype=float32), 'loss_cross_entropy': Array(0.59816945, dtype=float32)}


  3%|▎         | 26038/1000000 [1:20:54<33:09:28,  8.16it/s]

{'loss': Array(0.6075695, dtype=float32), 'loss_reward': Array(0.00495585, dtype=float32), 'loss_cross_entropy': Array(0.6026136, dtype=float32)}


  3%|▎         | 26049/1000000 [1:20:56<34:00:13,  7.96it/s]

{'loss': Array(0.6089095, dtype=float32), 'loss_reward': Array(0.0047389, dtype=float32), 'loss_cross_entropy': Array(0.60417056, dtype=float32)}


  3%|▎         | 26059/1000000 [1:20:57<30:37:28,  8.83it/s]

{'loss': Array(0.6080229, dtype=float32), 'loss_reward': Array(0.00480285, dtype=float32), 'loss_cross_entropy': Array(0.60322005, dtype=float32)}


  3%|▎         | 26070/1000000 [1:20:59<27:50:34,  9.72it/s]

{'loss': Array(0.6057885, dtype=float32), 'loss_reward': Array(0.00473759, dtype=float32), 'loss_cross_entropy': Array(0.601051, dtype=float32)}


  3%|▎         | 26080/1000000 [1:21:01<40:44:21,  6.64it/s]

{'loss': Array(0.61075103, dtype=float32), 'loss_reward': Array(0.00480846, dtype=float32), 'loss_cross_entropy': Array(0.6059425, dtype=float32)}


  3%|▎         | 26088/1000000 [1:21:02<37:15:38,  7.26it/s]

{'loss': Array(0.6037824, dtype=float32), 'loss_reward': Array(0.00464297, dtype=float32), 'loss_cross_entropy': Array(0.5991394, dtype=float32)}


  3%|▎         | 26100/1000000 [1:21:04<28:39:51,  9.44it/s]

{'loss': Array(0.60532403, dtype=float32), 'loss_reward': Array(0.0046417, dtype=float32), 'loss_cross_entropy': Array(0.6006823, dtype=float32)}


  3%|▎         | 26108/1000000 [1:21:05<34:10:17,  7.92it/s]

{'loss': Array(0.6078203, dtype=float32), 'loss_reward': Array(0.00477083, dtype=float32), 'loss_cross_entropy': Array(0.60304946, dtype=float32)}


  3%|▎         | 26120/1000000 [1:21:08<34:54:33,  7.75it/s]

{'loss': Array(0.61155295, dtype=float32), 'loss_reward': Array(0.00468142, dtype=float32), 'loss_cross_entropy': Array(0.60687155, dtype=float32)}


  3%|▎         | 26128/1000000 [1:21:09<35:48:25,  7.55it/s]

{'loss': Array(0.60069793, dtype=float32), 'loss_reward': Array(0.00491389, dtype=float32), 'loss_cross_entropy': Array(0.595784, dtype=float32)}


  3%|▎         | 26140/1000000 [1:21:11<27:59:59,  9.66it/s]

{'loss': Array(0.59886175, dtype=float32), 'loss_reward': Array(0.00463187, dtype=float32), 'loss_cross_entropy': Array(0.5942299, dtype=float32)}


  3%|▎         | 26148/1000000 [1:21:12<34:24:48,  7.86it/s]

{'loss': Array(0.6080417, dtype=float32), 'loss_reward': Array(0.00477866, dtype=float32), 'loss_cross_entropy': Array(0.6032631, dtype=float32)}


  3%|▎         | 26158/1000000 [1:21:14<39:13:41,  6.90it/s]

{'loss': Array(0.61442226, dtype=float32), 'loss_reward': Array(0.00470212, dtype=float32), 'loss_cross_entropy': Array(0.60972023, dtype=float32)}


  3%|▎         | 26169/1000000 [1:21:16<28:50:24,  9.38it/s]

{'loss': Array(0.6041943, dtype=float32), 'loss_reward': Array(0.00477896, dtype=float32), 'loss_cross_entropy': Array(0.59941536, dtype=float32)}


  3%|▎         | 26180/1000000 [1:21:17<27:48:07,  9.73it/s]

{'loss': Array(0.60353893, dtype=float32), 'loss_reward': Array(0.00457287, dtype=float32), 'loss_cross_entropy': Array(0.59896606, dtype=float32)}


  3%|▎         | 26188/1000000 [1:21:19<33:56:51,  7.97it/s]

{'loss': Array(0.6027558, dtype=float32), 'loss_reward': Array(0.00472304, dtype=float32), 'loss_cross_entropy': Array(0.5980328, dtype=float32)}


  3%|▎         | 26199/1000000 [1:21:21<33:43:06,  8.02it/s]

{'loss': Array(0.605646, dtype=float32), 'loss_reward': Array(0.00475418, dtype=float32), 'loss_cross_entropy': Array(0.60089177, dtype=float32)}


  3%|▎         | 26209/1000000 [1:21:22<29:23:36,  9.20it/s]

{'loss': Array(0.60283315, dtype=float32), 'loss_reward': Array(0.00479555, dtype=float32), 'loss_cross_entropy': Array(0.5980376, dtype=float32)}


  3%|▎         | 26219/1000000 [1:21:24<29:58:38,  9.02it/s]

{'loss': Array(0.6013458, dtype=float32), 'loss_reward': Array(0.00484767, dtype=float32), 'loss_cross_entropy': Array(0.59649813, dtype=float32)}


  3%|▎         | 26230/1000000 [1:21:26<27:28:56,  9.84it/s]

{'loss': Array(0.60087454, dtype=float32), 'loss_reward': Array(0.00469228, dtype=float32), 'loss_cross_entropy': Array(0.5961823, dtype=float32)}


  3%|▎         | 26238/1000000 [1:21:27<37:50:50,  7.15it/s]

{'loss': Array(0.6039092, dtype=float32), 'loss_reward': Array(0.00462842, dtype=float32), 'loss_cross_entropy': Array(0.5992808, dtype=float32)}


  3%|▎         | 26250/1000000 [1:21:29<28:44:29,  9.41it/s]

{'loss': Array(0.6043517, dtype=float32), 'loss_reward': Array(0.00493117, dtype=float32), 'loss_cross_entropy': Array(0.59942055, dtype=float32)}


  3%|▎         | 26260/1000000 [1:21:31<28:20:13,  9.55it/s]

{'loss': Array(0.60192424, dtype=float32), 'loss_reward': Array(0.00479581, dtype=float32), 'loss_cross_entropy': Array(0.5971285, dtype=float32)}


  3%|▎         | 26270/1000000 [1:21:33<40:21:48,  6.70it/s]

{'loss': Array(0.59671205, dtype=float32), 'loss_reward': Array(0.00462399, dtype=float32), 'loss_cross_entropy': Array(0.59208804, dtype=float32)}


  3%|▎         | 26278/1000000 [1:21:34<37:48:33,  7.15it/s]

{'loss': Array(0.59474707, dtype=float32), 'loss_reward': Array(0.00465232, dtype=float32), 'loss_cross_entropy': Array(0.5900948, dtype=float32)}


  3%|▎         | 26290/1000000 [1:21:36<28:27:39,  9.50it/s]

{'loss': Array(0.6014504, dtype=float32), 'loss_reward': Array(0.00463298, dtype=float32), 'loss_cross_entropy': Array(0.5968174, dtype=float32)}


  3%|▎         | 26298/1000000 [1:21:37<33:30:36,  8.07it/s]

{'loss': Array(0.5963411, dtype=float32), 'loss_reward': Array(0.00464352, dtype=float32), 'loss_cross_entropy': Array(0.5916975, dtype=float32)}


  3%|▎         | 26310/1000000 [1:21:39<34:36:01,  7.82it/s]

{'loss': Array(0.59781724, dtype=float32), 'loss_reward': Array(0.00454161, dtype=float32), 'loss_cross_entropy': Array(0.5932756, dtype=float32)}


  3%|▎         | 26320/1000000 [1:21:41<31:38:48,  8.55it/s]

{'loss': Array(0.6065327, dtype=float32), 'loss_reward': Array(0.00468708, dtype=float32), 'loss_cross_entropy': Array(0.6018456, dtype=float32)}


  3%|▎         | 26328/1000000 [1:21:42<35:42:39,  7.57it/s]

{'loss': Array(0.59314716, dtype=float32), 'loss_reward': Array(0.00472227, dtype=float32), 'loss_cross_entropy': Array(0.58842486, dtype=float32)}


  3%|▎         | 26340/1000000 [1:21:44<28:01:07,  9.65it/s]

{'loss': Array(0.6068949, dtype=float32), 'loss_reward': Array(0.00485553, dtype=float32), 'loss_cross_entropy': Array(0.60203946, dtype=float32)}


  3%|▎         | 26350/1000000 [1:21:46<33:23:34,  8.10it/s]

{'loss': Array(0.5916727, dtype=float32), 'loss_reward': Array(0.0046686, dtype=float32), 'loss_cross_entropy': Array(0.5870041, dtype=float32)}


  3%|▎         | 26358/1000000 [1:21:47<35:42:18,  7.57it/s]

{'loss': Array(0.5996305, dtype=float32), 'loss_reward': Array(0.00470976, dtype=float32), 'loss_cross_entropy': Array(0.59492064, dtype=float32)}


  3%|▎         | 26368/1000000 [1:21:49<31:37:32,  8.55it/s]

{'loss': Array(0.5919889, dtype=float32), 'loss_reward': Array(0.00462054, dtype=float32), 'loss_cross_entropy': Array(0.5873684, dtype=float32)}


  3%|▎         | 26380/1000000 [1:21:51<27:44:19,  9.75it/s]

{'loss': Array(0.6018345, dtype=float32), 'loss_reward': Array(0.00463392, dtype=float32), 'loss_cross_entropy': Array(0.5972006, dtype=float32)}


  3%|▎         | 26390/1000000 [1:21:53<33:07:37,  8.16it/s]

{'loss': Array(0.6061358, dtype=float32), 'loss_reward': Array(0.00493232, dtype=float32), 'loss_cross_entropy': Array(0.6012034, dtype=float32)}


  3%|▎         | 26398/1000000 [1:21:54<35:31:11,  7.61it/s]

{'loss': Array(0.5951892, dtype=float32), 'loss_reward': Array(0.00471705, dtype=float32), 'loss_cross_entropy': Array(0.59047216, dtype=float32)}


  3%|▎         | 26410/1000000 [1:21:56<28:05:12,  9.63it/s]

{'loss': Array(0.6036877, dtype=float32), 'loss_reward': Array(0.00486243, dtype=float32), 'loss_cross_entropy': Array(0.5988253, dtype=float32)}


  3%|▎         | 26418/1000000 [1:21:57<34:21:53,  7.87it/s]

{'loss': Array(0.6006671, dtype=float32), 'loss_reward': Array(0.00464947, dtype=float32), 'loss_cross_entropy': Array(0.5960176, dtype=float32)}


  3%|▎         | 26430/1000000 [1:21:59<30:39:14,  8.82it/s]

{'loss': Array(0.59752584, dtype=float32), 'loss_reward': Array(0.00456089, dtype=float32), 'loss_cross_entropy': Array(0.59296495, dtype=float32)}


  3%|▎         | 26438/1000000 [1:22:01<33:26:36,  8.09it/s]

{'loss': Array(0.5965306, dtype=float32), 'loss_reward': Array(0.00468584, dtype=float32), 'loss_cross_entropy': Array(0.59184474, dtype=float32)}


  3%|▎         | 26450/1000000 [1:22:02<27:27:07,  9.85it/s]

{'loss': Array(0.58566856, dtype=float32), 'loss_reward': Array(0.0046429, dtype=float32), 'loss_cross_entropy': Array(0.58102566, dtype=float32)}


  3%|▎         | 26459/1000000 [1:22:04<40:39:20,  6.65it/s]

{'loss': Array(0.5918194, dtype=float32), 'loss_reward': Array(0.00473874, dtype=float32), 'loss_cross_entropy': Array(0.58708066, dtype=float32)}


  3%|▎         | 26469/1000000 [1:22:06<32:27:38,  8.33it/s]

{'loss': Array(0.5953127, dtype=float32), 'loss_reward': Array(0.00471388, dtype=float32), 'loss_cross_entropy': Array(0.59059876, dtype=float32)}


  3%|▎         | 26480/1000000 [1:22:08<28:32:36,  9.47it/s]

{'loss': Array(0.59871596, dtype=float32), 'loss_reward': Array(0.00477958, dtype=float32), 'loss_cross_entropy': Array(0.5939364, dtype=float32)}


  3%|▎         | 26490/1000000 [1:22:09<29:10:27,  9.27it/s]

{'loss': Array(0.5928387, dtype=float32), 'loss_reward': Array(0.00466839, dtype=float32), 'loss_cross_entropy': Array(0.5881703, dtype=float32)}


  3%|▎         | 26500/1000000 [1:22:11<36:49:22,  7.34it/s]

{'loss': Array(0.5976804, dtype=float32), 'loss_reward': Array(0.00462404, dtype=float32), 'loss_cross_entropy': Array(0.5930564, dtype=float32)}


  3%|▎         | 26508/1000000 [1:22:23<183:28:52,  1.47it/s]

{'loss': Array(0.59909505, dtype=float32), 'loss_reward': Array(0.00470385, dtype=float32), 'loss_cross_entropy': Array(0.5943913, dtype=float32)}


  3%|▎         | 26520/1000000 [1:22:24<60:45:34,  4.45it/s] 

{'loss': Array(0.5971224, dtype=float32), 'loss_reward': Array(0.00469936, dtype=float32), 'loss_cross_entropy': Array(0.592423, dtype=float32)}


  3%|▎         | 26528/1000000 [1:22:26<44:31:11,  6.07it/s]

{'loss': Array(0.6014044, dtype=float32), 'loss_reward': Array(0.00460966, dtype=float32), 'loss_cross_entropy': Array(0.5967948, dtype=float32)}


  3%|▎         | 26539/1000000 [1:22:28<38:30:01,  7.02it/s]

{'loss': Array(0.59858793, dtype=float32), 'loss_reward': Array(0.00473625, dtype=float32), 'loss_cross_entropy': Array(0.5938516, dtype=float32)}


  3%|▎         | 26548/1000000 [1:22:29<32:08:43,  8.41it/s]

{'loss': Array(0.60508543, dtype=float32), 'loss_reward': Array(0.0048607, dtype=float32), 'loss_cross_entropy': Array(0.6002248, dtype=float32)}


  3%|▎         | 26558/1000000 [1:22:31<29:30:47,  9.16it/s]

{'loss': Array(0.5916298, dtype=float32), 'loss_reward': Array(0.00469818, dtype=float32), 'loss_cross_entropy': Array(0.58693165, dtype=float32)}


  3%|▎         | 26570/1000000 [1:22:32<26:27:14, 10.22it/s]

{'loss': Array(0.5914778, dtype=float32), 'loss_reward': Array(0.00464731, dtype=float32), 'loss_cross_entropy': Array(0.58683056, dtype=float32)}


  3%|▎         | 26580/1000000 [1:22:34<31:40:38,  8.54it/s]

{'loss': Array(0.5960215, dtype=float32), 'loss_reward': Array(0.00469365, dtype=float32), 'loss_cross_entropy': Array(0.5913278, dtype=float32)}


  3%|▎         | 26588/1000000 [1:22:36<34:14:51,  7.90it/s]

{'loss': Array(0.5960049, dtype=float32), 'loss_reward': Array(0.00476599, dtype=float32), 'loss_cross_entropy': Array(0.59123886, dtype=float32)}


  3%|▎         | 26600/1000000 [1:22:37<27:33:34,  9.81it/s]

{'loss': Array(0.59571934, dtype=float32), 'loss_reward': Array(0.00484337, dtype=float32), 'loss_cross_entropy': Array(0.590876, dtype=float32)}


  3%|▎         | 26608/1000000 [1:22:39<32:59:47,  8.19it/s]

{'loss': Array(0.59316003, dtype=float32), 'loss_reward': Array(0.00471339, dtype=float32), 'loss_cross_entropy': Array(0.5884466, dtype=float32)}


  3%|▎         | 26619/1000000 [1:22:41<33:10:57,  8.15it/s]

{'loss': Array(0.5979853, dtype=float32), 'loss_reward': Array(0.00462542, dtype=float32), 'loss_cross_entropy': Array(0.5933599, dtype=float32)}


  3%|▎         | 26630/1000000 [1:22:42<27:23:43,  9.87it/s]

{'loss': Array(0.58838785, dtype=float32), 'loss_reward': Array(0.00470642, dtype=float32), 'loss_cross_entropy': Array(0.58368146, dtype=float32)}


  3%|▎         | 26638/1000000 [1:22:44<32:10:44,  8.40it/s]

{'loss': Array(0.5940657, dtype=float32), 'loss_reward': Array(0.00470069, dtype=float32), 'loss_cross_entropy': Array(0.589365, dtype=float32)}


  3%|▎         | 26649/1000000 [1:22:46<39:08:05,  6.91it/s]

{'loss': Array(0.5933284, dtype=float32), 'loss_reward': Array(0.00487495, dtype=float32), 'loss_cross_entropy': Array(0.5884535, dtype=float32)}


  3%|▎         | 26660/1000000 [1:22:47<29:45:18,  9.09it/s]

{'loss': Array(0.5905329, dtype=float32), 'loss_reward': Array(0.00466671, dtype=float32), 'loss_cross_entropy': Array(0.58586615, dtype=float32)}


  3%|▎         | 26668/1000000 [1:22:49<33:32:05,  8.06it/s]

{'loss': Array(0.5984119, dtype=float32), 'loss_reward': Array(0.00462319, dtype=float32), 'loss_cross_entropy': Array(0.59378874, dtype=float32)}


  3%|▎         | 26680/1000000 [1:22:50<27:02:29, 10.00it/s]

{'loss': Array(0.59731627, dtype=float32), 'loss_reward': Array(0.00463469, dtype=float32), 'loss_cross_entropy': Array(0.5926816, dtype=float32)}


  3%|▎         | 26690/1000000 [1:22:52<35:07:54,  7.70it/s]

{'loss': Array(0.5941764, dtype=float32), 'loss_reward': Array(0.00467015, dtype=float32), 'loss_cross_entropy': Array(0.5895062, dtype=float32)}


  3%|▎         | 26698/1000000 [1:22:54<35:26:13,  7.63it/s]

{'loss': Array(0.58884954, dtype=float32), 'loss_reward': Array(0.0046781, dtype=float32), 'loss_cross_entropy': Array(0.58417153, dtype=float32)}


  3%|▎         | 26710/1000000 [1:22:55<28:09:29,  9.60it/s]

{'loss': Array(0.5910882, dtype=float32), 'loss_reward': Array(0.00466279, dtype=float32), 'loss_cross_entropy': Array(0.5864254, dtype=float32)}


  3%|▎         | 26718/1000000 [1:22:57<33:01:48,  8.19it/s]

{'loss': Array(0.58591276, dtype=float32), 'loss_reward': Array(0.00478885, dtype=float32), 'loss_cross_entropy': Array(0.58112395, dtype=float32)}


  3%|▎         | 26729/1000000 [1:22:59<35:09:51,  7.69it/s]

{'loss': Array(0.5928305, dtype=float32), 'loss_reward': Array(0.00465742, dtype=float32), 'loss_cross_entropy': Array(0.5881731, dtype=float32)}


  3%|▎         | 26740/1000000 [1:23:00<27:55:59,  9.68it/s]

{'loss': Array(0.5893943, dtype=float32), 'loss_reward': Array(0.00461678, dtype=float32), 'loss_cross_entropy': Array(0.58477753, dtype=float32)}


  3%|▎         | 26748/1000000 [1:23:02<33:17:32,  8.12it/s]

{'loss': Array(0.5892427, dtype=float32), 'loss_reward': Array(0.00462515, dtype=float32), 'loss_cross_entropy': Array(0.58461756, dtype=float32)}


  3%|▎         | 26760/1000000 [1:23:03<27:46:19,  9.73it/s]

{'loss': Array(0.5868451, dtype=float32), 'loss_reward': Array(0.00454022, dtype=float32), 'loss_cross_entropy': Array(0.5823049, dtype=float32)}


  3%|▎         | 26770/1000000 [1:23:05<32:13:34,  8.39it/s]

{'loss': Array(0.5866154, dtype=float32), 'loss_reward': Array(0.00468813, dtype=float32), 'loss_cross_entropy': Array(0.58192724, dtype=float32)}


  3%|▎         | 26778/1000000 [1:23:07<34:45:34,  7.78it/s]

{'loss': Array(0.5913629, dtype=float32), 'loss_reward': Array(0.00473346, dtype=float32), 'loss_cross_entropy': Array(0.5866295, dtype=float32)}


  3%|▎         | 26789/1000000 [1:23:08<28:21:55,  9.53it/s]

{'loss': Array(0.5867706, dtype=float32), 'loss_reward': Array(0.00469243, dtype=float32), 'loss_cross_entropy': Array(0.58207816, dtype=float32)}


  3%|▎         | 26800/1000000 [1:23:10<26:53:26, 10.05it/s]

{'loss': Array(0.58213043, dtype=float32), 'loss_reward': Array(0.00461907, dtype=float32), 'loss_cross_entropy': Array(0.57751137, dtype=float32)}


  3%|▎         | 26808/1000000 [1:23:12<37:46:29,  7.16it/s]

{'loss': Array(0.58496785, dtype=float32), 'loss_reward': Array(0.00471099, dtype=float32), 'loss_cross_entropy': Array(0.5802569, dtype=float32)}


  3%|▎         | 26819/1000000 [1:23:13<29:35:52,  9.13it/s]

{'loss': Array(0.5946147, dtype=float32), 'loss_reward': Array(0.00478103, dtype=float32), 'loss_cross_entropy': Array(0.5898337, dtype=float32)}


  3%|▎         | 26830/1000000 [1:23:15<26:44:18, 10.11it/s]

{'loss': Array(0.5841808, dtype=float32), 'loss_reward': Array(0.00464743, dtype=float32), 'loss_cross_entropy': Array(0.57953346, dtype=float32)}


  3%|▎         | 26840/1000000 [1:23:17<39:17:10,  6.88it/s]

{'loss': Array(0.5911507, dtype=float32), 'loss_reward': Array(0.00472206, dtype=float32), 'loss_cross_entropy': Array(0.58642864, dtype=float32)}


  3%|▎         | 26848/1000000 [1:23:18<37:10:19,  7.27it/s]

{'loss': Array(0.5857189, dtype=float32), 'loss_reward': Array(0.00452442, dtype=float32), 'loss_cross_entropy': Array(0.5811945, dtype=float32)}


  3%|▎         | 26860/1000000 [1:23:20<28:39:22,  9.43it/s]

{'loss': Array(0.5893762, dtype=float32), 'loss_reward': Array(0.00473298, dtype=float32), 'loss_cross_entropy': Array(0.5846432, dtype=float32)}


  3%|▎         | 26868/1000000 [1:23:21<33:55:38,  7.97it/s]

{'loss': Array(0.58861506, dtype=float32), 'loss_reward': Array(0.0046441, dtype=float32), 'loss_cross_entropy': Array(0.58397096, dtype=float32)}


  3%|▎         | 26879/1000000 [1:23:23<41:02:31,  6.59it/s]

{'loss': Array(0.59291416, dtype=float32), 'loss_reward': Array(0.00447803, dtype=float32), 'loss_cross_entropy': Array(0.5884361, dtype=float32)}


  3%|▎         | 26890/1000000 [1:23:25<29:16:56,  9.23it/s]

{'loss': Array(0.5855353, dtype=float32), 'loss_reward': Array(0.00476961, dtype=float32), 'loss_cross_entropy': Array(0.58076566, dtype=float32)}


  3%|▎         | 26898/1000000 [1:23:26<33:51:22,  7.98it/s]

{'loss': Array(0.5803984, dtype=float32), 'loss_reward': Array(0.00442122, dtype=float32), 'loss_cross_entropy': Array(0.57597715, dtype=float32)}


  3%|▎         | 26910/1000000 [1:23:28<27:24:28,  9.86it/s]

{'loss': Array(0.5890507, dtype=float32), 'loss_reward': Array(0.00475349, dtype=float32), 'loss_cross_entropy': Array(0.5842972, dtype=float32)}


  3%|▎         | 26920/1000000 [1:23:30<34:26:31,  7.85it/s]

{'loss': Array(0.57592547, dtype=float32), 'loss_reward': Array(0.00465459, dtype=float32), 'loss_cross_entropy': Array(0.5712709, dtype=float32)}


  3%|▎         | 26928/1000000 [1:23:31<35:25:08,  7.63it/s]

{'loss': Array(0.5787704, dtype=float32), 'loss_reward': Array(0.00454995, dtype=float32), 'loss_cross_entropy': Array(0.5742205, dtype=float32)}


  3%|▎         | 26939/1000000 [1:23:33<29:19:24,  9.22it/s]

{'loss': Array(0.5863079, dtype=float32), 'loss_reward': Array(0.00458047, dtype=float32), 'loss_cross_entropy': Array(0.58172745, dtype=float32)}


  3%|▎         | 26950/1000000 [1:23:34<26:25:00, 10.23it/s]

{'loss': Array(0.5856325, dtype=float32), 'loss_reward': Array(0.00457021, dtype=float32), 'loss_cross_entropy': Array(0.58106226, dtype=float32)}


  3%|▎         | 26960/1000000 [1:23:36<32:29:50,  8.32it/s]

{'loss': Array(0.58943766, dtype=float32), 'loss_reward': Array(0.00465993, dtype=float32), 'loss_cross_entropy': Array(0.5847778, dtype=float32)}


  3%|▎         | 26968/1000000 [1:23:38<34:09:42,  7.91it/s]

{'loss': Array(0.576715, dtype=float32), 'loss_reward': Array(0.00455695, dtype=float32), 'loss_cross_entropy': Array(0.57215804, dtype=float32)}


  3%|▎         | 26979/1000000 [1:23:39<27:39:16,  9.77it/s]

{'loss': Array(0.5700966, dtype=float32), 'loss_reward': Array(0.00461926, dtype=float32), 'loss_cross_entropy': Array(0.5654773, dtype=float32)}


  3%|▎         | 26990/1000000 [1:23:41<26:00:55, 10.39it/s]

{'loss': Array(0.57040256, dtype=float32), 'loss_reward': Array(0.00445633, dtype=float32), 'loss_cross_entropy': Array(0.56594616, dtype=float32)}


  3%|▎         | 26998/1000000 [1:23:43<37:43:57,  7.16it/s]

{'loss': Array(0.5764381, dtype=float32), 'loss_reward': Array(0.00468345, dtype=float32), 'loss_cross_entropy': Array(0.57175475, dtype=float32)}


  3%|▎         | 27010/1000000 [1:23:54<122:00:03,  2.22it/s]

{'loss': Array(0.57224226, dtype=float32), 'loss_reward': Array(0.00452414, dtype=float32), 'loss_cross_entropy': Array(0.56771815, dtype=float32)}


  3%|▎         | 27019/1000000 [1:23:56<64:21:41,  4.20it/s] 

{'loss': Array(0.5832613, dtype=float32), 'loss_reward': Array(0.00449178, dtype=float32), 'loss_cross_entropy': Array(0.5787695, dtype=float32)}


  3%|▎         | 27029/1000000 [1:23:58<49:50:32,  5.42it/s]

{'loss': Array(0.5806226, dtype=float32), 'loss_reward': Array(0.00473914, dtype=float32), 'loss_cross_entropy': Array(0.5758834, dtype=float32)}


  3%|▎         | 27040/1000000 [1:23:59<31:24:27,  8.61it/s]

{'loss': Array(0.58490825, dtype=float32), 'loss_reward': Array(0.00460841, dtype=float32), 'loss_cross_entropy': Array(0.5802998, dtype=float32)}


  3%|▎         | 27048/1000000 [1:24:01<35:13:09,  7.67it/s]

{'loss': Array(0.5793768, dtype=float32), 'loss_reward': Array(0.00474751, dtype=float32), 'loss_cross_entropy': Array(0.5746293, dtype=float32)}


  3%|▎         | 27060/1000000 [1:24:02<28:17:18,  9.55it/s]

{'loss': Array(0.5745692, dtype=float32), 'loss_reward': Array(0.00460983, dtype=float32), 'loss_cross_entropy': Array(0.5699594, dtype=float32)}


  3%|▎         | 27070/1000000 [1:24:04<34:56:15,  7.74it/s]

{'loss': Array(0.58429635, dtype=float32), 'loss_reward': Array(0.00464251, dtype=float32), 'loss_cross_entropy': Array(0.5796538, dtype=float32)}


  3%|▎         | 27080/1000000 [1:24:06<29:14:36,  9.24it/s]

{'loss': Array(0.58102053, dtype=float32), 'loss_reward': Array(0.00468463, dtype=float32), 'loss_cross_entropy': Array(0.5763359, dtype=float32)}


  3%|▎         | 27088/1000000 [1:24:07<33:12:14,  8.14it/s]

{'loss': Array(0.57556856, dtype=float32), 'loss_reward': Array(0.00449108, dtype=float32), 'loss_cross_entropy': Array(0.5710774, dtype=float32)}


  3%|▎         | 27100/1000000 [1:24:09<27:43:02,  9.75it/s]

{'loss': Array(0.5907382, dtype=float32), 'loss_reward': Array(0.00472975, dtype=float32), 'loss_cross_entropy': Array(0.5860085, dtype=float32)}


  3%|▎         | 27110/1000000 [1:24:11<31:56:32,  8.46it/s]

{'loss': Array(0.5861571, dtype=float32), 'loss_reward': Array(0.00461919, dtype=float32), 'loss_cross_entropy': Array(0.5815379, dtype=float32)}


  3%|▎         | 27118/1000000 [1:24:12<35:46:25,  7.55it/s]

{'loss': Array(0.58663386, dtype=float32), 'loss_reward': Array(0.00469573, dtype=float32), 'loss_cross_entropy': Array(0.5819381, dtype=float32)}


  3%|▎         | 27130/1000000 [1:24:14<28:04:50,  9.62it/s]

{'loss': Array(0.5883059, dtype=float32), 'loss_reward': Array(0.00464884, dtype=float32), 'loss_cross_entropy': Array(0.58365697, dtype=float32)}


  3%|▎         | 27138/1000000 [1:24:15<33:47:53,  8.00it/s]

{'loss': Array(0.5888364, dtype=float32), 'loss_reward': Array(0.00463009, dtype=float32), 'loss_cross_entropy': Array(0.5842063, dtype=float32)}


  3%|▎         | 27149/1000000 [1:24:17<33:32:38,  8.06it/s]

{'loss': Array(0.5799634, dtype=float32), 'loss_reward': Array(0.00449686, dtype=float32), 'loss_cross_entropy': Array(0.5754666, dtype=float32)}


  3%|▎         | 27160/1000000 [1:24:19<28:14:51,  9.57it/s]

{'loss': Array(0.57872075, dtype=float32), 'loss_reward': Array(0.00454651, dtype=float32), 'loss_cross_entropy': Array(0.5741743, dtype=float32)}


  3%|▎         | 27168/1000000 [1:24:20<33:52:38,  7.98it/s]

{'loss': Array(0.5813429, dtype=float32), 'loss_reward': Array(0.0044939, dtype=float32), 'loss_cross_entropy': Array(0.576849, dtype=float32)}


  3%|▎         | 27179/1000000 [1:24:22<39:26:56,  6.85it/s]

{'loss': Array(0.58733165, dtype=float32), 'loss_reward': Array(0.00480862, dtype=float32), 'loss_cross_entropy': Array(0.58252305, dtype=float32)}


  3%|▎         | 27190/1000000 [1:24:24<29:41:53,  9.10it/s]

{'loss': Array(0.5798914, dtype=float32), 'loss_reward': Array(0.00455073, dtype=float32), 'loss_cross_entropy': Array(0.5753406, dtype=float32)}


  3%|▎         | 27198/1000000 [1:24:26<34:44:38,  7.78it/s]

{'loss': Array(0.57664174, dtype=float32), 'loss_reward': Array(0.00448733, dtype=float32), 'loss_cross_entropy': Array(0.5721544, dtype=float32)}


  3%|▎         | 27210/1000000 [1:24:27<28:19:24,  9.54it/s]

{'loss': Array(0.5796988, dtype=float32), 'loss_reward': Array(0.00456802, dtype=float32), 'loss_cross_entropy': Array(0.57513076, dtype=float32)}


  3%|▎         | 27220/1000000 [1:24:29<36:08:24,  7.48it/s]

{'loss': Array(0.58567727, dtype=float32), 'loss_reward': Array(0.00454824, dtype=float32), 'loss_cross_entropy': Array(0.581129, dtype=float32)}


  3%|▎         | 27230/1000000 [1:24:31<31:55:50,  8.46it/s]

{'loss': Array(0.58186525, dtype=float32), 'loss_reward': Array(0.0047414, dtype=float32), 'loss_cross_entropy': Array(0.5771238, dtype=float32)}


  3%|▎         | 27238/1000000 [1:24:32<35:01:17,  7.72it/s]

{'loss': Array(0.58045286, dtype=float32), 'loss_reward': Array(0.00445261, dtype=float32), 'loss_cross_entropy': Array(0.57600033, dtype=float32)}


  3%|▎         | 27250/1000000 [1:24:34<27:21:54,  9.87it/s]

{'loss': Array(0.5827811, dtype=float32), 'loss_reward': Array(0.00466106, dtype=float32), 'loss_cross_entropy': Array(0.57812, dtype=float32)}


  3%|▎         | 27258/1000000 [1:24:36<39:11:59,  6.89it/s]

{'loss': Array(0.5721708, dtype=float32), 'loss_reward': Array(0.00462821, dtype=float32), 'loss_cross_entropy': Array(0.5675426, dtype=float32)}


  3%|▎         | 27268/1000000 [1:24:37<31:53:59,  8.47it/s]

{'loss': Array(0.57135147, dtype=float32), 'loss_reward': Array(0.00448738, dtype=float32), 'loss_cross_entropy': Array(0.56686413, dtype=float32)}


  3%|▎         | 27280/1000000 [1:24:39<26:58:14, 10.02it/s]

{'loss': Array(0.57642436, dtype=float32), 'loss_reward': Array(0.00452985, dtype=float32), 'loss_cross_entropy': Array(0.57189447, dtype=float32)}


  3%|▎         | 27288/1000000 [1:24:40<32:26:17,  8.33it/s]

{'loss': Array(0.58042866, dtype=float32), 'loss_reward': Array(0.00463923, dtype=float32), 'loss_cross_entropy': Array(0.5757894, dtype=float32)}


  3%|▎         | 27299/1000000 [1:24:42<32:23:51,  8.34it/s]

{'loss': Array(0.5667939, dtype=float32), 'loss_reward': Array(0.00461922, dtype=float32), 'loss_cross_entropy': Array(0.56217474, dtype=float32)}


  3%|▎         | 27310/1000000 [1:24:44<28:05:13,  9.62it/s]

{'loss': Array(0.57975245, dtype=float32), 'loss_reward': Array(0.00465047, dtype=float32), 'loss_cross_entropy': Array(0.575102, dtype=float32)}


  3%|▎         | 27320/1000000 [1:24:45<28:02:10,  9.64it/s]

{'loss': Array(0.5701873, dtype=float32), 'loss_reward': Array(0.00464641, dtype=float32), 'loss_cross_entropy': Array(0.56554085, dtype=float32)}


  3%|▎         | 27328/1000000 [1:24:47<33:13:54,  8.13it/s]

{'loss': Array(0.5722582, dtype=float32), 'loss_reward': Array(0.00453448, dtype=float32), 'loss_cross_entropy': Array(0.5677237, dtype=float32)}


  3%|▎         | 27339/1000000 [1:24:49<32:35:38,  8.29it/s]

{'loss': Array(0.56685793, dtype=float32), 'loss_reward': Array(0.0046212, dtype=float32), 'loss_cross_entropy': Array(0.5622367, dtype=float32)}


  3%|▎         | 27350/1000000 [1:24:50<27:40:39,  9.76it/s]

{'loss': Array(0.56976, dtype=float32), 'loss_reward': Array(0.00462819, dtype=float32), 'loss_cross_entropy': Array(0.5651318, dtype=float32)}


  3%|▎         | 27360/1000000 [1:24:52<27:27:59,  9.84it/s]

{'loss': Array(0.57152957, dtype=float32), 'loss_reward': Array(0.00459846, dtype=float32), 'loss_cross_entropy': Array(0.5669312, dtype=float32)}


  3%|▎         | 27370/1000000 [1:24:54<38:20:40,  7.05it/s]

{'loss': Array(0.57353234, dtype=float32), 'loss_reward': Array(0.00456146, dtype=float32), 'loss_cross_entropy': Array(0.5689708, dtype=float32)}


  3%|▎         | 27378/1000000 [1:24:55<36:46:41,  7.35it/s]

{'loss': Array(0.57465106, dtype=float32), 'loss_reward': Array(0.00467304, dtype=float32), 'loss_cross_entropy': Array(0.569978, dtype=float32)}


  3%|▎         | 27390/1000000 [1:24:57<27:57:40,  9.66it/s]

{'loss': Array(0.572299, dtype=float32), 'loss_reward': Array(0.0046436, dtype=float32), 'loss_cross_entropy': Array(0.5676555, dtype=float32)}


  3%|▎         | 27398/1000000 [1:24:58<32:18:06,  8.36it/s]

{'loss': Array(0.56984705, dtype=float32), 'loss_reward': Array(0.00457081, dtype=float32), 'loss_cross_entropy': Array(0.56527627, dtype=float32)}


  3%|▎         | 27410/1000000 [1:25:00<33:55:39,  7.96it/s]

{'loss': Array(0.56746817, dtype=float32), 'loss_reward': Array(0.00454023, dtype=float32), 'loss_cross_entropy': Array(0.5629279, dtype=float32)}


  3%|▎         | 27418/1000000 [1:25:02<35:25:45,  7.63it/s]

{'loss': Array(0.56903946, dtype=float32), 'loss_reward': Array(0.00445273, dtype=float32), 'loss_cross_entropy': Array(0.56458676, dtype=float32)}


  3%|▎         | 27430/1000000 [1:25:03<27:47:02,  9.72it/s]

{'loss': Array(0.576998, dtype=float32), 'loss_reward': Array(0.00442308, dtype=float32), 'loss_cross_entropy': Array(0.5725749, dtype=float32)}


  3%|▎         | 27438/1000000 [1:25:05<32:56:35,  8.20it/s]

{'loss': Array(0.5733704, dtype=float32), 'loss_reward': Array(0.00461032, dtype=float32), 'loss_cross_entropy': Array(0.56876, dtype=float32)}


  3%|▎         | 27448/1000000 [1:25:07<36:53:41,  7.32it/s]

{'loss': Array(0.56736803, dtype=float32), 'loss_reward': Array(0.00462327, dtype=float32), 'loss_cross_entropy': Array(0.5627448, dtype=float32)}


  3%|▎         | 27459/1000000 [1:25:08<29:55:15,  9.03it/s]

{'loss': Array(0.57694834, dtype=float32), 'loss_reward': Array(0.00452229, dtype=float32), 'loss_cross_entropy': Array(0.5724261, dtype=float32)}


  3%|▎         | 27470/1000000 [1:25:10<27:09:48,  9.95it/s]

{'loss': Array(0.56885684, dtype=float32), 'loss_reward': Array(0.00459515, dtype=float32), 'loss_cross_entropy': Array(0.5642616, dtype=float32)}


  3%|▎         | 27478/1000000 [1:25:11<32:21:20,  8.35it/s]

{'loss': Array(0.5694832, dtype=float32), 'loss_reward': Array(0.0044659, dtype=float32), 'loss_cross_entropy': Array(0.56501734, dtype=float32)}


  3%|▎         | 27489/1000000 [1:25:13<32:31:18,  8.31it/s]

{'loss': Array(0.5651606, dtype=float32), 'loss_reward': Array(0.00466992, dtype=float32), 'loss_cross_entropy': Array(0.5604906, dtype=float32)}


  3%|▎         | 27500/1000000 [1:25:15<27:40:59,  9.76it/s]

{'loss': Array(0.57018214, dtype=float32), 'loss_reward': Array(0.00451598, dtype=float32), 'loss_cross_entropy': Array(0.5656662, dtype=float32)}


  3%|▎         | 27508/1000000 [1:25:26<172:29:30,  1.57it/s]

{'loss': Array(0.576703, dtype=float32), 'loss_reward': Array(0.00439611, dtype=float32), 'loss_cross_entropy': Array(0.5723069, dtype=float32)}


  3%|▎         | 27520/1000000 [1:25:28<68:41:10,  3.93it/s] 

{'loss': Array(0.5738751, dtype=float32), 'loss_reward': Array(0.00463521, dtype=float32), 'loss_cross_entropy': Array(0.5692399, dtype=float32)}


  3%|▎         | 27528/1000000 [1:25:30<47:55:00,  5.64it/s]

{'loss': Array(0.5728239, dtype=float32), 'loss_reward': Array(0.0045626, dtype=float32), 'loss_cross_entropy': Array(0.5682612, dtype=float32)}


  3%|▎         | 27539/1000000 [1:25:31<32:30:38,  8.31it/s]

{'loss': Array(0.5786074, dtype=float32), 'loss_reward': Array(0.00471001, dtype=float32), 'loss_cross_entropy': Array(0.57389736, dtype=float32)}


  3%|▎         | 27550/1000000 [1:25:33<27:33:28,  9.80it/s]

{'loss': Array(0.5823569, dtype=float32), 'loss_reward': Array(0.00451442, dtype=float32), 'loss_cross_entropy': Array(0.5778425, dtype=float32)}


  3%|▎         | 27560/1000000 [1:25:35<34:47:32,  7.76it/s]

{'loss': Array(0.5713976, dtype=float32), 'loss_reward': Array(0.00445931, dtype=float32), 'loss_cross_entropy': Array(0.56693834, dtype=float32)}


  3%|▎         | 27568/1000000 [1:25:36<35:15:17,  7.66it/s]

{'loss': Array(0.57241374, dtype=float32), 'loss_reward': Array(0.00453801, dtype=float32), 'loss_cross_entropy': Array(0.5678758, dtype=float32)}


  3%|▎         | 27580/1000000 [1:25:38<27:45:09,  9.73it/s]

{'loss': Array(0.5761802, dtype=float32), 'loss_reward': Array(0.00472768, dtype=float32), 'loss_cross_entropy': Array(0.57145256, dtype=float32)}


  3%|▎         | 27588/1000000 [1:25:39<33:18:52,  8.11it/s]

{'loss': Array(0.57871, dtype=float32), 'loss_reward': Array(0.00464597, dtype=float32), 'loss_cross_entropy': Array(0.5740641, dtype=float32)}


  3%|▎         | 27599/1000000 [1:25:41<35:03:41,  7.70it/s]

{'loss': Array(0.5758501, dtype=float32), 'loss_reward': Array(0.00455688, dtype=float32), 'loss_cross_entropy': Array(0.5712932, dtype=float32)}


  3%|▎         | 27610/1000000 [1:25:43<27:50:15,  9.70it/s]

{'loss': Array(0.56828916, dtype=float32), 'loss_reward': Array(0.00442986, dtype=float32), 'loss_cross_entropy': Array(0.5638593, dtype=float32)}


  3%|▎         | 27618/1000000 [1:25:44<33:41:23,  8.02it/s]

{'loss': Array(0.56984526, dtype=float32), 'loss_reward': Array(0.00457212, dtype=float32), 'loss_cross_entropy': Array(0.56527317, dtype=float32)}


  3%|▎         | 27630/1000000 [1:25:46<27:11:47,  9.93it/s]

{'loss': Array(0.5811995, dtype=float32), 'loss_reward': Array(0.00462723, dtype=float32), 'loss_cross_entropy': Array(0.57657224, dtype=float32)}


  3%|▎         | 27640/1000000 [1:25:48<31:23:38,  8.60it/s]

{'loss': Array(0.5737466, dtype=float32), 'loss_reward': Array(0.00453939, dtype=float32), 'loss_cross_entropy': Array(0.56920725, dtype=float32)}


  3%|▎         | 27648/1000000 [1:25:49<35:11:20,  7.68it/s]

{'loss': Array(0.5645321, dtype=float32), 'loss_reward': Array(0.00450454, dtype=float32), 'loss_cross_entropy': Array(0.56002754, dtype=float32)}


  3%|▎         | 27660/1000000 [1:25:51<28:14:26,  9.56it/s]

{'loss': Array(0.56388193, dtype=float32), 'loss_reward': Array(0.00452093, dtype=float32), 'loss_cross_entropy': Array(0.55936104, dtype=float32)}


  3%|▎         | 27668/1000000 [1:25:52<33:22:04,  8.09it/s]

{'loss': Array(0.56927764, dtype=float32), 'loss_reward': Array(0.00460725, dtype=float32), 'loss_cross_entropy': Array(0.5646704, dtype=float32)}


  3%|▎         | 27679/1000000 [1:25:54<33:04:07,  8.17it/s]

{'loss': Array(0.57228243, dtype=float32), 'loss_reward': Array(0.00464005, dtype=float32), 'loss_cross_entropy': Array(0.5676424, dtype=float32)}


  3%|▎         | 27690/1000000 [1:25:56<27:35:40,  9.79it/s]

{'loss': Array(0.57723635, dtype=float32), 'loss_reward': Array(0.00449842, dtype=float32), 'loss_cross_entropy': Array(0.57273793, dtype=float32)}


  3%|▎         | 27698/1000000 [1:25:57<33:15:20,  8.12it/s]

{'loss': Array(0.5671707, dtype=float32), 'loss_reward': Array(0.0045398, dtype=float32), 'loss_cross_entropy': Array(0.56263083, dtype=float32)}


  3%|▎         | 27709/1000000 [1:25:59<27:51:10,  9.70it/s]

{'loss': Array(0.5658788, dtype=float32), 'loss_reward': Array(0.00449241, dtype=float32), 'loss_cross_entropy': Array(0.56138635, dtype=float32)}


  3%|▎         | 27720/1000000 [1:26:01<29:56:57,  9.02it/s]

{'loss': Array(0.570668, dtype=float32), 'loss_reward': Array(0.00453785, dtype=float32), 'loss_cross_entropy': Array(0.56613016, dtype=float32)}


  3%|▎         | 27728/1000000 [1:26:02<33:26:54,  8.07it/s]

{'loss': Array(0.57242054, dtype=float32), 'loss_reward': Array(0.00467927, dtype=float32), 'loss_cross_entropy': Array(0.5677413, dtype=float32)}


  3%|▎         | 27740/1000000 [1:26:04<27:38:55,  9.77it/s]

{'loss': Array(0.5675847, dtype=float32), 'loss_reward': Array(0.00454836, dtype=float32), 'loss_cross_entropy': Array(0.5630364, dtype=float32)}


  3%|▎         | 27749/1000000 [1:26:06<38:38:09,  6.99it/s]

{'loss': Array(0.569584, dtype=float32), 'loss_reward': Array(0.00452075, dtype=float32), 'loss_cross_entropy': Array(0.56506324, dtype=float32)}


  3%|▎         | 27760/1000000 [1:26:07<29:19:27,  9.21it/s]

{'loss': Array(0.5735776, dtype=float32), 'loss_reward': Array(0.00465714, dtype=float32), 'loss_cross_entropy': Array(0.5689205, dtype=float32)}


  3%|▎         | 27768/1000000 [1:26:09<34:32:29,  7.82it/s]

{'loss': Array(0.56885034, dtype=float32), 'loss_reward': Array(0.00456727, dtype=float32), 'loss_cross_entropy': Array(0.564283, dtype=float32)}


  3%|▎         | 27779/1000000 [1:26:10<27:59:18,  9.65it/s]

{'loss': Array(0.5688229, dtype=float32), 'loss_reward': Array(0.00455254, dtype=float32), 'loss_cross_entropy': Array(0.56427044, dtype=float32)}


  3%|▎         | 27789/1000000 [1:26:12<35:10:24,  7.68it/s]

{'loss': Array(0.56961703, dtype=float32), 'loss_reward': Array(0.00468756, dtype=float32), 'loss_cross_entropy': Array(0.5649295, dtype=float32)}


  3%|▎         | 27799/1000000 [1:26:14<29:27:04,  9.17it/s]

{'loss': Array(0.5777973, dtype=float32), 'loss_reward': Array(0.00466932, dtype=float32), 'loss_cross_entropy': Array(0.5731279, dtype=float32)}


  3%|▎         | 27809/1000000 [1:26:15<28:34:03,  9.45it/s]

{'loss': Array(0.5782575, dtype=float32), 'loss_reward': Array(0.00478307, dtype=float32), 'loss_cross_entropy': Array(0.57347435, dtype=float32)}


  3%|▎         | 27819/1000000 [1:26:17<28:20:17,  9.53it/s]

{'loss': Array(0.5660252, dtype=float32), 'loss_reward': Array(0.00457931, dtype=float32), 'loss_cross_entropy': Array(0.5614459, dtype=float32)}


  3%|▎         | 27829/1000000 [1:26:19<32:32:33,  8.30it/s]

{'loss': Array(0.5730936, dtype=float32), 'loss_reward': Array(0.00453588, dtype=float32), 'loss_cross_entropy': Array(0.56855774, dtype=float32)}


  3%|▎         | 27840/1000000 [1:26:20<27:05:02,  9.97it/s]

{'loss': Array(0.5664762, dtype=float32), 'loss_reward': Array(0.0045574, dtype=float32), 'loss_cross_entropy': Array(0.56191885, dtype=float32)}


  3%|▎         | 27848/1000000 [1:26:22<32:53:18,  8.21it/s]

{'loss': Array(0.5667464, dtype=float32), 'loss_reward': Array(0.004493, dtype=float32), 'loss_cross_entropy': Array(0.5622534, dtype=float32)}


  3%|▎         | 27860/1000000 [1:26:23<26:58:06, 10.01it/s]

{'loss': Array(0.5608, dtype=float32), 'loss_reward': Array(0.00457182, dtype=float32), 'loss_cross_entropy': Array(0.5562282, dtype=float32)}


  3%|▎         | 27868/1000000 [1:26:25<36:49:54,  7.33it/s]

{'loss': Array(0.555754, dtype=float32), 'loss_reward': Array(0.00458448, dtype=float32), 'loss_cross_entropy': Array(0.5511696, dtype=float32)}


  3%|▎         | 27880/1000000 [1:26:27<27:42:57,  9.74it/s]

{'loss': Array(0.56347406, dtype=float32), 'loss_reward': Array(0.0045006, dtype=float32), 'loss_cross_entropy': Array(0.55897343, dtype=float32)}


  3%|▎         | 27888/1000000 [1:26:28<32:27:50,  8.32it/s]

{'loss': Array(0.57523745, dtype=float32), 'loss_reward': Array(0.00482499, dtype=float32), 'loss_cross_entropy': Array(0.57041246, dtype=float32)}


  3%|▎         | 27900/1000000 [1:26:30<36:34:53,  7.38it/s]

{'loss': Array(0.5687224, dtype=float32), 'loss_reward': Array(0.00445172, dtype=float32), 'loss_cross_entropy': Array(0.56427073, dtype=float32)}


  3%|▎         | 27908/1000000 [1:26:31<35:44:03,  7.56it/s]

{'loss': Array(0.56380934, dtype=float32), 'loss_reward': Array(0.00455664, dtype=float32), 'loss_cross_entropy': Array(0.5592527, dtype=float32)}


  3%|▎         | 27920/1000000 [1:26:33<27:51:59,  9.69it/s]

{'loss': Array(0.56149566, dtype=float32), 'loss_reward': Array(0.00453149, dtype=float32), 'loss_cross_entropy': Array(0.5569641, dtype=float32)}


  3%|▎         | 27928/1000000 [1:26:34<33:04:24,  8.16it/s]

{'loss': Array(0.55636996, dtype=float32), 'loss_reward': Array(0.00436786, dtype=float32), 'loss_cross_entropy': Array(0.55200213, dtype=float32)}


  3%|▎         | 27940/1000000 [1:26:37<34:55:53,  7.73it/s]

{'loss': Array(0.56328356, dtype=float32), 'loss_reward': Array(0.00448266, dtype=float32), 'loss_cross_entropy': Array(0.5588009, dtype=float32)}


  3%|▎         | 27948/1000000 [1:26:38<37:20:25,  7.23it/s]

{'loss': Array(0.5636532, dtype=float32), 'loss_reward': Array(0.00441888, dtype=float32), 'loss_cross_entropy': Array(0.5592343, dtype=float32)}


  3%|▎         | 27960/1000000 [1:26:40<28:12:53,  9.57it/s]

{'loss': Array(0.56596315, dtype=float32), 'loss_reward': Array(0.0044999, dtype=float32), 'loss_cross_entropy': Array(0.5614632, dtype=float32)}


  3%|▎         | 27968/1000000 [1:26:41<33:06:44,  8.15it/s]

{'loss': Array(0.5614185, dtype=float32), 'loss_reward': Array(0.00464076, dtype=float32), 'loss_cross_entropy': Array(0.55677766, dtype=float32)}


  3%|▎         | 27979/1000000 [1:26:43<35:39:41,  7.57it/s]

{'loss': Array(0.564379, dtype=float32), 'loss_reward': Array(0.00453473, dtype=float32), 'loss_cross_entropy': Array(0.5598442, dtype=float32)}


  3%|▎         | 27990/1000000 [1:26:45<28:26:37,  9.49it/s]

{'loss': Array(0.56275576, dtype=float32), 'loss_reward': Array(0.00464684, dtype=float32), 'loss_cross_entropy': Array(0.558109, dtype=float32)}


  3%|▎         | 27998/1000000 [1:26:46<33:59:48,  7.94it/s]

{'loss': Array(0.5525478, dtype=float32), 'loss_reward': Array(0.00452059, dtype=float32), 'loss_cross_entropy': Array(0.5480272, dtype=float32)}


  3%|▎         | 28010/1000000 [1:26:58<119:41:36,  2.26it/s]

{'loss': Array(0.5612225, dtype=float32), 'loss_reward': Array(0.00463191, dtype=float32), 'loss_cross_entropy': Array(0.5565905, dtype=float32)}


  3%|▎         | 28020/1000000 [1:26:59<56:05:24,  4.81it/s] 

{'loss': Array(0.56820005, dtype=float32), 'loss_reward': Array(0.00465239, dtype=float32), 'loss_cross_entropy': Array(0.5635476, dtype=float32)}


  3%|▎         | 28028/1000000 [1:27:01<42:30:21,  6.35it/s]

{'loss': Array(0.55767316, dtype=float32), 'loss_reward': Array(0.00439761, dtype=float32), 'loss_cross_entropy': Array(0.5532755, dtype=float32)}


  3%|▎         | 28040/1000000 [1:27:03<29:43:30,  9.08it/s]

{'loss': Array(0.56057835, dtype=float32), 'loss_reward': Array(0.00454423, dtype=float32), 'loss_cross_entropy': Array(0.55603415, dtype=float32)}


  3%|▎         | 28048/1000000 [1:27:04<33:39:10,  8.02it/s]

{'loss': Array(0.5604332, dtype=float32), 'loss_reward': Array(0.00444261, dtype=float32), 'loss_cross_entropy': Array(0.5559906, dtype=float32)}


  3%|▎         | 28059/1000000 [1:27:06<32:50:50,  8.22it/s]

{'loss': Array(0.56248873, dtype=float32), 'loss_reward': Array(0.00459838, dtype=float32), 'loss_cross_entropy': Array(0.55789036, dtype=float32)}


  3%|▎         | 28070/1000000 [1:27:08<27:46:49,  9.72it/s]

{'loss': Array(0.5604336, dtype=float32), 'loss_reward': Array(0.00458727, dtype=float32), 'loss_cross_entropy': Array(0.55584633, dtype=float32)}


  3%|▎         | 28078/1000000 [1:27:09<33:02:02,  8.17it/s]

{'loss': Array(0.5695976, dtype=float32), 'loss_reward': Array(0.00456252, dtype=float32), 'loss_cross_entropy': Array(0.56503505, dtype=float32)}


  3%|▎         | 28089/1000000 [1:27:11<39:25:04,  6.85it/s]

{'loss': Array(0.559332, dtype=float32), 'loss_reward': Array(0.00436891, dtype=float32), 'loss_cross_entropy': Array(0.5549631, dtype=float32)}


  3%|▎         | 28100/1000000 [1:27:13<29:26:39,  9.17it/s]

{'loss': Array(0.56105685, dtype=float32), 'loss_reward': Array(0.00448134, dtype=float32), 'loss_cross_entropy': Array(0.55657554, dtype=float32)}


  3%|▎         | 28108/1000000 [1:27:14<33:12:42,  8.13it/s]

{'loss': Array(0.56104046, dtype=float32), 'loss_reward': Array(0.00439728, dtype=float32), 'loss_cross_entropy': Array(0.5566432, dtype=float32)}


  3%|▎         | 28120/1000000 [1:27:16<27:07:13,  9.95it/s]

{'loss': Array(0.5587515, dtype=float32), 'loss_reward': Array(0.00447722, dtype=float32), 'loss_cross_entropy': Array(0.5542743, dtype=float32)}


  3%|▎         | 28130/1000000 [1:27:17<34:49:48,  7.75it/s]

{'loss': Array(0.5558223, dtype=float32), 'loss_reward': Array(0.0045555, dtype=float32), 'loss_cross_entropy': Array(0.55126685, dtype=float32)}


  3%|▎         | 28138/1000000 [1:27:19<35:13:19,  7.66it/s]

{'loss': Array(0.555425, dtype=float32), 'loss_reward': Array(0.0045155, dtype=float32), 'loss_cross_entropy': Array(0.55090946, dtype=float32)}


  3%|▎         | 28150/1000000 [1:27:20<27:04:25,  9.97it/s]

{'loss': Array(0.5616338, dtype=float32), 'loss_reward': Array(0.00450377, dtype=float32), 'loss_cross_entropy': Array(0.55713004, dtype=float32)}


  3%|▎         | 28158/1000000 [1:27:22<32:17:21,  8.36it/s]

{'loss': Array(0.55320233, dtype=float32), 'loss_reward': Array(0.00445438, dtype=float32), 'loss_cross_entropy': Array(0.54874796, dtype=float32)}


  3%|▎         | 28169/1000000 [1:27:24<35:24:10,  7.63it/s]

{'loss': Array(0.5502847, dtype=float32), 'loss_reward': Array(0.00460014, dtype=float32), 'loss_cross_entropy': Array(0.5456846, dtype=float32)}


  3%|▎         | 28180/1000000 [1:27:25<28:24:29,  9.50it/s]

{'loss': Array(0.5659339, dtype=float32), 'loss_reward': Array(0.00462658, dtype=float32), 'loss_cross_entropy': Array(0.5613073, dtype=float32)}


  3%|▎         | 28188/1000000 [1:27:27<32:50:42,  8.22it/s]

{'loss': Array(0.5630361, dtype=float32), 'loss_reward': Array(0.00465673, dtype=float32), 'loss_cross_entropy': Array(0.55837935, dtype=float32)}


  3%|▎         | 28200/1000000 [1:27:28<27:32:09,  9.80it/s]

{'loss': Array(0.5599006, dtype=float32), 'loss_reward': Array(0.0045448, dtype=float32), 'loss_cross_entropy': Array(0.5553557, dtype=float32)}


  3%|▎         | 28210/1000000 [1:27:30<33:22:21,  8.09it/s]

{'loss': Array(0.55632544, dtype=float32), 'loss_reward': Array(0.00441673, dtype=float32), 'loss_cross_entropy': Array(0.55190873, dtype=float32)}


  3%|▎         | 28218/1000000 [1:27:32<36:05:13,  7.48it/s]

{'loss': Array(0.55981094, dtype=float32), 'loss_reward': Array(0.00457328, dtype=float32), 'loss_cross_entropy': Array(0.55523765, dtype=float32)}


  3%|▎         | 28230/1000000 [1:27:34<28:38:02,  9.43it/s]

{'loss': Array(0.55940765, dtype=float32), 'loss_reward': Array(0.00448359, dtype=float32), 'loss_cross_entropy': Array(0.5549241, dtype=float32)}


  3%|▎         | 28238/1000000 [1:27:35<33:30:33,  8.06it/s]

{'loss': Array(0.55274135, dtype=float32), 'loss_reward': Array(0.00453716, dtype=float32), 'loss_cross_entropy': Array(0.5482042, dtype=float32)}


  3%|▎         | 28249/1000000 [1:27:37<33:28:23,  8.06it/s]

{'loss': Array(0.5577462, dtype=float32), 'loss_reward': Array(0.00435115, dtype=float32), 'loss_cross_entropy': Array(0.55339503, dtype=float32)}


  3%|▎         | 28260/1000000 [1:27:39<28:10:56,  9.58it/s]

{'loss': Array(0.55711246, dtype=float32), 'loss_reward': Array(0.0046382, dtype=float32), 'loss_cross_entropy': Array(0.5524743, dtype=float32)}


  3%|▎         | 28268/1000000 [1:27:40<33:54:29,  7.96it/s]

{'loss': Array(0.5553636, dtype=float32), 'loss_reward': Array(0.00453459, dtype=float32), 'loss_cross_entropy': Array(0.5508291, dtype=float32)}


  3%|▎         | 28279/1000000 [1:27:42<39:49:00,  6.78it/s]

{'loss': Array(0.55779743, dtype=float32), 'loss_reward': Array(0.00447474, dtype=float32), 'loss_cross_entropy': Array(0.5533227, dtype=float32)}


  3%|▎         | 28290/1000000 [1:27:44<29:18:18,  9.21it/s]

{'loss': Array(0.5499924, dtype=float32), 'loss_reward': Array(0.00449403, dtype=float32), 'loss_cross_entropy': Array(0.5454984, dtype=float32)}


  3%|▎         | 28298/1000000 [1:27:45<33:10:08,  8.14it/s]

{'loss': Array(0.5559273, dtype=float32), 'loss_reward': Array(0.00457692, dtype=float32), 'loss_cross_entropy': Array(0.55135036, dtype=float32)}


  3%|▎         | 28310/1000000 [1:27:47<26:51:02, 10.05it/s]

{'loss': Array(0.55865866, dtype=float32), 'loss_reward': Array(0.00450933, dtype=float32), 'loss_cross_entropy': Array(0.5541494, dtype=float32)}


  3%|▎         | 28320/1000000 [1:27:49<34:38:21,  7.79it/s]

{'loss': Array(0.55790603, dtype=float32), 'loss_reward': Array(0.00465348, dtype=float32), 'loss_cross_entropy': Array(0.5532525, dtype=float32)}


  3%|▎         | 28328/1000000 [1:27:50<35:36:55,  7.58it/s]

{'loss': Array(0.5502822, dtype=float32), 'loss_reward': Array(0.00441531, dtype=float32), 'loss_cross_entropy': Array(0.5458669, dtype=float32)}


  3%|▎         | 28340/1000000 [1:27:52<27:30:19,  9.81it/s]

{'loss': Array(0.5505678, dtype=float32), 'loss_reward': Array(0.00442011, dtype=float32), 'loss_cross_entropy': Array(0.54614764, dtype=float32)}


  3%|▎         | 28348/1000000 [1:27:53<32:58:54,  8.18it/s]

{'loss': Array(0.55599946, dtype=float32), 'loss_reward': Array(0.00461686, dtype=float32), 'loss_cross_entropy': Array(0.5513825, dtype=float32)}


  3%|▎         | 28359/1000000 [1:27:55<35:30:28,  7.60it/s]

{'loss': Array(0.54797584, dtype=float32), 'loss_reward': Array(0.00432995, dtype=float32), 'loss_cross_entropy': Array(0.5436458, dtype=float32)}


  3%|▎         | 28370/1000000 [1:27:57<28:30:25,  9.47it/s]

{'loss': Array(0.55377144, dtype=float32), 'loss_reward': Array(0.00438434, dtype=float32), 'loss_cross_entropy': Array(0.5493871, dtype=float32)}


  3%|▎         | 28378/1000000 [1:27:58<33:32:42,  8.05it/s]

{'loss': Array(0.55249363, dtype=float32), 'loss_reward': Array(0.00450697, dtype=float32), 'loss_cross_entropy': Array(0.5479867, dtype=float32)}


  3%|▎         | 28390/1000000 [1:28:00<27:27:17,  9.83it/s]

{'loss': Array(0.55432254, dtype=float32), 'loss_reward': Array(0.00461113, dtype=float32), 'loss_cross_entropy': Array(0.5497114, dtype=float32)}


  3%|▎         | 28400/1000000 [1:28:02<32:22:49,  8.33it/s]

{'loss': Array(0.561437, dtype=float32), 'loss_reward': Array(0.00439526, dtype=float32), 'loss_cross_entropy': Array(0.5570417, dtype=float32)}


  3%|▎         | 28408/1000000 [1:28:03<35:00:51,  7.71it/s]

{'loss': Array(0.55673236, dtype=float32), 'loss_reward': Array(0.00441736, dtype=float32), 'loss_cross_entropy': Array(0.552315, dtype=float32)}


  3%|▎         | 28419/1000000 [1:28:05<28:15:51,  9.55it/s]

{'loss': Array(0.55150825, dtype=float32), 'loss_reward': Array(0.00460702, dtype=float32), 'loss_cross_entropy': Array(0.5469012, dtype=float32)}


  3%|▎         | 28430/1000000 [1:28:06<26:16:58, 10.27it/s]

{'loss': Array(0.5536914, dtype=float32), 'loss_reward': Array(0.00453051, dtype=float32), 'loss_cross_entropy': Array(0.5491609, dtype=float32)}


  3%|▎         | 28438/1000000 [1:28:08<37:14:04,  7.25it/s]

{'loss': Array(0.5492604, dtype=float32), 'loss_reward': Array(0.00455316, dtype=float32), 'loss_cross_entropy': Array(0.54470724, dtype=float32)}


  3%|▎         | 28450/1000000 [1:28:10<27:52:14,  9.68it/s]

{'loss': Array(0.5524095, dtype=float32), 'loss_reward': Array(0.0043527, dtype=float32), 'loss_cross_entropy': Array(0.54805684, dtype=float32)}


  3%|▎         | 28458/1000000 [1:28:11<33:05:03,  8.16it/s]

{'loss': Array(0.5526593, dtype=float32), 'loss_reward': Array(0.0044185, dtype=float32), 'loss_cross_entropy': Array(0.5482408, dtype=float32)}


  3%|▎         | 28469/1000000 [1:28:13<39:14:36,  6.88it/s]

{'loss': Array(0.5482575, dtype=float32), 'loss_reward': Array(0.00433618, dtype=float32), 'loss_cross_entropy': Array(0.54392123, dtype=float32)}


  3%|▎         | 28480/1000000 [1:28:15<28:50:23,  9.36it/s]

{'loss': Array(0.549222, dtype=float32), 'loss_reward': Array(0.00428869, dtype=float32), 'loss_cross_entropy': Array(0.5449333, dtype=float32)}


  3%|▎         | 28488/1000000 [1:28:16<34:38:39,  7.79it/s]

{'loss': Array(0.54713005, dtype=float32), 'loss_reward': Array(0.00445844, dtype=float32), 'loss_cross_entropy': Array(0.54267156, dtype=float32)}


  3%|▎         | 28500/1000000 [1:28:18<28:18:18,  9.53it/s]

{'loss': Array(0.5443538, dtype=float32), 'loss_reward': Array(0.00432472, dtype=float32), 'loss_cross_entropy': Array(0.54002905, dtype=float32)}


  3%|▎         | 28510/1000000 [1:28:29<136:33:23,  1.98it/s]

{'loss': Array(0.5571744, dtype=float32), 'loss_reward': Array(0.00448924, dtype=float32), 'loss_cross_entropy': Array(0.5526852, dtype=float32)}


  3%|▎         | 28518/1000000 [1:28:31<69:11:37,  3.90it/s] 

{'loss': Array(0.55117095, dtype=float32), 'loss_reward': Array(0.0044038, dtype=float32), 'loss_cross_entropy': Array(0.5467672, dtype=float32)}


  3%|▎         | 28530/1000000 [1:28:33<35:23:40,  7.62it/s]

{'loss': Array(0.54038876, dtype=float32), 'loss_reward': Array(0.00434867, dtype=float32), 'loss_cross_entropy': Array(0.53604, dtype=float32)}


  3%|▎         | 28540/1000000 [1:28:34<29:27:50,  9.16it/s]

{'loss': Array(0.5553608, dtype=float32), 'loss_reward': Array(0.00442137, dtype=float32), 'loss_cross_entropy': Array(0.5509395, dtype=float32)}


  3%|▎         | 28548/1000000 [1:28:36<40:55:56,  6.59it/s]

{'loss': Array(0.55378926, dtype=float32), 'loss_reward': Array(0.00445574, dtype=float32), 'loss_cross_entropy': Array(0.5493335, dtype=float32)}


  3%|▎         | 28560/1000000 [1:28:37<28:41:32,  9.40it/s]

{'loss': Array(0.5458117, dtype=float32), 'loss_reward': Array(0.00438463, dtype=float32), 'loss_cross_entropy': Array(0.5414271, dtype=float32)}


  3%|▎         | 28568/1000000 [1:28:39<32:33:46,  8.29it/s]

{'loss': Array(0.5506119, dtype=float32), 'loss_reward': Array(0.0045296, dtype=float32), 'loss_cross_entropy': Array(0.5460824, dtype=float32)}


  3%|▎         | 28580/1000000 [1:28:41<27:15:52,  9.90it/s]

{'loss': Array(0.5513711, dtype=float32), 'loss_reward': Array(0.00455942, dtype=float32), 'loss_cross_entropy': Array(0.54681164, dtype=float32)}


  3%|▎         | 28590/1000000 [1:28:42<32:50:00,  8.22it/s]

{'loss': Array(0.5428013, dtype=float32), 'loss_reward': Array(0.00447368, dtype=float32), 'loss_cross_entropy': Array(0.53832763, dtype=float32)}


  3%|▎         | 28598/1000000 [1:28:44<35:31:15,  7.60it/s]

{'loss': Array(0.54530907, dtype=float32), 'loss_reward': Array(0.00441517, dtype=float32), 'loss_cross_entropy': Array(0.540894, dtype=float32)}


  3%|▎         | 28610/1000000 [1:28:46<28:21:17,  9.52it/s]

{'loss': Array(0.5451812, dtype=float32), 'loss_reward': Array(0.00445472, dtype=float32), 'loss_cross_entropy': Array(0.5407265, dtype=float32)}


  3%|▎         | 28618/1000000 [1:28:47<33:26:18,  8.07it/s]

{'loss': Array(0.54244566, dtype=float32), 'loss_reward': Array(0.0044402, dtype=float32), 'loss_cross_entropy': Array(0.53800553, dtype=float32)}


  3%|▎         | 28630/1000000 [1:28:49<30:27:01,  8.86it/s]

{'loss': Array(0.53806573, dtype=float32), 'loss_reward': Array(0.00449309, dtype=float32), 'loss_cross_entropy': Array(0.5335726, dtype=float32)}


  3%|▎         | 28638/1000000 [1:28:51<34:20:10,  7.86it/s]

{'loss': Array(0.544158, dtype=float32), 'loss_reward': Array(0.00442278, dtype=float32), 'loss_cross_entropy': Array(0.5397352, dtype=float32)}


  3%|▎         | 28650/1000000 [1:28:52<28:13:22,  9.56it/s]

{'loss': Array(0.54685897, dtype=float32), 'loss_reward': Array(0.00460147, dtype=float32), 'loss_cross_entropy': Array(0.5422574, dtype=float32)}


  3%|▎         | 28660/1000000 [1:28:54<39:06:01,  6.90it/s]

{'loss': Array(0.5470575, dtype=float32), 'loss_reward': Array(0.00448177, dtype=float32), 'loss_cross_entropy': Array(0.5425758, dtype=float32)}


  3%|▎         | 28668/1000000 [1:28:56<36:40:01,  7.36it/s]

{'loss': Array(0.5535874, dtype=float32), 'loss_reward': Array(0.00445323, dtype=float32), 'loss_cross_entropy': Array(0.5491342, dtype=float32)}


  3%|▎         | 28680/1000000 [1:28:57<27:35:19,  9.78it/s]

{'loss': Array(0.5501606, dtype=float32), 'loss_reward': Array(0.00445876, dtype=float32), 'loss_cross_entropy': Array(0.5457019, dtype=float32)}


  3%|▎         | 28688/1000000 [1:28:59<32:51:01,  8.21it/s]

{'loss': Array(0.5498162, dtype=float32), 'loss_reward': Array(0.00449093, dtype=float32), 'loss_cross_entropy': Array(0.5453253, dtype=float32)}


  3%|▎         | 28699/1000000 [1:29:00<35:28:12,  7.61it/s]

{'loss': Array(0.5457392, dtype=float32), 'loss_reward': Array(0.00444238, dtype=float32), 'loss_cross_entropy': Array(0.54129684, dtype=float32)}


  3%|▎         | 28710/1000000 [1:29:02<28:09:49,  9.58it/s]

{'loss': Array(0.5432667, dtype=float32), 'loss_reward': Array(0.00451418, dtype=float32), 'loss_cross_entropy': Array(0.53875256, dtype=float32)}


  3%|▎         | 28718/1000000 [1:29:03<32:27:38,  8.31it/s]

{'loss': Array(0.54294306, dtype=float32), 'loss_reward': Array(0.00449232, dtype=float32), 'loss_cross_entropy': Array(0.5384507, dtype=float32)}


  3%|▎         | 28730/1000000 [1:29:05<27:02:24,  9.98it/s]

{'loss': Array(0.5454811, dtype=float32), 'loss_reward': Array(0.00452798, dtype=float32), 'loss_cross_entropy': Array(0.5409531, dtype=float32)}


  3%|▎         | 28740/1000000 [1:29:07<32:25:58,  8.32it/s]

{'loss': Array(0.5435962, dtype=float32), 'loss_reward': Array(0.00441848, dtype=float32), 'loss_cross_entropy': Array(0.5391777, dtype=float32)}


  3%|▎         | 28748/1000000 [1:29:08<34:29:45,  7.82it/s]

{'loss': Array(0.54646415, dtype=float32), 'loss_reward': Array(0.0043994, dtype=float32), 'loss_cross_entropy': Array(0.5420647, dtype=float32)}


  3%|▎         | 28760/1000000 [1:29:10<27:01:28,  9.98it/s]

{'loss': Array(0.5391518, dtype=float32), 'loss_reward': Array(0.00441223, dtype=float32), 'loss_cross_entropy': Array(0.5347397, dtype=float32)}


  3%|▎         | 28768/1000000 [1:29:11<33:04:24,  8.16it/s]

{'loss': Array(0.54283965, dtype=float32), 'loss_reward': Array(0.00445058, dtype=float32), 'loss_cross_entropy': Array(0.5383891, dtype=float32)}


  3%|▎         | 28779/1000000 [1:29:13<33:49:06,  7.98it/s]

{'loss': Array(0.5465428, dtype=float32), 'loss_reward': Array(0.00455139, dtype=float32), 'loss_cross_entropy': Array(0.5419914, dtype=float32)}


  3%|▎         | 28790/1000000 [1:29:15<27:46:35,  9.71it/s]

{'loss': Array(0.5412113, dtype=float32), 'loss_reward': Array(0.00439768, dtype=float32), 'loss_cross_entropy': Array(0.5368136, dtype=float32)}


  3%|▎         | 28798/1000000 [1:29:16<34:33:27,  7.81it/s]

{'loss': Array(0.54609984, dtype=float32), 'loss_reward': Array(0.00446172, dtype=float32), 'loss_cross_entropy': Array(0.5416381, dtype=float32)}


  3%|▎         | 28810/1000000 [1:29:18<37:07:14,  7.27it/s]

{'loss': Array(0.5444124, dtype=float32), 'loss_reward': Array(0.00443278, dtype=float32), 'loss_cross_entropy': Array(0.53997964, dtype=float32)}


  3%|▎         | 28818/1000000 [1:29:20<36:10:31,  7.46it/s]

{'loss': Array(0.5439221, dtype=float32), 'loss_reward': Array(0.00451593, dtype=float32), 'loss_cross_entropy': Array(0.53940624, dtype=float32)}


  3%|▎         | 28830/1000000 [1:29:21<27:29:13,  9.81it/s]

{'loss': Array(0.54439527, dtype=float32), 'loss_reward': Array(0.00459791, dtype=float32), 'loss_cross_entropy': Array(0.5397974, dtype=float32)}


  3%|▎         | 28838/1000000 [1:29:23<32:23:38,  8.33it/s]

{'loss': Array(0.53784996, dtype=float32), 'loss_reward': Array(0.00425022, dtype=float32), 'loss_cross_entropy': Array(0.5335998, dtype=float32)}


  3%|▎         | 28849/1000000 [1:29:25<38:57:11,  6.93it/s]

{'loss': Array(0.5406633, dtype=float32), 'loss_reward': Array(0.00448079, dtype=float32), 'loss_cross_entropy': Array(0.5361825, dtype=float32)}


  3%|▎         | 28860/1000000 [1:29:26<29:48:17,  9.05it/s]

{'loss': Array(0.5353239, dtype=float32), 'loss_reward': Array(0.00450775, dtype=float32), 'loss_cross_entropy': Array(0.5308162, dtype=float32)}


  3%|▎         | 28868/1000000 [1:29:28<34:06:00,  7.91it/s]

{'loss': Array(0.538855, dtype=float32), 'loss_reward': Array(0.00428541, dtype=float32), 'loss_cross_entropy': Array(0.5345696, dtype=float32)}


  3%|▎         | 28880/1000000 [1:29:30<27:52:00,  9.68it/s]

{'loss': Array(0.5300561, dtype=float32), 'loss_reward': Array(0.00435545, dtype=float32), 'loss_cross_entropy': Array(0.5257006, dtype=float32)}


  3%|▎         | 28890/1000000 [1:29:31<35:41:23,  7.56it/s]

{'loss': Array(0.5400905, dtype=float32), 'loss_reward': Array(0.00454565, dtype=float32), 'loss_cross_entropy': Array(0.5355449, dtype=float32)}


  3%|▎         | 28900/1000000 [1:29:33<33:01:57,  8.17it/s]

{'loss': Array(0.5392092, dtype=float32), 'loss_reward': Array(0.00446266, dtype=float32), 'loss_cross_entropy': Array(0.53474647, dtype=float32)}


  3%|▎         | 28908/1000000 [1:29:35<35:03:54,  7.69it/s]

{'loss': Array(0.53889364, dtype=float32), 'loss_reward': Array(0.00437875, dtype=float32), 'loss_cross_entropy': Array(0.53451484, dtype=float32)}


  3%|▎         | 28920/1000000 [1:29:36<27:29:46,  9.81it/s]

{'loss': Array(0.53849053, dtype=float32), 'loss_reward': Array(0.00436177, dtype=float32), 'loss_cross_entropy': Array(0.5341288, dtype=float32)}


  3%|▎         | 28930/1000000 [1:29:38<32:05:28,  8.41it/s]

{'loss': Array(0.5403789, dtype=float32), 'loss_reward': Array(0.00437332, dtype=float32), 'loss_cross_entropy': Array(0.5360056, dtype=float32)}


  3%|▎         | 28938/1000000 [1:29:40<33:53:55,  7.96it/s]

{'loss': Array(0.5316954, dtype=float32), 'loss_reward': Array(0.0044108, dtype=float32), 'loss_cross_entropy': Array(0.5272846, dtype=float32)}


  3%|▎         | 28950/1000000 [1:29:41<27:19:00,  9.87it/s]

{'loss': Array(0.5442756, dtype=float32), 'loss_reward': Array(0.00443591, dtype=float32), 'loss_cross_entropy': Array(0.5398397, dtype=float32)}


  3%|▎         | 28958/1000000 [1:29:43<32:27:43,  8.31it/s]

{'loss': Array(0.53828305, dtype=float32), 'loss_reward': Array(0.00441455, dtype=float32), 'loss_cross_entropy': Array(0.5338685, dtype=float32)}


  3%|▎         | 28970/1000000 [1:29:44<29:54:46,  9.02it/s]

{'loss': Array(0.54163206, dtype=float32), 'loss_reward': Array(0.00446387, dtype=float32), 'loss_cross_entropy': Array(0.5371682, dtype=float32)}


  3%|▎         | 28978/1000000 [1:29:46<33:01:49,  8.17it/s]

{'loss': Array(0.5359815, dtype=float32), 'loss_reward': Array(0.00437381, dtype=float32), 'loss_cross_entropy': Array(0.5316076, dtype=float32)}


  3%|▎         | 28990/1000000 [1:29:48<27:03:39,  9.97it/s]

{'loss': Array(0.53688115, dtype=float32), 'loss_reward': Array(0.00432908, dtype=float32), 'loss_cross_entropy': Array(0.532552, dtype=float32)}


  3%|▎         | 29000/1000000 [1:29:49<38:46:09,  6.96it/s]

{'loss': Array(0.5343495, dtype=float32), 'loss_reward': Array(0.00451861, dtype=float32), 'loss_cross_entropy': Array(0.5298309, dtype=float32)}


  3%|▎         | 29008/1000000 [1:30:01<178:55:19,  1.51it/s]

{'loss': Array(0.5486856, dtype=float32), 'loss_reward': Array(0.00456888, dtype=float32), 'loss_cross_entropy': Array(0.54411674, dtype=float32)}


  3%|▎         | 29019/1000000 [1:30:02<63:44:32,  4.23it/s] 

{'loss': Array(0.5391402, dtype=float32), 'loss_reward': Array(0.00446266, dtype=float32), 'loss_cross_entropy': Array(0.5346775, dtype=float32)}


  3%|▎         | 29030/1000000 [1:30:04<34:56:23,  7.72it/s]

{'loss': Array(0.54600495, dtype=float32), 'loss_reward': Array(0.0043768, dtype=float32), 'loss_cross_entropy': Array(0.54162806, dtype=float32)}


  3%|▎         | 29040/1000000 [1:30:06<37:50:44,  7.13it/s]

{'loss': Array(0.5353495, dtype=float32), 'loss_reward': Array(0.0043297, dtype=float32), 'loss_cross_entropy': Array(0.5310198, dtype=float32)}


  3%|▎         | 29048/1000000 [1:30:07<38:22:28,  7.03it/s]

{'loss': Array(0.5386211, dtype=float32), 'loss_reward': Array(0.0044137, dtype=float32), 'loss_cross_entropy': Array(0.53420746, dtype=float32)}


  3%|▎         | 29060/1000000 [1:30:09<28:38:10,  9.42it/s]

{'loss': Array(0.54151624, dtype=float32), 'loss_reward': Array(0.00452339, dtype=float32), 'loss_cross_entropy': Array(0.53699285, dtype=float32)}


  3%|▎         | 29068/1000000 [1:30:10<32:55:26,  8.19it/s]

{'loss': Array(0.539664, dtype=float32), 'loss_reward': Array(0.00437246, dtype=float32), 'loss_cross_entropy': Array(0.53529155, dtype=float32)}


  3%|▎         | 29079/1000000 [1:30:12<35:09:51,  7.67it/s]

{'loss': Array(0.5415506, dtype=float32), 'loss_reward': Array(0.00453616, dtype=float32), 'loss_cross_entropy': Array(0.53701437, dtype=float32)}


  3%|▎         | 29090/1000000 [1:30:14<27:59:21,  9.64it/s]

{'loss': Array(0.53802663, dtype=float32), 'loss_reward': Array(0.00432442, dtype=float32), 'loss_cross_entropy': Array(0.5337022, dtype=float32)}


  3%|▎         | 29098/1000000 [1:30:15<34:07:07,  7.90it/s]

{'loss': Array(0.5364297, dtype=float32), 'loss_reward': Array(0.00440163, dtype=float32), 'loss_cross_entropy': Array(0.53202814, dtype=float32)}


  3%|▎         | 29110/1000000 [1:30:17<27:16:54,  9.89it/s]

{'loss': Array(0.5416486, dtype=float32), 'loss_reward': Array(0.0045426, dtype=float32), 'loss_cross_entropy': Array(0.53710604, dtype=float32)}


  3%|▎         | 29120/1000000 [1:30:19<32:00:52,  8.42it/s]

{'loss': Array(0.5396662, dtype=float32), 'loss_reward': Array(0.00441547, dtype=float32), 'loss_cross_entropy': Array(0.53525066, dtype=float32)}


  3%|▎         | 29128/1000000 [1:30:20<34:51:50,  7.74it/s]

{'loss': Array(0.5384802, dtype=float32), 'loss_reward': Array(0.00438925, dtype=float32), 'loss_cross_entropy': Array(0.53409094, dtype=float32)}


  3%|▎         | 29140/1000000 [1:30:22<28:02:03,  9.62it/s]

{'loss': Array(0.53845865, dtype=float32), 'loss_reward': Array(0.00451877, dtype=float32), 'loss_cross_entropy': Array(0.53393996, dtype=float32)}


  3%|▎         | 29148/1000000 [1:30:23<33:36:35,  8.02it/s]

{'loss': Array(0.5361106, dtype=float32), 'loss_reward': Array(0.00440051, dtype=float32), 'loss_cross_entropy': Array(0.53171015, dtype=float32)}


  3%|▎         | 29160/1000000 [1:30:25<31:05:24,  8.67it/s]

{'loss': Array(0.5380307, dtype=float32), 'loss_reward': Array(0.00447594, dtype=float32), 'loss_cross_entropy': Array(0.53355473, dtype=float32)}


  3%|▎         | 29168/1000000 [1:30:27<35:04:14,  7.69it/s]

{'loss': Array(0.5358018, dtype=float32), 'loss_reward': Array(0.00442264, dtype=float32), 'loss_cross_entropy': Array(0.53137916, dtype=float32)}


  3%|▎         | 29180/1000000 [1:30:29<27:04:58,  9.96it/s]

{'loss': Array(0.53482324, dtype=float32), 'loss_reward': Array(0.00435038, dtype=float32), 'loss_cross_entropy': Array(0.5304729, dtype=float32)}


  3%|▎         | 29190/1000000 [1:30:30<38:25:33,  7.02it/s]

{'loss': Array(0.5340932, dtype=float32), 'loss_reward': Array(0.00444314, dtype=float32), 'loss_cross_entropy': Array(0.52965015, dtype=float32)}


  3%|▎         | 29198/1000000 [1:30:32<37:27:48,  7.20it/s]

{'loss': Array(0.52022856, dtype=float32), 'loss_reward': Array(0.00424155, dtype=float32), 'loss_cross_entropy': Array(0.51598704, dtype=float32)}


  3%|▎         | 29210/1000000 [1:30:34<27:26:23,  9.83it/s]

{'loss': Array(0.5302295, dtype=float32), 'loss_reward': Array(0.00447068, dtype=float32), 'loss_cross_entropy': Array(0.52575886, dtype=float32)}


  3%|▎         | 29218/1000000 [1:30:35<32:41:37,  8.25it/s]

{'loss': Array(0.5330414, dtype=float32), 'loss_reward': Array(0.00439153, dtype=float32), 'loss_cross_entropy': Array(0.5286498, dtype=float32)}


  3%|▎         | 29229/1000000 [1:30:37<34:24:58,  7.84it/s]

{'loss': Array(0.5311166, dtype=float32), 'loss_reward': Array(0.00433614, dtype=float32), 'loss_cross_entropy': Array(0.5267804, dtype=float32)}


  3%|▎         | 29240/1000000 [1:30:38<28:02:22,  9.62it/s]

{'loss': Array(0.5264608, dtype=float32), 'loss_reward': Array(0.00440468, dtype=float32), 'loss_cross_entropy': Array(0.5220562, dtype=float32)}


  3%|▎         | 29248/1000000 [1:30:40<32:30:25,  8.30it/s]

{'loss': Array(0.5323262, dtype=float32), 'loss_reward': Array(0.0043312, dtype=float32), 'loss_cross_entropy': Array(0.52799505, dtype=float32)}


  3%|▎         | 29260/1000000 [1:30:41<27:02:03,  9.97it/s]

{'loss': Array(0.52571326, dtype=float32), 'loss_reward': Array(0.00450133, dtype=float32), 'loss_cross_entropy': Array(0.5212119, dtype=float32)}


  3%|▎         | 29268/1000000 [1:30:43<39:00:23,  6.91it/s]

{'loss': Array(0.5312783, dtype=float32), 'loss_reward': Array(0.00428165, dtype=float32), 'loss_cross_entropy': Array(0.5269967, dtype=float32)}


  3%|▎         | 29280/1000000 [1:30:45<28:18:54,  9.52it/s]

{'loss': Array(0.5418759, dtype=float32), 'loss_reward': Array(0.00448348, dtype=float32), 'loss_cross_entropy': Array(0.53739244, dtype=float32)}


  3%|▎         | 29290/1000000 [1:30:46<28:37:49,  9.42it/s]

{'loss': Array(0.532788, dtype=float32), 'loss_reward': Array(0.00425215, dtype=float32), 'loss_cross_entropy': Array(0.5285359, dtype=float32)}


  3%|▎         | 29298/1000000 [1:30:48<33:15:28,  8.11it/s]

{'loss': Array(0.53192335, dtype=float32), 'loss_reward': Array(0.00420904, dtype=float32), 'loss_cross_entropy': Array(0.5277143, dtype=float32)}


  3%|▎         | 29310/1000000 [1:30:50<31:34:57,  8.54it/s]

{'loss': Array(0.53370243, dtype=float32), 'loss_reward': Array(0.0044587, dtype=float32), 'loss_cross_entropy': Array(0.5292438, dtype=float32)}


  3%|▎         | 29318/1000000 [1:30:51<33:51:15,  7.96it/s]

{'loss': Array(0.53231615, dtype=float32), 'loss_reward': Array(0.00443513, dtype=float32), 'loss_cross_entropy': Array(0.5278811, dtype=float32)}


  3%|▎         | 29329/1000000 [1:30:53<28:27:43,  9.47it/s]

{'loss': Array(0.53050286, dtype=float32), 'loss_reward': Array(0.0042543, dtype=float32), 'loss_cross_entropy': Array(0.5262486, dtype=float32)}


  3%|▎         | 29340/1000000 [1:30:54<26:46:56, 10.07it/s]

{'loss': Array(0.5261937, dtype=float32), 'loss_reward': Array(0.00439807, dtype=float32), 'loss_cross_entropy': Array(0.5217957, dtype=float32)}


  3%|▎         | 29348/1000000 [1:30:56<37:14:57,  7.24it/s]

{'loss': Array(0.53738374, dtype=float32), 'loss_reward': Array(0.00431628, dtype=float32), 'loss_cross_entropy': Array(0.5330674, dtype=float32)}


  3%|▎         | 29360/1000000 [1:30:58<28:38:29,  9.41it/s]

{'loss': Array(0.52606106, dtype=float32), 'loss_reward': Array(0.00437981, dtype=float32), 'loss_cross_entropy': Array(0.52168125, dtype=float32)}


  3%|▎         | 29368/1000000 [1:30:59<34:12:40,  7.88it/s]

{'loss': Array(0.51973784, dtype=float32), 'loss_reward': Array(0.00432506, dtype=float32), 'loss_cross_entropy': Array(0.51541275, dtype=float32)}


  3%|▎         | 29379/1000000 [1:31:01<40:15:33,  6.70it/s]

{'loss': Array(0.52560145, dtype=float32), 'loss_reward': Array(0.00440946, dtype=float32), 'loss_cross_entropy': Array(0.521192, dtype=float32)}


  3%|▎         | 29390/1000000 [1:31:03<28:38:25,  9.41it/s]

{'loss': Array(0.5257177, dtype=float32), 'loss_reward': Array(0.00435869, dtype=float32), 'loss_cross_entropy': Array(0.52135897, dtype=float32)}


  3%|▎         | 29398/1000000 [1:31:04<32:46:42,  8.23it/s]

{'loss': Array(0.52474797, dtype=float32), 'loss_reward': Array(0.00442102, dtype=float32), 'loss_cross_entropy': Array(0.5203269, dtype=float32)}


  3%|▎         | 29410/1000000 [1:31:06<26:45:38, 10.07it/s]

{'loss': Array(0.53261656, dtype=float32), 'loss_reward': Array(0.00424739, dtype=float32), 'loss_cross_entropy': Array(0.5283691, dtype=float32)}


  3%|▎         | 29420/1000000 [1:31:08<35:34:42,  7.58it/s]

{'loss': Array(0.5334497, dtype=float32), 'loss_reward': Array(0.00431589, dtype=float32), 'loss_cross_entropy': Array(0.5291338, dtype=float32)}


  3%|▎         | 29428/1000000 [1:31:09<37:22:30,  7.21it/s]

{'loss': Array(0.52707916, dtype=float32), 'loss_reward': Array(0.00442937, dtype=float32), 'loss_cross_entropy': Array(0.5226498, dtype=float32)}


  3%|▎         | 29440/1000000 [1:31:11<28:06:48,  9.59it/s]

{'loss': Array(0.5284445, dtype=float32), 'loss_reward': Array(0.00434666, dtype=float32), 'loss_cross_entropy': Array(0.5240979, dtype=float32)}


  3%|▎         | 29448/1000000 [1:31:12<32:55:05,  8.19it/s]

{'loss': Array(0.5327161, dtype=float32), 'loss_reward': Array(0.0044164, dtype=float32), 'loss_cross_entropy': Array(0.52829975, dtype=float32)}


  3%|▎         | 29459/1000000 [1:31:14<35:44:44,  7.54it/s]

{'loss': Array(0.52629465, dtype=float32), 'loss_reward': Array(0.00441472, dtype=float32), 'loss_cross_entropy': Array(0.5218799, dtype=float32)}


  3%|▎         | 29470/1000000 [1:31:16<28:02:45,  9.61it/s]

{'loss': Array(0.5283905, dtype=float32), 'loss_reward': Array(0.00432359, dtype=float32), 'loss_cross_entropy': Array(0.52406687, dtype=float32)}


  3%|▎         | 29478/1000000 [1:31:17<33:15:39,  8.11it/s]

{'loss': Array(0.5305152, dtype=float32), 'loss_reward': Array(0.00440455, dtype=float32), 'loss_cross_entropy': Array(0.5261107, dtype=float32)}


  3%|▎         | 29489/1000000 [1:31:19<30:30:49,  8.83it/s]

{'loss': Array(0.52057916, dtype=float32), 'loss_reward': Array(0.00425777, dtype=float32), 'loss_cross_entropy': Array(0.5163214, dtype=float32)}


  3%|▎         | 29499/1000000 [1:31:21<33:44:30,  7.99it/s]

{'loss': Array(0.5365617, dtype=float32), 'loss_reward': Array(0.00438693, dtype=float32), 'loss_cross_entropy': Array(0.5321749, dtype=float32)}


  3%|▎         | 29510/1000000 [1:31:32<123:01:50,  2.19it/s]

{'loss': Array(0.5403358, dtype=float32), 'loss_reward': Array(0.00445057, dtype=float32), 'loss_cross_entropy': Array(0.53588516, dtype=float32)}


  3%|▎         | 29518/1000000 [1:31:34<65:46:47,  4.10it/s] 

{'loss': Array(0.53619504, dtype=float32), 'loss_reward': Array(0.00451866, dtype=float32), 'loss_cross_entropy': Array(0.5316765, dtype=float32)}


  3%|▎         | 29530/1000000 [1:31:35<34:42:09,  7.77it/s]

{'loss': Array(0.5265164, dtype=float32), 'loss_reward': Array(0.00427231, dtype=float32), 'loss_cross_entropy': Array(0.5222441, dtype=float32)}


  3%|▎         | 29538/1000000 [1:31:37<41:10:40,  6.55it/s]

{'loss': Array(0.53701514, dtype=float32), 'loss_reward': Array(0.00445563, dtype=float32), 'loss_cross_entropy': Array(0.5325596, dtype=float32)}


  3%|▎         | 29550/1000000 [1:31:39<29:43:24,  9.07it/s]

{'loss': Array(0.5243044, dtype=float32), 'loss_reward': Array(0.00428134, dtype=float32), 'loss_cross_entropy': Array(0.5200231, dtype=float32)}


  3%|▎         | 29558/1000000 [1:31:40<33:15:43,  8.10it/s]

{'loss': Array(0.53418845, dtype=float32), 'loss_reward': Array(0.00439483, dtype=float32), 'loss_cross_entropy': Array(0.5297936, dtype=float32)}


  3%|▎         | 29570/1000000 [1:31:42<36:17:20,  7.43it/s]

{'loss': Array(0.5305504, dtype=float32), 'loss_reward': Array(0.0043867, dtype=float32), 'loss_cross_entropy': Array(0.52616376, dtype=float32)}


  3%|▎         | 29580/1000000 [1:31:44<30:39:51,  8.79it/s]

{'loss': Array(0.5280701, dtype=float32), 'loss_reward': Array(0.00431934, dtype=float32), 'loss_cross_entropy': Array(0.5237507, dtype=float32)}


  3%|▎         | 29588/1000000 [1:31:45<33:56:17,  7.94it/s]

{'loss': Array(0.53076804, dtype=float32), 'loss_reward': Array(0.00439488, dtype=float32), 'loss_cross_entropy': Array(0.5263731, dtype=float32)}


  3%|▎         | 29600/1000000 [1:31:47<27:29:48,  9.80it/s]

{'loss': Array(0.5242238, dtype=float32), 'loss_reward': Array(0.00428674, dtype=float32), 'loss_cross_entropy': Array(0.51993716, dtype=float32)}


  3%|▎         | 29608/1000000 [1:31:49<43:09:11,  6.25it/s]

{'loss': Array(0.52670354, dtype=float32), 'loss_reward': Array(0.00426485, dtype=float32), 'loss_cross_entropy': Array(0.5224387, dtype=float32)}


  3%|▎         | 29620/1000000 [1:31:50<29:37:04,  9.10it/s]

{'loss': Array(0.5280046, dtype=float32), 'loss_reward': Array(0.00426984, dtype=float32), 'loss_cross_entropy': Array(0.5237347, dtype=float32)}


  3%|▎         | 29628/1000000 [1:31:52<34:27:36,  7.82it/s]

{'loss': Array(0.53566545, dtype=float32), 'loss_reward': Array(0.00443048, dtype=float32), 'loss_cross_entropy': Array(0.531235, dtype=float32)}


  3%|▎         | 29640/1000000 [1:31:53<27:53:16,  9.67it/s]

{'loss': Array(0.527888, dtype=float32), 'loss_reward': Array(0.00449483, dtype=float32), 'loss_cross_entropy': Array(0.5233932, dtype=float32)}


  3%|▎         | 29648/1000000 [1:31:55<41:04:30,  6.56it/s]

{'loss': Array(0.52149105, dtype=float32), 'loss_reward': Array(0.00426733, dtype=float32), 'loss_cross_entropy': Array(0.5172238, dtype=float32)}


  3%|▎         | 29660/1000000 [1:31:57<29:30:20,  9.14it/s]

{'loss': Array(0.5277316, dtype=float32), 'loss_reward': Array(0.00441946, dtype=float32), 'loss_cross_entropy': Array(0.5233123, dtype=float32)}


  3%|▎         | 29668/1000000 [1:31:58<33:24:17,  8.07it/s]

{'loss': Array(0.5295458, dtype=float32), 'loss_reward': Array(0.00444224, dtype=float32), 'loss_cross_entropy': Array(0.52510357, dtype=float32)}


  3%|▎         | 29680/1000000 [1:32:00<27:27:16,  9.82it/s]

{'loss': Array(0.5220063, dtype=float32), 'loss_reward': Array(0.00431481, dtype=float32), 'loss_cross_entropy': Array(0.51769155, dtype=float32)}


  3%|▎         | 29690/1000000 [1:32:02<32:07:44,  8.39it/s]

{'loss': Array(0.52519554, dtype=float32), 'loss_reward': Array(0.00425244, dtype=float32), 'loss_cross_entropy': Array(0.5209431, dtype=float32)}


  3%|▎         | 29698/1000000 [1:32:03<34:49:28,  7.74it/s]

{'loss': Array(0.5207907, dtype=float32), 'loss_reward': Array(0.00428311, dtype=float32), 'loss_cross_entropy': Array(0.51650757, dtype=float32)}


  3%|▎         | 29710/1000000 [1:32:05<27:31:16,  9.79it/s]

{'loss': Array(0.521686, dtype=float32), 'loss_reward': Array(0.00455249, dtype=float32), 'loss_cross_entropy': Array(0.5171336, dtype=float32)}


  3%|▎         | 29720/1000000 [1:32:07<39:47:47,  6.77it/s]

{'loss': Array(0.51877004, dtype=float32), 'loss_reward': Array(0.00429493, dtype=float32), 'loss_cross_entropy': Array(0.51447505, dtype=float32)}


  3%|▎         | 29728/1000000 [1:32:09<37:52:51,  7.11it/s]

{'loss': Array(0.5260398, dtype=float32), 'loss_reward': Array(0.00433835, dtype=float32), 'loss_cross_entropy': Array(0.52170146, dtype=float32)}


  3%|▎         | 29740/1000000 [1:32:10<28:54:15,  9.32it/s]

{'loss': Array(0.51752275, dtype=float32), 'loss_reward': Array(0.00454154, dtype=float32), 'loss_cross_entropy': Array(0.51298124, dtype=float32)}


  3%|▎         | 29748/1000000 [1:32:12<33:57:58,  7.93it/s]

{'loss': Array(0.52564675, dtype=float32), 'loss_reward': Array(0.00434753, dtype=float32), 'loss_cross_entropy': Array(0.52129924, dtype=float32)}


  3%|▎         | 29759/1000000 [1:32:14<40:10:49,  6.71it/s]

{'loss': Array(0.5225221, dtype=float32), 'loss_reward': Array(0.00419041, dtype=float32), 'loss_cross_entropy': Array(0.51833165, dtype=float32)}


  3%|▎         | 29770/1000000 [1:32:15<29:50:39,  9.03it/s]

{'loss': Array(0.52660125, dtype=float32), 'loss_reward': Array(0.00432077, dtype=float32), 'loss_cross_entropy': Array(0.52228045, dtype=float32)}


  3%|▎         | 29778/1000000 [1:32:17<34:12:20,  7.88it/s]

{'loss': Array(0.52502686, dtype=float32), 'loss_reward': Array(0.00438053, dtype=float32), 'loss_cross_entropy': Array(0.5206463, dtype=float32)}


  3%|▎         | 29790/1000000 [1:32:18<27:38:18,  9.75it/s]

{'loss': Array(0.52303237, dtype=float32), 'loss_reward': Array(0.00441236, dtype=float32), 'loss_cross_entropy': Array(0.51861995, dtype=float32)}


  3%|▎         | 29799/1000000 [1:32:20<37:02:05,  7.28it/s]

{'loss': Array(0.5241467, dtype=float32), 'loss_reward': Array(0.00439474, dtype=float32), 'loss_cross_entropy': Array(0.51975197, dtype=float32)}


  3%|▎         | 29810/1000000 [1:32:22<28:13:56,  9.55it/s]

{'loss': Array(0.5241738, dtype=float32), 'loss_reward': Array(0.00427769, dtype=float32), 'loss_cross_entropy': Array(0.5198961, dtype=float32)}


  3%|▎         | 29818/1000000 [1:32:23<32:59:31,  8.17it/s]

{'loss': Array(0.5256564, dtype=float32), 'loss_reward': Array(0.00428515, dtype=float32), 'loss_cross_entropy': Array(0.52137125, dtype=float32)}


  3%|▎         | 29830/1000000 [1:32:25<27:27:57,  9.81it/s]

{'loss': Array(0.51673406, dtype=float32), 'loss_reward': Array(0.0041491, dtype=float32), 'loss_cross_entropy': Array(0.5125849, dtype=float32)}


  3%|▎         | 29838/1000000 [1:32:27<39:17:36,  6.86it/s]

{'loss': Array(0.5230846, dtype=float32), 'loss_reward': Array(0.00432429, dtype=float32), 'loss_cross_entropy': Array(0.51876026, dtype=float32)}


  3%|▎         | 29849/1000000 [1:32:28<29:19:47,  9.19it/s]

{'loss': Array(0.52320546, dtype=float32), 'loss_reward': Array(0.00437933, dtype=float32), 'loss_cross_entropy': Array(0.5188261, dtype=float32)}


  3%|▎         | 29860/1000000 [1:32:30<27:25:00,  9.83it/s]

{'loss': Array(0.5191323, dtype=float32), 'loss_reward': Array(0.00435229, dtype=float32), 'loss_cross_entropy': Array(0.5147801, dtype=float32)}


  3%|▎         | 29868/1000000 [1:32:31<33:18:56,  8.09it/s]

{'loss': Array(0.52659184, dtype=float32), 'loss_reward': Array(0.00444219, dtype=float32), 'loss_cross_entropy': Array(0.52214974, dtype=float32)}


  3%|▎         | 29880/1000000 [1:32:33<30:55:01,  8.72it/s]

{'loss': Array(0.52400434, dtype=float32), 'loss_reward': Array(0.00427308, dtype=float32), 'loss_cross_entropy': Array(0.5197312, dtype=float32)}


  3%|▎         | 29888/1000000 [1:32:35<34:38:56,  7.78it/s]

{'loss': Array(0.5313912, dtype=float32), 'loss_reward': Array(0.00426654, dtype=float32), 'loss_cross_entropy': Array(0.52712464, dtype=float32)}


  3%|▎         | 29900/1000000 [1:32:37<28:07:59,  9.58it/s]

{'loss': Array(0.518266, dtype=float32), 'loss_reward': Array(0.00428593, dtype=float32), 'loss_cross_entropy': Array(0.5139801, dtype=float32)}


  3%|▎         | 29910/1000000 [1:32:38<39:50:54,  6.76it/s]

{'loss': Array(0.5298685, dtype=float32), 'loss_reward': Array(0.00437397, dtype=float32), 'loss_cross_entropy': Array(0.52549446, dtype=float32)}


  3%|▎         | 29918/1000000 [1:32:40<37:46:11,  7.13it/s]

{'loss': Array(0.52117836, dtype=float32), 'loss_reward': Array(0.00428285, dtype=float32), 'loss_cross_entropy': Array(0.5168955, dtype=float32)}


  3%|▎         | 29929/1000000 [1:32:42<30:11:16,  8.93it/s]

{'loss': Array(0.5202833, dtype=float32), 'loss_reward': Array(0.00446308, dtype=float32), 'loss_cross_entropy': Array(0.5158202, dtype=float32)}


  3%|▎         | 29940/1000000 [1:32:43<27:35:59,  9.76it/s]

{'loss': Array(0.5234688, dtype=float32), 'loss_reward': Array(0.00426551, dtype=float32), 'loss_cross_entropy': Array(0.51920325, dtype=float32)}


  3%|▎         | 29948/1000000 [1:32:45<42:56:14,  6.28it/s]

{'loss': Array(0.5191895, dtype=float32), 'loss_reward': Array(0.00431305, dtype=float32), 'loss_cross_entropy': Array(0.5148765, dtype=float32)}


  3%|▎         | 29960/1000000 [1:32:47<29:26:31,  9.15it/s]

{'loss': Array(0.5238268, dtype=float32), 'loss_reward': Array(0.00420006, dtype=float32), 'loss_cross_entropy': Array(0.5196268, dtype=float32)}


  3%|▎         | 29968/1000000 [1:32:48<33:43:15,  7.99it/s]

{'loss': Array(0.51881784, dtype=float32), 'loss_reward': Array(0.00434939, dtype=float32), 'loss_cross_entropy': Array(0.5144685, dtype=float32)}


  3%|▎         | 29979/1000000 [1:32:50<29:47:33,  9.04it/s]

{'loss': Array(0.532602, dtype=float32), 'loss_reward': Array(0.0043622, dtype=float32), 'loss_cross_entropy': Array(0.5282398, dtype=float32)}


  3%|▎         | 29989/1000000 [1:32:52<35:45:49,  7.53it/s]

{'loss': Array(0.5230439, dtype=float32), 'loss_reward': Array(0.00432407, dtype=float32), 'loss_cross_entropy': Array(0.51871985, dtype=float32)}


  3%|▎         | 30000/1000000 [1:32:53<28:23:40,  9.49it/s]

{'loss': Array(0.5209851, dtype=float32), 'loss_reward': Array(0.00422617, dtype=float32), 'loss_cross_entropy': Array(0.5167589, dtype=float32)}


  3%|▎         | 30008/1000000 [1:33:05<174:46:43,  1.54it/s]

{'loss': Array(0.52421975, dtype=float32), 'loss_reward': Array(0.0044563, dtype=float32), 'loss_cross_entropy': Array(0.5197635, dtype=float32)}


  3%|▎         | 30020/1000000 [1:33:06<59:35:30,  4.52it/s] 

{'loss': Array(0.52079004, dtype=float32), 'loss_reward': Array(0.00428874, dtype=float32), 'loss_cross_entropy': Array(0.5165012, dtype=float32)}


  3%|▎         | 30030/1000000 [1:33:08<40:44:28,  6.61it/s]

{'loss': Array(0.5210437, dtype=float32), 'loss_reward': Array(0.00419912, dtype=float32), 'loss_cross_entropy': Array(0.5168446, dtype=float32)}


  3%|▎         | 30038/1000000 [1:33:10<37:52:03,  7.12it/s]

{'loss': Array(0.5144516, dtype=float32), 'loss_reward': Array(0.00424741, dtype=float32), 'loss_cross_entropy': Array(0.51020426, dtype=float32)}


  3%|▎         | 30050/1000000 [1:33:11<28:32:06,  9.44it/s]

{'loss': Array(0.5199911, dtype=float32), 'loss_reward': Array(0.0042528, dtype=float32), 'loss_cross_entropy': Array(0.5157383, dtype=float32)}


  3%|▎         | 30058/1000000 [1:33:13<33:38:33,  8.01it/s]

{'loss': Array(0.52016157, dtype=float32), 'loss_reward': Array(0.00426481, dtype=float32), 'loss_cross_entropy': Array(0.51589674, dtype=float32)}


  3%|▎         | 30070/1000000 [1:33:15<30:24:23,  8.86it/s]

{'loss': Array(0.5144374, dtype=float32), 'loss_reward': Array(0.00421247, dtype=float32), 'loss_cross_entropy': Array(0.5102249, dtype=float32)}


  3%|▎         | 30080/1000000 [1:33:16<29:12:16,  9.23it/s]

{'loss': Array(0.52439725, dtype=float32), 'loss_reward': Array(0.0043709, dtype=float32), 'loss_cross_entropy': Array(0.5200263, dtype=float32)}


  3%|▎         | 30088/1000000 [1:33:18<34:36:17,  7.79it/s]

{'loss': Array(0.5233224, dtype=float32), 'loss_reward': Array(0.00432886, dtype=float32), 'loss_cross_entropy': Array(0.51899356, dtype=float32)}


  3%|▎         | 30100/1000000 [1:33:20<37:21:22,  7.21it/s]

{'loss': Array(0.5217225, dtype=float32), 'loss_reward': Array(0.00458072, dtype=float32), 'loss_cross_entropy': Array(0.5171418, dtype=float32)}


  3%|▎         | 30108/1000000 [1:33:21<36:23:45,  7.40it/s]

{'loss': Array(0.51520705, dtype=float32), 'loss_reward': Array(0.00420786, dtype=float32), 'loss_cross_entropy': Array(0.5109992, dtype=float32)}


  3%|▎         | 30120/1000000 [1:33:23<28:41:17,  9.39it/s]

{'loss': Array(0.5243298, dtype=float32), 'loss_reward': Array(0.00431053, dtype=float32), 'loss_cross_entropy': Array(0.52001923, dtype=float32)}


  3%|▎         | 30128/1000000 [1:33:25<32:53:47,  8.19it/s]

{'loss': Array(0.53303325, dtype=float32), 'loss_reward': Array(0.004399, dtype=float32), 'loss_cross_entropy': Array(0.52863413, dtype=float32)}


  3%|▎         | 30140/1000000 [1:33:27<34:08:58,  7.89it/s]

{'loss': Array(0.5237459, dtype=float32), 'loss_reward': Array(0.00439066, dtype=float32), 'loss_cross_entropy': Array(0.5193552, dtype=float32)}


  3%|▎         | 30150/1000000 [1:33:28<30:00:44,  8.98it/s]

{'loss': Array(0.5235734, dtype=float32), 'loss_reward': Array(0.00428898, dtype=float32), 'loss_cross_entropy': Array(0.5192844, dtype=float32)}


  3%|▎         | 30158/1000000 [1:33:30<34:36:21,  7.78it/s]

{'loss': Array(0.5207714, dtype=float32), 'loss_reward': Array(0.00429504, dtype=float32), 'loss_cross_entropy': Array(0.51647633, dtype=float32)}


  3%|▎         | 30170/1000000 [1:33:31<27:31:47,  9.79it/s]

{'loss': Array(0.52320784, dtype=float32), 'loss_reward': Array(0.00428193, dtype=float32), 'loss_cross_entropy': Array(0.51892585, dtype=float32)}


  3%|▎         | 30179/1000000 [1:33:33<38:09:18,  7.06it/s]

{'loss': Array(0.51301473, dtype=float32), 'loss_reward': Array(0.00420379, dtype=float32), 'loss_cross_entropy': Array(0.508811, dtype=float32)}


  3%|▎         | 30189/1000000 [1:33:35<30:20:37,  8.88it/s]

{'loss': Array(0.5190136, dtype=float32), 'loss_reward': Array(0.0042801, dtype=float32), 'loss_cross_entropy': Array(0.5147335, dtype=float32)}


  3%|▎         | 30200/1000000 [1:33:36<27:29:22,  9.80it/s]

{'loss': Array(0.5169989, dtype=float32), 'loss_reward': Array(0.00420472, dtype=float32), 'loss_cross_entropy': Array(0.51279414, dtype=float32)}


  3%|▎         | 30208/1000000 [1:33:38<33:06:42,  8.14it/s]

{'loss': Array(0.5108158, dtype=float32), 'loss_reward': Array(0.00433365, dtype=float32), 'loss_cross_entropy': Array(0.5064822, dtype=float32)}


  3%|▎         | 30219/1000000 [1:33:40<33:37:41,  8.01it/s]

{'loss': Array(0.51559275, dtype=float32), 'loss_reward': Array(0.00431934, dtype=float32), 'loss_cross_entropy': Array(0.51127344, dtype=float32)}


  3%|▎         | 30230/1000000 [1:33:41<28:29:10,  9.46it/s]

{'loss': Array(0.5080002, dtype=float32), 'loss_reward': Array(0.00432246, dtype=float32), 'loss_cross_entropy': Array(0.5036778, dtype=float32)}


  3%|▎         | 30238/1000000 [1:33:43<34:17:14,  7.86it/s]

{'loss': Array(0.51254594, dtype=float32), 'loss_reward': Array(0.00433028, dtype=float32), 'loss_cross_entropy': Array(0.5082156, dtype=float32)}


  3%|▎         | 30250/1000000 [1:33:44<27:23:21,  9.84it/s]

{'loss': Array(0.52190864, dtype=float32), 'loss_reward': Array(0.0043111, dtype=float32), 'loss_cross_entropy': Array(0.5175976, dtype=float32)}


  3%|▎         | 30258/1000000 [1:33:46<38:43:12,  6.96it/s]

{'loss': Array(0.5124536, dtype=float32), 'loss_reward': Array(0.00430148, dtype=float32), 'loss_cross_entropy': Array(0.5081522, dtype=float32)}


  3%|▎         | 30270/1000000 [1:33:48<29:39:21,  9.08it/s]

{'loss': Array(0.51137275, dtype=float32), 'loss_reward': Array(0.00439564, dtype=float32), 'loss_cross_entropy': Array(0.5069771, dtype=float32)}


  3%|▎         | 30278/1000000 [1:33:50<33:58:55,  7.93it/s]

{'loss': Array(0.5157207, dtype=float32), 'loss_reward': Array(0.0043321, dtype=float32), 'loss_cross_entropy': Array(0.51138866, dtype=float32)}


  3%|▎         | 30289/1000000 [1:33:51<38:43:08,  6.96it/s]

{'loss': Array(0.52234155, dtype=float32), 'loss_reward': Array(0.00428548, dtype=float32), 'loss_cross_entropy': Array(0.5180561, dtype=float32)}


  3%|▎         | 30299/1000000 [1:33:53<31:21:40,  8.59it/s]

{'loss': Array(0.50288963, dtype=float32), 'loss_reward': Array(0.00432513, dtype=float32), 'loss_cross_entropy': Array(0.49856454, dtype=float32)}


  3%|▎         | 30309/1000000 [1:33:55<28:57:30,  9.30it/s]

{'loss': Array(0.5122743, dtype=float32), 'loss_reward': Array(0.00431076, dtype=float32), 'loss_cross_entropy': Array(0.50796354, dtype=float32)}


  3%|▎         | 30320/1000000 [1:33:56<27:29:30,  9.80it/s]

{'loss': Array(0.5099306, dtype=float32), 'loss_reward': Array(0.00431445, dtype=float32), 'loss_cross_entropy': Array(0.5056162, dtype=float32)}


  3%|▎         | 30330/1000000 [1:33:58<35:23:31,  7.61it/s]

{'loss': Array(0.52325267, dtype=float32), 'loss_reward': Array(0.00436491, dtype=float32), 'loss_cross_entropy': Array(0.5188878, dtype=float32)}


  3%|▎         | 30338/1000000 [1:34:00<36:16:45,  7.42it/s]

{'loss': Array(0.5054826, dtype=float32), 'loss_reward': Array(0.00427341, dtype=float32), 'loss_cross_entropy': Array(0.5012092, dtype=float32)}


  3%|▎         | 30350/1000000 [1:34:01<28:34:07,  9.43it/s]

{'loss': Array(0.51188546, dtype=float32), 'loss_reward': Array(0.00413841, dtype=float32), 'loss_cross_entropy': Array(0.5077471, dtype=float32)}


  3%|▎         | 30358/1000000 [1:34:03<33:26:40,  8.05it/s]

{'loss': Array(0.5145236, dtype=float32), 'loss_reward': Array(0.00427344, dtype=float32), 'loss_cross_entropy': Array(0.51025015, dtype=float32)}


  3%|▎         | 30369/1000000 [1:34:05<36:10:48,  7.44it/s]

{'loss': Array(0.51987886, dtype=float32), 'loss_reward': Array(0.0043221, dtype=float32), 'loss_cross_entropy': Array(0.51555675, dtype=float32)}


  3%|▎         | 30380/1000000 [1:34:06<28:12:44,  9.55it/s]

{'loss': Array(0.51123697, dtype=float32), 'loss_reward': Array(0.00422374, dtype=float32), 'loss_cross_entropy': Array(0.5070132, dtype=float32)}


  3%|▎         | 30390/1000000 [1:34:08<26:55:59, 10.00it/s]

{'loss': Array(0.507828, dtype=float32), 'loss_reward': Array(0.00426057, dtype=float32), 'loss_cross_entropy': Array(0.5035674, dtype=float32)}


  3%|▎         | 30398/1000000 [1:34:09<32:29:12,  8.29it/s]

{'loss': Array(0.5036612, dtype=float32), 'loss_reward': Array(0.00437756, dtype=float32), 'loss_cross_entropy': Array(0.4992836, dtype=float32)}


  3%|▎         | 30410/1000000 [1:34:11<31:09:28,  8.64it/s]

{'loss': Array(0.5120661, dtype=float32), 'loss_reward': Array(0.00421373, dtype=float32), 'loss_cross_entropy': Array(0.50785244, dtype=float32)}


  3%|▎         | 30418/1000000 [1:34:13<34:06:29,  7.90it/s]

{'loss': Array(0.51185966, dtype=float32), 'loss_reward': Array(0.00428235, dtype=float32), 'loss_cross_entropy': Array(0.50757736, dtype=float32)}


  3%|▎         | 30430/1000000 [1:34:14<28:10:26,  9.56it/s]

{'loss': Array(0.5090441, dtype=float32), 'loss_reward': Array(0.00440689, dtype=float32), 'loss_cross_entropy': Array(0.50463724, dtype=float32)}


  3%|▎         | 30438/1000000 [1:34:16<33:18:05,  8.09it/s]

{'loss': Array(0.5049128, dtype=float32), 'loss_reward': Array(0.00416836, dtype=float32), 'loss_cross_entropy': Array(0.50074434, dtype=float32)}


  3%|▎         | 30450/1000000 [1:34:18<30:52:12,  8.72it/s]

{'loss': Array(0.5170924, dtype=float32), 'loss_reward': Array(0.00430628, dtype=float32), 'loss_cross_entropy': Array(0.5127861, dtype=float32)}


  3%|▎         | 30460/1000000 [1:34:19<28:47:28,  9.35it/s]

{'loss': Array(0.51818, dtype=float32), 'loss_reward': Array(0.00431492, dtype=float32), 'loss_cross_entropy': Array(0.5138651, dtype=float32)}


  3%|▎         | 30468/1000000 [1:34:21<33:04:12,  8.14it/s]

{'loss': Array(0.5061427, dtype=float32), 'loss_reward': Array(0.00413416, dtype=float32), 'loss_cross_entropy': Array(0.50200856, dtype=float32)}


  3%|▎         | 30479/1000000 [1:34:23<38:24:34,  7.01it/s]

{'loss': Array(0.50867915, dtype=float32), 'loss_reward': Array(0.0042473, dtype=float32), 'loss_cross_entropy': Array(0.5044319, dtype=float32)}


  3%|▎         | 30490/1000000 [1:34:24<29:41:35,  9.07it/s]

{'loss': Array(0.50698507, dtype=float32), 'loss_reward': Array(0.00415945, dtype=float32), 'loss_cross_entropy': Array(0.50282556, dtype=float32)}


  3%|▎         | 30498/1000000 [1:34:26<33:49:53,  7.96it/s]

{'loss': Array(0.5109907, dtype=float32), 'loss_reward': Array(0.00427686, dtype=float32), 'loss_cross_entropy': Array(0.50671387, dtype=float32)}


  3%|▎         | 30509/1000000 [1:34:37<121:44:15,  2.21it/s]

{'loss': Array(0.51074487, dtype=float32), 'loss_reward': Array(0.0043082, dtype=float32), 'loss_cross_entropy': Array(0.5064367, dtype=float32)}


  3%|▎         | 30520/1000000 [1:34:39<55:59:43,  4.81it/s] 

{'loss': Array(0.51153135, dtype=float32), 'loss_reward': Array(0.0042335, dtype=float32), 'loss_cross_entropy': Array(0.5072979, dtype=float32)}


  3%|▎         | 30528/1000000 [1:34:40<43:06:38,  6.25it/s]

{'loss': Array(0.50349677, dtype=float32), 'loss_reward': Array(0.00411571, dtype=float32), 'loss_cross_entropy': Array(0.49938098, dtype=float32)}


  3%|▎         | 30540/1000000 [1:34:42<29:38:12,  9.09it/s]

{'loss': Array(0.51494235, dtype=float32), 'loss_reward': Array(0.00422254, dtype=float32), 'loss_cross_entropy': Array(0.5107198, dtype=float32)}


  3%|▎         | 30548/1000000 [1:34:43<33:50:40,  7.96it/s]

{'loss': Array(0.5068307, dtype=float32), 'loss_reward': Array(0.00427269, dtype=float32), 'loss_cross_entropy': Array(0.502558, dtype=float32)}


  3%|▎         | 30560/1000000 [1:34:45<31:54:50,  8.44it/s]

{'loss': Array(0.51302993, dtype=float32), 'loss_reward': Array(0.00420139, dtype=float32), 'loss_cross_entropy': Array(0.5088286, dtype=float32)}


  3%|▎         | 30568/1000000 [1:34:47<34:26:47,  7.82it/s]

{'loss': Array(0.5082214, dtype=float32), 'loss_reward': Array(0.00424427, dtype=float32), 'loss_cross_entropy': Array(0.5039771, dtype=float32)}


  3%|▎         | 30580/1000000 [1:34:48<28:30:24,  9.45it/s]

{'loss': Array(0.5148925, dtype=float32), 'loss_reward': Array(0.00442324, dtype=float32), 'loss_cross_entropy': Array(0.51046926, dtype=float32)}


  3%|▎         | 30588/1000000 [1:34:50<33:50:23,  7.96it/s]

{'loss': Array(0.5101968, dtype=float32), 'loss_reward': Array(0.00440973, dtype=float32), 'loss_cross_entropy': Array(0.5057871, dtype=float32)}


  3%|▎         | 30599/1000000 [1:34:52<33:19:44,  8.08it/s]

{'loss': Array(0.50372773, dtype=float32), 'loss_reward': Array(0.00422994, dtype=float32), 'loss_cross_entropy': Array(0.4994977, dtype=float32)}


  3%|▎         | 30610/1000000 [1:34:54<28:10:40,  9.56it/s]

{'loss': Array(0.509036, dtype=float32), 'loss_reward': Array(0.00434902, dtype=float32), 'loss_cross_entropy': Array(0.504687, dtype=float32)}


  3%|▎         | 30618/1000000 [1:34:55<33:00:11,  8.16it/s]

{'loss': Array(0.5074684, dtype=float32), 'loss_reward': Array(0.00441557, dtype=float32), 'loss_cross_entropy': Array(0.5030528, dtype=float32)}


  3%|▎         | 30630/1000000 [1:34:57<37:00:11,  7.28it/s]

{'loss': Array(0.51376605, dtype=float32), 'loss_reward': Array(0.0041504, dtype=float32), 'loss_cross_entropy': Array(0.5096157, dtype=float32)}


  3%|▎         | 30638/1000000 [1:34:58<35:34:27,  7.57it/s]

{'loss': Array(0.51017696, dtype=float32), 'loss_reward': Array(0.00444196, dtype=float32), 'loss_cross_entropy': Array(0.5057349, dtype=float32)}


  3%|▎         | 30650/1000000 [1:35:00<28:06:16,  9.58it/s]

{'loss': Array(0.5055137, dtype=float32), 'loss_reward': Array(0.00426555, dtype=float32), 'loss_cross_entropy': Array(0.5012482, dtype=float32)}


  3%|▎         | 30658/1000000 [1:35:02<34:16:30,  7.86it/s]

{'loss': Array(0.5097026, dtype=float32), 'loss_reward': Array(0.00422333, dtype=float32), 'loss_cross_entropy': Array(0.5054793, dtype=float32)}


  3%|▎         | 30669/1000000 [1:35:04<40:18:46,  6.68it/s]

{'loss': Array(0.4988818, dtype=float32), 'loss_reward': Array(0.00404864, dtype=float32), 'loss_cross_entropy': Array(0.49483314, dtype=float32)}


  3%|▎         | 30680/1000000 [1:35:05<29:46:52,  9.04it/s]

{'loss': Array(0.50297517, dtype=float32), 'loss_reward': Array(0.00422785, dtype=float32), 'loss_cross_entropy': Array(0.49874732, dtype=float32)}


  3%|▎         | 30688/1000000 [1:35:07<34:03:53,  7.90it/s]

{'loss': Array(0.49947643, dtype=float32), 'loss_reward': Array(0.00429391, dtype=float32), 'loss_cross_entropy': Array(0.49518248, dtype=float32)}


  3%|▎         | 30700/1000000 [1:35:08<27:38:17,  9.74it/s]

{'loss': Array(0.50303155, dtype=float32), 'loss_reward': Array(0.00435132, dtype=float32), 'loss_cross_entropy': Array(0.4986802, dtype=float32)}


  3%|▎         | 30710/1000000 [1:35:10<35:15:34,  7.64it/s]

{'loss': Array(0.5061576, dtype=float32), 'loss_reward': Array(0.00430092, dtype=float32), 'loss_cross_entropy': Array(0.50185674, dtype=float32)}


  3%|▎         | 30718/1000000 [1:35:12<35:34:43,  7.57it/s]

{'loss': Array(0.5048143, dtype=float32), 'loss_reward': Array(0.00419058, dtype=float32), 'loss_cross_entropy': Array(0.5006237, dtype=float32)}


  3%|▎         | 30729/1000000 [1:35:13<28:46:58,  9.35it/s]

{'loss': Array(0.51065993, dtype=float32), 'loss_reward': Array(0.0043608, dtype=float32), 'loss_cross_entropy': Array(0.50629914, dtype=float32)}


  3%|▎         | 30740/1000000 [1:35:15<26:43:42, 10.07it/s]

{'loss': Array(0.50483197, dtype=float32), 'loss_reward': Array(0.00422832, dtype=float32), 'loss_cross_entropy': Array(0.5006036, dtype=float32)}


  3%|▎         | 30750/1000000 [1:35:17<33:39:27,  8.00it/s]

{'loss': Array(0.5027153, dtype=float32), 'loss_reward': Array(0.00430563, dtype=float32), 'loss_cross_entropy': Array(0.49840966, dtype=float32)}


  3%|▎         | 30758/1000000 [1:35:18<36:42:29,  7.33it/s]

{'loss': Array(0.50567764, dtype=float32), 'loss_reward': Array(0.00421999, dtype=float32), 'loss_cross_entropy': Array(0.5014577, dtype=float32)}


  3%|▎         | 30770/1000000 [1:35:20<28:04:38,  9.59it/s]

{'loss': Array(0.51598597, dtype=float32), 'loss_reward': Array(0.00433273, dtype=float32), 'loss_cross_entropy': Array(0.51165336, dtype=float32)}


  3%|▎         | 30778/1000000 [1:35:22<32:54:51,  8.18it/s]

{'loss': Array(0.50582665, dtype=float32), 'loss_reward': Array(0.00419602, dtype=float32), 'loss_cross_entropy': Array(0.5016307, dtype=float32)}


  3%|▎         | 30788/1000000 [1:35:23<35:59:46,  7.48it/s]

{'loss': Array(0.5085439, dtype=float32), 'loss_reward': Array(0.00422944, dtype=float32), 'loss_cross_entropy': Array(0.5043144, dtype=float32)}


  3%|▎         | 30800/1000000 [1:35:25<27:51:38,  9.66it/s]

{'loss': Array(0.50525296, dtype=float32), 'loss_reward': Array(0.00428408, dtype=float32), 'loss_cross_entropy': Array(0.5009688, dtype=float32)}


  3%|▎         | 30808/1000000 [1:35:26<32:21:14,  8.32it/s]

{'loss': Array(0.5060234, dtype=float32), 'loss_reward': Array(0.00410784, dtype=float32), 'loss_cross_entropy': Array(0.5019156, dtype=float32)}


  3%|▎         | 30820/1000000 [1:35:28<36:11:48,  7.44it/s]

{'loss': Array(0.5028353, dtype=float32), 'loss_reward': Array(0.00421473, dtype=float32), 'loss_cross_entropy': Array(0.49862057, dtype=float32)}


  3%|▎         | 30828/1000000 [1:35:30<35:49:07,  7.52it/s]

{'loss': Array(0.5033726, dtype=float32), 'loss_reward': Array(0.00410562, dtype=float32), 'loss_cross_entropy': Array(0.49926695, dtype=float32)}


  3%|▎         | 30840/1000000 [1:35:31<27:36:13,  9.75it/s]

{'loss': Array(0.49706274, dtype=float32), 'loss_reward': Array(0.00415784, dtype=float32), 'loss_cross_entropy': Array(0.4929049, dtype=float32)}


  3%|▎         | 30848/1000000 [1:35:33<32:43:20,  8.23it/s]

{'loss': Array(0.50368893, dtype=float32), 'loss_reward': Array(0.00408794, dtype=float32), 'loss_cross_entropy': Array(0.49960098, dtype=float32)}


  3%|▎         | 30860/1000000 [1:35:35<34:10:00,  7.88it/s]

{'loss': Array(0.5037279, dtype=float32), 'loss_reward': Array(0.00422906, dtype=float32), 'loss_cross_entropy': Array(0.4994989, dtype=float32)}


  3%|▎         | 30868/1000000 [1:35:36<34:57:34,  7.70it/s]

{'loss': Array(0.49340525, dtype=float32), 'loss_reward': Array(0.00398068, dtype=float32), 'loss_cross_entropy': Array(0.48942456, dtype=float32)}


  3%|▎         | 30880/1000000 [1:35:38<27:55:31,  9.64it/s]

{'loss': Array(0.5008163, dtype=float32), 'loss_reward': Array(0.00419397, dtype=float32), 'loss_cross_entropy': Array(0.4966223, dtype=float32)}


  3%|▎         | 30888/1000000 [1:35:40<34:28:24,  7.81it/s]

{'loss': Array(0.50350326, dtype=float32), 'loss_reward': Array(0.00423652, dtype=float32), 'loss_cross_entropy': Array(0.4992667, dtype=float32)}


  3%|▎         | 30900/1000000 [1:35:42<33:25:49,  8.05it/s]

{'loss': Array(0.50875753, dtype=float32), 'loss_reward': Array(0.00431593, dtype=float32), 'loss_cross_entropy': Array(0.5044417, dtype=float32)}


  3%|▎         | 30910/1000000 [1:35:43<29:20:53,  9.17it/s]

{'loss': Array(0.49969396, dtype=float32), 'loss_reward': Array(0.00423198, dtype=float32), 'loss_cross_entropy': Array(0.495462, dtype=float32)}


  3%|▎         | 30920/1000000 [1:35:45<29:06:26,  9.25it/s]

{'loss': Array(0.5052536, dtype=float32), 'loss_reward': Array(0.00428486, dtype=float32), 'loss_cross_entropy': Array(0.5009687, dtype=float32)}


  3%|▎         | 30928/1000000 [1:35:46<33:05:58,  8.13it/s]

{'loss': Array(0.5043782, dtype=float32), 'loss_reward': Array(0.00410621, dtype=float32), 'loss_cross_entropy': Array(0.500272, dtype=float32)}


  3%|▎         | 30939/1000000 [1:35:48<32:38:02,  8.25it/s]

{'loss': Array(0.49614578, dtype=float32), 'loss_reward': Array(0.00429769, dtype=float32), 'loss_cross_entropy': Array(0.49184805, dtype=float32)}


  3%|▎         | 30950/1000000 [1:35:50<27:28:07,  9.80it/s]

{'loss': Array(0.49502364, dtype=float32), 'loss_reward': Array(0.00423899, dtype=float32), 'loss_cross_entropy': Array(0.49078465, dtype=float32)}


  3%|▎         | 30958/1000000 [1:35:51<33:10:37,  8.11it/s]

{'loss': Array(0.50322384, dtype=float32), 'loss_reward': Array(0.00420694, dtype=float32), 'loss_cross_entropy': Array(0.4990169, dtype=float32)}


  3%|▎         | 30970/1000000 [1:35:53<27:02:00,  9.96it/s]

{'loss': Array(0.5043884, dtype=float32), 'loss_reward': Array(0.00422713, dtype=float32), 'loss_cross_entropy': Array(0.50016135, dtype=float32)}


  3%|▎         | 30978/1000000 [1:35:55<38:06:41,  7.06it/s]

{'loss': Array(0.4949921, dtype=float32), 'loss_reward': Array(0.00419838, dtype=float32), 'loss_cross_entropy': Array(0.49079368, dtype=float32)}


  3%|▎         | 30990/1000000 [1:35:56<28:21:59,  9.49it/s]

{'loss': Array(0.50819284, dtype=float32), 'loss_reward': Array(0.00438354, dtype=float32), 'loss_cross_entropy': Array(0.5038092, dtype=float32)}


  3%|▎         | 31000/1000000 [1:35:58<28:12:38,  9.54it/s]

{'loss': Array(0.5002422, dtype=float32), 'loss_reward': Array(0.00444587, dtype=float32), 'loss_cross_entropy': Array(0.4957962, dtype=float32)}


  3%|▎         | 31008/1000000 [1:36:09<177:50:29,  1.51it/s]

{'loss': Array(0.50467604, dtype=float32), 'loss_reward': Array(0.00430257, dtype=float32), 'loss_cross_entropy': Array(0.5003735, dtype=float32)}


  3%|▎         | 31020/1000000 [1:36:11<63:00:09,  4.27it/s] 

{'loss': Array(0.50757354, dtype=float32), 'loss_reward': Array(0.00432264, dtype=float32), 'loss_cross_entropy': Array(0.50325096, dtype=float32)}


  3%|▎         | 31030/1000000 [1:36:13<37:31:59,  7.17it/s]

{'loss': Array(0.49923953, dtype=float32), 'loss_reward': Array(0.00434135, dtype=float32), 'loss_cross_entropy': Array(0.49489817, dtype=float32)}


  3%|▎         | 31038/1000000 [1:36:14<35:41:22,  7.54it/s]

{'loss': Array(0.5056614, dtype=float32), 'loss_reward': Array(0.00435759, dtype=float32), 'loss_cross_entropy': Array(0.5013038, dtype=float32)}


  3%|▎         | 31049/1000000 [1:36:16<40:29:37,  6.65it/s]

{'loss': Array(0.49369726, dtype=float32), 'loss_reward': Array(0.00421005, dtype=float32), 'loss_cross_entropy': Array(0.48948717, dtype=float32)}


  3%|▎         | 31060/1000000 [1:36:18<29:51:25,  9.01it/s]

{'loss': Array(0.49383217, dtype=float32), 'loss_reward': Array(0.00425275, dtype=float32), 'loss_cross_entropy': Array(0.4895794, dtype=float32)}


  3%|▎         | 31068/1000000 [1:36:19<35:15:47,  7.63it/s]

{'loss': Array(0.49995604, dtype=float32), 'loss_reward': Array(0.00412243, dtype=float32), 'loss_cross_entropy': Array(0.4958336, dtype=float32)}


  3%|▎         | 31080/1000000 [1:36:21<27:55:54,  9.64it/s]

{'loss': Array(0.49994174, dtype=float32), 'loss_reward': Array(0.00425304, dtype=float32), 'loss_cross_entropy': Array(0.49568874, dtype=float32)}


  3%|▎         | 31090/1000000 [1:36:23<35:04:22,  7.67it/s]

{'loss': Array(0.5016522, dtype=float32), 'loss_reward': Array(0.00439775, dtype=float32), 'loss_cross_entropy': Array(0.49725437, dtype=float32)}


  3%|▎         | 31098/1000000 [1:36:24<36:02:03,  7.47it/s]

{'loss': Array(0.49364278, dtype=float32), 'loss_reward': Array(0.00416673, dtype=float32), 'loss_cross_entropy': Array(0.48947603, dtype=float32)}


  3%|▎         | 31109/1000000 [1:36:26<31:16:20,  8.61it/s]

{'loss': Array(0.4978567, dtype=float32), 'loss_reward': Array(0.00423083, dtype=float32), 'loss_cross_entropy': Array(0.49362588, dtype=float32)}


  3%|▎         | 31120/1000000 [1:36:28<26:58:19,  9.98it/s]

{'loss': Array(0.48900923, dtype=float32), 'loss_reward': Array(0.00420567, dtype=float32), 'loss_cross_entropy': Array(0.4848036, dtype=float32)}


  3%|▎         | 31130/1000000 [1:36:29<31:49:26,  8.46it/s]

{'loss': Array(0.501227, dtype=float32), 'loss_reward': Array(0.00418732, dtype=float32), 'loss_cross_entropy': Array(0.49703965, dtype=float32)}


  3%|▎         | 31138/1000000 [1:36:31<34:31:16,  7.80it/s]

{'loss': Array(0.49740273, dtype=float32), 'loss_reward': Array(0.00426954, dtype=float32), 'loss_cross_entropy': Array(0.49313316, dtype=float32)}


  3%|▎         | 31150/1000000 [1:36:33<27:29:09,  9.79it/s]

{'loss': Array(0.4907537, dtype=float32), 'loss_reward': Array(0.00415171, dtype=float32), 'loss_cross_entropy': Array(0.48660198, dtype=float32)}


  3%|▎         | 31158/1000000 [1:36:34<32:15:59,  8.34it/s]

{'loss': Array(0.4994708, dtype=float32), 'loss_reward': Array(0.00425083, dtype=float32), 'loss_cross_entropy': Array(0.49522, dtype=float32)}


  3%|▎         | 31169/1000000 [1:36:36<33:10:05,  8.11it/s]

{'loss': Array(0.49282035, dtype=float32), 'loss_reward': Array(0.00428477, dtype=float32), 'loss_cross_entropy': Array(0.4885356, dtype=float32)}


  3%|▎         | 31180/1000000 [1:36:38<28:28:42,  9.45it/s]

{'loss': Array(0.49714833, dtype=float32), 'loss_reward': Array(0.00424124, dtype=float32), 'loss_cross_entropy': Array(0.49290714, dtype=float32)}


  3%|▎         | 31190/1000000 [1:36:39<29:01:07,  9.27it/s]

{'loss': Array(0.5031636, dtype=float32), 'loss_reward': Array(0.00420699, dtype=float32), 'loss_cross_entropy': Array(0.4989566, dtype=float32)}


  3%|▎         | 31198/1000000 [1:36:41<42:56:51,  6.27it/s]

{'loss': Array(0.50214714, dtype=float32), 'loss_reward': Array(0.00438434, dtype=float32), 'loss_cross_entropy': Array(0.49776283, dtype=float32)}


  3%|▎         | 31210/1000000 [1:36:43<29:15:38,  9.20it/s]

{'loss': Array(0.5010061, dtype=float32), 'loss_reward': Array(0.0042322, dtype=float32), 'loss_cross_entropy': Array(0.49677393, dtype=float32)}


  3%|▎         | 31220/1000000 [1:36:44<28:44:54,  9.36it/s]

{'loss': Array(0.49162483, dtype=float32), 'loss_reward': Array(0.00406035, dtype=float32), 'loss_cross_entropy': Array(0.48756447, dtype=float32)}


  3%|▎         | 31228/1000000 [1:36:46<34:44:10,  7.75it/s]

{'loss': Array(0.5047297, dtype=float32), 'loss_reward': Array(0.00442535, dtype=float32), 'loss_cross_entropy': Array(0.50030434, dtype=float32)}


  3%|▎         | 31240/1000000 [1:36:48<35:18:22,  7.62it/s]

{'loss': Array(0.49541193, dtype=float32), 'loss_reward': Array(0.00418872, dtype=float32), 'loss_cross_entropy': Array(0.49122316, dtype=float32)}


  3%|▎         | 31248/1000000 [1:36:49<35:56:38,  7.49it/s]

{'loss': Array(0.49655518, dtype=float32), 'loss_reward': Array(0.00423405, dtype=float32), 'loss_cross_entropy': Array(0.4923211, dtype=float32)}


  3%|▎         | 31260/1000000 [1:36:51<27:41:18,  9.72it/s]

{'loss': Array(0.50087506, dtype=float32), 'loss_reward': Array(0.00419597, dtype=float32), 'loss_cross_entropy': Array(0.49667904, dtype=float32)}


  3%|▎         | 31268/1000000 [1:36:52<32:43:50,  8.22it/s]

{'loss': Array(0.4992497, dtype=float32), 'loss_reward': Array(0.00429352, dtype=float32), 'loss_cross_entropy': Array(0.49495623, dtype=float32)}


  3%|▎         | 31279/1000000 [1:36:54<33:31:39,  8.03it/s]

{'loss': Array(0.4970285, dtype=float32), 'loss_reward': Array(0.00434417, dtype=float32), 'loss_cross_entropy': Array(0.49268433, dtype=float32)}


  3%|▎         | 31290/1000000 [1:36:56<28:20:59,  9.49it/s]

{'loss': Array(0.49472007, dtype=float32), 'loss_reward': Array(0.00422537, dtype=float32), 'loss_cross_entropy': Array(0.49049473, dtype=float32)}


  3%|▎         | 31300/1000000 [1:36:57<27:36:11,  9.75it/s]

{'loss': Array(0.4949031, dtype=float32), 'loss_reward': Array(0.00396579, dtype=float32), 'loss_cross_entropy': Array(0.49093732, dtype=float32)}


  3%|▎         | 31308/1000000 [1:36:59<33:01:41,  8.15it/s]

{'loss': Array(0.4870201, dtype=float32), 'loss_reward': Array(0.00408069, dtype=float32), 'loss_cross_entropy': Array(0.48293945, dtype=float32)}


  3%|▎         | 31319/1000000 [1:37:01<32:12:05,  8.36it/s]

{'loss': Array(0.48711568, dtype=float32), 'loss_reward': Array(0.00416883, dtype=float32), 'loss_cross_entropy': Array(0.48294684, dtype=float32)}


  3%|▎         | 31330/1000000 [1:37:02<27:25:01,  9.81it/s]

{'loss': Array(0.49217668, dtype=float32), 'loss_reward': Array(0.00412135, dtype=float32), 'loss_cross_entropy': Array(0.48805532, dtype=float32)}


  3%|▎         | 31338/1000000 [1:37:04<33:06:12,  8.13it/s]

{'loss': Array(0.49321172, dtype=float32), 'loss_reward': Array(0.00420812, dtype=float32), 'loss_cross_entropy': Array(0.48900363, dtype=float32)}


  3%|▎         | 31350/1000000 [1:37:06<36:36:17,  7.35it/s]

{'loss': Array(0.4959766, dtype=float32), 'loss_reward': Array(0.0041481, dtype=float32), 'loss_cross_entropy': Array(0.4918285, dtype=float32)}


  3%|▎         | 31358/1000000 [1:37:07<37:01:33,  7.27it/s]

{'loss': Array(0.4974474, dtype=float32), 'loss_reward': Array(0.00409972, dtype=float32), 'loss_cross_entropy': Array(0.49334773, dtype=float32)}


  3%|▎         | 31370/1000000 [1:37:09<27:51:38,  9.66it/s]

{'loss': Array(0.49084744, dtype=float32), 'loss_reward': Array(0.0042001, dtype=float32), 'loss_cross_entropy': Array(0.48664734, dtype=float32)}


  3%|▎         | 31380/1000000 [1:37:10<29:17:18,  9.19it/s]

{'loss': Array(0.49482828, dtype=float32), 'loss_reward': Array(0.00424831, dtype=float32), 'loss_cross_entropy': Array(0.49057993, dtype=float32)}


  3%|▎         | 31390/1000000 [1:37:12<37:24:43,  7.19it/s]

{'loss': Array(0.49208608, dtype=float32), 'loss_reward': Array(0.004185, dtype=float32), 'loss_cross_entropy': Array(0.48790103, dtype=float32)}


  3%|▎         | 31400/1000000 [1:37:14<31:57:27,  8.42it/s]

{'loss': Array(0.48226625, dtype=float32), 'loss_reward': Array(0.00408529, dtype=float32), 'loss_cross_entropy': Array(0.47818094, dtype=float32)}


  3%|▎         | 31408/1000000 [1:37:15<34:59:41,  7.69it/s]

{'loss': Array(0.49393988, dtype=float32), 'loss_reward': Array(0.00422415, dtype=float32), 'loss_cross_entropy': Array(0.48971578, dtype=float32)}


  3%|▎         | 31420/1000000 [1:37:17<27:41:02,  9.72it/s]

{'loss': Array(0.49210525, dtype=float32), 'loss_reward': Array(0.00423101, dtype=float32), 'loss_cross_entropy': Array(0.48787424, dtype=float32)}


  3%|▎         | 31428/1000000 [1:37:19<39:36:09,  6.79it/s]

{'loss': Array(0.48667884, dtype=float32), 'loss_reward': Array(0.00420467, dtype=float32), 'loss_cross_entropy': Array(0.48247418, dtype=float32)}


  3%|▎         | 31440/1000000 [1:37:20<28:06:06,  9.57it/s]

{'loss': Array(0.48868352, dtype=float32), 'loss_reward': Array(0.00409413, dtype=float32), 'loss_cross_entropy': Array(0.4845894, dtype=float32)}


  3%|▎         | 31448/1000000 [1:37:22<32:25:53,  8.30it/s]

{'loss': Array(0.48946562, dtype=float32), 'loss_reward': Array(0.00409313, dtype=float32), 'loss_cross_entropy': Array(0.48537254, dtype=float32)}


  3%|▎         | 31460/1000000 [1:37:24<27:23:54,  9.82it/s]

{'loss': Array(0.49533063, dtype=float32), 'loss_reward': Array(0.00416342, dtype=float32), 'loss_cross_entropy': Array(0.49116728, dtype=float32)}


  3%|▎         | 31470/1000000 [1:37:26<32:53:24,  8.18it/s]

{'loss': Array(0.48792192, dtype=float32), 'loss_reward': Array(0.00414444, dtype=float32), 'loss_cross_entropy': Array(0.4837775, dtype=float32)}


  3%|▎         | 31478/1000000 [1:37:27<35:41:12,  7.54it/s]

{'loss': Array(0.48879418, dtype=float32), 'loss_reward': Array(0.00420653, dtype=float32), 'loss_cross_entropy': Array(0.48458767, dtype=float32)}


  3%|▎         | 31490/1000000 [1:37:29<27:50:25,  9.66it/s]

{'loss': Array(0.48034438, dtype=float32), 'loss_reward': Array(0.00413389, dtype=float32), 'loss_cross_entropy': Array(0.47621056, dtype=float32)}


  3%|▎         | 31498/1000000 [1:37:30<33:21:16,  8.07it/s]

{'loss': Array(0.48870572, dtype=float32), 'loss_reward': Array(0.00413402, dtype=float32), 'loss_cross_entropy': Array(0.48457175, dtype=float32)}


  3%|▎         | 31508/1000000 [1:37:42<136:02:28,  1.98it/s]

{'loss': Array(0.49356094, dtype=float32), 'loss_reward': Array(0.00426776, dtype=float32), 'loss_cross_entropy': Array(0.4892932, dtype=float32)}


  3%|▎         | 31520/1000000 [1:37:44<50:10:05,  5.36it/s] 

{'loss': Array(0.49124613, dtype=float32), 'loss_reward': Array(0.00422635, dtype=float32), 'loss_cross_entropy': Array(0.48701975, dtype=float32)}


  3%|▎         | 31528/1000000 [1:37:45<40:49:39,  6.59it/s]

{'loss': Array(0.4969894, dtype=float32), 'loss_reward': Array(0.00440647, dtype=float32), 'loss_cross_entropy': Array(0.49258295, dtype=float32)}


  3%|▎         | 31539/1000000 [1:37:47<40:58:42,  6.56it/s]

{'loss': Array(0.49750152, dtype=float32), 'loss_reward': Array(0.00422196, dtype=float32), 'loss_cross_entropy': Array(0.4932796, dtype=float32)}


  3%|▎         | 31550/1000000 [1:37:48<29:27:41,  9.13it/s]

{'loss': Array(0.49228677, dtype=float32), 'loss_reward': Array(0.00411317, dtype=float32), 'loss_cross_entropy': Array(0.48817357, dtype=float32)}


  3%|▎         | 31558/1000000 [1:37:50<34:35:10,  7.78it/s]

{'loss': Array(0.49587384, dtype=float32), 'loss_reward': Array(0.00417385, dtype=float32), 'loss_cross_entropy': Array(0.4917, dtype=float32)}


  3%|▎         | 31570/1000000 [1:37:52<27:04:37,  9.93it/s]

{'loss': Array(0.4932901, dtype=float32), 'loss_reward': Array(0.00420707, dtype=float32), 'loss_cross_entropy': Array(0.48908302, dtype=float32)}


  3%|▎         | 31580/1000000 [1:37:53<34:31:17,  7.79it/s]

{'loss': Array(0.48964873, dtype=float32), 'loss_reward': Array(0.00418361, dtype=float32), 'loss_cross_entropy': Array(0.4854651, dtype=float32)}


  3%|▎         | 31588/1000000 [1:37:55<35:29:50,  7.58it/s]

{'loss': Array(0.5003481, dtype=float32), 'loss_reward': Array(0.00422914, dtype=float32), 'loss_cross_entropy': Array(0.49611902, dtype=float32)}


  3%|▎         | 31600/1000000 [1:37:57<28:02:55,  9.59it/s]

{'loss': Array(0.49352342, dtype=float32), 'loss_reward': Array(0.00406888, dtype=float32), 'loss_cross_entropy': Array(0.4894545, dtype=float32)}


  3%|▎         | 31608/1000000 [1:37:58<33:46:55,  7.96it/s]

{'loss': Array(0.49575374, dtype=float32), 'loss_reward': Array(0.00426496, dtype=float32), 'loss_cross_entropy': Array(0.4914887, dtype=float32)}


  3%|▎         | 31619/1000000 [1:38:00<36:05:09,  7.45it/s]

{'loss': Array(0.4883894, dtype=float32), 'loss_reward': Array(0.0042331, dtype=float32), 'loss_cross_entropy': Array(0.48415628, dtype=float32)}


  3%|▎         | 31630/1000000 [1:38:02<28:17:11,  9.51it/s]

{'loss': Array(0.489432, dtype=float32), 'loss_reward': Array(0.00429537, dtype=float32), 'loss_cross_entropy': Array(0.48513666, dtype=float32)}


  3%|▎         | 31638/1000000 [1:38:03<33:23:15,  8.06it/s]

{'loss': Array(0.4811177, dtype=float32), 'loss_reward': Array(0.00416009, dtype=float32), 'loss_cross_entropy': Array(0.47695756, dtype=float32)}


  3%|▎         | 31649/1000000 [1:38:05<29:25:40,  9.14it/s]

{'loss': Array(0.4897259, dtype=float32), 'loss_reward': Array(0.00412126, dtype=float32), 'loss_cross_entropy': Array(0.4856046, dtype=float32)}


  3%|▎         | 31658/1000000 [1:38:06<34:47:11,  7.73it/s]

{'loss': Array(0.48500854, dtype=float32), 'loss_reward': Array(0.00418358, dtype=float32), 'loss_cross_entropy': Array(0.48082495, dtype=float32)}


  3%|▎         | 31670/1000000 [1:38:08<26:57:50,  9.98it/s]

{'loss': Array(0.48751768, dtype=float32), 'loss_reward': Array(0.0042821, dtype=float32), 'loss_cross_entropy': Array(0.4832356, dtype=float32)}


  3%|▎         | 31678/1000000 [1:38:09<32:19:41,  8.32it/s]

{'loss': Array(0.48685804, dtype=float32), 'loss_reward': Array(0.00402634, dtype=float32), 'loss_cross_entropy': Array(0.48283172, dtype=float32)}


  3%|▎         | 31690/1000000 [1:38:11<26:46:58, 10.04it/s]

{'loss': Array(0.4861637, dtype=float32), 'loss_reward': Array(0.00407934, dtype=float32), 'loss_cross_entropy': Array(0.48208442, dtype=float32)}


  3%|▎         | 31698/1000000 [1:38:13<37:23:33,  7.19it/s]

{'loss': Array(0.48697218, dtype=float32), 'loss_reward': Array(0.00409269, dtype=float32), 'loss_cross_entropy': Array(0.48287946, dtype=float32)}


  3%|▎         | 31710/1000000 [1:38:15<29:29:43,  9.12it/s]

{'loss': Array(0.48595196, dtype=float32), 'loss_reward': Array(0.004166, dtype=float32), 'loss_cross_entropy': Array(0.48178598, dtype=float32)}


  3%|▎         | 31718/1000000 [1:38:16<33:11:43,  8.10it/s]

{'loss': Array(0.49547789, dtype=float32), 'loss_reward': Array(0.00424007, dtype=float32), 'loss_cross_entropy': Array(0.49123785, dtype=float32)}


  3%|▎         | 31729/1000000 [1:38:18<38:59:42,  6.90it/s]

{'loss': Array(0.48270956, dtype=float32), 'loss_reward': Array(0.00420742, dtype=float32), 'loss_cross_entropy': Array(0.47850212, dtype=float32)}


  3%|▎         | 31740/1000000 [1:38:20<29:00:49,  9.27it/s]

{'loss': Array(0.48530054, dtype=float32), 'loss_reward': Array(0.00414205, dtype=float32), 'loss_cross_entropy': Array(0.48115847, dtype=float32)}


  3%|▎         | 31748/1000000 [1:38:21<33:47:05,  7.96it/s]

{'loss': Array(0.47844276, dtype=float32), 'loss_reward': Array(0.00400435, dtype=float32), 'loss_cross_entropy': Array(0.47443843, dtype=float32)}


  3%|▎         | 31760/1000000 [1:38:23<26:48:09, 10.03it/s]

{'loss': Array(0.49209052, dtype=float32), 'loss_reward': Array(0.00425809, dtype=float32), 'loss_cross_entropy': Array(0.48783246, dtype=float32)}


  3%|▎         | 31770/1000000 [1:38:24<34:03:53,  7.90it/s]

{'loss': Array(0.48384115, dtype=float32), 'loss_reward': Array(0.00407334, dtype=float32), 'loss_cross_entropy': Array(0.47976777, dtype=float32)}


  3%|▎         | 31778/1000000 [1:38:26<34:40:47,  7.76it/s]

{'loss': Array(0.48216096, dtype=float32), 'loss_reward': Array(0.00403952, dtype=float32), 'loss_cross_entropy': Array(0.47812143, dtype=float32)}


  3%|▎         | 31790/1000000 [1:38:27<27:19:44,  9.84it/s]

{'loss': Array(0.48244706, dtype=float32), 'loss_reward': Array(0.00408351, dtype=float32), 'loss_cross_entropy': Array(0.4783635, dtype=float32)}


  3%|▎         | 31798/1000000 [1:38:29<32:19:14,  8.32it/s]

{'loss': Array(0.48388815, dtype=float32), 'loss_reward': Array(0.00423937, dtype=float32), 'loss_cross_entropy': Array(0.4796488, dtype=float32)}


  3%|▎         | 31809/1000000 [1:38:31<34:57:06,  7.69it/s]

{'loss': Array(0.483739, dtype=float32), 'loss_reward': Array(0.00409007, dtype=float32), 'loss_cross_entropy': Array(0.4796489, dtype=float32)}


  3%|▎         | 31820/1000000 [1:38:32<28:20:18,  9.49it/s]

{'loss': Array(0.4853463, dtype=float32), 'loss_reward': Array(0.00398536, dtype=float32), 'loss_cross_entropy': Array(0.4813609, dtype=float32)}


  3%|▎         | 31828/1000000 [1:38:34<33:01:35,  8.14it/s]

{'loss': Array(0.48512384, dtype=float32), 'loss_reward': Array(0.00415551, dtype=float32), 'loss_cross_entropy': Array(0.48096833, dtype=float32)}


  3%|▎         | 31840/1000000 [1:38:35<27:07:28,  9.91it/s]

{'loss': Array(0.4822522, dtype=float32), 'loss_reward': Array(0.00412596, dtype=float32), 'loss_cross_entropy': Array(0.4781263, dtype=float32)}


  3%|▎         | 31850/1000000 [1:38:37<32:08:39,  8.37it/s]

{'loss': Array(0.4850184, dtype=float32), 'loss_reward': Array(0.0041307, dtype=float32), 'loss_cross_entropy': Array(0.4808877, dtype=float32)}


  3%|▎         | 31858/1000000 [1:38:39<34:35:17,  7.78it/s]

{'loss': Array(0.48045665, dtype=float32), 'loss_reward': Array(0.00421851, dtype=float32), 'loss_cross_entropy': Array(0.47623816, dtype=float32)}


  3%|▎         | 31870/1000000 [1:38:40<28:02:17,  9.59it/s]

{'loss': Array(0.4795619, dtype=float32), 'loss_reward': Array(0.00404458, dtype=float32), 'loss_cross_entropy': Array(0.47551733, dtype=float32)}


  3%|▎         | 31878/1000000 [1:38:42<32:40:02,  8.23it/s]

{'loss': Array(0.4777455, dtype=float32), 'loss_reward': Array(0.00408404, dtype=float32), 'loss_cross_entropy': Array(0.47366142, dtype=float32)}


  3%|▎         | 31889/1000000 [1:38:44<32:29:33,  8.28it/s]

{'loss': Array(0.48244628, dtype=float32), 'loss_reward': Array(0.0040253, dtype=float32), 'loss_cross_entropy': Array(0.47842097, dtype=float32)}


  3%|▎         | 31900/1000000 [1:38:45<27:09:04,  9.90it/s]

{'loss': Array(0.4781146, dtype=float32), 'loss_reward': Array(0.00419951, dtype=float32), 'loss_cross_entropy': Array(0.4739151, dtype=float32)}


  3%|▎         | 31908/1000000 [1:38:47<32:32:10,  8.27it/s]

{'loss': Array(0.4884533, dtype=float32), 'loss_reward': Array(0.00412643, dtype=float32), 'loss_cross_entropy': Array(0.48432684, dtype=float32)}


  3%|▎         | 31920/1000000 [1:38:49<35:35:02,  7.56it/s]

{'loss': Array(0.48624745, dtype=float32), 'loss_reward': Array(0.00403334, dtype=float32), 'loss_cross_entropy': Array(0.48221412, dtype=float32)}


  3%|▎         | 31928/1000000 [1:38:50<35:33:54,  7.56it/s]

{'loss': Array(0.4776132, dtype=float32), 'loss_reward': Array(0.00407244, dtype=float32), 'loss_cross_entropy': Array(0.47354075, dtype=float32)}


  3%|▎         | 31940/1000000 [1:38:52<27:33:45,  9.76it/s]

{'loss': Array(0.47784397, dtype=float32), 'loss_reward': Array(0.00411045, dtype=float32), 'loss_cross_entropy': Array(0.47373348, dtype=float32)}


  3%|▎         | 31948/1000000 [1:38:53<32:59:25,  8.15it/s]

{'loss': Array(0.47864094, dtype=float32), 'loss_reward': Array(0.00406683, dtype=float32), 'loss_cross_entropy': Array(0.4745741, dtype=float32)}


  3%|▎         | 31960/1000000 [1:38:55<33:50:24,  7.95it/s]

{'loss': Array(0.47833434, dtype=float32), 'loss_reward': Array(0.00414067, dtype=float32), 'loss_cross_entropy': Array(0.47419363, dtype=float32)}


  3%|▎         | 31968/1000000 [1:38:57<34:33:22,  7.78it/s]

{'loss': Array(0.4801081, dtype=float32), 'loss_reward': Array(0.00400276, dtype=float32), 'loss_cross_entropy': Array(0.47610542, dtype=float32)}


  3%|▎         | 31979/1000000 [1:38:58<30:04:33,  8.94it/s]

{'loss': Array(0.48404303, dtype=float32), 'loss_reward': Array(0.00417969, dtype=float32), 'loss_cross_entropy': Array(0.47986326, dtype=float32)}


  3%|▎         | 31990/1000000 [1:39:00<27:32:28,  9.76it/s]

{'loss': Array(0.4827859, dtype=float32), 'loss_reward': Array(0.00420085, dtype=float32), 'loss_cross_entropy': Array(0.47858506, dtype=float32)}


  3%|▎         | 32000/1000000 [1:39:02<34:56:02,  7.70it/s]

{'loss': Array(0.48347998, dtype=float32), 'loss_reward': Array(0.00428378, dtype=float32), 'loss_cross_entropy': Array(0.47919622, dtype=float32)}


  3%|▎         | 32008/1000000 [1:39:13<180:16:52,  1.49it/s]

{'loss': Array(0.48784408, dtype=float32), 'loss_reward': Array(0.00404734, dtype=float32), 'loss_cross_entropy': Array(0.48379675, dtype=float32)}


  3%|▎         | 32020/1000000 [1:39:15<59:39:23,  4.51it/s] 

{'loss': Array(0.47852978, dtype=float32), 'loss_reward': Array(0.00413365, dtype=float32), 'loss_cross_entropy': Array(0.47439614, dtype=float32)}


  3%|▎         | 32028/1000000 [1:39:16<45:13:50,  5.94it/s]

{'loss': Array(0.4884592, dtype=float32), 'loss_reward': Array(0.00420856, dtype=float32), 'loss_cross_entropy': Array(0.4842506, dtype=float32)}


  3%|▎         | 32039/1000000 [1:39:18<35:43:42,  7.53it/s]

{'loss': Array(0.4772737, dtype=float32), 'loss_reward': Array(0.00425874, dtype=float32), 'loss_cross_entropy': Array(0.47301492, dtype=float32)}


  3%|▎         | 32050/1000000 [1:39:20<28:00:53,  9.60it/s]

{'loss': Array(0.4756333, dtype=float32), 'loss_reward': Array(0.00410134, dtype=float32), 'loss_cross_entropy': Array(0.47153193, dtype=float32)}


  3%|▎         | 32058/1000000 [1:39:21<32:48:49,  8.19it/s]

{'loss': Array(0.47378, dtype=float32), 'loss_reward': Array(0.00410724, dtype=float32), 'loss_cross_entropy': Array(0.46967277, dtype=float32)}


  3%|▎         | 32070/1000000 [1:39:23<27:06:52,  9.92it/s]

{'loss': Array(0.47448665, dtype=float32), 'loss_reward': Array(0.00408765, dtype=float32), 'loss_cross_entropy': Array(0.47039896, dtype=float32)}


  3%|▎         | 32078/1000000 [1:39:25<37:06:33,  7.25it/s]

{'loss': Array(0.48596436, dtype=float32), 'loss_reward': Array(0.00423977, dtype=float32), 'loss_cross_entropy': Array(0.48172456, dtype=float32)}


  3%|▎         | 32089/1000000 [1:39:26<29:25:59,  9.13it/s]

{'loss': Array(0.48801383, dtype=float32), 'loss_reward': Array(0.00405558, dtype=float32), 'loss_cross_entropy': Array(0.4839583, dtype=float32)}


  3%|▎         | 32100/1000000 [1:39:28<26:25:02, 10.18it/s]

{'loss': Array(0.47811937, dtype=float32), 'loss_reward': Array(0.00419212, dtype=float32), 'loss_cross_entropy': Array(0.47392732, dtype=float32)}


  3%|▎         | 32110/1000000 [1:39:30<34:26:27,  7.81it/s]

{'loss': Array(0.4811518, dtype=float32), 'loss_reward': Array(0.00428898, dtype=float32), 'loss_cross_entropy': Array(0.47686282, dtype=float32)}


  3%|▎         | 32118/1000000 [1:39:31<34:47:37,  7.73it/s]

{'loss': Array(0.48202476, dtype=float32), 'loss_reward': Array(0.00409009, dtype=float32), 'loss_cross_entropy': Array(0.4779347, dtype=float32)}


  3%|▎         | 32130/1000000 [1:39:33<28:57:26,  9.28it/s]

{'loss': Array(0.474464, dtype=float32), 'loss_reward': Array(0.00400585, dtype=float32), 'loss_cross_entropy': Array(0.47045812, dtype=float32)}


  3%|▎         | 32138/1000000 [1:39:34<34:11:00,  7.86it/s]

{'loss': Array(0.48403844, dtype=float32), 'loss_reward': Array(0.00406357, dtype=float32), 'loss_cross_entropy': Array(0.47997484, dtype=float32)}


  3%|▎         | 32149/1000000 [1:39:36<35:59:57,  7.47it/s]

{'loss': Array(0.47795472, dtype=float32), 'loss_reward': Array(0.00406668, dtype=float32), 'loss_cross_entropy': Array(0.47388807, dtype=float32)}


  3%|▎         | 32159/1000000 [1:39:38<30:22:52,  8.85it/s]

{'loss': Array(0.4854953, dtype=float32), 'loss_reward': Array(0.00415285, dtype=float32), 'loss_cross_entropy': Array(0.48134246, dtype=float32)}


  3%|▎         | 32170/1000000 [1:39:39<26:52:46, 10.00it/s]

{'loss': Array(0.46681854, dtype=float32), 'loss_reward': Array(0.00398891, dtype=float32), 'loss_cross_entropy': Array(0.46282965, dtype=float32)}


  3%|▎         | 32180/1000000 [1:39:41<27:40:24,  9.71it/s]

{'loss': Array(0.47784114, dtype=float32), 'loss_reward': Array(0.00413564, dtype=float32), 'loss_cross_entropy': Array(0.47370553, dtype=float32)}


  3%|▎         | 32190/1000000 [1:39:43<32:22:14,  8.30it/s]

{'loss': Array(0.46784306, dtype=float32), 'loss_reward': Array(0.00406015, dtype=float32), 'loss_cross_entropy': Array(0.46378294, dtype=float32)}


  3%|▎         | 32198/1000000 [1:39:44<34:17:46,  7.84it/s]

{'loss': Array(0.4736913, dtype=float32), 'loss_reward': Array(0.00404425, dtype=float32), 'loss_cross_entropy': Array(0.46964708, dtype=float32)}


  3%|▎         | 32210/1000000 [1:39:46<27:27:37,  9.79it/s]

{'loss': Array(0.46997458, dtype=float32), 'loss_reward': Array(0.00409392, dtype=float32), 'loss_cross_entropy': Array(0.46588063, dtype=float32)}


  3%|▎         | 32218/1000000 [1:39:47<32:30:08,  8.27it/s]

{'loss': Array(0.47631976, dtype=float32), 'loss_reward': Array(0.00413345, dtype=float32), 'loss_cross_entropy': Array(0.4721863, dtype=float32)}


  3%|▎         | 32230/1000000 [1:39:49<30:24:45,  8.84it/s]

{'loss': Array(0.48296347, dtype=float32), 'loss_reward': Array(0.00421137, dtype=float32), 'loss_cross_entropy': Array(0.4787521, dtype=float32)}


  3%|▎         | 32238/1000000 [1:39:51<33:34:58,  8.00it/s]

{'loss': Array(0.48011023, dtype=float32), 'loss_reward': Array(0.0041349, dtype=float32), 'loss_cross_entropy': Array(0.47597533, dtype=float32)}


  3%|▎         | 32250/1000000 [1:39:52<27:12:12,  9.88it/s]

{'loss': Array(0.48353466, dtype=float32), 'loss_reward': Array(0.00407025, dtype=float32), 'loss_cross_entropy': Array(0.47946435, dtype=float32)}


  3%|▎         | 32260/1000000 [1:39:54<39:04:14,  6.88it/s]

{'loss': Array(0.4819928, dtype=float32), 'loss_reward': Array(0.00421506, dtype=float32), 'loss_cross_entropy': Array(0.47777778, dtype=float32)}


  3%|▎         | 32268/1000000 [1:39:56<36:15:27,  7.41it/s]

{'loss': Array(0.48033157, dtype=float32), 'loss_reward': Array(0.00406284, dtype=float32), 'loss_cross_entropy': Array(0.47626868, dtype=float32)}


  3%|▎         | 32280/1000000 [1:39:57<27:18:12,  9.85it/s]

{'loss': Array(0.47292295, dtype=float32), 'loss_reward': Array(0.00409565, dtype=float32), 'loss_cross_entropy': Array(0.4688273, dtype=float32)}


  3%|▎         | 32288/1000000 [1:39:59<33:00:49,  8.14it/s]

{'loss': Array(0.47784385, dtype=float32), 'loss_reward': Array(0.00427934, dtype=float32), 'loss_cross_entropy': Array(0.47356454, dtype=float32)}


  3%|▎         | 32300/1000000 [1:40:01<33:52:51,  7.93it/s]

{'loss': Array(0.4840407, dtype=float32), 'loss_reward': Array(0.00424617, dtype=float32), 'loss_cross_entropy': Array(0.47979456, dtype=float32)}


  3%|▎         | 32308/1000000 [1:40:02<34:56:29,  7.69it/s]

{'loss': Array(0.47316313, dtype=float32), 'loss_reward': Array(0.00419724, dtype=float32), 'loss_cross_entropy': Array(0.46896586, dtype=float32)}


  3%|▎         | 32320/1000000 [1:40:04<26:52:24, 10.00it/s]

{'loss': Array(0.47960287, dtype=float32), 'loss_reward': Array(0.00414502, dtype=float32), 'loss_cross_entropy': Array(0.47545776, dtype=float32)}


  3%|▎         | 32328/1000000 [1:40:05<32:53:38,  8.17it/s]

{'loss': Array(0.47577628, dtype=float32), 'loss_reward': Array(0.00405115, dtype=float32), 'loss_cross_entropy': Array(0.47172514, dtype=float32)}


  3%|▎         | 32339/1000000 [1:40:07<35:47:31,  7.51it/s]

{'loss': Array(0.47200438, dtype=float32), 'loss_reward': Array(0.00405833, dtype=float32), 'loss_cross_entropy': Array(0.46794605, dtype=float32)}


  3%|▎         | 32349/1000000 [1:40:09<31:37:02,  8.50it/s]

{'loss': Array(0.47521815, dtype=float32), 'loss_reward': Array(0.00429281, dtype=float32), 'loss_cross_entropy': Array(0.4709253, dtype=float32)}


  3%|▎         | 32360/1000000 [1:40:10<27:24:02,  9.81it/s]

{'loss': Array(0.46691808, dtype=float32), 'loss_reward': Array(0.00411017, dtype=float32), 'loss_cross_entropy': Array(0.46280798, dtype=float32)}


  3%|▎         | 32368/1000000 [1:40:12<32:37:37,  8.24it/s]

{'loss': Array(0.48274395, dtype=float32), 'loss_reward': Array(0.00412933, dtype=float32), 'loss_cross_entropy': Array(0.47861457, dtype=float32)}


  3%|▎         | 32380/1000000 [1:40:14<31:27:50,  8.54it/s]

{'loss': Array(0.46922603, dtype=float32), 'loss_reward': Array(0.00403387, dtype=float32), 'loss_cross_entropy': Array(0.46519214, dtype=float32)}


  3%|▎         | 32390/1000000 [1:40:15<29:06:03,  9.24it/s]

{'loss': Array(0.47855732, dtype=float32), 'loss_reward': Array(0.00407663, dtype=float32), 'loss_cross_entropy': Array(0.47448063, dtype=float32)}


  3%|▎         | 32400/1000000 [1:40:17<30:02:54,  8.94it/s]

{'loss': Array(0.47456962, dtype=float32), 'loss_reward': Array(0.00407442, dtype=float32), 'loss_cross_entropy': Array(0.4704952, dtype=float32)}


  3%|▎         | 32408/1000000 [1:40:18<34:13:41,  7.85it/s]

{'loss': Array(0.47157332, dtype=float32), 'loss_reward': Array(0.0041113, dtype=float32), 'loss_cross_entropy': Array(0.46746203, dtype=float32)}


  3%|▎         | 32420/1000000 [1:40:20<30:42:50,  8.75it/s]

{'loss': Array(0.47403502, dtype=float32), 'loss_reward': Array(0.00409417, dtype=float32), 'loss_cross_entropy': Array(0.46994087, dtype=float32)}


  3%|▎         | 32428/1000000 [1:40:22<33:51:28,  7.94it/s]

{'loss': Array(0.46745965, dtype=float32), 'loss_reward': Array(0.00399524, dtype=float32), 'loss_cross_entropy': Array(0.4634644, dtype=float32)}


  3%|▎         | 32440/1000000 [1:40:24<27:55:32,  9.62it/s]

{'loss': Array(0.47097906, dtype=float32), 'loss_reward': Array(0.00416883, dtype=float32), 'loss_cross_entropy': Array(0.46681023, dtype=float32)}


  3%|▎         | 32450/1000000 [1:40:25<39:13:56,  6.85it/s]

{'loss': Array(0.4759872, dtype=float32), 'loss_reward': Array(0.00400049, dtype=float32), 'loss_cross_entropy': Array(0.47198674, dtype=float32)}


  3%|▎         | 32458/1000000 [1:40:27<36:24:25,  7.38it/s]

{'loss': Array(0.47333413, dtype=float32), 'loss_reward': Array(0.00405861, dtype=float32), 'loss_cross_entropy': Array(0.46927547, dtype=float32)}


  3%|▎         | 32470/1000000 [1:40:29<27:55:34,  9.62it/s]

{'loss': Array(0.47081527, dtype=float32), 'loss_reward': Array(0.00415753, dtype=float32), 'loss_cross_entropy': Array(0.4666578, dtype=float32)}


  3%|▎         | 32478/1000000 [1:40:30<32:41:51,  8.22it/s]

{'loss': Array(0.46769795, dtype=float32), 'loss_reward': Array(0.00406021, dtype=float32), 'loss_cross_entropy': Array(0.4636378, dtype=float32)}


  3%|▎         | 32490/1000000 [1:40:32<33:41:08,  7.98it/s]

{'loss': Array(0.46769506, dtype=float32), 'loss_reward': Array(0.00415655, dtype=float32), 'loss_cross_entropy': Array(0.46353847, dtype=float32)}


  3%|▎         | 32498/1000000 [1:40:44<34:31:38,  7.78it/s]

{'loss': Array(0.47513023, dtype=float32), 'loss_reward': Array(0.00414939, dtype=float32), 'loss_cross_entropy': Array(0.4709808, dtype=float32)}


  3%|▎         | 32510/1000000 [1:40:45<120:23:15,  2.23it/s]

{'loss': Array(0.47312042, dtype=float32), 'loss_reward': Array(0.00415418, dtype=float32), 'loss_cross_entropy': Array(0.46896625, dtype=float32)}


  3%|▎         | 32519/1000000 [1:40:46<62:42:06,  4.29it/s] 

{'loss': Array(0.4850287, dtype=float32), 'loss_reward': Array(0.00423697, dtype=float32), 'loss_cross_entropy': Array(0.48079178, dtype=float32)}


  3%|▎         | 32529/1000000 [1:40:48<44:21:20,  6.06it/s]

{'loss': Array(0.47754246, dtype=float32), 'loss_reward': Array(0.00405376, dtype=float32), 'loss_cross_entropy': Array(0.47348866, dtype=float32)}


  3%|▎         | 32540/1000000 [1:40:50<30:08:16,  8.92it/s]

{'loss': Array(0.47421739, dtype=float32), 'loss_reward': Array(0.00397064, dtype=float32), 'loss_cross_entropy': Array(0.47024676, dtype=float32)}


  3%|▎         | 32548/1000000 [1:40:51<34:52:11,  7.71it/s]

{'loss': Array(0.46577284, dtype=float32), 'loss_reward': Array(0.00408475, dtype=float32), 'loss_cross_entropy': Array(0.46168804, dtype=float32)}


  3%|▎         | 32560/1000000 [1:40:53<27:47:10,  9.67it/s]

{'loss': Array(0.4611163, dtype=float32), 'loss_reward': Array(0.00405102, dtype=float32), 'loss_cross_entropy': Array(0.4570653, dtype=float32)}


  3%|▎         | 32570/1000000 [1:40:55<32:24:23,  8.29it/s]

{'loss': Array(0.47584367, dtype=float32), 'loss_reward': Array(0.00414271, dtype=float32), 'loss_cross_entropy': Array(0.47170097, dtype=float32)}


  3%|▎         | 32580/1000000 [1:40:56<29:20:51,  9.16it/s]

{'loss': Array(0.47278792, dtype=float32), 'loss_reward': Array(0.00398437, dtype=float32), 'loss_cross_entropy': Array(0.4688035, dtype=float32)}


  3%|▎         | 32588/1000000 [1:40:58<34:13:12,  7.85it/s]

{'loss': Array(0.4618145, dtype=float32), 'loss_reward': Array(0.00394515, dtype=float32), 'loss_cross_entropy': Array(0.45786938, dtype=float32)}


  3%|▎         | 32600/1000000 [1:41:00<37:28:51,  7.17it/s]

{'loss': Array(0.4661603, dtype=float32), 'loss_reward': Array(0.0040682, dtype=float32), 'loss_cross_entropy': Array(0.46209213, dtype=float32)}


  3%|▎         | 32608/1000000 [1:41:01<36:11:32,  7.42it/s]

{'loss': Array(0.45720744, dtype=float32), 'loss_reward': Array(0.00400805, dtype=float32), 'loss_cross_entropy': Array(0.45319936, dtype=float32)}


  3%|▎         | 32620/1000000 [1:41:03<28:24:24,  9.46it/s]

{'loss': Array(0.47072268, dtype=float32), 'loss_reward': Array(0.00417523, dtype=float32), 'loss_cross_entropy': Array(0.4665475, dtype=float32)}


  3%|▎         | 32628/1000000 [1:41:05<33:49:59,  7.94it/s]

{'loss': Array(0.46264228, dtype=float32), 'loss_reward': Array(0.00389927, dtype=float32), 'loss_cross_entropy': Array(0.458743, dtype=float32)}


  3%|▎         | 32638/1000000 [1:41:06<42:32:18,  6.32it/s]

{'loss': Array(0.47103953, dtype=float32), 'loss_reward': Array(0.00396985, dtype=float32), 'loss_cross_entropy': Array(0.46706972, dtype=float32)}


  3%|▎         | 32650/1000000 [1:41:08<28:44:29,  9.35it/s]

{'loss': Array(0.46349785, dtype=float32), 'loss_reward': Array(0.00397755, dtype=float32), 'loss_cross_entropy': Array(0.45952034, dtype=float32)}


  3%|▎         | 32660/1000000 [1:41:10<27:18:54,  9.84it/s]

{'loss': Array(0.46718535, dtype=float32), 'loss_reward': Array(0.00417476, dtype=float32), 'loss_cross_entropy': Array(0.46301064, dtype=float32)}


  3%|▎         | 32668/1000000 [1:41:11<32:37:19,  8.24it/s]

{'loss': Array(0.46003786, dtype=float32), 'loss_reward': Array(0.00412331, dtype=float32), 'loss_cross_entropy': Array(0.45591456, dtype=float32)}


  3%|▎         | 32679/1000000 [1:41:13<35:55:19,  7.48it/s]

{'loss': Array(0.46837735, dtype=float32), 'loss_reward': Array(0.00401608, dtype=float32), 'loss_cross_entropy': Array(0.46436128, dtype=float32)}


  3%|▎         | 32690/1000000 [1:41:15<28:40:33,  9.37it/s]

{'loss': Array(0.46377158, dtype=float32), 'loss_reward': Array(0.00406185, dtype=float32), 'loss_cross_entropy': Array(0.45970973, dtype=float32)}


  3%|▎         | 32698/1000000 [1:41:16<33:11:46,  8.09it/s]

{'loss': Array(0.47305164, dtype=float32), 'loss_reward': Array(0.00406397, dtype=float32), 'loss_cross_entropy': Array(0.4689877, dtype=float32)}


  3%|▎         | 32710/1000000 [1:41:18<27:09:54,  9.89it/s]

{'loss': Array(0.47221643, dtype=float32), 'loss_reward': Array(0.00416847, dtype=float32), 'loss_cross_entropy': Array(0.46804792, dtype=float32)}


  3%|▎         | 32720/1000000 [1:41:20<32:35:25,  8.24it/s]

{'loss': Array(0.4690958, dtype=float32), 'loss_reward': Array(0.0039047, dtype=float32), 'loss_cross_entropy': Array(0.4651911, dtype=float32)}


  3%|▎         | 32730/1000000 [1:41:21<28:50:08,  9.32it/s]

{'loss': Array(0.4661545, dtype=float32), 'loss_reward': Array(0.00412247, dtype=float32), 'loss_cross_entropy': Array(0.462032, dtype=float32)}


  3%|▎         | 32738/1000000 [1:41:23<33:34:25,  8.00it/s]

{'loss': Array(0.47148538, dtype=float32), 'loss_reward': Array(0.00408497, dtype=float32), 'loss_cross_entropy': Array(0.46740037, dtype=float32)}


  3%|▎         | 32750/1000000 [1:41:24<27:34:03,  9.75it/s]

{'loss': Array(0.46496734, dtype=float32), 'loss_reward': Array(0.00396518, dtype=float32), 'loss_cross_entropy': Array(0.4610022, dtype=float32)}


  3%|▎         | 32758/1000000 [1:41:26<38:14:42,  7.03it/s]

{'loss': Array(0.4622101, dtype=float32), 'loss_reward': Array(0.0042152, dtype=float32), 'loss_cross_entropy': Array(0.45799485, dtype=float32)}


  3%|▎         | 32770/1000000 [1:41:28<29:29:48,  9.11it/s]

{'loss': Array(0.46856508, dtype=float32), 'loss_reward': Array(0.00400307, dtype=float32), 'loss_cross_entropy': Array(0.46456203, dtype=float32)}


  3%|▎         | 32778/1000000 [1:41:29<34:04:56,  7.88it/s]

{'loss': Array(0.45370293, dtype=float32), 'loss_reward': Array(0.00392251, dtype=float32), 'loss_cross_entropy': Array(0.44978043, dtype=float32)}


  3%|▎         | 32790/1000000 [1:41:31<36:45:40,  7.31it/s]

{'loss': Array(0.4608315, dtype=float32), 'loss_reward': Array(0.00401466, dtype=float32), 'loss_cross_entropy': Array(0.45681676, dtype=float32)}


  3%|▎         | 32798/1000000 [1:41:33<36:10:51,  7.43it/s]

{'loss': Array(0.4557599, dtype=float32), 'loss_reward': Array(0.00392865, dtype=float32), 'loss_cross_entropy': Array(0.45183125, dtype=float32)}


  3%|▎         | 32810/1000000 [1:41:34<27:50:46,  9.65it/s]

{'loss': Array(0.46517774, dtype=float32), 'loss_reward': Array(0.00408015, dtype=float32), 'loss_cross_entropy': Array(0.46109754, dtype=float32)}


  3%|▎         | 32818/1000000 [1:41:36<33:28:22,  8.03it/s]

{'loss': Array(0.4659357, dtype=float32), 'loss_reward': Array(0.00403325, dtype=float32), 'loss_cross_entropy': Array(0.46190247, dtype=float32)}


  3%|▎         | 32830/1000000 [1:41:38<34:02:54,  7.89it/s]

{'loss': Array(0.46486607, dtype=float32), 'loss_reward': Array(0.00395958, dtype=float32), 'loss_cross_entropy': Array(0.4609065, dtype=float32)}


  3%|▎         | 32838/1000000 [1:41:39<35:00:53,  7.67it/s]

{'loss': Array(0.4611042, dtype=float32), 'loss_reward': Array(0.00400641, dtype=float32), 'loss_cross_entropy': Array(0.45709783, dtype=float32)}


  3%|▎         | 32850/1000000 [1:41:41<27:16:55,  9.85it/s]

{'loss': Array(0.4721058, dtype=float32), 'loss_reward': Array(0.00403874, dtype=float32), 'loss_cross_entropy': Array(0.46806702, dtype=float32)}


  3%|▎         | 32858/1000000 [1:41:42<33:44:43,  7.96it/s]

{'loss': Array(0.46676716, dtype=float32), 'loss_reward': Array(0.00410523, dtype=float32), 'loss_cross_entropy': Array(0.46266198, dtype=float32)}


  3%|▎         | 32869/1000000 [1:41:44<35:27:56,  7.57it/s]

{'loss': Array(0.46453276, dtype=float32), 'loss_reward': Array(0.00398839, dtype=float32), 'loss_cross_entropy': Array(0.46054435, dtype=float32)}


  3%|▎         | 32880/1000000 [1:41:46<28:36:32,  9.39it/s]

{'loss': Array(0.4686892, dtype=float32), 'loss_reward': Array(0.00398756, dtype=float32), 'loss_cross_entropy': Array(0.46470165, dtype=float32)}


  3%|▎         | 32890/1000000 [1:41:48<27:16:24,  9.85it/s]

{'loss': Array(0.45912385, dtype=float32), 'loss_reward': Array(0.00409414, dtype=float32), 'loss_cross_entropy': Array(0.45502973, dtype=float32)}


  3%|▎         | 32898/1000000 [1:41:49<33:34:03,  8.00it/s]

{'loss': Array(0.46935374, dtype=float32), 'loss_reward': Array(0.00410841, dtype=float32), 'loss_cross_entropy': Array(0.4652454, dtype=float32)}


  3%|▎         | 32909/1000000 [1:41:51<33:22:45,  8.05it/s]

{'loss': Array(0.46264124, dtype=float32), 'loss_reward': Array(0.00411802, dtype=float32), 'loss_cross_entropy': Array(0.45852327, dtype=float32)}


  3%|▎         | 32919/1000000 [1:41:53<29:57:21,  8.97it/s]

{'loss': Array(0.45628014, dtype=float32), 'loss_reward': Array(0.00396108, dtype=float32), 'loss_cross_entropy': Array(0.45231906, dtype=float32)}


  3%|▎         | 32930/1000000 [1:41:54<27:15:13,  9.86it/s]

{'loss': Array(0.45465803, dtype=float32), 'loss_reward': Array(0.00400562, dtype=float32), 'loss_cross_entropy': Array(0.45065242, dtype=float32)}


  3%|▎         | 32938/1000000 [1:41:56<32:36:26,  8.24it/s]

{'loss': Array(0.45940396, dtype=float32), 'loss_reward': Array(0.00412768, dtype=float32), 'loss_cross_entropy': Array(0.45527622, dtype=float32)}


  3%|▎         | 32949/1000000 [1:41:58<32:35:00,  8.24it/s]

{'loss': Array(0.45635286, dtype=float32), 'loss_reward': Array(0.00403951, dtype=float32), 'loss_cross_entropy': Array(0.4523134, dtype=float32)}


  3%|▎         | 32960/1000000 [1:41:59<27:37:51,  9.72it/s]

{'loss': Array(0.46547803, dtype=float32), 'loss_reward': Array(0.00393739, dtype=float32), 'loss_cross_entropy': Array(0.4615407, dtype=float32)}


  3%|▎         | 32968/1000000 [1:42:01<33:30:32,  8.02it/s]

{'loss': Array(0.46032807, dtype=float32), 'loss_reward': Array(0.00388898, dtype=float32), 'loss_cross_entropy': Array(0.45643917, dtype=float32)}


  3%|▎         | 32980/1000000 [1:42:03<36:21:23,  7.39it/s]

{'loss': Array(0.45927045, dtype=float32), 'loss_reward': Array(0.00412533, dtype=float32), 'loss_cross_entropy': Array(0.45514512, dtype=float32)}


  3%|▎         | 32988/1000000 [1:42:04<35:53:26,  7.48it/s]

{'loss': Array(0.46359357, dtype=float32), 'loss_reward': Array(0.004162, dtype=float32), 'loss_cross_entropy': Array(0.4594315, dtype=float32)}


  3%|▎         | 33000/1000000 [1:42:06<27:29:49,  9.77it/s]

{'loss': Array(0.45947343, dtype=float32), 'loss_reward': Array(0.00423881, dtype=float32), 'loss_cross_entropy': Array(0.45523462, dtype=float32)}


  3%|▎         | 33008/1000000 [1:42:17<168:51:21,  1.59it/s]

{'loss': Array(0.4593946, dtype=float32), 'loss_reward': Array(0.00400771, dtype=float32), 'loss_cross_entropy': Array(0.45538685, dtype=float32)}


  3%|▎         | 33019/1000000 [1:42:19<74:28:31,  3.61it/s] 

{'loss': Array(0.4718299, dtype=float32), 'loss_reward': Array(0.00406856, dtype=float32), 'loss_cross_entropy': Array(0.46776137, dtype=float32)}


  3%|▎         | 33030/1000000 [1:42:20<37:35:40,  7.14it/s]

{'loss': Array(0.46192947, dtype=float32), 'loss_reward': Array(0.00412687, dtype=float32), 'loss_cross_entropy': Array(0.4578026, dtype=float32)}


  3%|▎         | 33040/1000000 [1:42:22<30:14:37,  8.88it/s]

{'loss': Array(0.4628024, dtype=float32), 'loss_reward': Array(0.00395143, dtype=float32), 'loss_cross_entropy': Array(0.458851, dtype=float32)}


  3%|▎         | 33048/1000000 [1:42:23<33:19:59,  8.06it/s]

{'loss': Array(0.45848224, dtype=float32), 'loss_reward': Array(0.00405612, dtype=float32), 'loss_cross_entropy': Array(0.4544261, dtype=float32)}


  3%|▎         | 33059/1000000 [1:42:25<35:10:01,  7.64it/s]

{'loss': Array(0.45898643, dtype=float32), 'loss_reward': Array(0.00411216, dtype=float32), 'loss_cross_entropy': Array(0.45487428, dtype=float32)}


  3%|▎         | 33070/1000000 [1:42:27<28:43:57,  9.35it/s]

{'loss': Array(0.46524698, dtype=float32), 'loss_reward': Array(0.00407129, dtype=float32), 'loss_cross_entropy': Array(0.46117568, dtype=float32)}


  3%|▎         | 33080/1000000 [1:42:29<29:24:54,  9.13it/s]

{'loss': Array(0.46254054, dtype=float32), 'loss_reward': Array(0.00412818, dtype=float32), 'loss_cross_entropy': Array(0.45841238, dtype=float32)}


  3%|▎         | 33090/1000000 [1:42:30<28:03:19,  9.57it/s]

{'loss': Array(0.45489636, dtype=float32), 'loss_reward': Array(0.00399277, dtype=float32), 'loss_cross_entropy': Array(0.45090356, dtype=float32)}


  3%|▎         | 33100/1000000 [1:42:32<32:30:37,  8.26it/s]

{'loss': Array(0.4580012, dtype=float32), 'loss_reward': Array(0.00392396, dtype=float32), 'loss_cross_entropy': Array(0.45407724, dtype=float32)}


  3%|▎         | 33108/1000000 [1:42:33<34:18:12,  7.83it/s]

{'loss': Array(0.45861623, dtype=float32), 'loss_reward': Array(0.00413077, dtype=float32), 'loss_cross_entropy': Array(0.45448542, dtype=float32)}


  3%|▎         | 33120/1000000 [1:42:35<28:40:48,  9.36it/s]

{'loss': Array(0.46796593, dtype=float32), 'loss_reward': Array(0.00406949, dtype=float32), 'loss_cross_entropy': Array(0.46389642, dtype=float32)}


  3%|▎         | 33128/1000000 [1:42:37<33:02:28,  8.13it/s]

{'loss': Array(0.46686503, dtype=float32), 'loss_reward': Array(0.00392653, dtype=float32), 'loss_cross_entropy': Array(0.46293855, dtype=float32)}


  3%|▎         | 33139/1000000 [1:42:39<32:23:44,  8.29it/s]

{'loss': Array(0.45585576, dtype=float32), 'loss_reward': Array(0.00405696, dtype=float32), 'loss_cross_entropy': Array(0.45179874, dtype=float32)}


  3%|▎         | 33150/1000000 [1:42:40<27:41:37,  9.70it/s]

{'loss': Array(0.4565858, dtype=float32), 'loss_reward': Array(0.00398044, dtype=float32), 'loss_cross_entropy': Array(0.45260534, dtype=float32)}


  3%|▎         | 33158/1000000 [1:42:42<32:51:44,  8.17it/s]

{'loss': Array(0.47005817, dtype=float32), 'loss_reward': Array(0.00417794, dtype=float32), 'loss_cross_entropy': Array(0.46588022, dtype=float32)}


  3%|▎         | 33169/1000000 [1:42:44<39:17:13,  6.84it/s]

{'loss': Array(0.45282012, dtype=float32), 'loss_reward': Array(0.00399914, dtype=float32), 'loss_cross_entropy': Array(0.44882098, dtype=float32)}


  3%|▎         | 33180/1000000 [1:42:45<29:57:25,  8.96it/s]

{'loss': Array(0.46171004, dtype=float32), 'loss_reward': Array(0.00398406, dtype=float32), 'loss_cross_entropy': Array(0.4577259, dtype=float32)}


  3%|▎         | 33188/1000000 [1:42:47<34:22:49,  7.81it/s]

{'loss': Array(0.46175367, dtype=float32), 'loss_reward': Array(0.00408075, dtype=float32), 'loss_cross_entropy': Array(0.4576729, dtype=float32)}


  3%|▎         | 33200/1000000 [1:42:48<27:16:38,  9.85it/s]

{'loss': Array(0.46326208, dtype=float32), 'loss_reward': Array(0.0039915, dtype=float32), 'loss_cross_entropy': Array(0.45927063, dtype=float32)}


  3%|▎         | 33210/1000000 [1:42:50<34:46:33,  7.72it/s]

{'loss': Array(0.46123728, dtype=float32), 'loss_reward': Array(0.00404384, dtype=float32), 'loss_cross_entropy': Array(0.45719343, dtype=float32)}


  3%|▎         | 33218/1000000 [1:42:52<35:47:44,  7.50it/s]

{'loss': Array(0.458989, dtype=float32), 'loss_reward': Array(0.00404441, dtype=float32), 'loss_cross_entropy': Array(0.45494452, dtype=float32)}


  3%|▎         | 33230/1000000 [1:42:53<27:42:28,  9.69it/s]

{'loss': Array(0.46599865, dtype=float32), 'loss_reward': Array(0.00415304, dtype=float32), 'loss_cross_entropy': Array(0.46184564, dtype=float32)}


  3%|▎         | 33238/1000000 [1:42:55<32:08:44,  8.35it/s]

{'loss': Array(0.46447197, dtype=float32), 'loss_reward': Array(0.00408435, dtype=float32), 'loss_cross_entropy': Array(0.46038756, dtype=float32)}


  3%|▎         | 33250/1000000 [1:42:57<32:36:08,  8.24it/s]

{'loss': Array(0.45600995, dtype=float32), 'loss_reward': Array(0.00397766, dtype=float32), 'loss_cross_entropy': Array(0.45203224, dtype=float32)}


  3%|▎         | 33258/1000000 [1:42:58<34:42:51,  7.74it/s]

{'loss': Array(0.46059957, dtype=float32), 'loss_reward': Array(0.00407103, dtype=float32), 'loss_cross_entropy': Array(0.45652857, dtype=float32)}


  3%|▎         | 33270/1000000 [1:43:00<27:46:48,  9.67it/s]

{'loss': Array(0.45827037, dtype=float32), 'loss_reward': Array(0.00405226, dtype=float32), 'loss_cross_entropy': Array(0.45421806, dtype=float32)}


  3%|▎         | 33278/1000000 [1:43:01<32:37:07,  8.23it/s]

{'loss': Array(0.4562838, dtype=float32), 'loss_reward': Array(0.00418151, dtype=float32), 'loss_cross_entropy': Array(0.4521024, dtype=float32)}


  3%|▎         | 33289/1000000 [1:43:03<32:46:02,  8.20it/s]

{'loss': Array(0.45706597, dtype=float32), 'loss_reward': Array(0.00395003, dtype=float32), 'loss_cross_entropy': Array(0.453116, dtype=float32)}


  3%|▎         | 33300/1000000 [1:43:05<27:12:36,  9.87it/s]

{'loss': Array(0.46331516, dtype=float32), 'loss_reward': Array(0.00396406, dtype=float32), 'loss_cross_entropy': Array(0.45935112, dtype=float32)}


  3%|▎         | 33308/1000000 [1:43:06<32:40:19,  8.22it/s]

{'loss': Array(0.45452577, dtype=float32), 'loss_reward': Array(0.00391285, dtype=float32), 'loss_cross_entropy': Array(0.45061287, dtype=float32)}


  3%|▎         | 33320/1000000 [1:43:08<36:32:57,  7.35it/s]

{'loss': Array(0.46154213, dtype=float32), 'loss_reward': Array(0.00404539, dtype=float32), 'loss_cross_entropy': Array(0.4574967, dtype=float32)}


  3%|▎         | 33330/1000000 [1:43:10<29:44:00,  9.03it/s]

{'loss': Array(0.44988376, dtype=float32), 'loss_reward': Array(0.00386299, dtype=float32), 'loss_cross_entropy': Array(0.44602075, dtype=float32)}


  3%|▎         | 33338/1000000 [1:43:11<33:41:12,  7.97it/s]

{'loss': Array(0.46040717, dtype=float32), 'loss_reward': Array(0.00409001, dtype=float32), 'loss_cross_entropy': Array(0.45631716, dtype=float32)}


  3%|▎         | 33350/1000000 [1:43:13<27:25:27,  9.79it/s]

{'loss': Array(0.4597086, dtype=float32), 'loss_reward': Array(0.00390767, dtype=float32), 'loss_cross_entropy': Array(0.45580098, dtype=float32)}


  3%|▎         | 33358/1000000 [1:43:15<43:57:14,  6.11it/s]

{'loss': Array(0.45702082, dtype=float32), 'loss_reward': Array(0.00412419, dtype=float32), 'loss_cross_entropy': Array(0.45289668, dtype=float32)}


  3%|▎         | 33370/1000000 [1:43:16<29:59:02,  8.96it/s]

{'loss': Array(0.45432758, dtype=float32), 'loss_reward': Array(0.00400502, dtype=float32), 'loss_cross_entropy': Array(0.45032263, dtype=float32)}


  3%|▎         | 33380/1000000 [1:43:18<28:21:02,  9.47it/s]

{'loss': Array(0.45196986, dtype=float32), 'loss_reward': Array(0.00377123, dtype=float32), 'loss_cross_entropy': Array(0.44819865, dtype=float32)}


  3%|▎         | 33390/1000000 [1:43:19<28:21:15,  9.47it/s]

{'loss': Array(0.45264822, dtype=float32), 'loss_reward': Array(0.00393512, dtype=float32), 'loss_cross_entropy': Array(0.44871306, dtype=float32)}


  3%|▎         | 33398/1000000 [1:43:21<41:24:09,  6.49it/s]

{'loss': Array(0.44911107, dtype=float32), 'loss_reward': Array(0.00389601, dtype=float32), 'loss_cross_entropy': Array(0.44521508, dtype=float32)}


  3%|▎         | 33410/1000000 [1:43:23<28:52:38,  9.30it/s]

{'loss': Array(0.4460852, dtype=float32), 'loss_reward': Array(0.00399614, dtype=float32), 'loss_cross_entropy': Array(0.44208908, dtype=float32)}


  3%|▎         | 33418/1000000 [1:43:24<34:01:42,  7.89it/s]

{'loss': Array(0.4567782, dtype=float32), 'loss_reward': Array(0.00406416, dtype=float32), 'loss_cross_entropy': Array(0.45271406, dtype=float32)}


  3%|▎         | 33429/1000000 [1:43:26<29:29:14,  9.11it/s]

{'loss': Array(0.45401856, dtype=float32), 'loss_reward': Array(0.00400601, dtype=float32), 'loss_cross_entropy': Array(0.45001253, dtype=float32)}


  3%|▎         | 33439/1000000 [1:43:28<32:15:56,  8.32it/s]

{'loss': Array(0.45232162, dtype=float32), 'loss_reward': Array(0.00400239, dtype=float32), 'loss_cross_entropy': Array(0.4483193, dtype=float32)}


  3%|▎         | 33450/1000000 [1:43:29<27:21:24,  9.81it/s]

{'loss': Array(0.45129237, dtype=float32), 'loss_reward': Array(0.00382688, dtype=float32), 'loss_cross_entropy': Array(0.44746557, dtype=float32)}


  3%|▎         | 33458/1000000 [1:43:31<33:05:57,  8.11it/s]

{'loss': Array(0.45450673, dtype=float32), 'loss_reward': Array(0.00402035, dtype=float32), 'loss_cross_entropy': Array(0.4504864, dtype=float32)}


  3%|▎         | 33470/1000000 [1:43:33<27:57:58,  9.60it/s]

{'loss': Array(0.44867513, dtype=float32), 'loss_reward': Array(0.00398253, dtype=float32), 'loss_cross_entropy': Array(0.4446926, dtype=float32)}


  3%|▎         | 33480/1000000 [1:43:35<32:12:47,  8.33it/s]

{'loss': Array(0.44184318, dtype=float32), 'loss_reward': Array(0.00386658, dtype=float32), 'loss_cross_entropy': Array(0.43797666, dtype=float32)}


  3%|▎         | 33488/1000000 [1:43:36<35:36:11,  7.54it/s]

{'loss': Array(0.45820865, dtype=float32), 'loss_reward': Array(0.00400816, dtype=float32), 'loss_cross_entropy': Array(0.45420057, dtype=float32)}


  3%|▎         | 33500/1000000 [1:43:38<27:16:47,  9.84it/s]

{'loss': Array(0.44857445, dtype=float32), 'loss_reward': Array(0.0039213, dtype=float32), 'loss_cross_entropy': Array(0.44465312, dtype=float32)}


  3%|▎         | 33508/1000000 [1:43:49<171:22:18,  1.57it/s]

{'loss': Array(0.45432672, dtype=float32), 'loss_reward': Array(0.00405959, dtype=float32), 'loss_cross_entropy': Array(0.45026714, dtype=float32)}


  3%|▎         | 33520/1000000 [1:43:51<62:05:17,  4.32it/s] 

{'loss': Array(0.4544568, dtype=float32), 'loss_reward': Array(0.00402891, dtype=float32), 'loss_cross_entropy': Array(0.45042792, dtype=float32)}


  3%|▎         | 33528/1000000 [1:43:53<45:03:46,  5.96it/s]

{'loss': Array(0.45599458, dtype=float32), 'loss_reward': Array(0.00395829, dtype=float32), 'loss_cross_entropy': Array(0.4520363, dtype=float32)}


  3%|▎         | 33540/1000000 [1:43:54<30:05:00,  8.92it/s]

{'loss': Array(0.4431711, dtype=float32), 'loss_reward': Array(0.00388326, dtype=float32), 'loss_cross_entropy': Array(0.43928787, dtype=float32)}


  3%|▎         | 33550/1000000 [1:43:56<35:20:33,  7.60it/s]

{'loss': Array(0.455443, dtype=float32), 'loss_reward': Array(0.00387526, dtype=float32), 'loss_cross_entropy': Array(0.45156774, dtype=float32)}


  3%|▎         | 33558/1000000 [1:43:58<35:11:41,  7.63it/s]

{'loss': Array(0.44902763, dtype=float32), 'loss_reward': Array(0.00393276, dtype=float32), 'loss_cross_entropy': Array(0.44509482, dtype=float32)}


  3%|▎         | 33570/1000000 [1:43:59<27:28:17,  9.77it/s]

{'loss': Array(0.4444283, dtype=float32), 'loss_reward': Array(0.00389681, dtype=float32), 'loss_cross_entropy': Array(0.44053155, dtype=float32)}


  3%|▎         | 33578/1000000 [1:44:01<33:18:20,  8.06it/s]

{'loss': Array(0.455963, dtype=float32), 'loss_reward': Array(0.00404657, dtype=float32), 'loss_cross_entropy': Array(0.45191643, dtype=float32)}


  3%|▎         | 33588/1000000 [1:44:03<37:41:38,  7.12it/s]

{'loss': Array(0.45380265, dtype=float32), 'loss_reward': Array(0.00393333, dtype=float32), 'loss_cross_entropy': Array(0.44986925, dtype=float32)}


  3%|▎         | 33600/1000000 [1:44:04<28:10:10,  9.53it/s]

{'loss': Array(0.45106038, dtype=float32), 'loss_reward': Array(0.00390274, dtype=float32), 'loss_cross_entropy': Array(0.44715768, dtype=float32)}


  3%|▎         | 33610/1000000 [1:44:06<28:40:38,  9.36it/s]

{'loss': Array(0.4525631, dtype=float32), 'loss_reward': Array(0.00386207, dtype=float32), 'loss_cross_entropy': Array(0.448701, dtype=float32)}


  3%|▎         | 33618/1000000 [1:44:07<34:27:10,  7.79it/s]

{'loss': Array(0.45090523, dtype=float32), 'loss_reward': Array(0.0040021, dtype=float32), 'loss_cross_entropy': Array(0.44690314, dtype=float32)}


  3%|▎         | 33629/1000000 [1:44:09<32:52:56,  8.16it/s]

{'loss': Array(0.4532734, dtype=float32), 'loss_reward': Array(0.00392397, dtype=float32), 'loss_cross_entropy': Array(0.44934946, dtype=float32)}


  3%|▎         | 33640/1000000 [1:44:11<28:10:58,  9.52it/s]

{'loss': Array(0.455195, dtype=float32), 'loss_reward': Array(0.00399655, dtype=float32), 'loss_cross_entropy': Array(0.45119843, dtype=float32)}


  3%|▎         | 33648/1000000 [1:44:12<33:15:38,  8.07it/s]

{'loss': Array(0.45394927, dtype=float32), 'loss_reward': Array(0.00400483, dtype=float32), 'loss_cross_entropy': Array(0.4499445, dtype=float32)}


  3%|▎         | 33660/1000000 [1:44:14<27:40:03,  9.70it/s]

{'loss': Array(0.44966507, dtype=float32), 'loss_reward': Array(0.00398683, dtype=float32), 'loss_cross_entropy': Array(0.4456783, dtype=float32)}


  3%|▎         | 33668/1000000 [1:44:16<37:14:41,  7.21it/s]

{'loss': Array(0.45289952, dtype=float32), 'loss_reward': Array(0.00405532, dtype=float32), 'loss_cross_entropy': Array(0.4488442, dtype=float32)}


  3%|▎         | 33680/1000000 [1:44:17<28:24:23,  9.45it/s]

{'loss': Array(0.4580141, dtype=float32), 'loss_reward': Array(0.00394638, dtype=float32), 'loss_cross_entropy': Array(0.45406777, dtype=float32)}


  3%|▎         | 33688/1000000 [1:44:19<33:56:24,  7.91it/s]

{'loss': Array(0.45018527, dtype=float32), 'loss_reward': Array(0.00402942, dtype=float32), 'loss_cross_entropy': Array(0.44615588, dtype=float32)}


  3%|▎         | 33699/1000000 [1:44:21<39:51:46,  6.73it/s]

{'loss': Array(0.4490771, dtype=float32), 'loss_reward': Array(0.00401023, dtype=float32), 'loss_cross_entropy': Array(0.44506684, dtype=float32)}


  3%|▎         | 33710/1000000 [1:44:22<29:12:27,  9.19it/s]

{'loss': Array(0.45335108, dtype=float32), 'loss_reward': Array(0.00408112, dtype=float32), 'loss_cross_entropy': Array(0.44926998, dtype=float32)}


  3%|▎         | 33718/1000000 [1:44:24<33:07:06,  8.10it/s]

{'loss': Array(0.46131077, dtype=float32), 'loss_reward': Array(0.00396689, dtype=float32), 'loss_cross_entropy': Array(0.45734388, dtype=float32)}


  3%|▎         | 33730/1000000 [1:44:26<26:59:55,  9.94it/s]

{'loss': Array(0.4535117, dtype=float32), 'loss_reward': Array(0.0041856, dtype=float32), 'loss_cross_entropy': Array(0.4493261, dtype=float32)}


  3%|▎         | 33740/1000000 [1:44:27<34:39:32,  7.74it/s]

{'loss': Array(0.45155057, dtype=float32), 'loss_reward': Array(0.00405932, dtype=float32), 'loss_cross_entropy': Array(0.44749126, dtype=float32)}


  3%|▎         | 33750/1000000 [1:44:29<30:51:58,  8.70it/s]

{'loss': Array(0.44815627, dtype=float32), 'loss_reward': Array(0.00396143, dtype=float32), 'loss_cross_entropy': Array(0.44419485, dtype=float32)}


  3%|▎         | 33760/1000000 [1:44:31<29:06:59,  9.22it/s]

{'loss': Array(0.45368266, dtype=float32), 'loss_reward': Array(0.00402136, dtype=float32), 'loss_cross_entropy': Array(0.4496613, dtype=float32)}


  3%|▎         | 33768/1000000 [1:44:32<33:57:02,  7.91it/s]

{'loss': Array(0.4502937, dtype=float32), 'loss_reward': Array(0.00409757, dtype=float32), 'loss_cross_entropy': Array(0.44619608, dtype=float32)}


  3%|▎         | 33779/1000000 [1:44:34<36:00:22,  7.45it/s]

{'loss': Array(0.45142457, dtype=float32), 'loss_reward': Array(0.00389291, dtype=float32), 'loss_cross_entropy': Array(0.44753167, dtype=float32)}


  3%|▎         | 33790/1000000 [1:44:36<27:52:50,  9.63it/s]

{'loss': Array(0.45358798, dtype=float32), 'loss_reward': Array(0.00394233, dtype=float32), 'loss_cross_entropy': Array(0.4496456, dtype=float32)}


  3%|▎         | 33798/1000000 [1:44:37<34:15:39,  7.83it/s]

{'loss': Array(0.43863842, dtype=float32), 'loss_reward': Array(0.00396214, dtype=float32), 'loss_cross_entropy': Array(0.43467623, dtype=float32)}


  3%|▎         | 33810/1000000 [1:44:39<26:51:21,  9.99it/s]

{'loss': Array(0.4456389, dtype=float32), 'loss_reward': Array(0.00398355, dtype=float32), 'loss_cross_entropy': Array(0.44165537, dtype=float32)}


  3%|▎         | 33820/1000000 [1:44:41<31:25:49,  8.54it/s]

{'loss': Array(0.44943425, dtype=float32), 'loss_reward': Array(0.00388359, dtype=float32), 'loss_cross_entropy': Array(0.44555065, dtype=float32)}


  3%|▎         | 33830/1000000 [1:44:42<28:30:36,  9.41it/s]

{'loss': Array(0.44630417, dtype=float32), 'loss_reward': Array(0.0040688, dtype=float32), 'loss_cross_entropy': Array(0.44223544, dtype=float32)}


  3%|▎         | 33838/1000000 [1:44:43<32:55:16,  8.15it/s]

{'loss': Array(0.44436437, dtype=float32), 'loss_reward': Array(0.00393274, dtype=float32), 'loss_cross_entropy': Array(0.44043165, dtype=float32)}


  3%|▎         | 33849/1000000 [1:44:45<28:37:49,  9.37it/s]

{'loss': Array(0.44729623, dtype=float32), 'loss_reward': Array(0.00409004, dtype=float32), 'loss_cross_entropy': Array(0.44320622, dtype=float32)}


  3%|▎         | 33859/1000000 [1:44:47<32:22:48,  8.29it/s]

{'loss': Array(0.44552746, dtype=float32), 'loss_reward': Array(0.00394968, dtype=float32), 'loss_cross_entropy': Array(0.44157776, dtype=float32)}


  3%|▎         | 33870/1000000 [1:44:49<26:42:30, 10.05it/s]

{'loss': Array(0.4495991, dtype=float32), 'loss_reward': Array(0.00400195, dtype=float32), 'loss_cross_entropy': Array(0.44559714, dtype=float32)}


  3%|▎         | 33878/1000000 [1:44:50<32:17:07,  8.31it/s]

{'loss': Array(0.44570303, dtype=float32), 'loss_reward': Array(0.00401087, dtype=float32), 'loss_cross_entropy': Array(0.44169217, dtype=float32)}


  3%|▎         | 33889/1000000 [1:44:52<37:33:53,  7.14it/s]

{'loss': Array(0.44525677, dtype=float32), 'loss_reward': Array(0.0039807, dtype=float32), 'loss_cross_entropy': Array(0.44127604, dtype=float32)}


  3%|▎         | 33900/1000000 [1:44:53<28:12:35,  9.51it/s]

{'loss': Array(0.44309998, dtype=float32), 'loss_reward': Array(0.0038987, dtype=float32), 'loss_cross_entropy': Array(0.43920127, dtype=float32)}


  3%|▎         | 33908/1000000 [1:44:55<32:30:13,  8.26it/s]

{'loss': Array(0.44958296, dtype=float32), 'loss_reward': Array(0.00400223, dtype=float32), 'loss_cross_entropy': Array(0.4455807, dtype=float32)}


  3%|▎         | 33920/1000000 [1:44:56<26:50:56,  9.99it/s]

{'loss': Array(0.45082793, dtype=float32), 'loss_reward': Array(0.00393394, dtype=float32), 'loss_cross_entropy': Array(0.44689402, dtype=float32)}


  3%|▎         | 33930/1000000 [1:44:58<34:31:32,  7.77it/s]

{'loss': Array(0.44912606, dtype=float32), 'loss_reward': Array(0.00400916, dtype=float32), 'loss_cross_entropy': Array(0.4451169, dtype=float32)}


  3%|▎         | 33940/1000000 [1:45:00<28:41:10,  9.35it/s]

{'loss': Array(0.45017973, dtype=float32), 'loss_reward': Array(0.00396035, dtype=float32), 'loss_cross_entropy': Array(0.44621935, dtype=float32)}


  3%|▎         | 33948/1000000 [1:45:01<33:13:25,  8.08it/s]

{'loss': Array(0.4481423, dtype=float32), 'loss_reward': Array(0.00404748, dtype=float32), 'loss_cross_entropy': Array(0.44409487, dtype=float32)}


  3%|▎         | 33960/1000000 [1:45:03<26:40:39, 10.06it/s]

{'loss': Array(0.45494804, dtype=float32), 'loss_reward': Array(0.0040367, dtype=float32), 'loss_cross_entropy': Array(0.45091134, dtype=float32)}


  3%|▎         | 33968/1000000 [1:45:05<38:57:45,  6.89it/s]

{'loss': Array(0.44118482, dtype=float32), 'loss_reward': Array(0.004046, dtype=float32), 'loss_cross_entropy': Array(0.4371388, dtype=float32)}


  3%|▎         | 33980/1000000 [1:45:06<28:10:13,  9.53it/s]

{'loss': Array(0.444545, dtype=float32), 'loss_reward': Array(0.00400332, dtype=float32), 'loss_cross_entropy': Array(0.4405417, dtype=float32)}


  3%|▎         | 33988/1000000 [1:45:08<32:14:11,  8.32it/s]

{'loss': Array(0.45112753, dtype=float32), 'loss_reward': Array(0.00400942, dtype=float32), 'loss_cross_entropy': Array(0.44711813, dtype=float32)}


  3%|▎         | 33999/1000000 [1:45:09<28:27:57,  9.43it/s]

{'loss': Array(0.44491848, dtype=float32), 'loss_reward': Array(0.00389558, dtype=float32), 'loss_cross_entropy': Array(0.44102287, dtype=float32)}


  3%|▎         | 34009/1000000 [1:45:21<132:14:54,  2.03it/s]

{'loss': Array(0.446292, dtype=float32), 'loss_reward': Array(0.00395897, dtype=float32), 'loss_cross_entropy': Array(0.44233304, dtype=float32)}


  3%|▎         | 34020/1000000 [1:45:22<50:04:42,  5.36it/s] 

{'loss': Array(0.44736966, dtype=float32), 'loss_reward': Array(0.00401085, dtype=float32), 'loss_cross_entropy': Array(0.44335875, dtype=float32)}


  3%|▎         | 34028/1000000 [1:45:24<39:53:52,  6.73it/s]

{'loss': Array(0.45642123, dtype=float32), 'loss_reward': Array(0.00398145, dtype=float32), 'loss_cross_entropy': Array(0.45243979, dtype=float32)}


  3%|▎         | 34040/1000000 [1:45:26<29:14:28,  9.18it/s]

{'loss': Array(0.44476888, dtype=float32), 'loss_reward': Array(0.00378978, dtype=float32), 'loss_cross_entropy': Array(0.4409791, dtype=float32)}


  3%|▎         | 34048/1000000 [1:45:27<38:42:48,  6.93it/s]

{'loss': Array(0.44437358, dtype=float32), 'loss_reward': Array(0.00389996, dtype=float32), 'loss_cross_entropy': Array(0.44047356, dtype=float32)}


  3%|▎         | 34060/1000000 [1:45:29<28:43:18,  9.34it/s]

{'loss': Array(0.45189735, dtype=float32), 'loss_reward': Array(0.00394848, dtype=float32), 'loss_cross_entropy': Array(0.44794884, dtype=float32)}


  3%|▎         | 34068/1000000 [1:45:31<33:52:16,  7.92it/s]

{'loss': Array(0.4481588, dtype=float32), 'loss_reward': Array(0.00386983, dtype=float32), 'loss_cross_entropy': Array(0.44428894, dtype=float32)}


  3%|▎         | 34079/1000000 [1:45:32<39:40:29,  6.76it/s]

{'loss': Array(0.44830796, dtype=float32), 'loss_reward': Array(0.00403125, dtype=float32), 'loss_cross_entropy': Array(0.44427666, dtype=float32)}


  3%|▎         | 34090/1000000 [1:45:34<29:21:55,  9.14it/s]

{'loss': Array(0.44576976, dtype=float32), 'loss_reward': Array(0.00384939, dtype=float32), 'loss_cross_entropy': Array(0.44192037, dtype=float32)}


  3%|▎         | 34098/1000000 [1:45:36<34:21:12,  7.81it/s]

{'loss': Array(0.44164297, dtype=float32), 'loss_reward': Array(0.00384184, dtype=float32), 'loss_cross_entropy': Array(0.43780118, dtype=float32)}


  3%|▎         | 34110/1000000 [1:45:37<27:51:54,  9.63it/s]

{'loss': Array(0.4475828, dtype=float32), 'loss_reward': Array(0.0039807, dtype=float32), 'loss_cross_entropy': Array(0.44360214, dtype=float32)}


  3%|▎         | 34120/1000000 [1:45:39<34:49:05,  7.71it/s]

{'loss': Array(0.4456543, dtype=float32), 'loss_reward': Array(0.00393025, dtype=float32), 'loss_cross_entropy': Array(0.44172412, dtype=float32)}


  3%|▎         | 34128/1000000 [1:45:41<35:34:15,  7.54it/s]

{'loss': Array(0.4462742, dtype=float32), 'loss_reward': Array(0.00391759, dtype=float32), 'loss_cross_entropy': Array(0.4423566, dtype=float32)}


  3%|▎         | 34140/1000000 [1:45:42<27:28:57,  9.76it/s]

{'loss': Array(0.44677305, dtype=float32), 'loss_reward': Array(0.00399569, dtype=float32), 'loss_cross_entropy': Array(0.44277737, dtype=float32)}


  3%|▎         | 34148/1000000 [1:45:44<32:40:27,  8.21it/s]

{'loss': Array(0.4426745, dtype=float32), 'loss_reward': Array(0.00394066, dtype=float32), 'loss_cross_entropy': Array(0.43873382, dtype=float32)}


  3%|▎         | 34159/1000000 [1:45:46<35:19:18,  7.60it/s]

{'loss': Array(0.4444399, dtype=float32), 'loss_reward': Array(0.00382495, dtype=float32), 'loss_cross_entropy': Array(0.44061494, dtype=float32)}


  3%|▎         | 34170/1000000 [1:45:47<27:45:35,  9.66it/s]

{'loss': Array(0.45272475, dtype=float32), 'loss_reward': Array(0.00406076, dtype=float32), 'loss_cross_entropy': Array(0.448664, dtype=float32)}


  3%|▎         | 34178/1000000 [1:45:49<32:30:15,  8.25it/s]

{'loss': Array(0.4404431, dtype=float32), 'loss_reward': Array(0.00381786, dtype=float32), 'loss_cross_entropy': Array(0.43662524, dtype=float32)}


  3%|▎         | 34189/1000000 [1:45:50<29:00:39,  9.25it/s]

{'loss': Array(0.45512986, dtype=float32), 'loss_reward': Array(0.00396614, dtype=float32), 'loss_cross_entropy': Array(0.45116374, dtype=float32)}


  3%|▎         | 34199/1000000 [1:45:52<33:13:18,  8.08it/s]

{'loss': Array(0.44567296, dtype=float32), 'loss_reward': Array(0.00402269, dtype=float32), 'loss_cross_entropy': Array(0.44165024, dtype=float32)}


  3%|▎         | 34210/1000000 [1:45:54<27:29:29,  9.76it/s]

{'loss': Array(0.43761387, dtype=float32), 'loss_reward': Array(0.00393366, dtype=float32), 'loss_cross_entropy': Array(0.43368027, dtype=float32)}


  3%|▎         | 34218/1000000 [1:45:55<32:36:24,  8.23it/s]

{'loss': Array(0.45415124, dtype=float32), 'loss_reward': Array(0.00388651, dtype=float32), 'loss_cross_entropy': Array(0.45026478, dtype=float32)}


  3%|▎         | 34230/1000000 [1:45:57<27:28:32,  9.76it/s]

{'loss': Array(0.4472053, dtype=float32), 'loss_reward': Array(0.0039097, dtype=float32), 'loss_cross_entropy': Array(0.44329563, dtype=float32)}


  3%|▎         | 34238/1000000 [1:45:59<38:30:25,  6.97it/s]

{'loss': Array(0.43852472, dtype=float32), 'loss_reward': Array(0.00395119, dtype=float32), 'loss_cross_entropy': Array(0.43457356, dtype=float32)}


  3%|▎         | 34250/1000000 [1:46:00<28:20:54,  9.46it/s]

{'loss': Array(0.45095626, dtype=float32), 'loss_reward': Array(0.00396206, dtype=float32), 'loss_cross_entropy': Array(0.44699427, dtype=float32)}


  3%|▎         | 34258/1000000 [1:46:02<33:32:00,  8.00it/s]

{'loss': Array(0.43906784, dtype=float32), 'loss_reward': Array(0.00399353, dtype=float32), 'loss_cross_entropy': Array(0.43507433, dtype=float32)}


  3%|▎         | 34269/1000000 [1:46:04<37:44:34,  7.11it/s]

{'loss': Array(0.43411133, dtype=float32), 'loss_reward': Array(0.00377758, dtype=float32), 'loss_cross_entropy': Array(0.43033376, dtype=float32)}


  3%|▎         | 34280/1000000 [1:46:05<29:03:26,  9.23it/s]

{'loss': Array(0.44181186, dtype=float32), 'loss_reward': Array(0.00396604, dtype=float32), 'loss_cross_entropy': Array(0.43784577, dtype=float32)}


  3%|▎         | 34288/1000000 [1:46:07<32:47:52,  8.18it/s]

{'loss': Array(0.43814024, dtype=float32), 'loss_reward': Array(0.003874, dtype=float32), 'loss_cross_entropy': Array(0.43426624, dtype=float32)}


  3%|▎         | 34299/1000000 [1:46:08<28:16:58,  9.48it/s]

{'loss': Array(0.44067255, dtype=float32), 'loss_reward': Array(0.00406166, dtype=float32), 'loss_cross_entropy': Array(0.43661085, dtype=float32)}


  3%|▎         | 34309/1000000 [1:46:10<35:48:53,  7.49it/s]

{'loss': Array(0.44596392, dtype=float32), 'loss_reward': Array(0.00396583, dtype=float32), 'loss_cross_entropy': Array(0.4419981, dtype=float32)}


  3%|▎         | 34320/1000000 [1:46:12<28:34:07,  9.39it/s]

{'loss': Array(0.44133788, dtype=float32), 'loss_reward': Array(0.00380127, dtype=float32), 'loss_cross_entropy': Array(0.43753657, dtype=float32)}


  3%|▎         | 34328/1000000 [1:46:13<32:39:53,  8.21it/s]

{'loss': Array(0.44695434, dtype=float32), 'loss_reward': Array(0.00390146, dtype=float32), 'loss_cross_entropy': Array(0.44305292, dtype=float32)}


  3%|▎         | 34340/1000000 [1:46:15<26:51:34,  9.99it/s]

{'loss': Array(0.44840312, dtype=float32), 'loss_reward': Array(0.00400499, dtype=float32), 'loss_cross_entropy': Array(0.44439808, dtype=float32)}


  3%|▎         | 34348/1000000 [1:46:17<39:34:43,  6.78it/s]

{'loss': Array(0.435108, dtype=float32), 'loss_reward': Array(0.00391002, dtype=float32), 'loss_cross_entropy': Array(0.43119797, dtype=float32)}


  3%|▎         | 34359/1000000 [1:46:18<29:40:51,  9.04it/s]

{'loss': Array(0.44850978, dtype=float32), 'loss_reward': Array(0.00381032, dtype=float32), 'loss_cross_entropy': Array(0.44469953, dtype=float32)}


  3%|▎         | 34370/1000000 [1:46:20<27:22:42,  9.80it/s]

{'loss': Array(0.44371176, dtype=float32), 'loss_reward': Array(0.00397907, dtype=float32), 'loss_cross_entropy': Array(0.43973264, dtype=float32)}


  3%|▎         | 34378/1000000 [1:46:21<33:14:29,  8.07it/s]

{'loss': Array(0.44407368, dtype=float32), 'loss_reward': Array(0.00397233, dtype=float32), 'loss_cross_entropy': Array(0.44010136, dtype=float32)}


  3%|▎         | 34390/1000000 [1:46:23<31:33:50,  8.50it/s]

{'loss': Array(0.4443105, dtype=float32), 'loss_reward': Array(0.00385383, dtype=float32), 'loss_cross_entropy': Array(0.4404567, dtype=float32)}


  3%|▎         | 34398/1000000 [1:46:25<33:40:10,  7.97it/s]

{'loss': Array(0.439045, dtype=float32), 'loss_reward': Array(0.00390041, dtype=float32), 'loss_cross_entropy': Array(0.43514457, dtype=float32)}


  3%|▎         | 34410/1000000 [1:46:26<27:34:21,  9.73it/s]

{'loss': Array(0.44720697, dtype=float32), 'loss_reward': Array(0.00381606, dtype=float32), 'loss_cross_entropy': Array(0.4433909, dtype=float32)}


  3%|▎         | 34418/1000000 [1:46:28<32:42:05,  8.20it/s]

{'loss': Array(0.43541154, dtype=float32), 'loss_reward': Array(0.0038263, dtype=float32), 'loss_cross_entropy': Array(0.43158528, dtype=float32)}


  3%|▎         | 34430/1000000 [1:46:30<30:20:30,  8.84it/s]

{'loss': Array(0.43466327, dtype=float32), 'loss_reward': Array(0.00397824, dtype=float32), 'loss_cross_entropy': Array(0.43068504, dtype=float32)}


  3%|▎         | 34438/1000000 [1:46:31<34:01:36,  7.88it/s]

{'loss': Array(0.43587747, dtype=float32), 'loss_reward': Array(0.00387565, dtype=float32), 'loss_cross_entropy': Array(0.43200183, dtype=float32)}


  3%|▎         | 34449/1000000 [1:46:33<29:52:15,  8.98it/s]

{'loss': Array(0.43801433, dtype=float32), 'loss_reward': Array(0.00391991, dtype=float32), 'loss_cross_entropy': Array(0.4340944, dtype=float32)}


  3%|▎         | 34459/1000000 [1:46:35<40:07:47,  6.68it/s]

{'loss': Array(0.43272072, dtype=float32), 'loss_reward': Array(0.00366129, dtype=float32), 'loss_cross_entropy': Array(0.42905942, dtype=float32)}


  3%|▎         | 34470/1000000 [1:46:37<29:20:46,  9.14it/s]

{'loss': Array(0.43171558, dtype=float32), 'loss_reward': Array(0.00386872, dtype=float32), 'loss_cross_entropy': Array(0.42784688, dtype=float32)}


  3%|▎         | 34478/1000000 [1:46:38<33:58:06,  7.90it/s]

{'loss': Array(0.43775544, dtype=float32), 'loss_reward': Array(0.00383912, dtype=float32), 'loss_cross_entropy': Array(0.43391633, dtype=float32)}


  3%|▎         | 34490/1000000 [1:46:40<28:28:55,  9.42it/s]

{'loss': Array(0.42970982, dtype=float32), 'loss_reward': Array(0.00390483, dtype=float32), 'loss_cross_entropy': Array(0.425805, dtype=float32)}


  3%|▎         | 34500/1000000 [1:46:42<35:47:51,  7.49it/s]

{'loss': Array(0.42997238, dtype=float32), 'loss_reward': Array(0.00395178, dtype=float32), 'loss_cross_entropy': Array(0.42602062, dtype=float32)}


  3%|▎         | 34508/1000000 [1:46:53<174:31:28,  1.54it/s]

{'loss': Array(0.43936807, dtype=float32), 'loss_reward': Array(0.00370343, dtype=float32), 'loss_cross_entropy': Array(0.43566462, dtype=float32)}


  3%|▎         | 34520/1000000 [1:46:54<58:42:37,  4.57it/s] 

{'loss': Array(0.45086113, dtype=float32), 'loss_reward': Array(0.00393582, dtype=float32), 'loss_cross_entropy': Array(0.44692525, dtype=float32)}


  3%|▎         | 34528/1000000 [1:46:56<43:32:07,  6.16it/s]

{'loss': Array(0.43744427, dtype=float32), 'loss_reward': Array(0.00391639, dtype=float32), 'loss_cross_entropy': Array(0.43352786, dtype=float32)}


  3%|▎         | 34539/1000000 [1:46:58<37:53:16,  7.08it/s]

{'loss': Array(0.43873563, dtype=float32), 'loss_reward': Array(0.00397806, dtype=float32), 'loss_cross_entropy': Array(0.43475762, dtype=float32)}


  3%|▎         | 34550/1000000 [1:46:59<28:46:57,  9.32it/s]

{'loss': Array(0.4362906, dtype=float32), 'loss_reward': Array(0.00386525, dtype=float32), 'loss_cross_entropy': Array(0.43242535, dtype=float32)}


  3%|▎         | 34558/1000000 [1:47:01<32:52:58,  8.16it/s]

{'loss': Array(0.43510276, dtype=float32), 'loss_reward': Array(0.00389252, dtype=float32), 'loss_cross_entropy': Array(0.4312102, dtype=float32)}


  3%|▎         | 34570/1000000 [1:47:02<26:48:54, 10.00it/s]

{'loss': Array(0.43875256, dtype=float32), 'loss_reward': Array(0.00388284, dtype=float32), 'loss_cross_entropy': Array(0.43486974, dtype=float32)}


  3%|▎         | 34580/1000000 [1:47:04<32:05:57,  8.35it/s]

{'loss': Array(0.42855817, dtype=float32), 'loss_reward': Array(0.00391706, dtype=float32), 'loss_cross_entropy': Array(0.4246411, dtype=float32)}


  3%|▎         | 34588/1000000 [1:47:06<34:15:04,  7.83it/s]

{'loss': Array(0.43688416, dtype=float32), 'loss_reward': Array(0.00400469, dtype=float32), 'loss_cross_entropy': Array(0.43287945, dtype=float32)}


  3%|▎         | 34600/1000000 [1:47:07<27:19:37,  9.81it/s]

{'loss': Array(0.44623873, dtype=float32), 'loss_reward': Array(0.00386628, dtype=float32), 'loss_cross_entropy': Array(0.4423724, dtype=float32)}


  3%|▎         | 34608/1000000 [1:47:09<32:39:18,  8.21it/s]

{'loss': Array(0.4328799, dtype=float32), 'loss_reward': Array(0.00409996, dtype=float32), 'loss_cross_entropy': Array(0.4287799, dtype=float32)}


  3%|▎         | 34620/1000000 [1:47:11<30:26:14,  8.81it/s]

{'loss': Array(0.4384505, dtype=float32), 'loss_reward': Array(0.00382049, dtype=float32), 'loss_cross_entropy': Array(0.43463, dtype=float32)}


  3%|▎         | 34628/1000000 [1:47:12<34:19:30,  7.81it/s]

{'loss': Array(0.43998137, dtype=float32), 'loss_reward': Array(0.00389133, dtype=float32), 'loss_cross_entropy': Array(0.43609005, dtype=float32)}


  3%|▎         | 34640/1000000 [1:47:14<27:13:51,  9.85it/s]

{'loss': Array(0.4440274, dtype=float32), 'loss_reward': Array(0.00399809, dtype=float32), 'loss_cross_entropy': Array(0.4400293, dtype=float32)}


  3%|▎         | 34650/1000000 [1:47:16<39:01:14,  6.87it/s]

{'loss': Array(0.434108, dtype=float32), 'loss_reward': Array(0.00395945, dtype=float32), 'loss_cross_entropy': Array(0.4301485, dtype=float32)}


  3%|▎         | 34658/1000000 [1:47:17<36:58:24,  7.25it/s]

{'loss': Array(0.43157634, dtype=float32), 'loss_reward': Array(0.00391592, dtype=float32), 'loss_cross_entropy': Array(0.42766044, dtype=float32)}


  3%|▎         | 34670/1000000 [1:47:19<28:02:34,  9.56it/s]

{'loss': Array(0.4326627, dtype=float32), 'loss_reward': Array(0.00371497, dtype=float32), 'loss_cross_entropy': Array(0.4289477, dtype=float32)}


  3%|▎         | 34678/1000000 [1:47:20<32:35:15,  8.23it/s]

{'loss': Array(0.43627053, dtype=float32), 'loss_reward': Array(0.00386078, dtype=float32), 'loss_cross_entropy': Array(0.43240976, dtype=float32)}


  3%|▎         | 34690/1000000 [1:47:22<33:10:24,  8.08it/s]

{'loss': Array(0.43590888, dtype=float32), 'loss_reward': Array(0.0038504, dtype=float32), 'loss_cross_entropy': Array(0.43205842, dtype=float32)}


  3%|▎         | 34700/1000000 [1:47:24<29:11:12,  9.19it/s]

{'loss': Array(0.43142983, dtype=float32), 'loss_reward': Array(0.00402269, dtype=float32), 'loss_cross_entropy': Array(0.4274071, dtype=float32)}


  3%|▎         | 34710/1000000 [1:47:25<28:27:36,  9.42it/s]

{'loss': Array(0.43501535, dtype=float32), 'loss_reward': Array(0.00398662, dtype=float32), 'loss_cross_entropy': Array(0.4310287, dtype=float32)}


  3%|▎         | 34718/1000000 [1:47:27<33:08:10,  8.09it/s]

{'loss': Array(0.4328888, dtype=float32), 'loss_reward': Array(0.00385478, dtype=float32), 'loss_cross_entropy': Array(0.429034, dtype=float32)}


  3%|▎         | 34729/1000000 [1:47:29<34:50:50,  7.69it/s]

{'loss': Array(0.4328218, dtype=float32), 'loss_reward': Array(0.00382847, dtype=float32), 'loss_cross_entropy': Array(0.42899337, dtype=float32)}


  3%|▎         | 34740/1000000 [1:47:30<27:58:34,  9.58it/s]

{'loss': Array(0.43755817, dtype=float32), 'loss_reward': Array(0.00392906, dtype=float32), 'loss_cross_entropy': Array(0.43362913, dtype=float32)}


  3%|▎         | 34748/1000000 [1:47:32<32:44:22,  8.19it/s]

{'loss': Array(0.43225488, dtype=float32), 'loss_reward': Array(0.00385307, dtype=float32), 'loss_cross_entropy': Array(0.4284018, dtype=float32)}


  3%|▎         | 34760/1000000 [1:47:33<27:00:37,  9.93it/s]

{'loss': Array(0.43311676, dtype=float32), 'loss_reward': Array(0.00393629, dtype=float32), 'loss_cross_entropy': Array(0.42918047, dtype=float32)}


  3%|▎         | 34770/1000000 [1:47:35<32:23:07,  8.28it/s]

{'loss': Array(0.432314, dtype=float32), 'loss_reward': Array(0.00399031, dtype=float32), 'loss_cross_entropy': Array(0.42832372, dtype=float32)}


  3%|▎         | 34778/1000000 [1:47:37<34:54:01,  7.68it/s]

{'loss': Array(0.4338834, dtype=float32), 'loss_reward': Array(0.0040408, dtype=float32), 'loss_cross_entropy': Array(0.42984256, dtype=float32)}


  3%|▎         | 34789/1000000 [1:47:38<29:21:09,  9.13it/s]

{'loss': Array(0.4397447, dtype=float32), 'loss_reward': Array(0.00403329, dtype=float32), 'loss_cross_entropy': Array(0.43571147, dtype=float32)}


  3%|▎         | 34800/1000000 [1:47:40<26:36:25, 10.08it/s]

{'loss': Array(0.4360083, dtype=float32), 'loss_reward': Array(0.00390395, dtype=float32), 'loss_cross_entropy': Array(0.43210435, dtype=float32)}


  3%|▎         | 34808/1000000 [1:47:42<37:58:20,  7.06it/s]

{'loss': Array(0.4347984, dtype=float32), 'loss_reward': Array(0.00391175, dtype=float32), 'loss_cross_entropy': Array(0.43088666, dtype=float32)}


  3%|▎         | 34820/1000000 [1:47:43<27:42:41,  9.67it/s]

{'loss': Array(0.43391824, dtype=float32), 'loss_reward': Array(0.00371908, dtype=float32), 'loss_cross_entropy': Array(0.4301992, dtype=float32)}


  3%|▎         | 34828/1000000 [1:47:45<32:48:28,  8.17it/s]

{'loss': Array(0.43776032, dtype=float32), 'loss_reward': Array(0.00393202, dtype=float32), 'loss_cross_entropy': Array(0.43382832, dtype=float32)}


  3%|▎         | 34840/1000000 [1:47:47<36:47:19,  7.29it/s]

{'loss': Array(0.42954165, dtype=float32), 'loss_reward': Array(0.00390312, dtype=float32), 'loss_cross_entropy': Array(0.42563853, dtype=float32)}


  3%|▎         | 34848/1000000 [1:47:48<36:34:26,  7.33it/s]

{'loss': Array(0.4332405, dtype=float32), 'loss_reward': Array(0.00385042, dtype=float32), 'loss_cross_entropy': Array(0.4293901, dtype=float32)}


  3%|▎         | 34860/1000000 [1:47:50<28:10:03,  9.52it/s]

{'loss': Array(0.4384935, dtype=float32), 'loss_reward': Array(0.00397914, dtype=float32), 'loss_cross_entropy': Array(0.43451437, dtype=float32)}


  3%|▎         | 34868/1000000 [1:47:51<32:42:09,  8.20it/s]

{'loss': Array(0.43242607, dtype=float32), 'loss_reward': Array(0.00382551, dtype=float32), 'loss_cross_entropy': Array(0.42860052, dtype=float32)}


  3%|▎         | 34880/1000000 [1:47:53<33:34:43,  7.98it/s]

{'loss': Array(0.43642193, dtype=float32), 'loss_reward': Array(0.00390312, dtype=float32), 'loss_cross_entropy': Array(0.43251878, dtype=float32)}


  3%|▎         | 34888/1000000 [1:47:55<34:37:49,  7.74it/s]

{'loss': Array(0.42255098, dtype=float32), 'loss_reward': Array(0.0037799, dtype=float32), 'loss_cross_entropy': Array(0.4187711, dtype=float32)}


  3%|▎         | 34900/1000000 [1:47:56<27:19:26,  9.81it/s]

{'loss': Array(0.4364669, dtype=float32), 'loss_reward': Array(0.00381859, dtype=float32), 'loss_cross_entropy': Array(0.43264827, dtype=float32)}


  3%|▎         | 34908/1000000 [1:47:58<33:06:03,  8.10it/s]

{'loss': Array(0.43229395, dtype=float32), 'loss_reward': Array(0.00392473, dtype=float32), 'loss_cross_entropy': Array(0.4283692, dtype=float32)}


  3%|▎         | 34918/1000000 [1:48:00<39:12:20,  6.84it/s]

{'loss': Array(0.43088937, dtype=float32), 'loss_reward': Array(0.00387544, dtype=float32), 'loss_cross_entropy': Array(0.42701393, dtype=float32)}


  3%|▎         | 34929/1000000 [1:48:01<29:33:15,  9.07it/s]

{'loss': Array(0.42629796, dtype=float32), 'loss_reward': Array(0.00389229, dtype=float32), 'loss_cross_entropy': Array(0.4224057, dtype=float32)}


  3%|▎         | 34940/1000000 [1:48:03<26:26:19, 10.14it/s]

{'loss': Array(0.43439838, dtype=float32), 'loss_reward': Array(0.00381926, dtype=float32), 'loss_cross_entropy': Array(0.4305791, dtype=float32)}


  3%|▎         | 34948/1000000 [1:48:04<31:49:34,  8.42it/s]

{'loss': Array(0.42312837, dtype=float32), 'loss_reward': Array(0.00382316, dtype=float32), 'loss_cross_entropy': Array(0.41930524, dtype=float32)}


  3%|▎         | 34959/1000000 [1:48:06<32:31:19,  8.24it/s]

{'loss': Array(0.4317343, dtype=float32), 'loss_reward': Array(0.00383599, dtype=float32), 'loss_cross_entropy': Array(0.42789832, dtype=float32)}


  3%|▎         | 34970/1000000 [1:48:08<27:52:38,  9.62it/s]

{'loss': Array(0.4349521, dtype=float32), 'loss_reward': Array(0.00391726, dtype=float32), 'loss_cross_entropy': Array(0.43103486, dtype=float32)}


  3%|▎         | 34978/1000000 [1:48:09<32:53:13,  8.15it/s]

{'loss': Array(0.4313623, dtype=float32), 'loss_reward': Array(0.00394769, dtype=float32), 'loss_cross_entropy': Array(0.42741463, dtype=float32)}


  3%|▎         | 34990/1000000 [1:48:11<28:33:09,  9.39it/s]

{'loss': Array(0.43136945, dtype=float32), 'loss_reward': Array(0.00394503, dtype=float32), 'loss_cross_entropy': Array(0.42742443, dtype=float32)}


  4%|▎         | 35000/1000000 [1:48:13<31:07:48,  8.61it/s]

{'loss': Array(0.4295036, dtype=float32), 'loss_reward': Array(0.00386795, dtype=float32), 'loss_cross_entropy': Array(0.42563564, dtype=float32)}


  4%|▎         | 35008/1000000 [1:48:24<172:15:19,  1.56it/s]

{'loss': Array(0.42345762, dtype=float32), 'loss_reward': Array(0.00381549, dtype=float32), 'loss_cross_entropy': Array(0.41964218, dtype=float32)}


  4%|▎         | 35020/1000000 [1:48:26<58:05:11,  4.61it/s] 

{'loss': Array(0.43570176, dtype=float32), 'loss_reward': Array(0.0040039, dtype=float32), 'loss_cross_entropy': Array(0.43169785, dtype=float32)}


  4%|▎         | 35030/1000000 [1:48:28<48:14:04,  5.56it/s]

{'loss': Array(0.43478408, dtype=float32), 'loss_reward': Array(0.0038607, dtype=float32), 'loss_cross_entropy': Array(0.43092337, dtype=float32)}


  4%|▎         | 35038/1000000 [1:48:29<40:01:34,  6.70it/s]

{'loss': Array(0.43646643, dtype=float32), 'loss_reward': Array(0.00389185, dtype=float32), 'loss_cross_entropy': Array(0.4325745, dtype=float32)}


  4%|▎         | 35050/1000000 [1:48:31<28:26:22,  9.42it/s]

{'loss': Array(0.43466803, dtype=float32), 'loss_reward': Array(0.00384302, dtype=float32), 'loss_cross_entropy': Array(0.430825, dtype=float32)}


  4%|▎         | 35058/1000000 [1:48:32<34:04:55,  7.86it/s]

{'loss': Array(0.4214502, dtype=float32), 'loss_reward': Array(0.00372961, dtype=float32), 'loss_cross_entropy': Array(0.41772056, dtype=float32)}


  4%|▎         | 35070/1000000 [1:48:34<34:02:50,  7.87it/s]

{'loss': Array(0.43396997, dtype=float32), 'loss_reward': Array(0.00384033, dtype=float32), 'loss_cross_entropy': Array(0.43012974, dtype=float32)}


  4%|▎         | 35080/1000000 [1:48:36<29:06:10,  9.21it/s]

{'loss': Array(0.43453142, dtype=float32), 'loss_reward': Array(0.00396039, dtype=float32), 'loss_cross_entropy': Array(0.43057105, dtype=float32)}


  4%|▎         | 35088/1000000 [1:48:37<33:01:35,  8.12it/s]

{'loss': Array(0.43529803, dtype=float32), 'loss_reward': Array(0.0038425, dtype=float32), 'loss_cross_entropy': Array(0.43145552, dtype=float32)}


  4%|▎         | 35100/1000000 [1:48:39<26:34:30, 10.09it/s]

{'loss': Array(0.4270323, dtype=float32), 'loss_reward': Array(0.00397918, dtype=float32), 'loss_cross_entropy': Array(0.42305312, dtype=float32)}


  4%|▎         | 35110/1000000 [1:48:41<34:47:17,  7.70it/s]

{'loss': Array(0.43919286, dtype=float32), 'loss_reward': Array(0.00392239, dtype=float32), 'loss_cross_entropy': Array(0.43527052, dtype=float32)}


  4%|▎         | 35118/1000000 [1:48:42<35:21:02,  7.58it/s]

{'loss': Array(0.43599853, dtype=float32), 'loss_reward': Array(0.00399699, dtype=float32), 'loss_cross_entropy': Array(0.43200156, dtype=float32)}


  4%|▎         | 35129/1000000 [1:48:44<30:58:12,  8.65it/s]

{'loss': Array(0.43213877, dtype=float32), 'loss_reward': Array(0.00398718, dtype=float32), 'loss_cross_entropy': Array(0.42815158, dtype=float32)}


  4%|▎         | 35140/1000000 [1:48:45<27:07:22,  9.88it/s]

{'loss': Array(0.42909846, dtype=float32), 'loss_reward': Array(0.00389021, dtype=float32), 'loss_cross_entropy': Array(0.4252083, dtype=float32)}


  4%|▎         | 35150/1000000 [1:48:47<32:04:31,  8.36it/s]

{'loss': Array(0.4271505, dtype=float32), 'loss_reward': Array(0.00386424, dtype=float32), 'loss_cross_entropy': Array(0.42328626, dtype=float32)}


  4%|▎         | 35158/1000000 [1:48:49<34:53:41,  7.68it/s]

{'loss': Array(0.42560443, dtype=float32), 'loss_reward': Array(0.00379252, dtype=float32), 'loss_cross_entropy': Array(0.42181197, dtype=float32)}


  4%|▎         | 35170/1000000 [1:48:50<27:26:18,  9.77it/s]

{'loss': Array(0.42461345, dtype=float32), 'loss_reward': Array(0.00377265, dtype=float32), 'loss_cross_entropy': Array(0.42084074, dtype=float32)}


  4%|▎         | 35178/1000000 [1:48:52<32:08:40,  8.34it/s]

{'loss': Array(0.427517, dtype=float32), 'loss_reward': Array(0.00385619, dtype=float32), 'loss_cross_entropy': Array(0.4236608, dtype=float32)}


  4%|▎         | 35189/1000000 [1:48:54<32:49:14,  8.17it/s]

{'loss': Array(0.42896348, dtype=float32), 'loss_reward': Array(0.00386537, dtype=float32), 'loss_cross_entropy': Array(0.4250981, dtype=float32)}


  4%|▎         | 35200/1000000 [1:48:55<27:40:11,  9.69it/s]

{'loss': Array(0.41986987, dtype=float32), 'loss_reward': Array(0.00376667, dtype=float32), 'loss_cross_entropy': Array(0.41610312, dtype=float32)}


  4%|▎         | 35208/1000000 [1:48:57<33:01:55,  8.11it/s]

{'loss': Array(0.43226942, dtype=float32), 'loss_reward': Array(0.00385044, dtype=float32), 'loss_cross_entropy': Array(0.42841896, dtype=float32)}


  4%|▎         | 35219/1000000 [1:48:59<28:39:45,  9.35it/s]

{'loss': Array(0.43606326, dtype=float32), 'loss_reward': Array(0.00388474, dtype=float32), 'loss_cross_entropy': Array(0.4321785, dtype=float32)}


  4%|▎         | 35230/1000000 [1:49:00<30:30:41,  8.78it/s]

{'loss': Array(0.4373954, dtype=float32), 'loss_reward': Array(0.00391837, dtype=float32), 'loss_cross_entropy': Array(0.43347698, dtype=float32)}


  4%|▎         | 35238/1000000 [1:49:02<33:32:18,  7.99it/s]

{'loss': Array(0.42071638, dtype=float32), 'loss_reward': Array(0.00387184, dtype=float32), 'loss_cross_entropy': Array(0.41684452, dtype=float32)}


  4%|▎         | 35250/1000000 [1:49:04<27:26:46,  9.76it/s]

{'loss': Array(0.41614866, dtype=float32), 'loss_reward': Array(0.00369858, dtype=float32), 'loss_cross_entropy': Array(0.41245008, dtype=float32)}


  4%|▎         | 35258/1000000 [1:49:05<42:55:54,  6.24it/s]

{'loss': Array(0.42731714, dtype=float32), 'loss_reward': Array(0.0038312, dtype=float32), 'loss_cross_entropy': Array(0.42348596, dtype=float32)}


  4%|▎         | 35270/1000000 [1:49:07<29:07:25,  9.20it/s]

{'loss': Array(0.4257391, dtype=float32), 'loss_reward': Array(0.00391664, dtype=float32), 'loss_cross_entropy': Array(0.42182252, dtype=float32)}


  4%|▎         | 35278/1000000 [1:49:09<34:28:12,  7.77it/s]

{'loss': Array(0.43187147, dtype=float32), 'loss_reward': Array(0.00379583, dtype=float32), 'loss_cross_entropy': Array(0.42807564, dtype=float32)}


  4%|▎         | 35290/1000000 [1:49:10<27:21:55,  9.79it/s]

{'loss': Array(0.42311582, dtype=float32), 'loss_reward': Array(0.00380155, dtype=float32), 'loss_cross_entropy': Array(0.41931424, dtype=float32)}


  4%|▎         | 35300/1000000 [1:49:12<34:29:53,  7.77it/s]

{'loss': Array(0.43714055, dtype=float32), 'loss_reward': Array(0.0038746, dtype=float32), 'loss_cross_entropy': Array(0.43326598, dtype=float32)}


  4%|▎         | 35308/1000000 [1:49:13<35:43:43,  7.50it/s]

{'loss': Array(0.42083532, dtype=float32), 'loss_reward': Array(0.0039677, dtype=float32), 'loss_cross_entropy': Array(0.41686764, dtype=float32)}


  4%|▎         | 35320/1000000 [1:49:15<27:54:04,  9.60it/s]

{'loss': Array(0.4288411, dtype=float32), 'loss_reward': Array(0.0039275, dtype=float32), 'loss_cross_entropy': Array(0.42491361, dtype=float32)}


  4%|▎         | 35328/1000000 [1:49:17<32:48:09,  8.17it/s]

{'loss': Array(0.43020868, dtype=float32), 'loss_reward': Array(0.00385486, dtype=float32), 'loss_cross_entropy': Array(0.42635384, dtype=float32)}


  4%|▎         | 35340/1000000 [1:49:19<31:27:51,  8.52it/s]

{'loss': Array(0.4342464, dtype=float32), 'loss_reward': Array(0.00393157, dtype=float32), 'loss_cross_entropy': Array(0.43031484, dtype=float32)}


  4%|▎         | 35348/1000000 [1:49:20<35:07:30,  7.63it/s]

{'loss': Array(0.4262023, dtype=float32), 'loss_reward': Array(0.00373261, dtype=float32), 'loss_cross_entropy': Array(0.4224697, dtype=float32)}


  4%|▎         | 35360/1000000 [1:49:22<27:57:31,  9.58it/s]

{'loss': Array(0.42727566, dtype=float32), 'loss_reward': Array(0.00383141, dtype=float32), 'loss_cross_entropy': Array(0.42344424, dtype=float32)}


  4%|▎         | 35368/1000000 [1:49:23<32:50:38,  8.16it/s]

{'loss': Array(0.4318845, dtype=float32), 'loss_reward': Array(0.00381545, dtype=float32), 'loss_cross_entropy': Array(0.42806908, dtype=float32)}


  4%|▎         | 35379/1000000 [1:49:25<33:09:19,  8.08it/s]

{'loss': Array(0.42873403, dtype=float32), 'loss_reward': Array(0.00404705, dtype=float32), 'loss_cross_entropy': Array(0.42468697, dtype=float32)}


  4%|▎         | 35390/1000000 [1:49:27<28:51:53,  9.28it/s]

{'loss': Array(0.42144758, dtype=float32), 'loss_reward': Array(0.00380176, dtype=float32), 'loss_cross_entropy': Array(0.41764575, dtype=float32)}


  4%|▎         | 35398/1000000 [1:49:28<34:59:52,  7.66it/s]

{'loss': Array(0.42813, dtype=float32), 'loss_reward': Array(0.00385051, dtype=float32), 'loss_cross_entropy': Array(0.4242795, dtype=float32)}


  4%|▎         | 35409/1000000 [1:49:30<39:47:28,  6.73it/s]

{'loss': Array(0.41721034, dtype=float32), 'loss_reward': Array(0.00372503, dtype=float32), 'loss_cross_entropy': Array(0.41348535, dtype=float32)}


  4%|▎         | 35419/1000000 [1:49:32<30:55:05,  8.67it/s]

{'loss': Array(0.4220202, dtype=float32), 'loss_reward': Array(0.00389387, dtype=float32), 'loss_cross_entropy': Array(0.41812634, dtype=float32)}


  4%|▎         | 35430/1000000 [1:49:34<28:28:52,  9.41it/s]

{'loss': Array(0.4223797, dtype=float32), 'loss_reward': Array(0.00389709, dtype=float32), 'loss_cross_entropy': Array(0.41848263, dtype=float32)}


  4%|▎         | 35438/1000000 [1:49:35<33:14:24,  8.06it/s]

{'loss': Array(0.4320259, dtype=float32), 'loss_reward': Array(0.00389958, dtype=float32), 'loss_cross_entropy': Array(0.42812634, dtype=float32)}


  4%|▎         | 35450/1000000 [1:49:37<33:47:53,  7.93it/s]

{'loss': Array(0.42628342, dtype=float32), 'loss_reward': Array(0.00386244, dtype=float32), 'loss_cross_entropy': Array(0.42242095, dtype=float32)}


  4%|▎         | 35458/1000000 [1:49:39<35:40:36,  7.51it/s]

{'loss': Array(0.42235872, dtype=float32), 'loss_reward': Array(0.00378651, dtype=float32), 'loss_cross_entropy': Array(0.41857225, dtype=float32)}


  4%|▎         | 35470/1000000 [1:49:40<28:16:13,  9.48it/s]

{'loss': Array(0.4205366, dtype=float32), 'loss_reward': Array(0.00379083, dtype=float32), 'loss_cross_entropy': Array(0.41674575, dtype=float32)}


  4%|▎         | 35478/1000000 [1:49:42<33:50:08,  7.92it/s]

{'loss': Array(0.41671202, dtype=float32), 'loss_reward': Array(0.00391382, dtype=float32), 'loss_cross_entropy': Array(0.41279817, dtype=float32)}


  4%|▎         | 35490/1000000 [1:49:44<32:11:23,  8.32it/s]

{'loss': Array(0.42862377, dtype=float32), 'loss_reward': Array(0.00393666, dtype=float32), 'loss_cross_entropy': Array(0.42468712, dtype=float32)}


  4%|▎         | 35498/1000000 [1:49:45<34:50:56,  7.69it/s]

{'loss': Array(0.42717743, dtype=float32), 'loss_reward': Array(0.00388374, dtype=float32), 'loss_cross_entropy': Array(0.42329368, dtype=float32)}


  4%|▎         | 35510/1000000 [1:49:57<120:44:31,  2.22it/s]

{'loss': Array(0.42843515, dtype=float32), 'loss_reward': Array(0.00378825, dtype=float32), 'loss_cross_entropy': Array(0.42464685, dtype=float32)}


  4%|▎         | 35518/1000000 [1:49:58<65:38:32,  4.08it/s] 

{'loss': Array(0.42497388, dtype=float32), 'loss_reward': Array(0.00392225, dtype=float32), 'loss_cross_entropy': Array(0.4210516, dtype=float32)}


  4%|▎         | 35529/1000000 [1:50:00<40:37:14,  6.60it/s]

{'loss': Array(0.42016014, dtype=float32), 'loss_reward': Array(0.00392016, dtype=float32), 'loss_cross_entropy': Array(0.41623998, dtype=float32)}


  4%|▎         | 35540/1000000 [1:50:02<28:57:55,  9.25it/s]

{'loss': Array(0.43432662, dtype=float32), 'loss_reward': Array(0.00375459, dtype=float32), 'loss_cross_entropy': Array(0.430572, dtype=float32)}


  4%|▎         | 35548/1000000 [1:50:03<34:05:07,  7.86it/s]

{'loss': Array(0.43098432, dtype=float32), 'loss_reward': Array(0.00386929, dtype=float32), 'loss_cross_entropy': Array(0.42711502, dtype=float32)}


  4%|▎         | 35559/1000000 [1:50:05<39:22:08,  6.80it/s]

{'loss': Array(0.42522627, dtype=float32), 'loss_reward': Array(0.00385631, dtype=float32), 'loss_cross_entropy': Array(0.42137, dtype=float32)}


  4%|▎         | 35570/1000000 [1:50:07<29:13:53,  9.16it/s]

{'loss': Array(0.42274356, dtype=float32), 'loss_reward': Array(0.00385362, dtype=float32), 'loss_cross_entropy': Array(0.4188899, dtype=float32)}


  4%|▎         | 35578/1000000 [1:50:08<33:00:08,  8.12it/s]

{'loss': Array(0.42593247, dtype=float32), 'loss_reward': Array(0.00389793, dtype=float32), 'loss_cross_entropy': Array(0.4220346, dtype=float32)}


  4%|▎         | 35590/1000000 [1:50:10<26:38:09, 10.06it/s]

{'loss': Array(0.42502356, dtype=float32), 'loss_reward': Array(0.00389566, dtype=float32), 'loss_cross_entropy': Array(0.4211279, dtype=float32)}


  4%|▎         | 35600/1000000 [1:50:12<34:40:28,  7.73it/s]

{'loss': Array(0.4162756, dtype=float32), 'loss_reward': Array(0.00385792, dtype=float32), 'loss_cross_entropy': Array(0.41241765, dtype=float32)}


  4%|▎         | 35608/1000000 [1:50:13<35:01:15,  7.65it/s]

{'loss': Array(0.42102322, dtype=float32), 'loss_reward': Array(0.0038036, dtype=float32), 'loss_cross_entropy': Array(0.41721964, dtype=float32)}


  4%|▎         | 35620/1000000 [1:50:15<27:14:28,  9.83it/s]

{'loss': Array(0.41982633, dtype=float32), 'loss_reward': Array(0.00389107, dtype=float32), 'loss_cross_entropy': Array(0.41593525, dtype=float32)}


  4%|▎         | 35628/1000000 [1:50:16<32:50:33,  8.16it/s]

{'loss': Array(0.42017388, dtype=float32), 'loss_reward': Array(0.00379009, dtype=float32), 'loss_cross_entropy': Array(0.41638383, dtype=float32)}


  4%|▎         | 35638/1000000 [1:50:18<36:39:36,  7.31it/s]

{'loss': Array(0.41630316, dtype=float32), 'loss_reward': Array(0.00365855, dtype=float32), 'loss_cross_entropy': Array(0.4126446, dtype=float32)}


  4%|▎         | 35650/1000000 [1:50:20<27:47:29,  9.64it/s]

{'loss': Array(0.41480285, dtype=float32), 'loss_reward': Array(0.00368703, dtype=float32), 'loss_cross_entropy': Array(0.41111585, dtype=float32)}


  4%|▎         | 35658/1000000 [1:50:21<33:15:14,  8.06it/s]

{'loss': Array(0.4228509, dtype=float32), 'loss_reward': Array(0.00394693, dtype=float32), 'loss_cross_entropy': Array(0.41890398, dtype=float32)}


  4%|▎         | 35670/1000000 [1:50:23<27:24:50,  9.77it/s]

{'loss': Array(0.41029406, dtype=float32), 'loss_reward': Array(0.00378442, dtype=float32), 'loss_cross_entropy': Array(0.40650964, dtype=float32)}


  4%|▎         | 35680/1000000 [1:50:25<31:52:24,  8.40it/s]

{'loss': Array(0.42214295, dtype=float32), 'loss_reward': Array(0.00383141, dtype=float32), 'loss_cross_entropy': Array(0.4183115, dtype=float32)}


  4%|▎         | 35688/1000000 [1:50:26<34:11:02,  7.84it/s]

{'loss': Array(0.42457977, dtype=float32), 'loss_reward': Array(0.00381041, dtype=float32), 'loss_cross_entropy': Array(0.42076936, dtype=float32)}


  4%|▎         | 35700/1000000 [1:50:28<27:18:17,  9.81it/s]

{'loss': Array(0.42832652, dtype=float32), 'loss_reward': Array(0.00393909, dtype=float32), 'loss_cross_entropy': Array(0.42438743, dtype=float32)}


  4%|▎         | 35708/1000000 [1:50:29<33:01:35,  8.11it/s]

{'loss': Array(0.41414857, dtype=float32), 'loss_reward': Array(0.00368346, dtype=float32), 'loss_cross_entropy': Array(0.4104651, dtype=float32)}


  4%|▎         | 35720/1000000 [1:50:31<31:00:36,  8.64it/s]

{'loss': Array(0.41967234, dtype=float32), 'loss_reward': Array(0.00380717, dtype=float32), 'loss_cross_entropy': Array(0.41586524, dtype=float32)}


  4%|▎         | 35728/1000000 [1:50:33<33:54:51,  7.90it/s]

{'loss': Array(0.41712475, dtype=float32), 'loss_reward': Array(0.0036951, dtype=float32), 'loss_cross_entropy': Array(0.4134296, dtype=float32)}


  4%|▎         | 35739/1000000 [1:50:34<28:32:12,  9.39it/s]

{'loss': Array(0.41455823, dtype=float32), 'loss_reward': Array(0.00375029, dtype=float32), 'loss_cross_entropy': Array(0.41080794, dtype=float32)}


  4%|▎         | 35749/1000000 [1:50:36<39:30:37,  6.78it/s]

{'loss': Array(0.42039457, dtype=float32), 'loss_reward': Array(0.00376614, dtype=float32), 'loss_cross_entropy': Array(0.41662842, dtype=float32)}


  4%|▎         | 35760/1000000 [1:50:38<29:19:52,  9.13it/s]

{'loss': Array(0.42177233, dtype=float32), 'loss_reward': Array(0.00381418, dtype=float32), 'loss_cross_entropy': Array(0.41795817, dtype=float32)}


  4%|▎         | 35768/1000000 [1:50:39<34:26:40,  7.78it/s]

{'loss': Array(0.41310006, dtype=float32), 'loss_reward': Array(0.00372746, dtype=float32), 'loss_cross_entropy': Array(0.40937257, dtype=float32)}


  4%|▎         | 35780/1000000 [1:50:41<26:53:12,  9.96it/s]

{'loss': Array(0.41823325, dtype=float32), 'loss_reward': Array(0.00394483, dtype=float32), 'loss_cross_entropy': Array(0.41428843, dtype=float32)}


  4%|▎         | 35790/1000000 [1:50:43<36:00:42,  7.44it/s]

{'loss': Array(0.41545287, dtype=float32), 'loss_reward': Array(0.00391539, dtype=float32), 'loss_cross_entropy': Array(0.4115375, dtype=float32)}


  4%|▎         | 35800/1000000 [1:50:44<29:09:50,  9.18it/s]

{'loss': Array(0.41424084, dtype=float32), 'loss_reward': Array(0.00383625, dtype=float32), 'loss_cross_entropy': Array(0.41040453, dtype=float32)}


  4%|▎         | 35808/1000000 [1:50:46<32:45:48,  8.17it/s]

{'loss': Array(0.41329572, dtype=float32), 'loss_reward': Array(0.0037395, dtype=float32), 'loss_cross_entropy': Array(0.40955624, dtype=float32)}


  4%|▎         | 35820/1000000 [1:50:47<26:34:30, 10.08it/s]

{'loss': Array(0.4239628, dtype=float32), 'loss_reward': Array(0.00382541, dtype=float32), 'loss_cross_entropy': Array(0.42013738, dtype=float32)}


  4%|▎         | 35828/1000000 [1:50:49<39:36:24,  6.76it/s]

{'loss': Array(0.41542712, dtype=float32), 'loss_reward': Array(0.0038731, dtype=float32), 'loss_cross_entropy': Array(0.411554, dtype=float32)}


  4%|▎         | 35840/1000000 [1:50:51<28:21:46,  9.44it/s]

{'loss': Array(0.42903373, dtype=float32), 'loss_reward': Array(0.00384923, dtype=float32), 'loss_cross_entropy': Array(0.4251845, dtype=float32)}


  4%|▎         | 35850/1000000 [1:50:52<27:23:52,  9.78it/s]

{'loss': Array(0.4182372, dtype=float32), 'loss_reward': Array(0.00374748, dtype=float32), 'loss_cross_entropy': Array(0.41448972, dtype=float32)}


  4%|▎         | 35858/1000000 [1:50:54<32:05:32,  8.35it/s]

{'loss': Array(0.4208463, dtype=float32), 'loss_reward': Array(0.0038601, dtype=float32), 'loss_cross_entropy': Array(0.4169863, dtype=float32)}


  4%|▎         | 35869/1000000 [1:50:56<31:43:08,  8.44it/s]

{'loss': Array(0.42093754, dtype=float32), 'loss_reward': Array(0.00394691, dtype=float32), 'loss_cross_entropy': Array(0.4169906, dtype=float32)}


  4%|▎         | 35880/1000000 [1:50:57<27:36:39,  9.70it/s]

{'loss': Array(0.4153448, dtype=float32), 'loss_reward': Array(0.00372296, dtype=float32), 'loss_cross_entropy': Array(0.41162187, dtype=float32)}


  4%|▎         | 35888/1000000 [1:50:59<33:28:11,  8.00it/s]

{'loss': Array(0.4241178, dtype=float32), 'loss_reward': Array(0.00385238, dtype=float32), 'loss_cross_entropy': Array(0.4202654, dtype=float32)}


  4%|▎         | 35899/1000000 [1:51:00<28:13:31,  9.49it/s]

{'loss': Array(0.41990408, dtype=float32), 'loss_reward': Array(0.0038126, dtype=float32), 'loss_cross_entropy': Array(0.4160915, dtype=float32)}


  4%|▎         | 35910/1000000 [1:51:02<29:49:04,  8.98it/s]

{'loss': Array(0.41684875, dtype=float32), 'loss_reward': Array(0.00375194, dtype=float32), 'loss_cross_entropy': Array(0.41309676, dtype=float32)}


  4%|▎         | 35918/1000000 [1:51:04<33:06:32,  8.09it/s]

{'loss': Array(0.41533905, dtype=float32), 'loss_reward': Array(0.00381039, dtype=float32), 'loss_cross_entropy': Array(0.41152865, dtype=float32)}


  4%|▎         | 35930/1000000 [1:51:05<27:32:21,  9.72it/s]

{'loss': Array(0.41336808, dtype=float32), 'loss_reward': Array(0.00376725, dtype=float32), 'loss_cross_entropy': Array(0.40960088, dtype=float32)}


  4%|▎         | 35940/1000000 [1:51:07<38:04:22,  7.03it/s]

{'loss': Array(0.41062784, dtype=float32), 'loss_reward': Array(0.00364714, dtype=float32), 'loss_cross_entropy': Array(0.40698072, dtype=float32)}


  4%|▎         | 35948/1000000 [1:51:09<36:13:31,  7.39it/s]

{'loss': Array(0.42503744, dtype=float32), 'loss_reward': Array(0.00380668, dtype=float32), 'loss_cross_entropy': Array(0.4212307, dtype=float32)}


  4%|▎         | 35960/1000000 [1:51:10<27:38:07,  9.69it/s]

{'loss': Array(0.41507173, dtype=float32), 'loss_reward': Array(0.00375892, dtype=float32), 'loss_cross_entropy': Array(0.4113128, dtype=float32)}


  4%|▎         | 35968/1000000 [1:51:12<32:39:16,  8.20it/s]

{'loss': Array(0.41304922, dtype=float32), 'loss_reward': Array(0.00376471, dtype=float32), 'loss_cross_entropy': Array(0.4092845, dtype=float32)}


  4%|▎         | 35978/1000000 [1:51:14<38:49:13,  6.90it/s]

{'loss': Array(0.4125402, dtype=float32), 'loss_reward': Array(0.00379331, dtype=float32), 'loss_cross_entropy': Array(0.40874693, dtype=float32)}


  4%|▎         | 35990/1000000 [1:51:15<28:19:54,  9.45it/s]

{'loss': Array(0.40860674, dtype=float32), 'loss_reward': Array(0.00379789, dtype=float32), 'loss_cross_entropy': Array(0.40480885, dtype=float32)}


  4%|▎         | 35998/1000000 [1:51:17<33:53:11,  7.90it/s]

{'loss': Array(0.4150838, dtype=float32), 'loss_reward': Array(0.00379395, dtype=float32), 'loss_cross_entropy': Array(0.41128984, dtype=float32)}


  4%|▎         | 36010/1000000 [1:51:28<118:19:42,  2.26it/s]

{'loss': Array(0.4177304, dtype=float32), 'loss_reward': Array(0.00392934, dtype=float32), 'loss_cross_entropy': Array(0.413801, dtype=float32)}


  4%|▎         | 36018/1000000 [1:51:30<72:36:30,  3.69it/s] 

{'loss': Array(0.42950612, dtype=float32), 'loss_reward': Array(0.0039156, dtype=float32), 'loss_cross_entropy': Array(0.42559052, dtype=float32)}


  4%|▎         | 36029/1000000 [1:51:31<38:03:28,  7.04it/s]

{'loss': Array(0.42302057, dtype=float32), 'loss_reward': Array(0.0039059, dtype=float32), 'loss_cross_entropy': Array(0.41911468, dtype=float32)}


  4%|▎         | 36040/1000000 [1:51:33<28:39:26,  9.34it/s]

{'loss': Array(0.41858965, dtype=float32), 'loss_reward': Array(0.00376298, dtype=float32), 'loss_cross_entropy': Array(0.41482663, dtype=float32)}


  4%|▎         | 36050/1000000 [1:51:35<28:04:34,  9.54it/s]

{'loss': Array(0.41949445, dtype=float32), 'loss_reward': Array(0.00381275, dtype=float32), 'loss_cross_entropy': Array(0.4156817, dtype=float32)}


  4%|▎         | 36060/1000000 [1:51:36<32:54:26,  8.14it/s]

{'loss': Array(0.41891453, dtype=float32), 'loss_reward': Array(0.00384301, dtype=float32), 'loss_cross_entropy': Array(0.4150715, dtype=float32)}


  4%|▎         | 36068/1000000 [1:51:38<34:29:34,  7.76it/s]

{'loss': Array(0.42398578, dtype=float32), 'loss_reward': Array(0.00380777, dtype=float32), 'loss_cross_entropy': Array(0.420178, dtype=float32)}


  4%|▎         | 36080/1000000 [1:51:40<27:15:11,  9.82it/s]

{'loss': Array(0.4283844, dtype=float32), 'loss_reward': Array(0.00395647, dtype=float32), 'loss_cross_entropy': Array(0.42442796, dtype=float32)}


  4%|▎         | 36088/1000000 [1:51:41<31:59:48,  8.37it/s]

{'loss': Array(0.40956163, dtype=float32), 'loss_reward': Array(0.00375366, dtype=float32), 'loss_cross_entropy': Array(0.40580794, dtype=float32)}


  4%|▎         | 36099/1000000 [1:51:43<32:44:03,  8.18it/s]

{'loss': Array(0.42750126, dtype=float32), 'loss_reward': Array(0.00376896, dtype=float32), 'loss_cross_entropy': Array(0.42373228, dtype=float32)}


  4%|▎         | 36110/1000000 [1:51:44<26:38:21, 10.05it/s]

{'loss': Array(0.42635345, dtype=float32), 'loss_reward': Array(0.00387053, dtype=float32), 'loss_cross_entropy': Array(0.42248297, dtype=float32)}


  4%|▎         | 36118/1000000 [1:51:46<31:58:38,  8.37it/s]

{'loss': Array(0.42588806, dtype=float32), 'loss_reward': Array(0.00390368, dtype=float32), 'loss_cross_entropy': Array(0.42198443, dtype=float32)}


  4%|▎         | 36129/1000000 [1:51:48<39:39:37,  6.75it/s]

{'loss': Array(0.42299667, dtype=float32), 'loss_reward': Array(0.0037168, dtype=float32), 'loss_cross_entropy': Array(0.41927987, dtype=float32)}


  4%|▎         | 36140/1000000 [1:51:49<29:33:38,  9.06it/s]

{'loss': Array(0.40737483, dtype=float32), 'loss_reward': Array(0.00358656, dtype=float32), 'loss_cross_entropy': Array(0.40378833, dtype=float32)}


  4%|▎         | 36150/1000000 [1:51:51<28:28:33,  9.40it/s]

{'loss': Array(0.41478515, dtype=float32), 'loss_reward': Array(0.00374595, dtype=float32), 'loss_cross_entropy': Array(0.4110392, dtype=float32)}


  4%|▎         | 36158/1000000 [1:51:52<32:46:09,  8.17it/s]

{'loss': Array(0.42465636, dtype=float32), 'loss_reward': Array(0.0037294, dtype=float32), 'loss_cross_entropy': Array(0.4209269, dtype=float32)}


  4%|▎         | 36170/1000000 [1:51:55<34:42:56,  7.71it/s]

{'loss': Array(0.41386983, dtype=float32), 'loss_reward': Array(0.00391607, dtype=float32), 'loss_cross_entropy': Array(0.40995368, dtype=float32)}


  4%|▎         | 36178/1000000 [1:51:56<36:40:34,  7.30it/s]

{'loss': Array(0.4178621, dtype=float32), 'loss_reward': Array(0.00383936, dtype=float32), 'loss_cross_entropy': Array(0.41402274, dtype=float32)}


  4%|▎         | 36190/1000000 [1:51:58<27:59:26,  9.56it/s]

{'loss': Array(0.41089693, dtype=float32), 'loss_reward': Array(0.00380325, dtype=float32), 'loss_cross_entropy': Array(0.40709373, dtype=float32)}


  4%|▎         | 36198/1000000 [1:51:59<33:33:39,  7.98it/s]

{'loss': Array(0.40628558, dtype=float32), 'loss_reward': Array(0.00373248, dtype=float32), 'loss_cross_entropy': Array(0.40255314, dtype=float32)}


  4%|▎         | 36209/1000000 [1:52:01<35:09:22,  7.62it/s]

{'loss': Array(0.41496578, dtype=float32), 'loss_reward': Array(0.00379251, dtype=float32), 'loss_cross_entropy': Array(0.41117325, dtype=float32)}


  4%|▎         | 36220/1000000 [1:52:03<29:38:05,  9.03it/s]

{'loss': Array(0.41170016, dtype=float32), 'loss_reward': Array(0.00383876, dtype=float32), 'loss_cross_entropy': Array(0.40786144, dtype=float32)}


  4%|▎         | 36228/1000000 [1:52:04<34:09:34,  7.84it/s]

{'loss': Array(0.41529772, dtype=float32), 'loss_reward': Array(0.00370039, dtype=float32), 'loss_cross_entropy': Array(0.4115973, dtype=float32)}


  4%|▎         | 36240/1000000 [1:52:06<27:24:33,  9.77it/s]

{'loss': Array(0.4154614, dtype=float32), 'loss_reward': Array(0.00378015, dtype=float32), 'loss_cross_entropy': Array(0.41168126, dtype=float32)}


  4%|▎         | 36250/1000000 [1:52:08<32:17:31,  8.29it/s]

{'loss': Array(0.4040377, dtype=float32), 'loss_reward': Array(0.00366575, dtype=float32), 'loss_cross_entropy': Array(0.40037194, dtype=float32)}


  4%|▎         | 36258/1000000 [1:52:09<35:20:11,  7.58it/s]

{'loss': Array(0.40817448, dtype=float32), 'loss_reward': Array(0.00363241, dtype=float32), 'loss_cross_entropy': Array(0.40454206, dtype=float32)}


  4%|▎         | 36270/1000000 [1:52:11<28:01:36,  9.55it/s]

{'loss': Array(0.40630013, dtype=float32), 'loss_reward': Array(0.00363136, dtype=float32), 'loss_cross_entropy': Array(0.40266877, dtype=float32)}


  4%|▎         | 36278/1000000 [1:52:12<32:57:54,  8.12it/s]

{'loss': Array(0.41170046, dtype=float32), 'loss_reward': Array(0.00376214, dtype=float32), 'loss_cross_entropy': Array(0.4079383, dtype=float32)}


  4%|▎         | 36290/1000000 [1:52:14<29:49:57,  8.97it/s]

{'loss': Array(0.40881416, dtype=float32), 'loss_reward': Array(0.00374216, dtype=float32), 'loss_cross_entropy': Array(0.40507197, dtype=float32)}


  4%|▎         | 36298/1000000 [1:52:16<33:14:09,  8.05it/s]

{'loss': Array(0.40258494, dtype=float32), 'loss_reward': Array(0.00376384, dtype=float32), 'loss_cross_entropy': Array(0.39882115, dtype=float32)}


  4%|▎         | 36310/1000000 [1:52:17<26:47:28,  9.99it/s]

{'loss': Array(0.4067379, dtype=float32), 'loss_reward': Array(0.00382273, dtype=float32), 'loss_cross_entropy': Array(0.4029152, dtype=float32)}


  4%|▎         | 36320/1000000 [1:52:19<38:13:57,  7.00it/s]

{'loss': Array(0.41831478, dtype=float32), 'loss_reward': Array(0.00385123, dtype=float32), 'loss_cross_entropy': Array(0.41446358, dtype=float32)}


  4%|▎         | 36328/1000000 [1:52:21<36:03:25,  7.42it/s]

{'loss': Array(0.40687758, dtype=float32), 'loss_reward': Array(0.00377851, dtype=float32), 'loss_cross_entropy': Array(0.40309906, dtype=float32)}


  4%|▎         | 36340/1000000 [1:52:22<27:29:06,  9.74it/s]

{'loss': Array(0.40186897, dtype=float32), 'loss_reward': Array(0.00363212, dtype=float32), 'loss_cross_entropy': Array(0.3982368, dtype=float32)}


  4%|▎         | 36348/1000000 [1:52:24<31:46:10,  8.43it/s]

{'loss': Array(0.413565, dtype=float32), 'loss_reward': Array(0.00381222, dtype=float32), 'loss_cross_entropy': Array(0.40975285, dtype=float32)}


  4%|▎         | 36359/1000000 [1:52:26<34:38:46,  7.73it/s]

{'loss': Array(0.40858895, dtype=float32), 'loss_reward': Array(0.00367068, dtype=float32), 'loss_cross_entropy': Array(0.40491828, dtype=float32)}


  4%|▎         | 36370/1000000 [1:52:27<27:23:58,  9.77it/s]

{'loss': Array(0.40666577, dtype=float32), 'loss_reward': Array(0.0036898, dtype=float32), 'loss_cross_entropy': Array(0.40297595, dtype=float32)}


  4%|▎         | 36378/1000000 [1:52:29<32:13:39,  8.31it/s]

{'loss': Array(0.4081064, dtype=float32), 'loss_reward': Array(0.00369517, dtype=float32), 'loss_cross_entropy': Array(0.40441123, dtype=float32)}


  4%|▎         | 36390/1000000 [1:52:30<26:47:22,  9.99it/s]

{'loss': Array(0.4037757, dtype=float32), 'loss_reward': Array(0.00381887, dtype=float32), 'loss_cross_entropy': Array(0.39995682, dtype=float32)}


  4%|▎         | 36398/1000000 [1:52:32<39:26:03,  6.79it/s]

{'loss': Array(0.41136917, dtype=float32), 'loss_reward': Array(0.00378972, dtype=float32), 'loss_cross_entropy': Array(0.40757942, dtype=float32)}


  4%|▎         | 36410/1000000 [1:52:34<29:00:24,  9.23it/s]

{'loss': Array(0.41052127, dtype=float32), 'loss_reward': Array(0.00379249, dtype=float32), 'loss_cross_entropy': Array(0.4067288, dtype=float32)}


  4%|▎         | 36418/1000000 [1:52:35<35:15:43,  7.59it/s]

{'loss': Array(0.41033053, dtype=float32), 'loss_reward': Array(0.00384322, dtype=float32), 'loss_cross_entropy': Array(0.4064873, dtype=float32)}


  4%|▎         | 36429/1000000 [1:52:37<29:00:58,  9.22it/s]

{'loss': Array(0.40686637, dtype=float32), 'loss_reward': Array(0.00370196, dtype=float32), 'loss_cross_entropy': Array(0.4031644, dtype=float32)}


  4%|▎         | 36440/1000000 [1:52:39<31:17:36,  8.55it/s]

{'loss': Array(0.41715223, dtype=float32), 'loss_reward': Array(0.00390411, dtype=float32), 'loss_cross_entropy': Array(0.41324812, dtype=float32)}


  4%|▎         | 36448/1000000 [1:52:40<33:45:13,  7.93it/s]

{'loss': Array(0.40624723, dtype=float32), 'loss_reward': Array(0.00382924, dtype=float32), 'loss_cross_entropy': Array(0.402418, dtype=float32)}


  4%|▎         | 36460/1000000 [1:52:42<27:32:59,  9.72it/s]

{'loss': Array(0.4137965, dtype=float32), 'loss_reward': Array(0.00386564, dtype=float32), 'loss_cross_entropy': Array(0.40993086, dtype=float32)}


  4%|▎         | 36468/1000000 [1:52:43<32:22:38,  8.27it/s]

{'loss': Array(0.40644994, dtype=float32), 'loss_reward': Array(0.00381548, dtype=float32), 'loss_cross_entropy': Array(0.40263447, dtype=float32)}


  4%|▎         | 36480/1000000 [1:52:45<29:48:06,  8.98it/s]

{'loss': Array(0.40943298, dtype=float32), 'loss_reward': Array(0.00379002, dtype=float32), 'loss_cross_entropy': Array(0.4056429, dtype=float32)}


  4%|▎         | 36488/1000000 [1:52:47<33:01:45,  8.10it/s]

{'loss': Array(0.42094994, dtype=float32), 'loss_reward': Array(0.00371524, dtype=float32), 'loss_cross_entropy': Array(0.41723472, dtype=float32)}


  4%|▎         | 36499/1000000 [1:52:48<29:10:22,  9.17it/s]

{'loss': Array(0.402208, dtype=float32), 'loss_reward': Array(0.00365568, dtype=float32), 'loss_cross_entropy': Array(0.3985523, dtype=float32)}


  4%|▎         | 36509/1000000 [1:53:00<140:28:56,  1.91it/s]

{'loss': Array(0.41553602, dtype=float32), 'loss_reward': Array(0.00375285, dtype=float32), 'loss_cross_entropy': Array(0.41178322, dtype=float32)}


  4%|▎         | 36520/1000000 [1:53:02<51:45:07,  5.17it/s] 

{'loss': Array(0.4137779, dtype=float32), 'loss_reward': Array(0.00384298, dtype=float32), 'loss_cross_entropy': Array(0.4099349, dtype=float32)}


  4%|▎         | 36528/1000000 [1:53:03<41:54:26,  6.39it/s]

{'loss': Array(0.4128278, dtype=float32), 'loss_reward': Array(0.00380044, dtype=float32), 'loss_cross_entropy': Array(0.4090273, dtype=float32)}


  4%|▎         | 36540/1000000 [1:53:05<29:38:02,  9.03it/s]

{'loss': Array(0.40769053, dtype=float32), 'loss_reward': Array(0.0038606, dtype=float32), 'loss_cross_entropy': Array(0.4038299, dtype=float32)}


  4%|▎         | 36550/1000000 [1:53:07<35:15:18,  7.59it/s]

{'loss': Array(0.41900405, dtype=float32), 'loss_reward': Array(0.00379861, dtype=float32), 'loss_cross_entropy': Array(0.41520545, dtype=float32)}


  4%|▎         | 36558/1000000 [1:53:08<36:57:23,  7.24it/s]

{'loss': Array(0.4113333, dtype=float32), 'loss_reward': Array(0.00376775, dtype=float32), 'loss_cross_entropy': Array(0.4075655, dtype=float32)}


  4%|▎         | 36570/1000000 [1:53:10<27:35:09,  9.70it/s]

{'loss': Array(0.4082233, dtype=float32), 'loss_reward': Array(0.00370718, dtype=float32), 'loss_cross_entropy': Array(0.40451613, dtype=float32)}


  4%|▎         | 36578/1000000 [1:53:11<31:59:58,  8.36it/s]

{'loss': Array(0.41081378, dtype=float32), 'loss_reward': Array(0.00383349, dtype=float32), 'loss_cross_entropy': Array(0.40698028, dtype=float32)}


  4%|▎         | 36589/1000000 [1:53:13<35:14:48,  7.59it/s]

{'loss': Array(0.4106082, dtype=float32), 'loss_reward': Array(0.00375192, dtype=float32), 'loss_cross_entropy': Array(0.4068563, dtype=float32)}


  4%|▎         | 36600/1000000 [1:53:15<28:20:56,  9.44it/s]

{'loss': Array(0.41170463, dtype=float32), 'loss_reward': Array(0.00390257, dtype=float32), 'loss_cross_entropy': Array(0.40780208, dtype=float32)}


  4%|▎         | 36608/1000000 [1:53:16<33:50:19,  7.91it/s]

{'loss': Array(0.4102827, dtype=float32), 'loss_reward': Array(0.00369738, dtype=float32), 'loss_cross_entropy': Array(0.40658528, dtype=float32)}


  4%|▎         | 36620/1000000 [1:53:18<27:39:13,  9.68it/s]

{'loss': Array(0.40872255, dtype=float32), 'loss_reward': Array(0.00386628, dtype=float32), 'loss_cross_entropy': Array(0.4048563, dtype=float32)}


  4%|▎         | 36630/1000000 [1:53:20<32:04:41,  8.34it/s]

{'loss': Array(0.40785724, dtype=float32), 'loss_reward': Array(0.0037614, dtype=float32), 'loss_cross_entropy': Array(0.40409586, dtype=float32)}


  4%|▎         | 36638/1000000 [1:53:21<34:51:18,  7.68it/s]

{'loss': Array(0.4065268, dtype=float32), 'loss_reward': Array(0.00378178, dtype=float32), 'loss_cross_entropy': Array(0.40274507, dtype=float32)}


  4%|▎         | 36649/1000000 [1:53:23<28:52:48,  9.27it/s]

{'loss': Array(0.4131511, dtype=float32), 'loss_reward': Array(0.00372727, dtype=float32), 'loss_cross_entropy': Array(0.4094238, dtype=float32)}


  4%|▎         | 36660/1000000 [1:53:24<26:29:44, 10.10it/s]

{'loss': Array(0.40016127, dtype=float32), 'loss_reward': Array(0.00375495, dtype=float32), 'loss_cross_entropy': Array(0.39640626, dtype=float32)}


  4%|▎         | 36668/1000000 [1:53:26<36:38:42,  7.30it/s]

{'loss': Array(0.40901706, dtype=float32), 'loss_reward': Array(0.00391212, dtype=float32), 'loss_cross_entropy': Array(0.40510494, dtype=float32)}


  4%|▎         | 36680/1000000 [1:53:28<27:27:43,  9.74it/s]

{'loss': Array(0.42358208, dtype=float32), 'loss_reward': Array(0.00387404, dtype=float32), 'loss_cross_entropy': Array(0.41970807, dtype=float32)}


  4%|▎         | 36688/1000000 [1:53:29<33:39:42,  7.95it/s]

{'loss': Array(0.4088668, dtype=float32), 'loss_reward': Array(0.00369788, dtype=float32), 'loss_cross_entropy': Array(0.40516892, dtype=float32)}


  4%|▎         | 36699/1000000 [1:53:31<39:14:38,  6.82it/s]

{'loss': Array(0.41694972, dtype=float32), 'loss_reward': Array(0.00372308, dtype=float32), 'loss_cross_entropy': Array(0.41322666, dtype=float32)}


  4%|▎         | 36710/1000000 [1:53:33<28:51:08,  9.27it/s]

{'loss': Array(0.41826138, dtype=float32), 'loss_reward': Array(0.00382904, dtype=float32), 'loss_cross_entropy': Array(0.41443235, dtype=float32)}


  4%|▎         | 36718/1000000 [1:53:34<33:24:36,  8.01it/s]

{'loss': Array(0.40926543, dtype=float32), 'loss_reward': Array(0.0037873, dtype=float32), 'loss_cross_entropy': Array(0.40547806, dtype=float32)}


  4%|▎         | 36730/1000000 [1:53:36<27:47:56,  9.63it/s]

{'loss': Array(0.40966684, dtype=float32), 'loss_reward': Array(0.00380153, dtype=float32), 'loss_cross_entropy': Array(0.40586525, dtype=float32)}


  4%|▎         | 36740/1000000 [1:53:38<34:20:03,  7.79it/s]

{'loss': Array(0.400348, dtype=float32), 'loss_reward': Array(0.00369428, dtype=float32), 'loss_cross_entropy': Array(0.39665374, dtype=float32)}


  4%|▎         | 36748/1000000 [1:53:39<35:52:05,  7.46it/s]

{'loss': Array(0.40853748, dtype=float32), 'loss_reward': Array(0.0036486, dtype=float32), 'loss_cross_entropy': Array(0.40488893, dtype=float32)}


  4%|▎         | 36759/1000000 [1:53:41<29:18:07,  9.13it/s]

{'loss': Array(0.39496437, dtype=float32), 'loss_reward': Array(0.00361154, dtype=float32), 'loss_cross_entropy': Array(0.39135286, dtype=float32)}


  4%|▎         | 36769/1000000 [1:53:42<27:13:47,  9.83it/s]

{'loss': Array(0.40611148, dtype=float32), 'loss_reward': Array(0.00364824, dtype=float32), 'loss_cross_entropy': Array(0.40246326, dtype=float32)}


  4%|▎         | 36779/1000000 [1:53:44<34:57:59,  7.65it/s]

{'loss': Array(0.39878032, dtype=float32), 'loss_reward': Array(0.00370327, dtype=float32), 'loss_cross_entropy': Array(0.39507708, dtype=float32)}


  4%|▎         | 36790/1000000 [1:53:46<28:20:57,  9.44it/s]

{'loss': Array(0.39854938, dtype=float32), 'loss_reward': Array(0.00373908, dtype=float32), 'loss_cross_entropy': Array(0.3948103, dtype=float32)}


  4%|▎         | 36798/1000000 [1:53:47<34:20:55,  7.79it/s]

{'loss': Array(0.39905578, dtype=float32), 'loss_reward': Array(0.00371995, dtype=float32), 'loss_cross_entropy': Array(0.39533582, dtype=float32)}


  4%|▎         | 36810/1000000 [1:53:49<27:37:14,  9.69it/s]

{'loss': Array(0.4001213, dtype=float32), 'loss_reward': Array(0.00366142, dtype=float32), 'loss_cross_entropy': Array(0.3964599, dtype=float32)}


  4%|▎         | 36820/1000000 [1:53:51<31:51:38,  8.40it/s]

{'loss': Array(0.39255786, dtype=float32), 'loss_reward': Array(0.00367293, dtype=float32), 'loss_cross_entropy': Array(0.38888493, dtype=float32)}


  4%|▎         | 36828/1000000 [1:53:52<34:00:43,  7.87it/s]

{'loss': Array(0.40554762, dtype=float32), 'loss_reward': Array(0.00362024, dtype=float32), 'loss_cross_entropy': Array(0.40192738, dtype=float32)}


  4%|▎         | 36840/1000000 [1:53:54<28:23:00,  9.43it/s]

{'loss': Array(0.41227648, dtype=float32), 'loss_reward': Array(0.00377727, dtype=float32), 'loss_cross_entropy': Array(0.4084992, dtype=float32)}


  4%|▎         | 36848/1000000 [1:53:56<33:28:03,  7.99it/s]

{'loss': Array(0.41081604, dtype=float32), 'loss_reward': Array(0.00383544, dtype=float32), 'loss_cross_entropy': Array(0.40698057, dtype=float32)}


  4%|▎         | 36860/1000000 [1:53:57<30:15:51,  8.84it/s]

{'loss': Array(0.40475193, dtype=float32), 'loss_reward': Array(0.00370365, dtype=float32), 'loss_cross_entropy': Array(0.40104824, dtype=float32)}


  4%|▎         | 36868/1000000 [1:53:59<33:34:49,  7.97it/s]

{'loss': Array(0.403577, dtype=float32), 'loss_reward': Array(0.0037445, dtype=float32), 'loss_cross_entropy': Array(0.39983246, dtype=float32)}


  4%|▎         | 36880/1000000 [1:54:01<26:40:08, 10.03it/s]

{'loss': Array(0.40045586, dtype=float32), 'loss_reward': Array(0.00361311, dtype=float32), 'loss_cross_entropy': Array(0.39684278, dtype=float32)}


  4%|▎         | 36890/1000000 [1:54:02<38:25:57,  6.96it/s]

{'loss': Array(0.40504193, dtype=float32), 'loss_reward': Array(0.00373182, dtype=float32), 'loss_cross_entropy': Array(0.40131012, dtype=float32)}


  4%|▎         | 36899/1000000 [1:54:04<33:51:43,  7.90it/s]

{'loss': Array(0.4019928, dtype=float32), 'loss_reward': Array(0.00368598, dtype=float32), 'loss_cross_entropy': Array(0.3983068, dtype=float32)}


  4%|▎         | 36910/1000000 [1:54:06<27:31:16,  9.72it/s]

{'loss': Array(0.40164432, dtype=float32), 'loss_reward': Array(0.00368115, dtype=float32), 'loss_cross_entropy': Array(0.39796323, dtype=float32)}


  4%|▎         | 36918/1000000 [1:54:07<32:36:52,  8.20it/s]

{'loss': Array(0.40224418, dtype=float32), 'loss_reward': Array(0.00375589, dtype=float32), 'loss_cross_entropy': Array(0.3984883, dtype=float32)}


  4%|▎         | 36930/1000000 [1:54:09<33:18:47,  8.03it/s]

{'loss': Array(0.40377674, dtype=float32), 'loss_reward': Array(0.00373259, dtype=float32), 'loss_cross_entropy': Array(0.40004417, dtype=float32)}


  4%|▎         | 36938/1000000 [1:54:10<34:49:39,  7.68it/s]

{'loss': Array(0.41188794, dtype=float32), 'loss_reward': Array(0.00384033, dtype=float32), 'loss_cross_entropy': Array(0.4080476, dtype=float32)}


  4%|▎         | 36950/1000000 [1:54:12<27:27:51,  9.74it/s]

{'loss': Array(0.40662384, dtype=float32), 'loss_reward': Array(0.00367655, dtype=float32), 'loss_cross_entropy': Array(0.40294725, dtype=float32)}


  4%|▎         | 36958/1000000 [1:54:13<32:23:29,  8.26it/s]

{'loss': Array(0.4086032, dtype=float32), 'loss_reward': Array(0.00371417, dtype=float32), 'loss_cross_entropy': Array(0.40488896, dtype=float32)}


  4%|▎         | 36969/1000000 [1:54:15<34:44:07,  7.70it/s]

{'loss': Array(0.40329716, dtype=float32), 'loss_reward': Array(0.00364829, dtype=float32), 'loss_cross_entropy': Array(0.39964885, dtype=float32)}


  4%|▎         | 36980/1000000 [1:54:17<27:47:04,  9.63it/s]

{'loss': Array(0.41321397, dtype=float32), 'loss_reward': Array(0.00365068, dtype=float32), 'loss_cross_entropy': Array(0.4095633, dtype=float32)}


  4%|▎         | 36988/1000000 [1:54:18<32:00:54,  8.36it/s]

{'loss': Array(0.40380818, dtype=float32), 'loss_reward': Array(0.00372014, dtype=float32), 'loss_cross_entropy': Array(0.40008807, dtype=float32)}


  4%|▎         | 37000/1000000 [1:54:20<26:23:46, 10.13it/s]

{'loss': Array(0.4119464, dtype=float32), 'loss_reward': Array(0.00365522, dtype=float32), 'loss_cross_entropy': Array(0.4082911, dtype=float32)}


  4%|▎         | 37009/1000000 [1:54:32<138:25:12,  1.93it/s]

{'loss': Array(0.40671316, dtype=float32), 'loss_reward': Array(0.0037601, dtype=float32), 'loss_cross_entropy': Array(0.40295306, dtype=float32)}


  4%|▎         | 37019/1000000 [1:54:33<53:31:28,  5.00it/s] 

{'loss': Array(0.40546915, dtype=float32), 'loss_reward': Array(0.00383556, dtype=float32), 'loss_cross_entropy': Array(0.40163356, dtype=float32)}


  4%|▎         | 37030/1000000 [1:54:35<32:22:38,  8.26it/s]

{'loss': Array(0.4082232, dtype=float32), 'loss_reward': Array(0.00378741, dtype=float32), 'loss_cross_entropy': Array(0.40443584, dtype=float32)}


  4%|▎         | 37038/1000000 [1:54:36<35:17:15,  7.58it/s]

{'loss': Array(0.4058435, dtype=float32), 'loss_reward': Array(0.00379337, dtype=float32), 'loss_cross_entropy': Array(0.40205008, dtype=float32)}


  4%|▎         | 37049/1000000 [1:54:38<33:10:06,  8.06it/s]

{'loss': Array(0.40405193, dtype=float32), 'loss_reward': Array(0.00372982, dtype=float32), 'loss_cross_entropy': Array(0.4003221, dtype=float32)}


  4%|▎         | 37060/1000000 [1:54:40<27:17:01,  9.80it/s]

{'loss': Array(0.4048369, dtype=float32), 'loss_reward': Array(0.00369173, dtype=float32), 'loss_cross_entropy': Array(0.4011452, dtype=float32)}


  4%|▎         | 37068/1000000 [1:54:41<32:09:27,  8.32it/s]

{'loss': Array(0.40360162, dtype=float32), 'loss_reward': Array(0.00356401, dtype=float32), 'loss_cross_entropy': Array(0.4000376, dtype=float32)}


  4%|▎         | 37079/1000000 [1:54:43<38:38:21,  6.92it/s]

{'loss': Array(0.4057594, dtype=float32), 'loss_reward': Array(0.00372129, dtype=float32), 'loss_cross_entropy': Array(0.40203807, dtype=float32)}


  4%|▎         | 37090/1000000 [1:54:45<28:23:18,  9.42it/s]

{'loss': Array(0.40706578, dtype=float32), 'loss_reward': Array(0.00371342, dtype=float32), 'loss_cross_entropy': Array(0.40335235, dtype=float32)}


  4%|▎         | 37098/1000000 [1:54:46<34:39:47,  7.72it/s]

{'loss': Array(0.39818424, dtype=float32), 'loss_reward': Array(0.00368114, dtype=float32), 'loss_cross_entropy': Array(0.39450312, dtype=float32)}


  4%|▎         | 37110/1000000 [1:54:48<27:14:44,  9.82it/s]

{'loss': Array(0.40691993, dtype=float32), 'loss_reward': Array(0.00368468, dtype=float32), 'loss_cross_entropy': Array(0.40323526, dtype=float32)}


  4%|▎         | 37120/1000000 [1:54:50<34:31:07,  7.75it/s]

{'loss': Array(0.40098807, dtype=float32), 'loss_reward': Array(0.00375165, dtype=float32), 'loss_cross_entropy': Array(0.39723644, dtype=float32)}


  4%|▎         | 37130/1000000 [1:54:51<30:25:16,  8.79it/s]

{'loss': Array(0.40558743, dtype=float32), 'loss_reward': Array(0.00367997, dtype=float32), 'loss_cross_entropy': Array(0.40190744, dtype=float32)}


  4%|▎         | 37138/1000000 [1:54:53<33:16:37,  8.04it/s]

{'loss': Array(0.39633784, dtype=float32), 'loss_reward': Array(0.0038388, dtype=float32), 'loss_cross_entropy': Array(0.39249903, dtype=float32)}


  4%|▎         | 37150/1000000 [1:54:54<27:56:48,  9.57it/s]

{'loss': Array(0.40439853, dtype=float32), 'loss_reward': Array(0.00373207, dtype=float32), 'loss_cross_entropy': Array(0.40066648, dtype=float32)}


  4%|▎         | 37160/1000000 [1:54:56<33:16:26,  8.04it/s]

{'loss': Array(0.40214005, dtype=float32), 'loss_reward': Array(0.00380441, dtype=float32), 'loss_cross_entropy': Array(0.39833567, dtype=float32)}


  4%|▎         | 37170/1000000 [1:54:58<29:35:25,  9.04it/s]

{'loss': Array(0.40061283, dtype=float32), 'loss_reward': Array(0.00378951, dtype=float32), 'loss_cross_entropy': Array(0.39682335, dtype=float32)}


  4%|▎         | 37178/1000000 [1:54:59<32:41:57,  8.18it/s]

{'loss': Array(0.39866844, dtype=float32), 'loss_reward': Array(0.00360913, dtype=float32), 'loss_cross_entropy': Array(0.39505932, dtype=float32)}


  4%|▎         | 37190/1000000 [1:55:01<27:34:29,  9.70it/s]

{'loss': Array(0.4003663, dtype=float32), 'loss_reward': Array(0.00362497, dtype=float32), 'loss_cross_entropy': Array(0.3967413, dtype=float32)}


  4%|▎         | 37200/1000000 [1:55:03<32:24:40,  8.25it/s]

{'loss': Array(0.3924955, dtype=float32), 'loss_reward': Array(0.00371585, dtype=float32), 'loss_cross_entropy': Array(0.38877967, dtype=float32)}


  4%|▎         | 37210/1000000 [1:55:04<28:35:52,  9.35it/s]

{'loss': Array(0.40258265, dtype=float32), 'loss_reward': Array(0.0036794, dtype=float32), 'loss_cross_entropy': Array(0.39890325, dtype=float32)}


  4%|▎         | 37218/1000000 [1:55:06<33:22:09,  8.01it/s]

{'loss': Array(0.3943489, dtype=float32), 'loss_reward': Array(0.00368526, dtype=float32), 'loss_cross_entropy': Array(0.39066362, dtype=float32)}


  4%|▎         | 37230/1000000 [1:55:08<27:24:52,  9.76it/s]

{'loss': Array(0.3938176, dtype=float32), 'loss_reward': Array(0.00376216, dtype=float32), 'loss_cross_entropy': Array(0.39005548, dtype=float32)}


  4%|▎         | 37238/1000000 [1:55:09<37:17:11,  7.17it/s]

{'loss': Array(0.3978693, dtype=float32), 'loss_reward': Array(0.00382898, dtype=float32), 'loss_cross_entropy': Array(0.3940403, dtype=float32)}


  4%|▎         | 37250/1000000 [1:55:11<28:30:55,  9.38it/s]

{'loss': Array(0.40217152, dtype=float32), 'loss_reward': Array(0.00371814, dtype=float32), 'loss_cross_entropy': Array(0.39845335, dtype=float32)}


  4%|▎         | 37260/1000000 [1:55:13<28:23:05,  9.42it/s]

{'loss': Array(0.3960249, dtype=float32), 'loss_reward': Array(0.0036736, dtype=float32), 'loss_cross_entropy': Array(0.39235124, dtype=float32)}


  4%|▎         | 37270/1000000 [1:55:15<39:08:55,  6.83it/s]

{'loss': Array(0.3954097, dtype=float32), 'loss_reward': Array(0.00363936, dtype=float32), 'loss_cross_entropy': Array(0.39177036, dtype=float32)}


  4%|▎         | 37278/1000000 [1:55:16<35:58:56,  7.43it/s]

{'loss': Array(0.3976247, dtype=float32), 'loss_reward': Array(0.00370477, dtype=float32), 'loss_cross_entropy': Array(0.39391997, dtype=float32)}


  4%|▎         | 37290/1000000 [1:55:18<27:26:23,  9.75it/s]

{'loss': Array(0.40146923, dtype=float32), 'loss_reward': Array(0.00380411, dtype=float32), 'loss_cross_entropy': Array(0.39766517, dtype=float32)}


  4%|▎         | 37298/1000000 [1:55:19<33:02:55,  8.09it/s]

{'loss': Array(0.39455006, dtype=float32), 'loss_reward': Array(0.00368941, dtype=float32), 'loss_cross_entropy': Array(0.39086065, dtype=float32)}


  4%|▎         | 37310/1000000 [1:55:21<33:34:47,  7.96it/s]

{'loss': Array(0.40267253, dtype=float32), 'loss_reward': Array(0.00370931, dtype=float32), 'loss_cross_entropy': Array(0.3989632, dtype=float32)}


  4%|▎         | 37318/1000000 [1:55:22<34:50:09,  7.68it/s]

{'loss': Array(0.40000668, dtype=float32), 'loss_reward': Array(0.00370338, dtype=float32), 'loss_cross_entropy': Array(0.39630333, dtype=float32)}


  4%|▎         | 37330/1000000 [1:55:24<27:33:59,  9.70it/s]

{'loss': Array(0.39372322, dtype=float32), 'loss_reward': Array(0.0036516, dtype=float32), 'loss_cross_entropy': Array(0.3900716, dtype=float32)}


  4%|▎         | 37338/1000000 [1:55:26<33:16:06,  8.04it/s]

{'loss': Array(0.3857288, dtype=float32), 'loss_reward': Array(0.0036298, dtype=float32), 'loss_cross_entropy': Array(0.382099, dtype=float32)}


  4%|▎         | 37349/1000000 [1:55:28<35:41:34,  7.49it/s]

{'loss': Array(0.39992246, dtype=float32), 'loss_reward': Array(0.00367579, dtype=float32), 'loss_cross_entropy': Array(0.39624667, dtype=float32)}


  4%|▎         | 37359/1000000 [1:55:29<30:07:13,  8.88it/s]

{'loss': Array(0.39783695, dtype=float32), 'loss_reward': Array(0.0036464, dtype=float32), 'loss_cross_entropy': Array(0.39419058, dtype=float32)}


  4%|▎         | 37369/1000000 [1:55:31<28:12:03,  9.48it/s]

{'loss': Array(0.40079007, dtype=float32), 'loss_reward': Array(0.00374598, dtype=float32), 'loss_cross_entropy': Array(0.39704412, dtype=float32)}


  4%|▎         | 37380/1000000 [1:55:32<27:13:40,  9.82it/s]

{'loss': Array(0.398842, dtype=float32), 'loss_reward': Array(0.00358675, dtype=float32), 'loss_cross_entropy': Array(0.3952553, dtype=float32)}


  4%|▎         | 37390/1000000 [1:55:34<31:59:48,  8.36it/s]

{'loss': Array(0.39512834, dtype=float32), 'loss_reward': Array(0.00365007, dtype=float32), 'loss_cross_entropy': Array(0.39147827, dtype=float32)}


  4%|▎         | 37398/1000000 [1:55:36<34:01:08,  7.86it/s]

{'loss': Array(0.39849386, dtype=float32), 'loss_reward': Array(0.00370571, dtype=float32), 'loss_cross_entropy': Array(0.39478815, dtype=float32)}


  4%|▎         | 37410/1000000 [1:55:37<27:09:29,  9.85it/s]

{'loss': Array(0.40091953, dtype=float32), 'loss_reward': Array(0.00367645, dtype=float32), 'loss_cross_entropy': Array(0.39724302, dtype=float32)}


  4%|▎         | 37418/1000000 [1:55:39<32:24:12,  8.25it/s]

{'loss': Array(0.39581683, dtype=float32), 'loss_reward': Array(0.00371069, dtype=float32), 'loss_cross_entropy': Array(0.39210615, dtype=float32)}


  4%|▎         | 37430/1000000 [1:55:41<29:47:04,  8.98it/s]

{'loss': Array(0.38932928, dtype=float32), 'loss_reward': Array(0.00374394, dtype=float32), 'loss_cross_entropy': Array(0.38558534, dtype=float32)}


  4%|▎         | 37438/1000000 [1:55:42<34:15:32,  7.80it/s]

{'loss': Array(0.39833626, dtype=float32), 'loss_reward': Array(0.00369143, dtype=float32), 'loss_cross_entropy': Array(0.39464483, dtype=float32)}


  4%|▎         | 37450/1000000 [1:55:44<27:58:10,  9.56it/s]

{'loss': Array(0.40324458, dtype=float32), 'loss_reward': Array(0.00369586, dtype=float32), 'loss_cross_entropy': Array(0.39954874, dtype=float32)}


  4%|▎         | 37460/1000000 [1:55:46<39:43:47,  6.73it/s]

{'loss': Array(0.40214214, dtype=float32), 'loss_reward': Array(0.00381191, dtype=float32), 'loss_cross_entropy': Array(0.3983302, dtype=float32)}


  4%|▎         | 37468/1000000 [1:55:47<37:43:48,  7.09it/s]

{'loss': Array(0.39438498, dtype=float32), 'loss_reward': Array(0.00374111, dtype=float32), 'loss_cross_entropy': Array(0.39064392, dtype=float32)}


  4%|▎         | 37480/1000000 [1:55:49<27:58:24,  9.56it/s]

{'loss': Array(0.39194864, dtype=float32), 'loss_reward': Array(0.00367733, dtype=float32), 'loss_cross_entropy': Array(0.3882713, dtype=float32)}


  4%|▎         | 37488/1000000 [1:55:50<33:26:34,  7.99it/s]

{'loss': Array(0.39553925, dtype=float32), 'loss_reward': Array(0.00365314, dtype=float32), 'loss_cross_entropy': Array(0.39188612, dtype=float32)}


  4%|▎         | 37499/1000000 [1:55:52<35:36:53,  7.51it/s]

{'loss': Array(0.39388952, dtype=float32), 'loss_reward': Array(0.00363047, dtype=float32), 'loss_cross_entropy': Array(0.3902591, dtype=float32)}


  4%|▍         | 37510/1000000 [1:56:04<123:49:40,  2.16it/s]

{'loss': Array(0.3908711, dtype=float32), 'loss_reward': Array(0.00366736, dtype=float32), 'loss_cross_entropy': Array(0.3872037, dtype=float32)}


  4%|▍         | 37518/1000000 [1:56:05<66:38:50,  4.01it/s] 

{'loss': Array(0.40240374, dtype=float32), 'loss_reward': Array(0.00375762, dtype=float32), 'loss_cross_entropy': Array(0.3986461, dtype=float32)}


  4%|▍         | 37530/1000000 [1:56:07<34:26:15,  7.76it/s]

{'loss': Array(0.39457077, dtype=float32), 'loss_reward': Array(0.0035971, dtype=float32), 'loss_cross_entropy': Array(0.39097366, dtype=float32)}


  4%|▍         | 37538/1000000 [1:56:09<41:46:53,  6.40it/s]

{'loss': Array(0.40609714, dtype=float32), 'loss_reward': Array(0.00371184, dtype=float32), 'loss_cross_entropy': Array(0.4023853, dtype=float32)}


  4%|▍         | 37550/1000000 [1:56:10<28:46:15,  9.29it/s]

{'loss': Array(0.39732072, dtype=float32), 'loss_reward': Array(0.0036838, dtype=float32), 'loss_cross_entropy': Array(0.3936369, dtype=float32)}


  4%|▍         | 37558/1000000 [1:56:12<33:30:14,  7.98it/s]

{'loss': Array(0.39751545, dtype=float32), 'loss_reward': Array(0.00382243, dtype=float32), 'loss_cross_entropy': Array(0.393693, dtype=float32)}


  4%|▍         | 37570/1000000 [1:56:13<28:16:46,  9.45it/s]

{'loss': Array(0.40372345, dtype=float32), 'loss_reward': Array(0.00373929, dtype=float32), 'loss_cross_entropy': Array(0.39998412, dtype=float32)}


  4%|▍         | 37580/1000000 [1:56:15<32:26:54,  8.24it/s]

{'loss': Array(0.3950094, dtype=float32), 'loss_reward': Array(0.00377924, dtype=float32), 'loss_cross_entropy': Array(0.39123017, dtype=float32)}


  4%|▍         | 37588/1000000 [1:56:17<34:07:49,  7.83it/s]

{'loss': Array(0.40231052, dtype=float32), 'loss_reward': Array(0.00382513, dtype=float32), 'loss_cross_entropy': Array(0.3984854, dtype=float32)}


  4%|▍         | 37600/1000000 [1:56:18<27:08:40,  9.85it/s]

{'loss': Array(0.3971235, dtype=float32), 'loss_reward': Array(0.00368056, dtype=float32), 'loss_cross_entropy': Array(0.3934429, dtype=float32)}


  4%|▍         | 37608/1000000 [1:56:20<32:17:43,  8.28it/s]

{'loss': Array(0.40196, dtype=float32), 'loss_reward': Array(0.00363082, dtype=float32), 'loss_cross_entropy': Array(0.39832917, dtype=float32)}


  4%|▍         | 37619/1000000 [1:56:22<32:14:28,  8.29it/s]

{'loss': Array(0.39009246, dtype=float32), 'loss_reward': Array(0.00362088, dtype=float32), 'loss_cross_entropy': Array(0.38647157, dtype=float32)}


  4%|▍         | 37628/1000000 [1:56:23<30:45:10,  8.69it/s]

{'loss': Array(0.3967093, dtype=float32), 'loss_reward': Array(0.00377475, dtype=float32), 'loss_cross_entropy': Array(0.39293453, dtype=float32)}


  4%|▍         | 37640/1000000 [1:56:25<26:03:18, 10.26it/s]

{'loss': Array(0.38916704, dtype=float32), 'loss_reward': Array(0.00370792, dtype=float32), 'loss_cross_entropy': Array(0.38545915, dtype=float32)}


  4%|▍         | 37649/1000000 [1:56:27<40:22:31,  6.62it/s]

{'loss': Array(0.3980128, dtype=float32), 'loss_reward': Array(0.00370678, dtype=float32), 'loss_cross_entropy': Array(0.39430603, dtype=float32)}


  4%|▍         | 37660/1000000 [1:56:28<29:16:41,  9.13it/s]

{'loss': Array(0.39177713, dtype=float32), 'loss_reward': Array(0.00360846, dtype=float32), 'loss_cross_entropy': Array(0.38816866, dtype=float32)}


  4%|▍         | 37668/1000000 [1:56:30<34:20:52,  7.78it/s]

{'loss': Array(0.39975733, dtype=float32), 'loss_reward': Array(0.00375143, dtype=float32), 'loss_cross_entropy': Array(0.39600587, dtype=float32)}


  4%|▍         | 37680/1000000 [1:56:32<28:17:33,  9.45it/s]

{'loss': Array(0.39460263, dtype=float32), 'loss_reward': Array(0.00372887, dtype=float32), 'loss_cross_entropy': Array(0.39087373, dtype=float32)}


  4%|▍         | 37689/1000000 [1:56:33<38:14:40,  6.99it/s]

{'loss': Array(0.39600143, dtype=float32), 'loss_reward': Array(0.0036813, dtype=float32), 'loss_cross_entropy': Array(0.39232013, dtype=float32)}


  4%|▍         | 37700/1000000 [1:56:35<28:24:17,  9.41it/s]

{'loss': Array(0.39775518, dtype=float32), 'loss_reward': Array(0.00371435, dtype=float32), 'loss_cross_entropy': Array(0.39404082, dtype=float32)}


  4%|▍         | 37708/1000000 [1:56:37<33:39:19,  7.94it/s]

{'loss': Array(0.39696458, dtype=float32), 'loss_reward': Array(0.00370792, dtype=float32), 'loss_cross_entropy': Array(0.39325663, dtype=float32)}


  4%|▍         | 37719/1000000 [1:56:38<29:15:11,  9.14it/s]

{'loss': Array(0.3907901, dtype=float32), 'loss_reward': Array(0.00370021, dtype=float32), 'loss_cross_entropy': Array(0.38708985, dtype=float32)}


  4%|▍         | 37729/1000000 [1:56:40<34:53:52,  7.66it/s]

{'loss': Array(0.3912424, dtype=float32), 'loss_reward': Array(0.00359201, dtype=float32), 'loss_cross_entropy': Array(0.3876504, dtype=float32)}


  4%|▍         | 37740/1000000 [1:56:42<27:44:10,  9.64it/s]

{'loss': Array(0.3904707, dtype=float32), 'loss_reward': Array(0.00368386, dtype=float32), 'loss_cross_entropy': Array(0.38678688, dtype=float32)}


  4%|▍         | 37748/1000000 [1:56:43<32:44:28,  8.16it/s]

{'loss': Array(0.39935035, dtype=float32), 'loss_reward': Array(0.00382584, dtype=float32), 'loss_cross_entropy': Array(0.39552447, dtype=float32)}


  4%|▍         | 37760/1000000 [1:56:45<27:03:16,  9.88it/s]

{'loss': Array(0.38425738, dtype=float32), 'loss_reward': Array(0.00360148, dtype=float32), 'loss_cross_entropy': Array(0.38065588, dtype=float32)}


  4%|▍         | 37770/1000000 [1:56:47<32:45:54,  8.16it/s]

{'loss': Array(0.3850622, dtype=float32), 'loss_reward': Array(0.00373492, dtype=float32), 'loss_cross_entropy': Array(0.38132724, dtype=float32)}


  4%|▍         | 37780/1000000 [1:56:48<29:39:27,  9.01it/s]

{'loss': Array(0.39129496, dtype=float32), 'loss_reward': Array(0.00362903, dtype=float32), 'loss_cross_entropy': Array(0.38766596, dtype=float32)}


  4%|▍         | 37788/1000000 [1:56:50<33:05:34,  8.08it/s]

{'loss': Array(0.38265556, dtype=float32), 'loss_reward': Array(0.00356775, dtype=float32), 'loss_cross_entropy': Array(0.3790878, dtype=float32)}


  4%|▍         | 37800/1000000 [1:56:51<27:02:15,  9.89it/s]

{'loss': Array(0.39925575, dtype=float32), 'loss_reward': Array(0.00367298, dtype=float32), 'loss_cross_entropy': Array(0.39558274, dtype=float32)}


  4%|▍         | 37808/1000000 [1:56:53<37:27:00,  7.14it/s]

{'loss': Array(0.39203367, dtype=float32), 'loss_reward': Array(0.00361216, dtype=float32), 'loss_cross_entropy': Array(0.38842154, dtype=float32)}


  4%|▍         | 37820/1000000 [1:56:55<28:17:56,  9.44it/s]

{'loss': Array(0.40080872, dtype=float32), 'loss_reward': Array(0.00375905, dtype=float32), 'loss_cross_entropy': Array(0.39704967, dtype=float32)}


  4%|▍         | 37828/1000000 [1:56:56<33:39:44,  7.94it/s]

{'loss': Array(0.39093715, dtype=float32), 'loss_reward': Array(0.0035525, dtype=float32), 'loss_cross_entropy': Array(0.38738462, dtype=float32)}


  4%|▍         | 37839/1000000 [1:56:58<40:23:59,  6.62it/s]

{'loss': Array(0.38095626, dtype=float32), 'loss_reward': Array(0.00360941, dtype=float32), 'loss_cross_entropy': Array(0.37734684, dtype=float32)}


  4%|▍         | 37850/1000000 [1:57:00<28:52:02,  9.26it/s]

{'loss': Array(0.38707545, dtype=float32), 'loss_reward': Array(0.00376427, dtype=float32), 'loss_cross_entropy': Array(0.38331118, dtype=float32)}


  4%|▍         | 37858/1000000 [1:57:01<33:06:22,  8.07it/s]

{'loss': Array(0.39138627, dtype=float32), 'loss_reward': Array(0.00367637, dtype=float32), 'loss_cross_entropy': Array(0.3877099, dtype=float32)}


  4%|▍         | 37869/1000000 [1:57:03<28:10:35,  9.49it/s]

{'loss': Array(0.38755292, dtype=float32), 'loss_reward': Array(0.00363696, dtype=float32), 'loss_cross_entropy': Array(0.38391596, dtype=float32)}


  4%|▍         | 37880/1000000 [1:57:05<33:14:27,  8.04it/s]

{'loss': Array(0.39459434, dtype=float32), 'loss_reward': Array(0.00373871, dtype=float32), 'loss_cross_entropy': Array(0.39085564, dtype=float32)}


  4%|▍         | 37890/1000000 [1:57:06<28:46:35,  9.29it/s]

{'loss': Array(0.38255256, dtype=float32), 'loss_reward': Array(0.00353232, dtype=float32), 'loss_cross_entropy': Array(0.3790202, dtype=float32)}


  4%|▍         | 37898/1000000 [1:57:08<33:21:54,  8.01it/s]

{'loss': Array(0.39375088, dtype=float32), 'loss_reward': Array(0.00377756, dtype=float32), 'loss_cross_entropy': Array(0.38997328, dtype=float32)}


  4%|▍         | 37910/1000000 [1:57:09<26:39:33, 10.02it/s]

{'loss': Array(0.38530567, dtype=float32), 'loss_reward': Array(0.0036381, dtype=float32), 'loss_cross_entropy': Array(0.38166758, dtype=float32)}


  4%|▍         | 37918/1000000 [1:57:11<38:38:08,  6.92it/s]

{'loss': Array(0.3893069, dtype=float32), 'loss_reward': Array(0.00359456, dtype=float32), 'loss_cross_entropy': Array(0.38571236, dtype=float32)}


  4%|▍         | 37930/1000000 [1:57:13<28:39:52,  9.32it/s]

{'loss': Array(0.37976483, dtype=float32), 'loss_reward': Array(0.00363879, dtype=float32), 'loss_cross_entropy': Array(0.37612602, dtype=float32)}


  4%|▍         | 37938/1000000 [1:57:14<33:37:20,  7.95it/s]

{'loss': Array(0.3891299, dtype=float32), 'loss_reward': Array(0.00360068, dtype=float32), 'loss_cross_entropy': Array(0.38552922, dtype=float32)}


  4%|▍         | 37950/1000000 [1:57:16<27:35:55,  9.68it/s]

{'loss': Array(0.3872004, dtype=float32), 'loss_reward': Array(0.003568, dtype=float32), 'loss_cross_entropy': Array(0.38363236, dtype=float32)}


  4%|▍         | 37960/1000000 [1:57:18<31:25:31,  8.50it/s]

{'loss': Array(0.391798, dtype=float32), 'loss_reward': Array(0.00367388, dtype=float32), 'loss_cross_entropy': Array(0.38812408, dtype=float32)}


  4%|▍         | 37968/1000000 [1:57:19<33:59:06,  7.86it/s]

{'loss': Array(0.38790134, dtype=float32), 'loss_reward': Array(0.00348427, dtype=float32), 'loss_cross_entropy': Array(0.38441706, dtype=float32)}


  4%|▍         | 37980/1000000 [1:57:21<26:56:03,  9.92it/s]

{'loss': Array(0.3868658, dtype=float32), 'loss_reward': Array(0.00368178, dtype=float32), 'loss_cross_entropy': Array(0.38318402, dtype=float32)}


  4%|▍         | 37988/1000000 [1:57:22<34:00:50,  7.86it/s]

{'loss': Array(0.38216582, dtype=float32), 'loss_reward': Array(0.00365871, dtype=float32), 'loss_cross_entropy': Array(0.3785071, dtype=float32)}


  4%|▍         | 38000/1000000 [1:57:24<30:21:43,  8.80it/s]

{'loss': Array(0.39234146, dtype=float32), 'loss_reward': Array(0.00371476, dtype=float32), 'loss_cross_entropy': Array(0.38862666, dtype=float32)}


  4%|▍         | 38010/1000000 [1:57:35<123:47:37,  2.16it/s]

{'loss': Array(0.39309445, dtype=float32), 'loss_reward': Array(0.00365412, dtype=float32), 'loss_cross_entropy': Array(0.38944033, dtype=float32)}


  4%|▍         | 38019/1000000 [1:57:37<62:45:35,  4.26it/s] 

{'loss': Array(0.40591407, dtype=float32), 'loss_reward': Array(0.00369317, dtype=float32), 'loss_cross_entropy': Array(0.40222088, dtype=float32)}


  4%|▍         | 38029/1000000 [1:57:39<49:07:49,  5.44it/s]

{'loss': Array(0.39211616, dtype=float32), 'loss_reward': Array(0.00359065, dtype=float32), 'loss_cross_entropy': Array(0.3885255, dtype=float32)}


  4%|▍         | 38040/1000000 [1:57:41<31:11:06,  8.57it/s]

{'loss': Array(0.39609262, dtype=float32), 'loss_reward': Array(0.0036291, dtype=float32), 'loss_cross_entropy': Array(0.39246354, dtype=float32)}


  4%|▍         | 38050/1000000 [1:57:42<29:20:04,  9.11it/s]

{'loss': Array(0.39414385, dtype=float32), 'loss_reward': Array(0.00361256, dtype=float32), 'loss_cross_entropy': Array(0.39053133, dtype=float32)}


  4%|▍         | 38058/1000000 [1:57:43<33:15:11,  8.04it/s]

{'loss': Array(0.39622405, dtype=float32), 'loss_reward': Array(0.00367072, dtype=float32), 'loss_cross_entropy': Array(0.39255336, dtype=float32)}


  4%|▍         | 38069/1000000 [1:57:45<36:02:20,  7.41it/s]

{'loss': Array(0.39529034, dtype=float32), 'loss_reward': Array(0.0036535, dtype=float32), 'loss_cross_entropy': Array(0.39163685, dtype=float32)}


  4%|▍         | 38080/1000000 [1:57:47<29:36:47,  9.02it/s]

{'loss': Array(0.39156213, dtype=float32), 'loss_reward': Array(0.00367313, dtype=float32), 'loss_cross_entropy': Array(0.387889, dtype=float32)}


  4%|▍         | 38088/1000000 [1:57:49<32:51:52,  8.13it/s]

{'loss': Array(0.39576888, dtype=float32), 'loss_reward': Array(0.00363607, dtype=float32), 'loss_cross_entropy': Array(0.39213282, dtype=float32)}


  4%|▍         | 38100/1000000 [1:57:50<27:04:31,  9.87it/s]

{'loss': Array(0.38944346, dtype=float32), 'loss_reward': Array(0.00373087, dtype=float32), 'loss_cross_entropy': Array(0.3857126, dtype=float32)}


  4%|▍         | 38110/1000000 [1:57:52<31:55:36,  8.37it/s]

{'loss': Array(0.39561483, dtype=float32), 'loss_reward': Array(0.00367582, dtype=float32), 'loss_cross_entropy': Array(0.391939, dtype=float32)}


  4%|▍         | 38118/1000000 [1:57:54<33:40:03,  7.94it/s]

{'loss': Array(0.39478952, dtype=float32), 'loss_reward': Array(0.00350349, dtype=float32), 'loss_cross_entropy': Array(0.39128605, dtype=float32)}


  4%|▍         | 38130/1000000 [1:57:55<27:21:03,  9.77it/s]

{'loss': Array(0.38235387, dtype=float32), 'loss_reward': Array(0.00353475, dtype=float32), 'loss_cross_entropy': Array(0.37881914, dtype=float32)}


  4%|▍         | 38138/1000000 [1:57:57<32:26:59,  8.23it/s]

{'loss': Array(0.37494275, dtype=float32), 'loss_reward': Array(0.00353155, dtype=float32), 'loss_cross_entropy': Array(0.37141123, dtype=float32)}


  4%|▍         | 38149/1000000 [1:57:58<32:08:38,  8.31it/s]

{'loss': Array(0.3859754, dtype=float32), 'loss_reward': Array(0.00362137, dtype=float32), 'loss_cross_entropy': Array(0.38235402, dtype=float32)}


  4%|▍         | 38160/1000000 [1:58:00<27:33:26,  9.70it/s]

{'loss': Array(0.39512983, dtype=float32), 'loss_reward': Array(0.00359814, dtype=float32), 'loss_cross_entropy': Array(0.39153168, dtype=float32)}


  4%|▍         | 38168/1000000 [1:58:02<32:57:23,  8.11it/s]

{'loss': Array(0.3916149, dtype=float32), 'loss_reward': Array(0.00366292, dtype=float32), 'loss_cross_entropy': Array(0.387952, dtype=float32)}


  4%|▍         | 38180/1000000 [1:58:04<36:45:36,  7.27it/s]

{'loss': Array(0.38526553, dtype=float32), 'loss_reward': Array(0.00355769, dtype=float32), 'loss_cross_entropy': Array(0.3817078, dtype=float32)}


  4%|▍         | 38188/1000000 [1:58:05<35:28:02,  7.53it/s]

{'loss': Array(0.38947666, dtype=float32), 'loss_reward': Array(0.00364241, dtype=float32), 'loss_cross_entropy': Array(0.3858343, dtype=float32)}


  4%|▍         | 38200/1000000 [1:58:07<27:29:28,  9.72it/s]

{'loss': Array(0.39228988, dtype=float32), 'loss_reward': Array(0.00369819, dtype=float32), 'loss_cross_entropy': Array(0.38859174, dtype=float32)}


  4%|▍         | 38208/1000000 [1:58:08<32:16:23,  8.28it/s]

{'loss': Array(0.39217585, dtype=float32), 'loss_reward': Array(0.00360449, dtype=float32), 'loss_cross_entropy': Array(0.38857135, dtype=float32)}


  4%|▍         | 38219/1000000 [1:58:10<40:27:33,  6.60it/s]

{'loss': Array(0.40491754, dtype=float32), 'loss_reward': Array(0.00369504, dtype=float32), 'loss_cross_entropy': Array(0.40122247, dtype=float32)}


  4%|▍         | 38230/1000000 [1:58:12<29:29:44,  9.06it/s]

{'loss': Array(0.39215904, dtype=float32), 'loss_reward': Array(0.00360943, dtype=float32), 'loss_cross_entropy': Array(0.38854957, dtype=float32)}


  4%|▍         | 38238/1000000 [1:58:13<33:23:27,  8.00it/s]

{'loss': Array(0.39616466, dtype=float32), 'loss_reward': Array(0.00368864, dtype=float32), 'loss_cross_entropy': Array(0.392476, dtype=float32)}


  4%|▍         | 38250/1000000 [1:58:15<26:54:26,  9.93it/s]

{'loss': Array(0.38606665, dtype=float32), 'loss_reward': Array(0.00358628, dtype=float32), 'loss_cross_entropy': Array(0.38248035, dtype=float32)}


  4%|▍         | 38258/1000000 [1:58:17<39:31:33,  6.76it/s]

{'loss': Array(0.38633123, dtype=float32), 'loss_reward': Array(0.00357423, dtype=float32), 'loss_cross_entropy': Array(0.38275704, dtype=float32)}


  4%|▍         | 38268/1000000 [1:58:18<33:16:54,  8.03it/s]

{'loss': Array(0.39161286, dtype=float32), 'loss_reward': Array(0.00368937, dtype=float32), 'loss_cross_entropy': Array(0.38792348, dtype=float32)}


  4%|▍         | 38280/1000000 [1:58:20<26:47:12,  9.97it/s]

{'loss': Array(0.3967407, dtype=float32), 'loss_reward': Array(0.00366405, dtype=float32), 'loss_cross_entropy': Array(0.39307663, dtype=float32)}


  4%|▍         | 38288/1000000 [1:58:21<32:00:05,  8.35it/s]

{'loss': Array(0.38607138, dtype=float32), 'loss_reward': Array(0.00361235, dtype=float32), 'loss_cross_entropy': Array(0.38245898, dtype=float32)}


  4%|▍         | 38300/1000000 [1:58:23<31:44:45,  8.41it/s]

{'loss': Array(0.3942511, dtype=float32), 'loss_reward': Array(0.00358226, dtype=float32), 'loss_cross_entropy': Array(0.39066887, dtype=float32)}


  4%|▍         | 38308/1000000 [1:58:25<34:33:07,  7.73it/s]

{'loss': Array(0.38370246, dtype=float32), 'loss_reward': Array(0.00362769, dtype=float32), 'loss_cross_entropy': Array(0.38007474, dtype=float32)}


  4%|▍         | 38320/1000000 [1:58:26<27:28:45,  9.72it/s]

{'loss': Array(0.38997424, dtype=float32), 'loss_reward': Array(0.00353477, dtype=float32), 'loss_cross_entropy': Array(0.38643947, dtype=float32)}


  4%|▍         | 38328/1000000 [1:58:28<32:44:40,  8.16it/s]

{'loss': Array(0.3904899, dtype=float32), 'loss_reward': Array(0.00358109, dtype=float32), 'loss_cross_entropy': Array(0.38690883, dtype=float32)}


  4%|▍         | 38339/1000000 [1:58:30<32:41:41,  8.17it/s]

{'loss': Array(0.3806894, dtype=float32), 'loss_reward': Array(0.00357394, dtype=float32), 'loss_cross_entropy': Array(0.37711546, dtype=float32)}


  4%|▍         | 38350/1000000 [1:58:31<28:09:07,  9.49it/s]

{'loss': Array(0.39103627, dtype=float32), 'loss_reward': Array(0.00362677, dtype=float32), 'loss_cross_entropy': Array(0.3874095, dtype=float32)}


  4%|▍         | 38359/1000000 [1:58:33<29:54:06,  8.93it/s]

{'loss': Array(0.3872535, dtype=float32), 'loss_reward': Array(0.00370621, dtype=float32), 'loss_cross_entropy': Array(0.38354728, dtype=float32)}


  4%|▍         | 38370/1000000 [1:58:35<39:11:17,  6.82it/s]

{'loss': Array(0.3812686, dtype=float32), 'loss_reward': Array(0.00353761, dtype=float32), 'loss_cross_entropy': Array(0.37773097, dtype=float32)}


  4%|▍         | 38380/1000000 [1:58:36<28:44:30,  9.29it/s]

{'loss': Array(0.3849884, dtype=float32), 'loss_reward': Array(0.00361157, dtype=float32), 'loss_cross_entropy': Array(0.38137683, dtype=float32)}


  4%|▍         | 38388/1000000 [1:58:38<34:42:41,  7.70it/s]

{'loss': Array(0.3811457, dtype=float32), 'loss_reward': Array(0.00363652, dtype=float32), 'loss_cross_entropy': Array(0.37750915, dtype=float32)}


  4%|▍         | 38400/1000000 [1:58:40<28:14:44,  9.46it/s]

{'loss': Array(0.3870821, dtype=float32), 'loss_reward': Array(0.00369034, dtype=float32), 'loss_cross_entropy': Array(0.38339174, dtype=float32)}


  4%|▍         | 38410/1000000 [1:58:42<37:08:15,  7.19it/s]

{'loss': Array(0.38916665, dtype=float32), 'loss_reward': Array(0.00363695, dtype=float32), 'loss_cross_entropy': Array(0.3855297, dtype=float32)}


  4%|▍         | 38419/1000000 [1:58:43<31:48:29,  8.40it/s]

{'loss': Array(0.37438112, dtype=float32), 'loss_reward': Array(0.00360722, dtype=float32), 'loss_cross_entropy': Array(0.37077388, dtype=float32)}


  4%|▍         | 38430/1000000 [1:58:45<27:39:19,  9.66it/s]

{'loss': Array(0.37622663, dtype=float32), 'loss_reward': Array(0.00360125, dtype=float32), 'loss_cross_entropy': Array(0.3726254, dtype=float32)}


  4%|▍         | 38440/1000000 [1:58:47<26:44:28,  9.99it/s]

{'loss': Array(0.37653562, dtype=float32), 'loss_reward': Array(0.00368469, dtype=float32), 'loss_cross_entropy': Array(0.37285098, dtype=float32)}


  4%|▍         | 38449/1000000 [1:58:48<39:28:22,  6.77it/s]

{'loss': Array(0.37505373, dtype=float32), 'loss_reward': Array(0.00352382, dtype=float32), 'loss_cross_entropy': Array(0.3715299, dtype=float32)}


  4%|▍         | 38459/1000000 [1:58:50<29:16:50,  9.12it/s]

{'loss': Array(0.37768814, dtype=float32), 'loss_reward': Array(0.00357372, dtype=float32), 'loss_cross_entropy': Array(0.37411436, dtype=float32)}


  4%|▍         | 38469/1000000 [1:58:52<26:44:16,  9.99it/s]

{'loss': Array(0.37794256, dtype=float32), 'loss_reward': Array(0.00365963, dtype=float32), 'loss_cross_entropy': Array(0.37428296, dtype=float32)}


  4%|▍         | 38478/1000000 [1:58:53<29:22:57,  9.09it/s]

{'loss': Array(0.38471377, dtype=float32), 'loss_reward': Array(0.00369616, dtype=float32), 'loss_cross_entropy': Array(0.38101763, dtype=float32)}


  4%|▍         | 38488/1000000 [1:58:55<36:52:34,  7.24it/s]

{'loss': Array(0.3793305, dtype=float32), 'loss_reward': Array(0.00361082, dtype=float32), 'loss_cross_entropy': Array(0.37571967, dtype=float32)}


  4%|▍         | 38500/1000000 [1:58:57<25:58:18, 10.28it/s]

{'loss': Array(0.37932894, dtype=float32), 'loss_reward': Array(0.00351837, dtype=float32), 'loss_cross_entropy': Array(0.37581056, dtype=float32)}


  4%|▍         | 38508/1000000 [1:59:08<179:34:53,  1.49it/s]

{'loss': Array(0.37152836, dtype=float32), 'loss_reward': Array(0.00361671, dtype=float32), 'loss_cross_entropy': Array(0.36791164, dtype=float32)}


  4%|▍         | 38520/1000000 [1:59:10<53:31:43,  4.99it/s] 

{'loss': Array(0.3738919, dtype=float32), 'loss_reward': Array(0.00367324, dtype=float32), 'loss_cross_entropy': Array(0.37021866, dtype=float32)}


  4%|▍         | 38530/1000000 [1:59:12<35:35:20,  7.50it/s]

{'loss': Array(0.3870605, dtype=float32), 'loss_reward': Array(0.00365704, dtype=float32), 'loss_cross_entropy': Array(0.38340342, dtype=float32)}


  4%|▍         | 38538/1000000 [1:59:14<34:19:06,  7.78it/s]

{'loss': Array(0.38184708, dtype=float32), 'loss_reward': Array(0.00356196, dtype=float32), 'loss_cross_entropy': Array(0.37828514, dtype=float32)}


  4%|▍         | 38549/1000000 [1:59:15<26:56:27,  9.91it/s]

{'loss': Array(0.3869091, dtype=float32), 'loss_reward': Array(0.00366364, dtype=float32), 'loss_cross_entropy': Array(0.38324544, dtype=float32)}


  4%|▍         | 38559/1000000 [1:59:17<28:46:12,  9.28it/s]

{'loss': Array(0.38203278, dtype=float32), 'loss_reward': Array(0.00358127, dtype=float32), 'loss_cross_entropy': Array(0.37845156, dtype=float32)}


  4%|▍         | 38569/1000000 [1:59:19<30:29:43,  8.76it/s]

{'loss': Array(0.38482416, dtype=float32), 'loss_reward': Array(0.00362693, dtype=float32), 'loss_cross_entropy': Array(0.3811972, dtype=float32)}


  4%|▍         | 38579/1000000 [1:59:21<29:05:44,  9.18it/s]

{'loss': Array(0.3849208, dtype=float32), 'loss_reward': Array(0.0035641, dtype=float32), 'loss_cross_entropy': Array(0.38135672, dtype=float32)}


  4%|▍         | 38590/1000000 [1:59:22<26:43:26,  9.99it/s]

{'loss': Array(0.38581234, dtype=float32), 'loss_reward': Array(0.00361797, dtype=float32), 'loss_cross_entropy': Array(0.38219434, dtype=float32)}


  4%|▍         | 38600/1000000 [1:59:24<36:24:04,  7.34it/s]

{'loss': Array(0.38257983, dtype=float32), 'loss_reward': Array(0.00361654, dtype=float32), 'loss_cross_entropy': Array(0.37896332, dtype=float32)}


  4%|▍         | 38610/1000000 [1:59:26<28:04:07,  9.51it/s]

{'loss': Array(0.39067084, dtype=float32), 'loss_reward': Array(0.00363829, dtype=float32), 'loss_cross_entropy': Array(0.38703254, dtype=float32)}


  4%|▍         | 38618/1000000 [1:59:27<32:55:42,  8.11it/s]

{'loss': Array(0.38481876, dtype=float32), 'loss_reward': Array(0.00362971, dtype=float32), 'loss_cross_entropy': Array(0.38118905, dtype=float32)}


  4%|▍         | 38629/1000000 [1:59:29<26:54:44,  9.92it/s]

{'loss': Array(0.38653848, dtype=float32), 'loss_reward': Array(0.0035887, dtype=float32), 'loss_cross_entropy': Array(0.38294974, dtype=float32)}


  4%|▍         | 38639/1000000 [1:59:31<33:14:36,  8.03it/s]

{'loss': Array(0.3787096, dtype=float32), 'loss_reward': Array(0.00364963, dtype=float32), 'loss_cross_entropy': Array(0.37506, dtype=float32)}


  4%|▍         | 38650/1000000 [1:59:32<27:54:03,  9.57it/s]

{'loss': Array(0.38717732, dtype=float32), 'loss_reward': Array(0.00381718, dtype=float32), 'loss_cross_entropy': Array(0.38336012, dtype=float32)}


  4%|▍         | 38660/1000000 [1:59:34<26:33:16, 10.06it/s]

{'loss': Array(0.38417372, dtype=float32), 'loss_reward': Array(0.0035668, dtype=float32), 'loss_cross_entropy': Array(0.3806069, dtype=float32)}


  4%|▍         | 38670/1000000 [1:59:36<29:23:01,  9.09it/s]

{'loss': Array(0.38315094, dtype=float32), 'loss_reward': Array(0.00360778, dtype=float32), 'loss_cross_entropy': Array(0.37954316, dtype=float32)}


  4%|▍         | 38679/1000000 [1:59:38<36:27:30,  7.32it/s]

{'loss': Array(0.3839912, dtype=float32), 'loss_reward': Array(0.00365958, dtype=float32), 'loss_cross_entropy': Array(0.3803316, dtype=float32)}


  4%|▍         | 38690/1000000 [1:59:39<27:55:11,  9.56it/s]

{'loss': Array(0.38120127, dtype=float32), 'loss_reward': Array(0.00354275, dtype=float32), 'loss_cross_entropy': Array(0.37765858, dtype=float32)}


  4%|▍         | 38700/1000000 [1:59:41<29:59:48,  8.90it/s]

{'loss': Array(0.38383284, dtype=float32), 'loss_reward': Array(0.00361548, dtype=float32), 'loss_cross_entropy': Array(0.38021737, dtype=float32)}


  4%|▍         | 38704/1000000 [1:59:42<51:57:42,  5.14it/s]

  4%|▍         | 38710/1000000 [1:59:43<29:03:23,  9.19it/s]

{'loss': Array(0.37930393, dtype=float32), 'loss_reward': Array(0.00356761, dtype=float32), 'loss_cross_entropy': Array(0.37573633, dtype=float32)}


  4%|▍         | 38718/1000000 [1:59:44<39:21:01,  6.79it/s]

{'loss': Array(0.375394, dtype=float32), 'loss_reward': Array(0.00353182, dtype=float32), 'loss_cross_entropy': Array(0.37186217, dtype=float32)}


  4%|▍         | 38730/1000000 [1:59:46<28:39:49,  9.32it/s]

{'loss': Array(0.3847318, dtype=float32), 'loss_reward': Array(0.00360488, dtype=float32), 'loss_cross_entropy': Array(0.38112694, dtype=float32)}


  4%|▍         | 38740/1000000 [1:59:48<27:51:29,  9.58it/s]

{'loss': Array(0.37962636, dtype=float32), 'loss_reward': Array(0.00356489, dtype=float32), 'loss_cross_entropy': Array(0.37606147, dtype=float32)}


  4%|▍         | 38750/1000000 [1:59:50<42:31:54,  6.28it/s]

{'loss': Array(0.37354678, dtype=float32), 'loss_reward': Array(0.00364665, dtype=float32), 'loss_cross_entropy': Array(0.36990017, dtype=float32)}


  4%|▍         | 38760/1000000 [1:59:52<29:40:13,  9.00it/s]

{'loss': Array(0.3737183, dtype=float32), 'loss_reward': Array(0.00360904, dtype=float32), 'loss_cross_entropy': Array(0.37010923, dtype=float32)}


  4%|▍         | 38769/1000000 [1:59:53<31:44:28,  8.41it/s]

{'loss': Array(0.3752575, dtype=float32), 'loss_reward': Array(0.0035918, dtype=float32), 'loss_cross_entropy': Array(0.37166572, dtype=float32)}


  4%|▍         | 38779/1000000 [1:59:55<28:01:25,  9.53it/s]

{'loss': Array(0.3770681, dtype=float32), 'loss_reward': Array(0.00360791, dtype=float32), 'loss_cross_entropy': Array(0.3734602, dtype=float32)}


  4%|▍         | 38789/1000000 [1:59:57<37:37:33,  7.10it/s]

{'loss': Array(0.38475353, dtype=float32), 'loss_reward': Array(0.00361002, dtype=float32), 'loss_cross_entropy': Array(0.38114354, dtype=float32)}


  4%|▍         | 38799/1000000 [1:59:59<29:49:15,  8.95it/s]

{'loss': Array(0.37315032, dtype=float32), 'loss_reward': Array(0.00353415, dtype=float32), 'loss_cross_entropy': Array(0.36961615, dtype=float32)}


  4%|▍         | 38809/1000000 [2:00:00<27:27:38,  9.72it/s]

{'loss': Array(0.38050565, dtype=float32), 'loss_reward': Array(0.00358461, dtype=float32), 'loss_cross_entropy': Array(0.3769211, dtype=float32)}


  4%|▍         | 38819/1000000 [2:00:02<27:40:57,  9.64it/s]

{'loss': Array(0.37767223, dtype=float32), 'loss_reward': Array(0.00367223, dtype=float32), 'loss_cross_entropy': Array(0.37399998, dtype=float32)}


  4%|▍         | 38829/1000000 [2:00:04<35:42:30,  7.48it/s]

{'loss': Array(0.37603045, dtype=float32), 'loss_reward': Array(0.00341477, dtype=float32), 'loss_cross_entropy': Array(0.37261567, dtype=float32)}


  4%|▍         | 38840/1000000 [2:00:06<27:19:28,  9.77it/s]

{'loss': Array(0.37426767, dtype=float32), 'loss_reward': Array(0.00359734, dtype=float32), 'loss_cross_entropy': Array(0.37067032, dtype=float32)}


  4%|▍         | 38849/1000000 [2:00:07<30:30:32,  8.75it/s]

{'loss': Array(0.38129768, dtype=float32), 'loss_reward': Array(0.00357257, dtype=float32), 'loss_cross_entropy': Array(0.3777251, dtype=float32)}


  4%|▍         | 38859/1000000 [2:00:09<27:25:18,  9.74it/s]

{'loss': Array(0.38286802, dtype=float32), 'loss_reward': Array(0.00361145, dtype=float32), 'loss_cross_entropy': Array(0.37925658, dtype=float32)}


  4%|▍         | 38869/1000000 [2:00:11<33:27:28,  7.98it/s]

{'loss': Array(0.37863222, dtype=float32), 'loss_reward': Array(0.00349559, dtype=float32), 'loss_cross_entropy': Array(0.3751366, dtype=float32)}


  4%|▍         | 38880/1000000 [2:00:13<28:04:08,  9.51it/s]

{'loss': Array(0.36277872, dtype=float32), 'loss_reward': Array(0.00350964, dtype=float32), 'loss_cross_entropy': Array(0.35926908, dtype=float32)}


  4%|▍         | 38889/1000000 [2:00:14<29:45:12,  8.97it/s]

{'loss': Array(0.37468264, dtype=float32), 'loss_reward': Array(0.00353116, dtype=float32), 'loss_cross_entropy': Array(0.3711515, dtype=float32)}


  4%|▍         | 38900/1000000 [2:00:16<27:51:42,  9.58it/s]

{'loss': Array(0.36936843, dtype=float32), 'loss_reward': Array(0.00355078, dtype=float32), 'loss_cross_entropy': Array(0.36581767, dtype=float32)}


  4%|▍         | 38908/1000000 [2:00:18<38:40:17,  6.90it/s]

{'loss': Array(0.37385055, dtype=float32), 'loss_reward': Array(0.00355441, dtype=float32), 'loss_cross_entropy': Array(0.37029615, dtype=float32)}


  4%|▍         | 38919/1000000 [2:00:19<30:14:57,  8.83it/s]

{'loss': Array(0.3714063, dtype=float32), 'loss_reward': Array(0.00358732, dtype=float32), 'loss_cross_entropy': Array(0.36781898, dtype=float32)}


  4%|▍         | 38929/1000000 [2:00:21<29:55:36,  8.92it/s]

{'loss': Array(0.37520278, dtype=float32), 'loss_reward': Array(0.00353173, dtype=float32), 'loss_cross_entropy': Array(0.37167105, dtype=float32)}


  4%|▍         | 38939/1000000 [2:00:23<40:35:20,  6.58it/s]

{'loss': Array(0.37121007, dtype=float32), 'loss_reward': Array(0.00356418, dtype=float32), 'loss_cross_entropy': Array(0.36764592, dtype=float32)}


  4%|▍         | 38949/1000000 [2:00:25<30:30:01,  8.75it/s]

{'loss': Array(0.3749938, dtype=float32), 'loss_reward': Array(0.00366811, dtype=float32), 'loss_cross_entropy': Array(0.3713257, dtype=float32)}


  4%|▍         | 38959/1000000 [2:00:26<28:41:33,  9.30it/s]

{'loss': Array(0.37680537, dtype=float32), 'loss_reward': Array(0.0035563, dtype=float32), 'loss_cross_entropy': Array(0.3732491, dtype=float32)}


  4%|▍         | 38970/1000000 [2:00:28<27:08:35,  9.83it/s]

{'loss': Array(0.37170753, dtype=float32), 'loss_reward': Array(0.00367499, dtype=float32), 'loss_cross_entropy': Array(0.36803254, dtype=float32)}


  4%|▍         | 38979/1000000 [2:00:30<38:39:00,  6.91it/s]

{'loss': Array(0.370164, dtype=float32), 'loss_reward': Array(0.00351324, dtype=float32), 'loss_cross_entropy': Array(0.36665076, dtype=float32)}


  4%|▍         | 38989/1000000 [2:00:31<28:33:06,  9.35it/s]

{'loss': Array(0.38043004, dtype=float32), 'loss_reward': Array(0.00346951, dtype=float32), 'loss_cross_entropy': Array(0.37696052, dtype=float32)}


  4%|▍         | 39000/1000000 [2:00:33<27:43:49,  9.63it/s]

{'loss': Array(0.3811987, dtype=float32), 'loss_reward': Array(0.00355688, dtype=float32), 'loss_cross_entropy': Array(0.37764177, dtype=float32)}


  4%|▍         | 39009/1000000 [2:00:45<141:06:25,  1.89it/s]

{'loss': Array(0.37380672, dtype=float32), 'loss_reward': Array(0.00352193, dtype=float32), 'loss_cross_entropy': Array(0.3702848, dtype=float32)}


  4%|▍         | 39019/1000000 [2:00:47<62:56:27,  4.24it/s] 

{'loss': Array(0.37530872, dtype=float32), 'loss_reward': Array(0.00357343, dtype=float32), 'loss_cross_entropy': Array(0.37173527, dtype=float32)}


  4%|▍         | 39028/1000000 [2:00:48<39:07:41,  6.82it/s]

{'loss': Array(0.3696846, dtype=float32), 'loss_reward': Array(0.00360392, dtype=float32), 'loss_cross_entropy': Array(0.36608067, dtype=float32)}


  4%|▍         | 39040/1000000 [2:00:50<28:01:20,  9.53it/s]

{'loss': Array(0.38286087, dtype=float32), 'loss_reward': Array(0.00348618, dtype=float32), 'loss_cross_entropy': Array(0.37937465, dtype=float32)}


  4%|▍         | 39048/1000000 [2:00:51<33:41:20,  7.92it/s]

{'loss': Array(0.3785094, dtype=float32), 'loss_reward': Array(0.00357541, dtype=float32), 'loss_cross_entropy': Array(0.37493402, dtype=float32)}


  4%|▍         | 39059/1000000 [2:00:53<33:46:28,  7.90it/s]

{'loss': Array(0.38068983, dtype=float32), 'loss_reward': Array(0.00353544, dtype=float32), 'loss_cross_entropy': Array(0.37715438, dtype=float32)}


  4%|▍         | 39070/1000000 [2:00:55<28:53:19,  9.24it/s]

{'loss': Array(0.36699033, dtype=float32), 'loss_reward': Array(0.00350891, dtype=float32), 'loss_cross_entropy': Array(0.36348143, dtype=float32)}


  4%|▍         | 39078/1000000 [2:00:56<34:12:58,  7.80it/s]

{'loss': Array(0.37709826, dtype=float32), 'loss_reward': Array(0.00354985, dtype=float32), 'loss_cross_entropy': Array(0.37354845, dtype=float32)}


  4%|▍         | 39089/1000000 [2:00:58<29:05:42,  9.17it/s]

{'loss': Array(0.37105972, dtype=float32), 'loss_reward': Array(0.00342996, dtype=float32), 'loss_cross_entropy': Array(0.36762977, dtype=float32)}


  4%|▍         | 39100/1000000 [2:01:00<29:53:54,  8.93it/s]

{'loss': Array(0.37315646, dtype=float32), 'loss_reward': Array(0.00359807, dtype=float32), 'loss_cross_entropy': Array(0.36955842, dtype=float32)}


  4%|▍         | 39109/1000000 [2:01:02<31:51:02,  8.38it/s]

{'loss': Array(0.38827863, dtype=float32), 'loss_reward': Array(0.00362396, dtype=float32), 'loss_cross_entropy': Array(0.38465473, dtype=float32)}


  4%|▍         | 39120/1000000 [2:01:03<27:40:34,  9.64it/s]

{'loss': Array(0.3794093, dtype=float32), 'loss_reward': Array(0.00354784, dtype=float32), 'loss_cross_entropy': Array(0.3758615, dtype=float32)}


  4%|▍         | 39130/1000000 [2:01:05<38:57:53,  6.85it/s]

{'loss': Array(0.38249037, dtype=float32), 'loss_reward': Array(0.00350565, dtype=float32), 'loss_cross_entropy': Array(0.37898475, dtype=float32)}


  4%|▍         | 39138/1000000 [2:01:07<35:55:44,  7.43it/s]

{'loss': Array(0.38290522, dtype=float32), 'loss_reward': Array(0.00374688, dtype=float32), 'loss_cross_entropy': Array(0.37915832, dtype=float32)}


  4%|▍         | 39150/1000000 [2:01:08<27:18:39,  9.77it/s]

{'loss': Array(0.37187174, dtype=float32), 'loss_reward': Array(0.00364489, dtype=float32), 'loss_cross_entropy': Array(0.36822683, dtype=float32)}


  4%|▍         | 39160/1000000 [2:01:10<27:53:16,  9.57it/s]

{'loss': Array(0.37771046, dtype=float32), 'loss_reward': Array(0.0035292, dtype=float32), 'loss_cross_entropy': Array(0.3741813, dtype=float32)}


  4%|▍         | 39170/1000000 [2:01:12<35:56:12,  7.43it/s]

{'loss': Array(0.3773296, dtype=float32), 'loss_reward': Array(0.00357914, dtype=float32), 'loss_cross_entropy': Array(0.37375042, dtype=float32)}


  4%|▍         | 39178/1000000 [2:01:13<35:38:10,  7.49it/s]

{'loss': Array(0.36937997, dtype=float32), 'loss_reward': Array(0.00354438, dtype=float32), 'loss_cross_entropy': Array(0.36583558, dtype=float32)}


  4%|▍         | 39189/1000000 [2:01:15<29:18:12,  9.11it/s]

{'loss': Array(0.38055176, dtype=float32), 'loss_reward': Array(0.00365638, dtype=float32), 'loss_cross_entropy': Array(0.37689534, dtype=float32)}


  4%|▍         | 39200/1000000 [2:01:16<26:31:18, 10.06it/s]

{'loss': Array(0.3815239, dtype=float32), 'loss_reward': Array(0.00356437, dtype=float32), 'loss_cross_entropy': Array(0.37795952, dtype=float32)}


  4%|▍         | 39208/1000000 [2:01:18<39:05:51,  6.83it/s]

{'loss': Array(0.38448244, dtype=float32), 'loss_reward': Array(0.0036364, dtype=float32), 'loss_cross_entropy': Array(0.38084605, dtype=float32)}


  4%|▍         | 39220/1000000 [2:01:20<28:18:37,  9.43it/s]

{'loss': Array(0.37838027, dtype=float32), 'loss_reward': Array(0.00355734, dtype=float32), 'loss_cross_entropy': Array(0.37482294, dtype=float32)}


  4%|▍         | 39228/1000000 [2:01:21<33:44:59,  7.91it/s]

{'loss': Array(0.37763727, dtype=float32), 'loss_reward': Array(0.00352292, dtype=float32), 'loss_cross_entropy': Array(0.37411436, dtype=float32)}


  4%|▍         | 39240/1000000 [2:01:23<27:42:16,  9.63it/s]

{'loss': Array(0.3838199, dtype=float32), 'loss_reward': Array(0.00361826, dtype=float32), 'loss_cross_entropy': Array(0.38020164, dtype=float32)}


  4%|▍         | 39250/1000000 [2:01:25<31:47:27,  8.39it/s]

{'loss': Array(0.3737316, dtype=float32), 'loss_reward': Array(0.00355117, dtype=float32), 'loss_cross_entropy': Array(0.37018046, dtype=float32)}


  4%|▍         | 39258/1000000 [2:01:26<33:38:33,  7.93it/s]

{'loss': Array(0.37107977, dtype=float32), 'loss_reward': Array(0.00351485, dtype=float32), 'loss_cross_entropy': Array(0.36756492, dtype=float32)}


  4%|▍         | 39270/1000000 [2:01:28<26:56:12,  9.91it/s]

{'loss': Array(0.37815008, dtype=float32), 'loss_reward': Array(0.00360645, dtype=float32), 'loss_cross_entropy': Array(0.37454364, dtype=float32)}


  4%|▍         | 39280/1000000 [2:01:29<27:18:37,  9.77it/s]

{'loss': Array(0.3695731, dtype=float32), 'loss_reward': Array(0.0034716, dtype=float32), 'loss_cross_entropy': Array(0.36610153, dtype=float32)}


  4%|▍         | 39288/1000000 [2:01:31<37:04:29,  7.20it/s]

{'loss': Array(0.3722063, dtype=float32), 'loss_reward': Array(0.0034918, dtype=float32), 'loss_cross_entropy': Array(0.36871454, dtype=float32)}


  4%|▍         | 39300/1000000 [2:01:33<27:40:28,  9.64it/s]

{'loss': Array(0.3747389, dtype=float32), 'loss_reward': Array(0.00358308, dtype=float32), 'loss_cross_entropy': Array(0.37115583, dtype=float32)}


  4%|▍         | 39308/1000000 [2:01:34<32:07:58,  8.30it/s]

{'loss': Array(0.3791732, dtype=float32), 'loss_reward': Array(0.00363874, dtype=float32), 'loss_cross_entropy': Array(0.3755345, dtype=float32)}


  4%|▍         | 39319/1000000 [2:01:36<38:52:08,  6.87it/s]

{'loss': Array(0.36608276, dtype=float32), 'loss_reward': Array(0.00357124, dtype=float32), 'loss_cross_entropy': Array(0.3625115, dtype=float32)}


  4%|▍         | 39330/1000000 [2:01:38<28:38:52,  9.31it/s]

{'loss': Array(0.37525114, dtype=float32), 'loss_reward': Array(0.00363358, dtype=float32), 'loss_cross_entropy': Array(0.37161756, dtype=float32)}


  4%|▍         | 39340/1000000 [2:01:39<28:22:34,  9.40it/s]

{'loss': Array(0.37211475, dtype=float32), 'loss_reward': Array(0.00349417, dtype=float32), 'loss_cross_entropy': Array(0.3686206, dtype=float32)}


  4%|▍         | 39348/1000000 [2:01:41<33:05:33,  8.06it/s]

{'loss': Array(0.36782905, dtype=float32), 'loss_reward': Array(0.00353311, dtype=float32), 'loss_cross_entropy': Array(0.36429596, dtype=float32)}


  4%|▍         | 39359/1000000 [2:01:42<35:26:14,  7.53it/s]

{'loss': Array(0.36842465, dtype=float32), 'loss_reward': Array(0.00342947, dtype=float32), 'loss_cross_entropy': Array(0.3649952, dtype=float32)}


  4%|▍         | 39370/1000000 [2:01:44<29:14:12,  9.13it/s]

{'loss': Array(0.37544876, dtype=float32), 'loss_reward': Array(0.00355579, dtype=float32), 'loss_cross_entropy': Array(0.37189293, dtype=float32)}


  4%|▍         | 39378/1000000 [2:01:46<34:04:47,  7.83it/s]

{'loss': Array(0.37491462, dtype=float32), 'loss_reward': Array(0.00364126, dtype=float32), 'loss_cross_entropy': Array(0.37127337, dtype=float32)}


  4%|▍         | 39390/1000000 [2:01:47<27:21:40,  9.75it/s]

{'loss': Array(0.37332734, dtype=float32), 'loss_reward': Array(0.00352627, dtype=float32), 'loss_cross_entropy': Array(0.36980104, dtype=float32)}


  4%|▍         | 39398/1000000 [2:01:49<39:44:15,  6.71it/s]

{'loss': Array(0.3609839, dtype=float32), 'loss_reward': Array(0.00337781, dtype=float32), 'loss_cross_entropy': Array(0.3576061, dtype=float32)}


  4%|▍         | 39410/1000000 [2:01:51<28:05:34,  9.50it/s]

{'loss': Array(0.3712248, dtype=float32), 'loss_reward': Array(0.00352276, dtype=float32), 'loss_cross_entropy': Array(0.367702, dtype=float32)}


  4%|▍         | 39418/1000000 [2:01:52<32:15:23,  8.27it/s]

{'loss': Array(0.37050742, dtype=float32), 'loss_reward': Array(0.00356143, dtype=float32), 'loss_cross_entropy': Array(0.36694595, dtype=float32)}


  4%|▍         | 39430/1000000 [2:01:54<26:12:41, 10.18it/s]

{'loss': Array(0.3701361, dtype=float32), 'loss_reward': Array(0.00343656, dtype=float32), 'loss_cross_entropy': Array(0.36669955, dtype=float32)}


  4%|▍         | 39440/1000000 [2:01:56<31:19:19,  8.52it/s]

{'loss': Array(0.36750007, dtype=float32), 'loss_reward': Array(0.0034958, dtype=float32), 'loss_cross_entropy': Array(0.36400422, dtype=float32)}


  4%|▍         | 39448/1000000 [2:01:57<33:52:18,  7.88it/s]

{'loss': Array(0.36917517, dtype=float32), 'loss_reward': Array(0.00343375, dtype=float32), 'loss_cross_entropy': Array(0.3657414, dtype=float32)}


  4%|▍         | 39460/1000000 [2:01:59<27:09:26,  9.82it/s]

{'loss': Array(0.36189458, dtype=float32), 'loss_reward': Array(0.00349874, dtype=float32), 'loss_cross_entropy': Array(0.35839579, dtype=float32)}


  4%|▍         | 39470/1000000 [2:02:00<28:39:02,  9.31it/s]

{'loss': Array(0.37279946, dtype=float32), 'loss_reward': Array(0.00355411, dtype=float32), 'loss_cross_entropy': Array(0.36924535, dtype=float32)}


  4%|▍         | 39478/1000000 [2:02:02<39:13:14,  6.80it/s]

{'loss': Array(0.35955253, dtype=float32), 'loss_reward': Array(0.00344528, dtype=float32), 'loss_cross_entropy': Array(0.35610726, dtype=float32)}


  4%|▍         | 39490/1000000 [2:02:04<29:00:39,  9.20it/s]

{'loss': Array(0.3669224, dtype=float32), 'loss_reward': Array(0.00354929, dtype=float32), 'loss_cross_entropy': Array(0.36337313, dtype=float32)}


  4%|▍         | 39498/1000000 [2:02:05<33:25:27,  7.98it/s]

{'loss': Array(0.36539784, dtype=float32), 'loss_reward': Array(0.00340064, dtype=float32), 'loss_cross_entropy': Array(0.36199722, dtype=float32)}


  4%|▍         | 39509/1000000 [2:02:17<139:46:07,  1.91it/s]

{'loss': Array(0.37507656, dtype=float32), 'loss_reward': Array(0.0035236, dtype=float32), 'loss_cross_entropy': Array(0.37155294, dtype=float32)}


  4%|▍         | 39520/1000000 [2:02:18<51:38:16,  5.17it/s] 

{'loss': Array(0.36828983, dtype=float32), 'loss_reward': Array(0.00358943, dtype=float32), 'loss_cross_entropy': Array(0.3647004, dtype=float32)}


  4%|▍         | 39528/1000000 [2:02:20<41:38:44,  6.41it/s]

{'loss': Array(0.3723618, dtype=float32), 'loss_reward': Array(0.0036298, dtype=float32), 'loss_cross_entropy': Array(0.36873204, dtype=float32)}


  4%|▍         | 39539/1000000 [2:02:22<32:26:16,  8.22it/s]

{'loss': Array(0.36852333, dtype=float32), 'loss_reward': Array(0.00364799, dtype=float32), 'loss_cross_entropy': Array(0.3648753, dtype=float32)}


  4%|▍         | 39550/1000000 [2:02:24<34:38:18,  7.70it/s]

{'loss': Array(0.3639317, dtype=float32), 'loss_reward': Array(0.00349883, dtype=float32), 'loss_cross_entropy': Array(0.36043283, dtype=float32)}


  4%|▍         | 39558/1000000 [2:02:25<36:02:52,  7.40it/s]

{'loss': Array(0.37343764, dtype=float32), 'loss_reward': Array(0.00359475, dtype=float32), 'loss_cross_entropy': Array(0.36984292, dtype=float32)}


  4%|▍         | 39570/1000000 [2:02:27<27:28:06,  9.71it/s]

{'loss': Array(0.37500122, dtype=float32), 'loss_reward': Array(0.00347253, dtype=float32), 'loss_cross_entropy': Array(0.37152869, dtype=float32)}


  4%|▍         | 39580/1000000 [2:02:28<27:27:43,  9.71it/s]

{'loss': Array(0.37444505, dtype=float32), 'loss_reward': Array(0.00362191, dtype=float32), 'loss_cross_entropy': Array(0.37082317, dtype=float32)}


  4%|▍         | 39590/1000000 [2:02:30<33:18:40,  8.01it/s]

{'loss': Array(0.37483093, dtype=float32), 'loss_reward': Array(0.00347199, dtype=float32), 'loss_cross_entropy': Array(0.37135896, dtype=float32)}


  4%|▍         | 39598/1000000 [2:02:32<35:26:24,  7.53it/s]

{'loss': Array(0.35946542, dtype=float32), 'loss_reward': Array(0.0034566, dtype=float32), 'loss_cross_entropy': Array(0.35600883, dtype=float32)}


  4%|▍         | 39610/1000000 [2:02:33<27:56:35,  9.55it/s]

{'loss': Array(0.36935386, dtype=float32), 'loss_reward': Array(0.00346197, dtype=float32), 'loss_cross_entropy': Array(0.36589187, dtype=float32)}


  4%|▍         | 39618/1000000 [2:02:35<34:23:19,  7.76it/s]

{'loss': Array(0.3660826, dtype=float32), 'loss_reward': Array(0.00350241, dtype=float32), 'loss_cross_entropy': Array(0.3625802, dtype=float32)}


  4%|▍         | 39629/1000000 [2:02:37<33:09:51,  8.04it/s]

{'loss': Array(0.37135258, dtype=float32), 'loss_reward': Array(0.00356545, dtype=float32), 'loss_cross_entropy': Array(0.3677871, dtype=float32)}


  4%|▍         | 39640/1000000 [2:02:38<27:27:19,  9.72it/s]

{'loss': Array(0.36852694, dtype=float32), 'loss_reward': Array(0.00344964, dtype=float32), 'loss_cross_entropy': Array(0.36507732, dtype=float32)}


  4%|▍         | 39648/1000000 [2:02:40<33:10:54,  8.04it/s]

{'loss': Array(0.37487736, dtype=float32), 'loss_reward': Array(0.00354688, dtype=float32), 'loss_cross_entropy': Array(0.37133047, dtype=float32)}


  4%|▍         | 39660/1000000 [2:02:41<27:14:20,  9.79it/s]

{'loss': Array(0.37222362, dtype=float32), 'loss_reward': Array(0.00348723, dtype=float32), 'loss_cross_entropy': Array(0.36873642, dtype=float32)}


  4%|▍         | 39668/1000000 [2:02:43<37:10:02,  7.18it/s]

{'loss': Array(0.3675656, dtype=float32), 'loss_reward': Array(0.00355268, dtype=float32), 'loss_cross_entropy': Array(0.36401293, dtype=float32)}


  4%|▍         | 39680/1000000 [2:02:45<28:32:34,  9.35it/s]

{'loss': Array(0.36922178, dtype=float32), 'loss_reward': Array(0.00349801, dtype=float32), 'loss_cross_entropy': Array(0.3657238, dtype=float32)}


  4%|▍         | 39688/1000000 [2:02:46<34:19:47,  7.77it/s]

{'loss': Array(0.36258668, dtype=float32), 'loss_reward': Array(0.00350118, dtype=float32), 'loss_cross_entropy': Array(0.3590855, dtype=float32)}


  4%|▍         | 39699/1000000 [2:02:48<39:33:08,  6.74it/s]

{'loss': Array(0.37421945, dtype=float32), 'loss_reward': Array(0.00359965, dtype=float32), 'loss_cross_entropy': Array(0.37061977, dtype=float32)}


  4%|▍         | 39710/1000000 [2:02:50<29:05:38,  9.17it/s]

{'loss': Array(0.37270102, dtype=float32), 'loss_reward': Array(0.00350343, dtype=float32), 'loss_cross_entropy': Array(0.36919752, dtype=float32)}


  4%|▍         | 39718/1000000 [2:02:51<33:22:45,  7.99it/s]

{'loss': Array(0.37276423, dtype=float32), 'loss_reward': Array(0.00354924, dtype=float32), 'loss_cross_entropy': Array(0.36921498, dtype=float32)}


  4%|▍         | 39729/1000000 [2:02:53<29:00:16,  9.20it/s]

{'loss': Array(0.37015504, dtype=float32), 'loss_reward': Array(0.00357485, dtype=float32), 'loss_cross_entropy': Array(0.3665802, dtype=float32)}


  4%|▍         | 39740/1000000 [2:02:55<33:46:41,  7.90it/s]

{'loss': Array(0.36891824, dtype=float32), 'loss_reward': Array(0.00348344, dtype=float32), 'loss_cross_entropy': Array(0.36543483, dtype=float32)}


  4%|▍         | 39748/1000000 [2:02:56<34:10:41,  7.80it/s]

{'loss': Array(0.37136003, dtype=float32), 'loss_reward': Array(0.00360842, dtype=float32), 'loss_cross_entropy': Array(0.36775163, dtype=float32)}


  4%|▍         | 39760/1000000 [2:02:58<26:43:14,  9.98it/s]

{'loss': Array(0.35508755, dtype=float32), 'loss_reward': Array(0.00349606, dtype=float32), 'loss_cross_entropy': Array(0.3515915, dtype=float32)}


  4%|▍         | 39770/1000000 [2:03:00<27:06:10,  9.84it/s]

{'loss': Array(0.36764774, dtype=float32), 'loss_reward': Array(0.00350643, dtype=float32), 'loss_cross_entropy': Array(0.36414137, dtype=float32)}


  4%|▍         | 39780/1000000 [2:03:01<34:29:29,  7.73it/s]

{'loss': Array(0.3673927, dtype=float32), 'loss_reward': Array(0.00351084, dtype=float32), 'loss_cross_entropy': Array(0.3638819, dtype=float32)}


  4%|▍         | 39788/1000000 [2:03:03<35:04:23,  7.60it/s]

{'loss': Array(0.38178495, dtype=float32), 'loss_reward': Array(0.00365921, dtype=float32), 'loss_cross_entropy': Array(0.37812573, dtype=float32)}


  4%|▍         | 39800/1000000 [2:03:05<27:26:22,  9.72it/s]

{'loss': Array(0.36232385, dtype=float32), 'loss_reward': Array(0.00348694, dtype=float32), 'loss_cross_entropy': Array(0.3588369, dtype=float32)}


  4%|▍         | 39808/1000000 [2:03:06<33:20:25,  8.00it/s]

{'loss': Array(0.36749926, dtype=float32), 'loss_reward': Array(0.00345489, dtype=float32), 'loss_cross_entropy': Array(0.36404437, dtype=float32)}


  4%|▍         | 39818/1000000 [2:03:08<34:54:37,  7.64it/s]

{'loss': Array(0.36822143, dtype=float32), 'loss_reward': Array(0.00364743, dtype=float32), 'loss_cross_entropy': Array(0.364574, dtype=float32)}


  4%|▍         | 39830/1000000 [2:03:10<27:13:45,  9.80it/s]

{'loss': Array(0.38065535, dtype=float32), 'loss_reward': Array(0.0036461, dtype=float32), 'loss_cross_entropy': Array(0.37700927, dtype=float32)}


  4%|▍         | 39838/1000000 [2:03:11<33:17:16,  8.01it/s]

{'loss': Array(0.36981997, dtype=float32), 'loss_reward': Array(0.00360074, dtype=float32), 'loss_cross_entropy': Array(0.36621922, dtype=float32)}


  4%|▍         | 39850/1000000 [2:03:13<27:09:39,  9.82it/s]

{'loss': Array(0.36561176, dtype=float32), 'loss_reward': Array(0.00353924, dtype=float32), 'loss_cross_entropy': Array(0.36207256, dtype=float32)}


  4%|▍         | 39858/1000000 [2:03:15<37:47:32,  7.06it/s]

{'loss': Array(0.3645237, dtype=float32), 'loss_reward': Array(0.00359263, dtype=float32), 'loss_cross_entropy': Array(0.36093107, dtype=float32)}


  4%|▍         | 39870/1000000 [2:03:16<28:44:40,  9.28it/s]

{'loss': Array(0.36624885, dtype=float32), 'loss_reward': Array(0.00346228, dtype=float32), 'loss_cross_entropy': Array(0.36278662, dtype=float32)}


  4%|▍         | 39878/1000000 [2:03:18<33:46:03,  7.90it/s]

{'loss': Array(0.35921273, dtype=float32), 'loss_reward': Array(0.00339091, dtype=float32), 'loss_cross_entropy': Array(0.35582182, dtype=float32)}


  4%|▍         | 39889/1000000 [2:03:20<39:18:16,  6.79it/s]

{'loss': Array(0.3611978, dtype=float32), 'loss_reward': Array(0.00363098, dtype=float32), 'loss_cross_entropy': Array(0.35756683, dtype=float32)}


  4%|▍         | 39899/1000000 [2:03:21<30:12:23,  8.83it/s]

{'loss': Array(0.36311805, dtype=float32), 'loss_reward': Array(0.00344937, dtype=float32), 'loss_cross_entropy': Array(0.3596687, dtype=float32)}


  4%|▍         | 39909/1000000 [2:03:23<28:50:26,  9.25it/s]

{'loss': Array(0.36779198, dtype=float32), 'loss_reward': Array(0.00351881, dtype=float32), 'loss_cross_entropy': Array(0.36427316, dtype=float32)}


  4%|▍         | 39920/1000000 [2:03:24<27:25:00,  9.73it/s]

{'loss': Array(0.3580823, dtype=float32), 'loss_reward': Array(0.00348351, dtype=float32), 'loss_cross_entropy': Array(0.35459882, dtype=float32)}


  4%|▍         | 39930/1000000 [2:03:26<34:03:21,  7.83it/s]

{'loss': Array(0.36181697, dtype=float32), 'loss_reward': Array(0.00337968, dtype=float32), 'loss_cross_entropy': Array(0.35843727, dtype=float32)}


  4%|▍         | 39938/1000000 [2:03:28<35:31:50,  7.51it/s]

{'loss': Array(0.37279853, dtype=float32), 'loss_reward': Array(0.00370695, dtype=float32), 'loss_cross_entropy': Array(0.3690916, dtype=float32)}


  4%|▍         | 39950/1000000 [2:03:29<27:32:09,  9.68it/s]

{'loss': Array(0.37322888, dtype=float32), 'loss_reward': Array(0.00357336, dtype=float32), 'loss_cross_entropy': Array(0.36965552, dtype=float32)}


  4%|▍         | 39958/1000000 [2:03:31<32:27:04,  8.22it/s]

{'loss': Array(0.357895, dtype=float32), 'loss_reward': Array(0.00352208, dtype=float32), 'loss_cross_entropy': Array(0.3543729, dtype=float32)}


  4%|▍         | 39969/1000000 [2:03:33<34:58:52,  7.62it/s]

{'loss': Array(0.3701329, dtype=float32), 'loss_reward': Array(0.0035336, dtype=float32), 'loss_cross_entropy': Array(0.36659932, dtype=float32)}


  4%|▍         | 39979/1000000 [2:03:34<28:48:33,  9.26it/s]

{'loss': Array(0.36315298, dtype=float32), 'loss_reward': Array(0.00340779, dtype=float32), 'loss_cross_entropy': Array(0.35974517, dtype=float32)}


  4%|▍         | 39990/1000000 [2:03:36<26:19:37, 10.13it/s]

{'loss': Array(0.36019167, dtype=float32), 'loss_reward': Array(0.0035506, dtype=float32), 'loss_cross_entropy': Array(0.35664105, dtype=float32)}


  4%|▍         | 39998/1000000 [2:03:37<32:15:55,  8.26it/s]

{'loss': Array(0.36580977, dtype=float32), 'loss_reward': Array(0.00347241, dtype=float32), 'loss_cross_entropy': Array(0.36233732, dtype=float32)}


  4%|▍         | 40010/1000000 [2:03:49<123:53:03,  2.15it/s]

{'loss': Array(0.36107814, dtype=float32), 'loss_reward': Array(0.00357781, dtype=float32), 'loss_cross_entropy': Array(0.35750034, dtype=float32)}


  4%|▍         | 40018/1000000 [2:03:51<68:14:11,  3.91it/s] 

{'loss': Array(0.35802913, dtype=float32), 'loss_reward': Array(0.00339827, dtype=float32), 'loss_cross_entropy': Array(0.35463086, dtype=float32)}


  4%|▍         | 40029/1000000 [2:03:52<36:30:29,  7.30it/s]

{'loss': Array(0.35776913, dtype=float32), 'loss_reward': Array(0.00345505, dtype=float32), 'loss_cross_entropy': Array(0.35431406, dtype=float32)}


  4%|▍         | 40040/1000000 [2:03:54<27:54:13,  9.56it/s]

{'loss': Array(0.3630359, dtype=float32), 'loss_reward': Array(0.00342225, dtype=float32), 'loss_cross_entropy': Array(0.35961366, dtype=float32)}


  4%|▍         | 40050/1000000 [2:03:56<31:39:48,  8.42it/s]

{'loss': Array(0.36765575, dtype=float32), 'loss_reward': Array(0.00368879, dtype=float32), 'loss_cross_entropy': Array(0.36396694, dtype=float32)}


  4%|▍         | 40060/1000000 [2:03:57<29:33:36,  9.02it/s]

{'loss': Array(0.3600364, dtype=float32), 'loss_reward': Array(0.00346459, dtype=float32), 'loss_cross_entropy': Array(0.35657182, dtype=float32)}


  4%|▍         | 40068/1000000 [2:03:59<33:56:19,  7.86it/s]

{'loss': Array(0.3719915, dtype=float32), 'loss_reward': Array(0.00364443, dtype=float32), 'loss_cross_entropy': Array(0.36834708, dtype=float32)}


  4%|▍         | 40079/1000000 [2:04:01<39:52:02,  6.69it/s]

{'loss': Array(0.37138265, dtype=float32), 'loss_reward': Array(0.00345889, dtype=float32), 'loss_cross_entropy': Array(0.36792374, dtype=float32)}


  4%|▍         | 40089/1000000 [2:04:02<30:51:11,  8.64it/s]

{'loss': Array(0.3712364, dtype=float32), 'loss_reward': Array(0.0034195, dtype=float32), 'loss_cross_entropy': Array(0.36781687, dtype=float32)}


  4%|▍         | 40100/1000000 [2:04:04<27:25:20,  9.72it/s]

{'loss': Array(0.36480728, dtype=float32), 'loss_reward': Array(0.00353024, dtype=float32), 'loss_cross_entropy': Array(0.36127707, dtype=float32)}


  4%|▍         | 40108/1000000 [2:04:06<34:26:05,  7.74it/s]

{'loss': Array(0.36237293, dtype=float32), 'loss_reward': Array(0.00340245, dtype=float32), 'loss_cross_entropy': Array(0.3589705, dtype=float32)}


  4%|▍         | 40120/1000000 [2:04:07<33:38:13,  7.93it/s]

{'loss': Array(0.3677249, dtype=float32), 'loss_reward': Array(0.00343616, dtype=float32), 'loss_cross_entropy': Array(0.36428875, dtype=float32)}


  4%|▍         | 40128/1000000 [2:04:09<34:56:28,  7.63it/s]

{'loss': Array(0.3616074, dtype=float32), 'loss_reward': Array(0.00341599, dtype=float32), 'loss_cross_entropy': Array(0.3581914, dtype=float32)}


  4%|▍         | 40140/1000000 [2:04:11<27:30:22,  9.69it/s]

{'loss': Array(0.37149832, dtype=float32), 'loss_reward': Array(0.00350728, dtype=float32), 'loss_cross_entropy': Array(0.36799103, dtype=float32)}


  4%|▍         | 40150/1000000 [2:04:12<27:17:26,  9.77it/s]

{'loss': Array(0.3671731, dtype=float32), 'loss_reward': Array(0.00347591, dtype=float32), 'loss_cross_entropy': Array(0.36369714, dtype=float32)}


  4%|▍         | 40158/1000000 [2:04:14<41:22:55,  6.44it/s]

{'loss': Array(0.3639303, dtype=float32), 'loss_reward': Array(0.00354914, dtype=float32), 'loss_cross_entropy': Array(0.3603812, dtype=float32)}


  4%|▍         | 40169/1000000 [2:04:16<30:22:08,  8.78it/s]

{'loss': Array(0.35882103, dtype=float32), 'loss_reward': Array(0.00344475, dtype=float32), 'loss_cross_entropy': Array(0.35537627, dtype=float32)}


  4%|▍         | 40180/1000000 [2:04:17<27:01:04,  9.87it/s]

{'loss': Array(0.3611688, dtype=float32), 'loss_reward': Array(0.00344318, dtype=float32), 'loss_cross_entropy': Array(0.3577256, dtype=float32)}


  4%|▍         | 40188/1000000 [2:04:19<32:54:18,  8.10it/s]

{'loss': Array(0.36887679, dtype=float32), 'loss_reward': Array(0.00349879, dtype=float32), 'loss_cross_entropy': Array(0.365378, dtype=float32)}


  4%|▍         | 40199/1000000 [2:04:21<32:59:41,  8.08it/s]

{'loss': Array(0.35807398, dtype=float32), 'loss_reward': Array(0.00350057, dtype=float32), 'loss_cross_entropy': Array(0.35457343, dtype=float32)}


  4%|▍         | 40209/1000000 [2:04:22<28:53:21,  9.23it/s]

{'loss': Array(0.36442578, dtype=float32), 'loss_reward': Array(0.00357605, dtype=float32), 'loss_cross_entropy': Array(0.3608497, dtype=float32)}


  4%|▍         | 40220/1000000 [2:04:24<26:57:19,  9.89it/s]

{'loss': Array(0.36277565, dtype=float32), 'loss_reward': Array(0.00344306, dtype=float32), 'loss_cross_entropy': Array(0.35933256, dtype=float32)}


  4%|▍         | 40228/1000000 [2:04:25<32:57:41,  8.09it/s]

{'loss': Array(0.36111137, dtype=float32), 'loss_reward': Array(0.00350841, dtype=float32), 'loss_cross_entropy': Array(0.35760292, dtype=float32)}


  4%|▍         | 40239/1000000 [2:04:27<32:48:10,  8.13it/s]

{'loss': Array(0.36184752, dtype=float32), 'loss_reward': Array(0.0035999, dtype=float32), 'loss_cross_entropy': Array(0.35824764, dtype=float32)}


  4%|▍         | 40249/1000000 [2:04:29<29:46:35,  8.95it/s]

{'loss': Array(0.36427996, dtype=float32), 'loss_reward': Array(0.00357753, dtype=float32), 'loss_cross_entropy': Array(0.36070243, dtype=float32)}


  4%|▍         | 40259/1000000 [2:04:30<28:49:31,  9.25it/s]

{'loss': Array(0.3688352, dtype=float32), 'loss_reward': Array(0.0036266, dtype=float32), 'loss_cross_entropy': Array(0.3652086, dtype=float32)}


  4%|▍         | 40269/1000000 [2:04:32<40:23:23,  6.60it/s]

{'loss': Array(0.36608702, dtype=float32), 'loss_reward': Array(0.00349599, dtype=float32), 'loss_cross_entropy': Array(0.362591, dtype=float32)}


  4%|▍         | 40279/1000000 [2:04:34<31:01:18,  8.59it/s]

{'loss': Array(0.36057118, dtype=float32), 'loss_reward': Array(0.00346958, dtype=float32), 'loss_cross_entropy': Array(0.3571016, dtype=float32)}


  4%|▍         | 40289/1000000 [2:04:36<28:39:21,  9.30it/s]

{'loss': Array(0.35904005, dtype=float32), 'loss_reward': Array(0.00354792, dtype=float32), 'loss_cross_entropy': Array(0.35549217, dtype=float32)}


  4%|▍         | 40299/1000000 [2:04:37<28:42:17,  9.29it/s]

{'loss': Array(0.36276636, dtype=float32), 'loss_reward': Array(0.00361228, dtype=float32), 'loss_cross_entropy': Array(0.35915408, dtype=float32)}


  4%|▍         | 40310/1000000 [2:04:39<34:30:22,  7.73it/s]

{'loss': Array(0.3642685, dtype=float32), 'loss_reward': Array(0.00354348, dtype=float32), 'loss_cross_entropy': Array(0.36072505, dtype=float32)}


  4%|▍         | 40320/1000000 [2:04:41<29:48:11,  8.94it/s]

{'loss': Array(0.36039397, dtype=float32), 'loss_reward': Array(0.00349485, dtype=float32), 'loss_cross_entropy': Array(0.35689908, dtype=float32)}


  4%|▍         | 40328/1000000 [2:04:42<32:59:39,  8.08it/s]

{'loss': Array(0.36285385, dtype=float32), 'loss_reward': Array(0.00347956, dtype=float32), 'loss_cross_entropy': Array(0.35937425, dtype=float32)}


  4%|▍         | 40340/1000000 [2:04:44<26:57:12,  9.89it/s]

{'loss': Array(0.35990733, dtype=float32), 'loss_reward': Array(0.00347558, dtype=float32), 'loss_cross_entropy': Array(0.3564317, dtype=float32)}


  4%|▍         | 40349/1000000 [2:04:46<36:29:10,  7.31it/s]

{'loss': Array(0.3578858, dtype=float32), 'loss_reward': Array(0.00341301, dtype=float32), 'loss_cross_entropy': Array(0.35447282, dtype=float32)}


  4%|▍         | 40359/1000000 [2:04:47<30:33:22,  8.72it/s]

{'loss': Array(0.3598593, dtype=float32), 'loss_reward': Array(0.0034198, dtype=float32), 'loss_cross_entropy': Array(0.3564395, dtype=float32)}


  4%|▍         | 40370/1000000 [2:04:49<26:59:36,  9.88it/s]

{'loss': Array(0.36021203, dtype=float32), 'loss_reward': Array(0.00347408, dtype=float32), 'loss_cross_entropy': Array(0.3567379, dtype=float32)}


  4%|▍         | 40378/1000000 [2:04:50<32:59:45,  8.08it/s]

{'loss': Array(0.36179897, dtype=float32), 'loss_reward': Array(0.00350323, dtype=float32), 'loss_cross_entropy': Array(0.35829568, dtype=float32)}


  4%|▍         | 40389/1000000 [2:04:52<33:05:34,  8.05it/s]

{'loss': Array(0.3585082, dtype=float32), 'loss_reward': Array(0.00346228, dtype=float32), 'loss_cross_entropy': Array(0.3550459, dtype=float32)}


  4%|▍         | 40399/1000000 [2:04:54<28:59:27,  9.19it/s]

{'loss': Array(0.34650308, dtype=float32), 'loss_reward': Array(0.00337849, dtype=float32), 'loss_cross_entropy': Array(0.34312457, dtype=float32)}


  4%|▍         | 40409/1000000 [2:04:55<27:38:21,  9.64it/s]

{'loss': Array(0.35202637, dtype=float32), 'loss_reward': Array(0.00337556, dtype=float32), 'loss_cross_entropy': Array(0.34865078, dtype=float32)}


  4%|▍         | 40419/1000000 [2:04:57<29:10:18,  9.14it/s]

{'loss': Array(0.36530992, dtype=float32), 'loss_reward': Array(0.00363909, dtype=float32), 'loss_cross_entropy': Array(0.36167082, dtype=float32)}


  4%|▍         | 40429/1000000 [2:04:59<32:59:15,  8.08it/s]

{'loss': Array(0.35550725, dtype=float32), 'loss_reward': Array(0.00337025, dtype=float32), 'loss_cross_entropy': Array(0.35213703, dtype=float32)}


  4%|▍         | 40440/1000000 [2:05:01<27:51:56,  9.57it/s]

{'loss': Array(0.3557284, dtype=float32), 'loss_reward': Array(0.00350016, dtype=float32), 'loss_cross_entropy': Array(0.35222828, dtype=float32)}


  4%|▍         | 40450/1000000 [2:05:02<28:42:07,  9.29it/s]

{'loss': Array(0.36424372, dtype=float32), 'loss_reward': Array(0.00347848, dtype=float32), 'loss_cross_entropy': Array(0.3607653, dtype=float32)}


  4%|▍         | 40459/1000000 [2:05:04<42:00:09,  6.35it/s]

{'loss': Array(0.35755646, dtype=float32), 'loss_reward': Array(0.00346132, dtype=float32), 'loss_cross_entropy': Array(0.3540951, dtype=float32)}


  4%|▍         | 40470/1000000 [2:05:06<27:36:11,  9.66it/s]

{'loss': Array(0.36092347, dtype=float32), 'loss_reward': Array(0.00335147, dtype=float32), 'loss_cross_entropy': Array(0.357572, dtype=float32)}


  4%|▍         | 40478/1000000 [2:05:07<33:56:24,  7.85it/s]

{'loss': Array(0.36017787, dtype=float32), 'loss_reward': Array(0.00338897, dtype=float32), 'loss_cross_entropy': Array(0.35678887, dtype=float32)}


  4%|▍         | 40490/1000000 [2:05:09<27:06:44,  9.83it/s]

{'loss': Array(0.3622553, dtype=float32), 'loss_reward': Array(0.0034785, dtype=float32), 'loss_cross_entropy': Array(0.3587768, dtype=float32)}


  4%|▍         | 40500/1000000 [2:05:11<36:15:36,  7.35it/s]

{'loss': Array(0.36053815, dtype=float32), 'loss_reward': Array(0.00348608, dtype=float32), 'loss_cross_entropy': Array(0.3570521, dtype=float32)}


  4%|▍         | 40508/1000000 [2:05:22<182:15:06,  1.46it/s]

{'loss': Array(0.36174768, dtype=float32), 'loss_reward': Array(0.00337329, dtype=float32), 'loss_cross_entropy': Array(0.35837436, dtype=float32)}


  4%|▍         | 40518/1000000 [2:05:24<70:42:52,  3.77it/s] 

{'loss': Array(0.36035615, dtype=float32), 'loss_reward': Array(0.00349387, dtype=float32), 'loss_cross_entropy': Array(0.35686228, dtype=float32)}


  4%|▍         | 40528/1000000 [2:05:26<40:47:38,  6.53it/s]

{'loss': Array(0.36739707, dtype=float32), 'loss_reward': Array(0.00351001, dtype=float32), 'loss_cross_entropy': Array(0.36388707, dtype=float32)}


  4%|▍         | 40539/1000000 [2:05:28<37:38:59,  7.08it/s]

{'loss': Array(0.35994837, dtype=float32), 'loss_reward': Array(0.00355602, dtype=float32), 'loss_cross_entropy': Array(0.35639235, dtype=float32)}


  4%|▍         | 40549/1000000 [2:05:29<30:43:08,  8.68it/s]

{'loss': Array(0.36488977, dtype=float32), 'loss_reward': Array(0.00357038, dtype=float32), 'loss_cross_entropy': Array(0.3613194, dtype=float32)}


  4%|▍         | 40560/1000000 [2:05:31<27:00:22,  9.87it/s]

{'loss': Array(0.36860707, dtype=float32), 'loss_reward': Array(0.00359553, dtype=float32), 'loss_cross_entropy': Array(0.36501154, dtype=float32)}


  4%|▍         | 40570/1000000 [2:05:33<29:00:09,  9.19it/s]

{'loss': Array(0.36068407, dtype=float32), 'loss_reward': Array(0.00343409, dtype=float32), 'loss_cross_entropy': Array(0.35725003, dtype=float32)}


  4%|▍         | 40578/1000000 [2:05:35<39:11:39,  6.80it/s]

{'loss': Array(0.35939217, dtype=float32), 'loss_reward': Array(0.00338949, dtype=float32), 'loss_cross_entropy': Array(0.3560027, dtype=float32)}


  4%|▍         | 40589/1000000 [2:05:36<29:15:20,  9.11it/s]

{'loss': Array(0.35295025, dtype=float32), 'loss_reward': Array(0.00330656, dtype=float32), 'loss_cross_entropy': Array(0.34964368, dtype=float32)}


  4%|▍         | 40599/1000000 [2:05:38<28:51:51,  9.23it/s]

{'loss': Array(0.36351094, dtype=float32), 'loss_reward': Array(0.00349349, dtype=float32), 'loss_cross_entropy': Array(0.3600175, dtype=float32)}


  4%|▍         | 40609/1000000 [2:05:39<28:51:45,  9.23it/s]

{'loss': Array(0.3479334, dtype=float32), 'loss_reward': Array(0.00347761, dtype=float32), 'loss_cross_entropy': Array(0.34445578, dtype=float32)}


  4%|▍         | 40620/1000000 [2:05:41<27:44:19,  9.61it/s]

{'loss': Array(0.35612413, dtype=float32), 'loss_reward': Array(0.00348678, dtype=float32), 'loss_cross_entropy': Array(0.35263735, dtype=float32)}


  4%|▍         | 40630/1000000 [2:05:43<28:49:17,  9.25it/s]

{'loss': Array(0.35313776, dtype=float32), 'loss_reward': Array(0.00364723, dtype=float32), 'loss_cross_entropy': Array(0.34949055, dtype=float32)}


  4%|▍         | 40640/1000000 [2:05:45<28:33:31,  9.33it/s]

{'loss': Array(0.35568184, dtype=float32), 'loss_reward': Array(0.00350861, dtype=float32), 'loss_cross_entropy': Array(0.35217318, dtype=float32)}


  4%|▍         | 40649/1000000 [2:05:47<40:56:22,  6.51it/s]

{'loss': Array(0.35769883, dtype=float32), 'loss_reward': Array(0.00353693, dtype=float32), 'loss_cross_entropy': Array(0.35416195, dtype=float32)}


  4%|▍         | 40660/1000000 [2:05:48<28:20:08,  9.40it/s]

{'loss': Array(0.3634686, dtype=float32), 'loss_reward': Array(0.00355742, dtype=float32), 'loss_cross_entropy': Array(0.35991117, dtype=float32)}


  4%|▍         | 40670/1000000 [2:05:50<29:04:15,  9.17it/s]

{'loss': Array(0.35559794, dtype=float32), 'loss_reward': Array(0.00351058, dtype=float32), 'loss_cross_entropy': Array(0.35208738, dtype=float32)}


  4%|▍         | 40680/1000000 [2:05:51<27:15:05,  9.78it/s]

{'loss': Array(0.3606294, dtype=float32), 'loss_reward': Array(0.00343409, dtype=float32), 'loss_cross_entropy': Array(0.35719535, dtype=float32)}


  4%|▍         | 40690/1000000 [2:05:53<36:09:31,  7.37it/s]

{'loss': Array(0.35443515, dtype=float32), 'loss_reward': Array(0.00342584, dtype=float32), 'loss_cross_entropy': Array(0.35100934, dtype=float32)}


  4%|▍         | 40699/1000000 [2:05:55<33:11:22,  8.03it/s]

{'loss': Array(0.36121178, dtype=float32), 'loss_reward': Array(0.00343062, dtype=float32), 'loss_cross_entropy': Array(0.3577812, dtype=float32)}


  4%|▍         | 40709/1000000 [2:05:57<29:23:29,  9.07it/s]

{'loss': Array(0.35423315, dtype=float32), 'loss_reward': Array(0.00346042, dtype=float32), 'loss_cross_entropy': Array(0.3507727, dtype=float32)}


  4%|▍         | 40718/1000000 [2:05:58<29:43:11,  8.97it/s]

{'loss': Array(0.36007187, dtype=float32), 'loss_reward': Array(0.00352345, dtype=float32), 'loss_cross_entropy': Array(0.3565484, dtype=float32)}


  4%|▍         | 40730/1000000 [2:06:00<31:55:59,  8.34it/s]

{'loss': Array(0.35104284, dtype=float32), 'loss_reward': Array(0.00343108, dtype=float32), 'loss_cross_entropy': Array(0.34761176, dtype=float32)}


  4%|▍         | 40738/1000000 [2:06:02<35:53:25,  7.42it/s]

{'loss': Array(0.3551707, dtype=float32), 'loss_reward': Array(0.00342122, dtype=float32), 'loss_cross_entropy': Array(0.3517495, dtype=float32)}


  4%|▍         | 40750/1000000 [2:06:03<29:19:05,  9.09it/s]

{'loss': Array(0.3513239, dtype=float32), 'loss_reward': Array(0.00345631, dtype=float32), 'loss_cross_entropy': Array(0.34786758, dtype=float32)}


  4%|▍         | 40760/1000000 [2:06:05<29:14:30,  9.11it/s]

{'loss': Array(0.34976512, dtype=float32), 'loss_reward': Array(0.00341797, dtype=float32), 'loss_cross_entropy': Array(0.34634718, dtype=float32)}


  4%|▍         | 40769/1000000 [2:06:07<36:27:19,  7.31it/s]

{'loss': Array(0.3567846, dtype=float32), 'loss_reward': Array(0.00345037, dtype=float32), 'loss_cross_entropy': Array(0.35333425, dtype=float32)}


  4%|▍         | 40779/1000000 [2:06:09<30:55:59,  8.61it/s]

{'loss': Array(0.35448912, dtype=float32), 'loss_reward': Array(0.00344545, dtype=float32), 'loss_cross_entropy': Array(0.35104367, dtype=float32)}


  4%|▍         | 40789/1000000 [2:06:10<27:38:17,  9.64it/s]

{'loss': Array(0.36039692, dtype=float32), 'loss_reward': Array(0.00356721, dtype=float32), 'loss_cross_entropy': Array(0.3568297, dtype=float32)}


  4%|▍         | 40799/1000000 [2:06:12<28:24:17,  9.38it/s]

{'loss': Array(0.36597368, dtype=float32), 'loss_reward': Array(0.00356541, dtype=float32), 'loss_cross_entropy': Array(0.3624083, dtype=float32)}


  4%|▍         | 40808/1000000 [2:06:14<34:11:11,  7.79it/s]

{'loss': Array(0.35356545, dtype=float32), 'loss_reward': Array(0.00347095, dtype=float32), 'loss_cross_entropy': Array(0.35009447, dtype=float32)}


  4%|▍         | 40820/1000000 [2:06:15<27:21:50,  9.74it/s]

{'loss': Array(0.35648185, dtype=float32), 'loss_reward': Array(0.00347069, dtype=float32), 'loss_cross_entropy': Array(0.3530112, dtype=float32)}


  4%|▍         | 40829/1000000 [2:06:17<30:09:52,  8.83it/s]

{'loss': Array(0.3534786, dtype=float32), 'loss_reward': Array(0.00360975, dtype=float32), 'loss_cross_entropy': Array(0.3498688, dtype=float32)}


  4%|▍         | 40840/1000000 [2:06:19<38:05:01,  7.00it/s]

{'loss': Array(0.35575628, dtype=float32), 'loss_reward': Array(0.00347312, dtype=float32), 'loss_cross_entropy': Array(0.35228318, dtype=float32)}


  4%|▍         | 40848/1000000 [2:06:21<35:00:54,  7.61it/s]

{'loss': Array(0.3504374, dtype=float32), 'loss_reward': Array(0.00337954, dtype=float32), 'loss_cross_entropy': Array(0.3470579, dtype=float32)}


  4%|▍         | 40860/1000000 [2:06:22<25:59:01, 10.25it/s]

{'loss': Array(0.3579963, dtype=float32), 'loss_reward': Array(0.00342602, dtype=float32), 'loss_cross_entropy': Array(0.3545703, dtype=float32)}


  4%|▍         | 40868/1000000 [2:06:24<33:30:05,  7.95it/s]

{'loss': Array(0.34403455, dtype=float32), 'loss_reward': Array(0.00339247, dtype=float32), 'loss_cross_entropy': Array(0.34064206, dtype=float32)}


  4%|▍         | 40880/1000000 [2:06:26<33:20:33,  7.99it/s]

{'loss': Array(0.35541645, dtype=float32), 'loss_reward': Array(0.00340489, dtype=float32), 'loss_cross_entropy': Array(0.35201153, dtype=float32)}


  4%|▍         | 40889/1000000 [2:06:27<32:01:28,  8.32it/s]

{'loss': Array(0.35787913, dtype=float32), 'loss_reward': Array(0.0034504, dtype=float32), 'loss_cross_entropy': Array(0.35442874, dtype=float32)}


  4%|▍         | 40899/1000000 [2:06:29<28:35:38,  9.32it/s]

{'loss': Array(0.35008353, dtype=float32), 'loss_reward': Array(0.00329617, dtype=float32), 'loss_cross_entropy': Array(0.3467874, dtype=float32)}


  4%|▍         | 40910/1000000 [2:06:31<27:10:52,  9.80it/s]

{'loss': Array(0.3457133, dtype=float32), 'loss_reward': Array(0.00344401, dtype=float32), 'loss_cross_entropy': Array(0.34226927, dtype=float32)}


  4%|▍         | 40920/1000000 [2:06:33<35:26:55,  7.52it/s]

{'loss': Array(0.35168406, dtype=float32), 'loss_reward': Array(0.00341081, dtype=float32), 'loss_cross_entropy': Array(0.34827325, dtype=float32)}


  4%|▍         | 40929/1000000 [2:06:34<32:47:58,  8.12it/s]

{'loss': Array(0.35226277, dtype=float32), 'loss_reward': Array(0.00355559, dtype=float32), 'loss_cross_entropy': Array(0.34870717, dtype=float32)}


  4%|▍         | 40939/1000000 [2:06:36<28:57:31,  9.20it/s]

{'loss': Array(0.34965715, dtype=float32), 'loss_reward': Array(0.0033993, dtype=float32), 'loss_cross_entropy': Array(0.34625784, dtype=float32)}


  4%|▍         | 40950/1000000 [2:06:38<25:28:54, 10.45it/s]

{'loss': Array(0.35537976, dtype=float32), 'loss_reward': Array(0.00346924, dtype=float32), 'loss_cross_entropy': Array(0.35191047, dtype=float32)}


  4%|▍         | 40960/1000000 [2:06:40<32:41:57,  8.15it/s]

{'loss': Array(0.3635262, dtype=float32), 'loss_reward': Array(0.00344434, dtype=float32), 'loss_cross_entropy': Array(0.3600818, dtype=float32)}


  4%|▍         | 40970/1000000 [2:06:41<28:08:48,  9.46it/s]

{'loss': Array(0.35840067, dtype=float32), 'loss_reward': Array(0.00342817, dtype=float32), 'loss_cross_entropy': Array(0.3549725, dtype=float32)}


  4%|▍         | 40979/1000000 [2:06:43<31:15:16,  8.52it/s]

{'loss': Array(0.35154328, dtype=float32), 'loss_reward': Array(0.00351528, dtype=float32), 'loss_cross_entropy': Array(0.34802797, dtype=float32)}


  4%|▍         | 40989/1000000 [2:06:45<27:31:12,  9.68it/s]

{'loss': Array(0.34387872, dtype=float32), 'loss_reward': Array(0.00333582, dtype=float32), 'loss_cross_entropy': Array(0.3405429, dtype=float32)}


  4%|▍         | 40999/1000000 [2:06:47<33:15:04,  8.01it/s]

{'loss': Array(0.34816405, dtype=float32), 'loss_reward': Array(0.0033926, dtype=float32), 'loss_cross_entropy': Array(0.34477141, dtype=float32)}


  4%|▍         | 41009/1000000 [2:06:58<143:34:23,  1.86it/s]

{'loss': Array(0.35677072, dtype=float32), 'loss_reward': Array(0.0034203, dtype=float32), 'loss_cross_entropy': Array(0.35335046, dtype=float32)}


  4%|▍         | 41019/1000000 [2:07:00<51:01:48,  5.22it/s] 

{'loss': Array(0.36607704, dtype=float32), 'loss_reward': Array(0.0035071, dtype=float32), 'loss_cross_entropy': Array(0.36257, dtype=float32)}


  4%|▍         | 41029/1000000 [2:07:02<33:28:34,  7.96it/s]

{'loss': Array(0.3516702, dtype=float32), 'loss_reward': Array(0.00347361, dtype=float32), 'loss_cross_entropy': Array(0.3481966, dtype=float32)}


  4%|▍         | 41039/1000000 [2:07:04<33:06:39,  8.05it/s]

{'loss': Array(0.34938198, dtype=float32), 'loss_reward': Array(0.00337462, dtype=float32), 'loss_cross_entropy': Array(0.34600735, dtype=float32)}


  4%|▍         | 41050/1000000 [2:07:05<28:00:17,  9.51it/s]

{'loss': Array(0.34647056, dtype=float32), 'loss_reward': Array(0.00338562, dtype=float32), 'loss_cross_entropy': Array(0.34308493, dtype=float32)}


  4%|▍         | 41060/1000000 [2:07:07<27:03:12,  9.85it/s]

{'loss': Array(0.3550716, dtype=float32), 'loss_reward': Array(0.00351781, dtype=float32), 'loss_cross_entropy': Array(0.3515538, dtype=float32)}


  4%|▍         | 41069/1000000 [2:07:09<42:32:29,  6.26it/s]

{'loss': Array(0.35985893, dtype=float32), 'loss_reward': Array(0.003454, dtype=float32), 'loss_cross_entropy': Array(0.35640496, dtype=float32)}


  4%|▍         | 41080/1000000 [2:07:11<27:19:55,  9.75it/s]

{'loss': Array(0.366382, dtype=float32), 'loss_reward': Array(0.00349848, dtype=float32), 'loss_cross_entropy': Array(0.36288348, dtype=float32)}


  4%|▍         | 41089/1000000 [2:07:12<29:53:04,  8.91it/s]

{'loss': Array(0.35772672, dtype=float32), 'loss_reward': Array(0.00354193, dtype=float32), 'loss_cross_entropy': Array(0.35418478, dtype=float32)}


  4%|▍         | 41100/1000000 [2:07:14<26:32:55, 10.03it/s]

{'loss': Array(0.35233152, dtype=float32), 'loss_reward': Array(0.00350717, dtype=float32), 'loss_cross_entropy': Array(0.34882432, dtype=float32)}


  4%|▍         | 41110/1000000 [2:07:16<34:05:21,  7.81it/s]

{'loss': Array(0.35316724, dtype=float32), 'loss_reward': Array(0.00340179, dtype=float32), 'loss_cross_entropy': Array(0.34976545, dtype=float32)}


  4%|▍         | 41118/1000000 [2:07:17<35:52:18,  7.43it/s]

{'loss': Array(0.36154488, dtype=float32), 'loss_reward': Array(0.00355074, dtype=float32), 'loss_cross_entropy': Array(0.35799414, dtype=float32)}


  4%|▍         | 41129/1000000 [2:07:19<29:39:21,  8.98it/s]

{'loss': Array(0.3570334, dtype=float32), 'loss_reward': Array(0.00347715, dtype=float32), 'loss_cross_entropy': Array(0.35355625, dtype=float32)}


  4%|▍         | 41139/1000000 [2:07:21<28:22:39,  9.39it/s]

{'loss': Array(0.35169002, dtype=float32), 'loss_reward': Array(0.00340354, dtype=float32), 'loss_cross_entropy': Array(0.3482865, dtype=float32)}


  4%|▍         | 41149/1000000 [2:07:23<33:47:48,  7.88it/s]

{'loss': Array(0.34835753, dtype=float32), 'loss_reward': Array(0.00337178, dtype=float32), 'loss_cross_entropy': Array(0.34498578, dtype=float32)}


  4%|▍         | 41158/1000000 [2:07:24<31:09:14,  8.55it/s]

{'loss': Array(0.36313888, dtype=float32), 'loss_reward': Array(0.00351425, dtype=float32), 'loss_cross_entropy': Array(0.35962462, dtype=float32)}


  4%|▍         | 41170/1000000 [2:07:26<26:15:19, 10.14it/s]

{'loss': Array(0.34667596, dtype=float32), 'loss_reward': Array(0.00334917, dtype=float32), 'loss_cross_entropy': Array(0.3433268, dtype=float32)}


  4%|▍         | 41180/1000000 [2:07:28<39:08:17,  6.81it/s]

{'loss': Array(0.35472056, dtype=float32), 'loss_reward': Array(0.00340487, dtype=float32), 'loss_cross_entropy': Array(0.3513157, dtype=float32)}


  4%|▍         | 41190/1000000 [2:07:30<31:24:29,  8.48it/s]

{'loss': Array(0.35402992, dtype=float32), 'loss_reward': Array(0.00340417, dtype=float32), 'loss_cross_entropy': Array(0.35062575, dtype=float32)}


  4%|▍         | 41200/1000000 [2:07:31<29:11:25,  9.12it/s]

{'loss': Array(0.35882434, dtype=float32), 'loss_reward': Array(0.00339429, dtype=float32), 'loss_cross_entropy': Array(0.3554301, dtype=float32)}


  4%|▍         | 41210/1000000 [2:07:33<29:01:05,  9.18it/s]

{'loss': Array(0.36302802, dtype=float32), 'loss_reward': Array(0.00358055, dtype=float32), 'loss_cross_entropy': Array(0.35944745, dtype=float32)}


  4%|▍         | 41219/1000000 [2:07:35<41:55:09,  6.35it/s]

{'loss': Array(0.35080525, dtype=float32), 'loss_reward': Array(0.00340665, dtype=float32), 'loss_cross_entropy': Array(0.3473986, dtype=float32)}


  4%|▍         | 41228/1000000 [2:07:36<33:19:36,  7.99it/s]

{'loss': Array(0.35019735, dtype=float32), 'loss_reward': Array(0.00349439, dtype=float32), 'loss_cross_entropy': Array(0.34670293, dtype=float32)}


  4%|▍         | 41240/1000000 [2:07:38<25:14:33, 10.55it/s]

{'loss': Array(0.35273635, dtype=float32), 'loss_reward': Array(0.00340815, dtype=float32), 'loss_cross_entropy': Array(0.34932816, dtype=float32)}


  4%|▍         | 41250/1000000 [2:07:40<29:21:39,  9.07it/s]

{'loss': Array(0.36084148, dtype=float32), 'loss_reward': Array(0.00350291, dtype=float32), 'loss_cross_entropy': Array(0.3573386, dtype=float32)}


  4%|▍         | 41260/1000000 [2:07:42<33:50:20,  7.87it/s]

{'loss': Array(0.34635693, dtype=float32), 'loss_reward': Array(0.00336386, dtype=float32), 'loss_cross_entropy': Array(0.34299308, dtype=float32)}


  4%|▍         | 41269/1000000 [2:07:43<31:22:54,  8.49it/s]

{'loss': Array(0.35226575, dtype=float32), 'loss_reward': Array(0.00342902, dtype=float32), 'loss_cross_entropy': Array(0.34883672, dtype=float32)}


  4%|▍         | 41280/1000000 [2:07:45<27:49:29,  9.57it/s]

{'loss': Array(0.3586686, dtype=float32), 'loss_reward': Array(0.00345381, dtype=float32), 'loss_cross_entropy': Array(0.3552148, dtype=float32)}


  4%|▍         | 41288/1000000 [2:07:46<33:16:01,  8.01it/s]

{'loss': Array(0.34786323, dtype=float32), 'loss_reward': Array(0.00340913, dtype=float32), 'loss_cross_entropy': Array(0.3444541, dtype=float32)}


  4%|▍         | 41299/1000000 [2:07:48<33:47:49,  7.88it/s]

{'loss': Array(0.3495019, dtype=float32), 'loss_reward': Array(0.00343769, dtype=float32), 'loss_cross_entropy': Array(0.3460642, dtype=float32)}


  4%|▍         | 41309/1000000 [2:07:50<27:50:09,  9.57it/s]

{'loss': Array(0.34933084, dtype=float32), 'loss_reward': Array(0.00338591, dtype=float32), 'loss_cross_entropy': Array(0.3459449, dtype=float32)}


  4%|▍         | 41320/1000000 [2:07:52<25:48:11, 10.32it/s]

{'loss': Array(0.3457447, dtype=float32), 'loss_reward': Array(0.00347385, dtype=float32), 'loss_cross_entropy': Array(0.34227082, dtype=float32)}


  4%|▍         | 41328/1000000 [2:07:53<33:28:42,  7.95it/s]

{'loss': Array(0.34931198, dtype=float32), 'loss_reward': Array(0.00333759, dtype=float32), 'loss_cross_entropy': Array(0.34597442, dtype=float32)}


  4%|▍         | 41340/1000000 [2:07:55<30:07:20,  8.84it/s]

{'loss': Array(0.34829327, dtype=float32), 'loss_reward': Array(0.00346338, dtype=float32), 'loss_cross_entropy': Array(0.3448299, dtype=float32)}


  4%|▍         | 41348/1000000 [2:07:57<34:23:09,  7.74it/s]

{'loss': Array(0.34381673, dtype=float32), 'loss_reward': Array(0.00331812, dtype=float32), 'loss_cross_entropy': Array(0.34049863, dtype=float32)}


  4%|▍         | 41359/1000000 [2:07:58<29:03:33,  9.16it/s]

{'loss': Array(0.35678515, dtype=float32), 'loss_reward': Array(0.00339438, dtype=float32), 'loss_cross_entropy': Array(0.35339078, dtype=float32)}


  4%|▍         | 41370/1000000 [2:08:00<38:32:08,  6.91it/s]

{'loss': Array(0.35615882, dtype=float32), 'loss_reward': Array(0.00351949, dtype=float32), 'loss_cross_entropy': Array(0.35263935, dtype=float32)}


  4%|▍         | 41380/1000000 [2:08:02<31:29:53,  8.45it/s]

{'loss': Array(0.34418422, dtype=float32), 'loss_reward': Array(0.00326323, dtype=float32), 'loss_cross_entropy': Array(0.34092095, dtype=float32)}


  4%|▍         | 41389/1000000 [2:08:04<30:42:35,  8.67it/s]

{'loss': Array(0.3542106, dtype=float32), 'loss_reward': Array(0.00345208, dtype=float32), 'loss_cross_entropy': Array(0.35075846, dtype=float32)}


  4%|▍         | 41400/1000000 [2:08:05<27:46:56,  9.58it/s]

{'loss': Array(0.35117504, dtype=float32), 'loss_reward': Array(0.00342498, dtype=float32), 'loss_cross_entropy': Array(0.34775007, dtype=float32)}


  4%|▍         | 41410/1000000 [2:08:07<35:20:19,  7.53it/s]

{'loss': Array(0.35266182, dtype=float32), 'loss_reward': Array(0.0034633, dtype=float32), 'loss_cross_entropy': Array(0.3491985, dtype=float32)}


  4%|▍         | 41419/1000000 [2:08:09<33:08:37,  8.03it/s]

{'loss': Array(0.3481118, dtype=float32), 'loss_reward': Array(0.00342255, dtype=float32), 'loss_cross_entropy': Array(0.34468925, dtype=float32)}


  4%|▍         | 41430/1000000 [2:08:11<28:48:29,  9.24it/s]

{'loss': Array(0.34504527, dtype=float32), 'loss_reward': Array(0.00333173, dtype=float32), 'loss_cross_entropy': Array(0.34171352, dtype=float32)}


  4%|▍         | 41440/1000000 [2:08:12<27:22:35,  9.73it/s]

{'loss': Array(0.34588122, dtype=float32), 'loss_reward': Array(0.00338824, dtype=float32), 'loss_cross_entropy': Array(0.34249297, dtype=float32)}


  4%|▍         | 41449/1000000 [2:08:14<38:11:16,  6.97it/s]

{'loss': Array(0.3458315, dtype=float32), 'loss_reward': Array(0.00342468, dtype=float32), 'loss_cross_entropy': Array(0.34240684, dtype=float32)}


  4%|▍         | 41460/1000000 [2:08:16<28:59:33,  9.18it/s]

{'loss': Array(0.34738508, dtype=float32), 'loss_reward': Array(0.00333745, dtype=float32), 'loss_cross_entropy': Array(0.34404764, dtype=float32)}


  4%|▍         | 41469/1000000 [2:08:18<31:53:56,  8.35it/s]

{'loss': Array(0.3476703, dtype=float32), 'loss_reward': Array(0.00344994, dtype=float32), 'loss_cross_entropy': Array(0.3442203, dtype=float32)}


  4%|▍         | 41480/1000000 [2:08:19<27:25:23,  9.71it/s]

{'loss': Array(0.34833115, dtype=float32), 'loss_reward': Array(0.00337064, dtype=float32), 'loss_cross_entropy': Array(0.34496054, dtype=float32)}


  4%|▍         | 41490/1000000 [2:08:21<34:47:10,  7.65it/s]

{'loss': Array(0.353031, dtype=float32), 'loss_reward': Array(0.0033815, dtype=float32), 'loss_cross_entropy': Array(0.34964952, dtype=float32)}


  4%|▍         | 41499/1000000 [2:08:23<32:46:29,  8.12it/s]

{'loss': Array(0.34020233, dtype=float32), 'loss_reward': Array(0.0034394, dtype=float32), 'loss_cross_entropy': Array(0.3367629, dtype=float32)}


  4%|▍         | 41509/1000000 [2:08:35<140:56:18,  1.89it/s]

{'loss': Array(0.34823146, dtype=float32), 'loss_reward': Array(0.00339197, dtype=float32), 'loss_cross_entropy': Array(0.34483948, dtype=float32)}


  4%|▍         | 41519/1000000 [2:08:36<56:38:33,  4.70it/s] 

{'loss': Array(0.34873185, dtype=float32), 'loss_reward': Array(0.00342651, dtype=float32), 'loss_cross_entropy': Array(0.3453053, dtype=float32)}


  4%|▍         | 41529/1000000 [2:08:38<39:43:05,  6.70it/s]

{'loss': Array(0.347646, dtype=float32), 'loss_reward': Array(0.0033904, dtype=float32), 'loss_cross_entropy': Array(0.34425557, dtype=float32)}


  4%|▍         | 41540/1000000 [2:08:40<29:10:42,  9.12it/s]

{'loss': Array(0.35399553, dtype=float32), 'loss_reward': Array(0.00344356, dtype=float32), 'loss_cross_entropy': Array(0.35055202, dtype=float32)}


  4%|▍         | 41549/1000000 [2:08:42<30:07:15,  8.84it/s]

{'loss': Array(0.34742612, dtype=float32), 'loss_reward': Array(0.00346933, dtype=float32), 'loss_cross_entropy': Array(0.3439568, dtype=float32)}


  4%|▍         | 41559/1000000 [2:08:44<40:58:59,  6.50it/s]

{'loss': Array(0.34556815, dtype=float32), 'loss_reward': Array(0.00334273, dtype=float32), 'loss_cross_entropy': Array(0.3422254, dtype=float32)}


  4%|▍         | 41569/1000000 [2:08:45<29:36:46,  8.99it/s]

{'loss': Array(0.35188487, dtype=float32), 'loss_reward': Array(0.00340494, dtype=float32), 'loss_cross_entropy': Array(0.34847996, dtype=float32)}


  4%|▍         | 41579/1000000 [2:08:47<28:54:59,  9.21it/s]

{'loss': Array(0.34456453, dtype=float32), 'loss_reward': Array(0.0034774, dtype=float32), 'loss_cross_entropy': Array(0.34108716, dtype=float32)}


  4%|▍         | 41589/1000000 [2:08:48<28:57:39,  9.19it/s]

{'loss': Array(0.3458687, dtype=float32), 'loss_reward': Array(0.00337502, dtype=float32), 'loss_cross_entropy': Array(0.34249374, dtype=float32)}


  4%|▍         | 41599/1000000 [2:08:50<40:16:03,  6.61it/s]

{'loss': Array(0.3420074, dtype=float32), 'loss_reward': Array(0.00325616, dtype=float32), 'loss_cross_entropy': Array(0.33875126, dtype=float32)}


  4%|▍         | 41609/1000000 [2:08:52<29:16:40,  9.09it/s]

{'loss': Array(0.35132292, dtype=float32), 'loss_reward': Array(0.00340627, dtype=float32), 'loss_cross_entropy': Array(0.3479166, dtype=float32)}


  4%|▍         | 41618/1000000 [2:08:54<31:31:23,  8.45it/s]

{'loss': Array(0.3493665, dtype=float32), 'loss_reward': Array(0.00336353, dtype=float32), 'loss_cross_entropy': Array(0.34600294, dtype=float32)}


  4%|▍         | 41630/1000000 [2:08:55<25:11:50, 10.57it/s]

{'loss': Array(0.34283853, dtype=float32), 'loss_reward': Array(0.00338049, dtype=float32), 'loss_cross_entropy': Array(0.33945802, dtype=float32)}


  4%|▍         | 41640/1000000 [2:08:57<33:51:52,  7.86it/s]

{'loss': Array(0.34163854, dtype=float32), 'loss_reward': Array(0.00343081, dtype=float32), 'loss_cross_entropy': Array(0.3382077, dtype=float32)}


  4%|▍         | 41648/1000000 [2:08:59<35:41:35,  7.46it/s]

{'loss': Array(0.34605378, dtype=float32), 'loss_reward': Array(0.00339347, dtype=float32), 'loss_cross_entropy': Array(0.34266034, dtype=float32)}


  4%|▍         | 41659/1000000 [2:09:00<28:55:11,  9.20it/s]

{'loss': Array(0.34028324, dtype=float32), 'loss_reward': Array(0.00326341, dtype=float32), 'loss_cross_entropy': Array(0.33701983, dtype=float32)}


  4%|▍         | 41669/1000000 [2:09:02<27:42:22,  9.61it/s]

{'loss': Array(0.34880874, dtype=float32), 'loss_reward': Array(0.00349861, dtype=float32), 'loss_cross_entropy': Array(0.34531018, dtype=float32)}


  4%|▍         | 41680/1000000 [2:09:04<30:16:56,  8.79it/s]

{'loss': Array(0.34502178, dtype=float32), 'loss_reward': Array(0.00336702, dtype=float32), 'loss_cross_entropy': Array(0.34165475, dtype=float32)}


  4%|▍         | 41690/1000000 [2:09:06<29:23:35,  9.06it/s]

{'loss': Array(0.3431027, dtype=float32), 'loss_reward': Array(0.00326112, dtype=float32), 'loss_cross_entropy': Array(0.33984157, dtype=float32)}


  4%|▍         | 41699/1000000 [2:09:07<31:26:07,  8.47it/s]

{'loss': Array(0.34828094, dtype=float32), 'loss_reward': Array(0.00320662, dtype=float32), 'loss_cross_entropy': Array(0.34507433, dtype=float32)}


  4%|▍         | 41709/1000000 [2:09:09<29:22:38,  9.06it/s]

{'loss': Array(0.3378656, dtype=float32), 'loss_reward': Array(0.00329792, dtype=float32), 'loss_cross_entropy': Array(0.33456767, dtype=float32)}


  4%|▍         | 41720/1000000 [2:09:11<31:14:39,  8.52it/s]

{'loss': Array(0.35065413, dtype=float32), 'loss_reward': Array(0.00351784, dtype=float32), 'loss_cross_entropy': Array(0.34713626, dtype=float32)}


  4%|▍         | 41730/1000000 [2:09:13<27:55:46,  9.53it/s]

{'loss': Array(0.35352433, dtype=float32), 'loss_reward': Array(0.0034714, dtype=float32), 'loss_cross_entropy': Array(0.35005292, dtype=float32)}


  4%|▍         | 41738/1000000 [2:09:14<34:06:12,  7.81it/s]

{'loss': Array(0.33924907, dtype=float32), 'loss_reward': Array(0.00322787, dtype=float32), 'loss_cross_entropy': Array(0.33602118, dtype=float32)}


  4%|▍         | 41749/1000000 [2:09:16<39:21:30,  6.76it/s]

{'loss': Array(0.34520203, dtype=float32), 'loss_reward': Array(0.00343895, dtype=float32), 'loss_cross_entropy': Array(0.3417631, dtype=float32)}


  4%|▍         | 41759/1000000 [2:09:18<32:16:09,  8.25it/s]

{'loss': Array(0.3499531, dtype=float32), 'loss_reward': Array(0.00333945, dtype=float32), 'loss_cross_entropy': Array(0.34661365, dtype=float32)}


  4%|▍         | 41770/1000000 [2:09:19<27:49:54,  9.56it/s]

{'loss': Array(0.34291422, dtype=float32), 'loss_reward': Array(0.0032796, dtype=float32), 'loss_cross_entropy': Array(0.33963466, dtype=float32)}


  4%|▍         | 41780/1000000 [2:09:21<26:59:50,  9.86it/s]

{'loss': Array(0.33392763, dtype=float32), 'loss_reward': Array(0.00335637, dtype=float32), 'loss_cross_entropy': Array(0.3305713, dtype=float32)}


  4%|▍         | 41789/1000000 [2:09:23<39:03:49,  6.81it/s]

{'loss': Array(0.3472502, dtype=float32), 'loss_reward': Array(0.0034546, dtype=float32), 'loss_cross_entropy': Array(0.34379557, dtype=float32)}


  4%|▍         | 41799/1000000 [2:09:24<29:56:21,  8.89it/s]

{'loss': Array(0.34521842, dtype=float32), 'loss_reward': Array(0.00350857, dtype=float32), 'loss_cross_entropy': Array(0.34170985, dtype=float32)}


  4%|▍         | 41810/1000000 [2:09:26<27:06:36,  9.82it/s]

{'loss': Array(0.34271854, dtype=float32), 'loss_reward': Array(0.00336469, dtype=float32), 'loss_cross_entropy': Array(0.33935383, dtype=float32)}


  4%|▍         | 41818/1000000 [2:09:28<33:49:14,  7.87it/s]

{'loss': Array(0.33756134, dtype=float32), 'loss_reward': Array(0.00333471, dtype=float32), 'loss_cross_entropy': Array(0.3342266, dtype=float32)}


  4%|▍         | 41829/1000000 [2:09:30<34:20:34,  7.75it/s]

{'loss': Array(0.3495883, dtype=float32), 'loss_reward': Array(0.00348967, dtype=float32), 'loss_cross_entropy': Array(0.34609863, dtype=float32)}


  4%|▍         | 41839/1000000 [2:09:31<29:50:03,  8.92it/s]

{'loss': Array(0.3460975, dtype=float32), 'loss_reward': Array(0.00333551, dtype=float32), 'loss_cross_entropy': Array(0.342762, dtype=float32)}


  4%|▍         | 41849/1000000 [2:09:33<29:28:45,  9.03it/s]

{'loss': Array(0.3461823, dtype=float32), 'loss_reward': Array(0.00349767, dtype=float32), 'loss_cross_entropy': Array(0.3426846, dtype=float32)}


  4%|▍         | 41859/1000000 [2:09:35<27:31:33,  9.67it/s]

{'loss': Array(0.33990642, dtype=float32), 'loss_reward': Array(0.00337068, dtype=float32), 'loss_cross_entropy': Array(0.3365357, dtype=float32)}


  4%|▍         | 41869/1000000 [2:09:37<33:38:53,  7.91it/s]

{'loss': Array(0.33724698, dtype=float32), 'loss_reward': Array(0.00329396, dtype=float32), 'loss_cross_entropy': Array(0.333953, dtype=float32)}


  4%|▍         | 41879/1000000 [2:09:38<28:02:27,  9.49it/s]

{'loss': Array(0.33792654, dtype=float32), 'loss_reward': Array(0.00353345, dtype=float32), 'loss_cross_entropy': Array(0.33439308, dtype=float32)}


  4%|▍         | 41889/1000000 [2:09:40<28:19:22,  9.40it/s]

{'loss': Array(0.34193197, dtype=float32), 'loss_reward': Array(0.00336701, dtype=float32), 'loss_cross_entropy': Array(0.33856496, dtype=float32)}


  4%|▍         | 41898/1000000 [2:09:41<30:28:45,  8.73it/s]

{'loss': Array(0.33833966, dtype=float32), 'loss_reward': Array(0.00337992, dtype=float32), 'loss_cross_entropy': Array(0.33495972, dtype=float32)}


  4%|▍         | 41909/1000000 [2:09:43<33:08:20,  8.03it/s]

{'loss': Array(0.33877373, dtype=float32), 'loss_reward': Array(0.00330807, dtype=float32), 'loss_cross_entropy': Array(0.33546564, dtype=float32)}


  4%|▍         | 41919/1000000 [2:09:45<30:45:54,  8.65it/s]

{'loss': Array(0.3373257, dtype=float32), 'loss_reward': Array(0.00342858, dtype=float32), 'loss_cross_entropy': Array(0.33389708, dtype=float32)}


  4%|▍         | 41930/1000000 [2:09:47<26:31:48, 10.03it/s]

{'loss': Array(0.3399242, dtype=float32), 'loss_reward': Array(0.00338159, dtype=float32), 'loss_cross_entropy': Array(0.33654258, dtype=float32)}


  4%|▍         | 41939/1000000 [2:09:49<43:17:25,  6.15it/s]

{'loss': Array(0.3378554, dtype=float32), 'loss_reward': Array(0.00339922, dtype=float32), 'loss_cross_entropy': Array(0.33445612, dtype=float32)}


  4%|▍         | 41949/1000000 [2:09:50<29:56:04,  8.89it/s]

{'loss': Array(0.33559227, dtype=float32), 'loss_reward': Array(0.00324914, dtype=float32), 'loss_cross_entropy': Array(0.3323431, dtype=float32)}


  4%|▍         | 41958/1000000 [2:09:52<30:28:03,  8.73it/s]

{'loss': Array(0.3343723, dtype=float32), 'loss_reward': Array(0.0033095, dtype=float32), 'loss_cross_entropy': Array(0.3310628, dtype=float32)}


  4%|▍         | 41970/1000000 [2:09:54<27:26:13,  9.70it/s]

{'loss': Array(0.33684546, dtype=float32), 'loss_reward': Array(0.00344147, dtype=float32), 'loss_cross_entropy': Array(0.33340394, dtype=float32)}


  4%|▍         | 41979/1000000 [2:09:56<38:19:04,  6.94it/s]

{'loss': Array(0.34286878, dtype=float32), 'loss_reward': Array(0.00336003, dtype=float32), 'loss_cross_entropy': Array(0.3395087, dtype=float32)}


  4%|▍         | 41990/1000000 [2:09:57<28:38:05,  9.29it/s]

{'loss': Array(0.3394213, dtype=float32), 'loss_reward': Array(0.00327703, dtype=float32), 'loss_cross_entropy': Array(0.3361443, dtype=float32)}


  4%|▍         | 42000/1000000 [2:09:59<28:35:23,  9.31it/s]

{'loss': Array(0.3384976, dtype=float32), 'loss_reward': Array(0.00329042, dtype=float32), 'loss_cross_entropy': Array(0.3352072, dtype=float32)}


  4%|▍         | 42009/1000000 [2:10:11<155:16:29,  1.71it/s]

{'loss': Array(0.3460801, dtype=float32), 'loss_reward': Array(0.00341441, dtype=float32), 'loss_cross_entropy': Array(0.34266573, dtype=float32)}


  4%|▍         | 42019/1000000 [2:10:13<63:17:40,  4.20it/s] 

{'loss': Array(0.34572226, dtype=float32), 'loss_reward': Array(0.00339931, dtype=float32), 'loss_cross_entropy': Array(0.34232292, dtype=float32)}


  4%|▍         | 42029/1000000 [2:10:14<36:40:14,  7.26it/s]

{'loss': Array(0.35255176, dtype=float32), 'loss_reward': Array(0.00344321, dtype=float32), 'loss_cross_entropy': Array(0.34910852, dtype=float32)}


  4%|▍         | 42040/1000000 [2:10:16<27:52:41,  9.55it/s]

{'loss': Array(0.34988567, dtype=float32), 'loss_reward': Array(0.00342607, dtype=float32), 'loss_cross_entropy': Array(0.3464596, dtype=float32)}


  4%|▍         | 42050/1000000 [2:10:18<28:41:55,  9.27it/s]

{'loss': Array(0.35269383, dtype=float32), 'loss_reward': Array(0.00340986, dtype=float32), 'loss_cross_entropy': Array(0.34928396, dtype=float32)}


  4%|▍         | 42059/1000000 [2:10:20<36:03:12,  7.38it/s]

{'loss': Array(0.34374696, dtype=float32), 'loss_reward': Array(0.00350417, dtype=float32), 'loss_cross_entropy': Array(0.34024274, dtype=float32)}


  4%|▍         | 42070/1000000 [2:10:21<27:28:08,  9.69it/s]

{'loss': Array(0.34262452, dtype=float32), 'loss_reward': Array(0.00339375, dtype=float32), 'loss_cross_entropy': Array(0.33923078, dtype=float32)}


  4%|▍         | 42080/1000000 [2:10:23<28:08:40,  9.45it/s]

{'loss': Array(0.34336212, dtype=float32), 'loss_reward': Array(0.00344857, dtype=float32), 'loss_cross_entropy': Array(0.33991352, dtype=float32)}


  4%|▍         | 42090/1000000 [2:10:25<39:20:53,  6.76it/s]

{'loss': Array(0.3445801, dtype=float32), 'loss_reward': Array(0.0033287, dtype=float32), 'loss_cross_entropy': Array(0.34125137, dtype=float32)}


  4%|▍         | 42100/1000000 [2:10:26<30:46:11,  8.65it/s]

{'loss': Array(0.35028216, dtype=float32), 'loss_reward': Array(0.00341532, dtype=float32), 'loss_cross_entropy': Array(0.34686682, dtype=float32)}


  4%|▍         | 42110/1000000 [2:10:28<29:44:55,  8.94it/s]

{'loss': Array(0.34747887, dtype=float32), 'loss_reward': Array(0.00345472, dtype=float32), 'loss_cross_entropy': Array(0.34402415, dtype=float32)}


  4%|▍         | 42120/1000000 [2:10:30<29:17:05,  9.09it/s]

{'loss': Array(0.34889975, dtype=float32), 'loss_reward': Array(0.00346427, dtype=float32), 'loss_cross_entropy': Array(0.3454355, dtype=float32)}


  4%|▍         | 42130/1000000 [2:10:32<36:46:04,  7.24it/s]

{'loss': Array(0.33661535, dtype=float32), 'loss_reward': Array(0.00331308, dtype=float32), 'loss_cross_entropy': Array(0.33330226, dtype=float32)}


  4%|▍         | 42138/1000000 [2:10:33<34:27:52,  7.72it/s]

{'loss': Array(0.33976, dtype=float32), 'loss_reward': Array(0.00335577, dtype=float32), 'loss_cross_entropy': Array(0.3364042, dtype=float32)}


  4%|▍         | 42149/1000000 [2:10:35<29:08:20,  9.13it/s]

{'loss': Array(0.34236902, dtype=float32), 'loss_reward': Array(0.00335472, dtype=float32), 'loss_cross_entropy': Array(0.33901426, dtype=float32)}


  4%|▍         | 42160/1000000 [2:10:36<26:23:43, 10.08it/s]

{'loss': Array(0.34230325, dtype=float32), 'loss_reward': Array(0.00344094, dtype=float32), 'loss_cross_entropy': Array(0.33886227, dtype=float32)}


  4%|▍         | 42170/1000000 [2:10:38<36:15:13,  7.34it/s]

{'loss': Array(0.33885404, dtype=float32), 'loss_reward': Array(0.00329156, dtype=float32), 'loss_cross_entropy': Array(0.3355625, dtype=float32)}


  4%|▍         | 42179/1000000 [2:10:40<32:26:30,  8.20it/s]

{'loss': Array(0.3438767, dtype=float32), 'loss_reward': Array(0.00327064, dtype=float32), 'loss_cross_entropy': Array(0.34060606, dtype=float32)}


  4%|▍         | 42188/1000000 [2:10:42<31:55:44,  8.33it/s]

{'loss': Array(0.34074387, dtype=float32), 'loss_reward': Array(0.00337414, dtype=float32), 'loss_cross_entropy': Array(0.33736968, dtype=float32)}


  4%|▍         | 42198/1000000 [2:10:43<29:50:08,  8.92it/s]

{'loss': Array(0.3405105, dtype=float32), 'loss_reward': Array(0.00344455, dtype=float32), 'loss_cross_entropy': Array(0.33706596, dtype=float32)}


  4%|▍         | 42208/1000000 [2:10:45<36:09:06,  7.36it/s]

{'loss': Array(0.33846757, dtype=float32), 'loss_reward': Array(0.00336217, dtype=float32), 'loss_cross_entropy': Array(0.33510548, dtype=float32)}


  4%|▍         | 42218/1000000 [2:10:47<31:59:07,  8.32it/s]

{'loss': Array(0.3373792, dtype=float32), 'loss_reward': Array(0.00337416, dtype=float32), 'loss_cross_entropy': Array(0.3340051, dtype=float32)}


  4%|▍         | 42229/1000000 [2:10:48<28:13:43,  9.42it/s]

{'loss': Array(0.3322167, dtype=float32), 'loss_reward': Array(0.00333388, dtype=float32), 'loss_cross_entropy': Array(0.3288828, dtype=float32)}


  4%|▍         | 42240/1000000 [2:10:50<27:22:52,  9.72it/s]

{'loss': Array(0.34294316, dtype=float32), 'loss_reward': Array(0.00327911, dtype=float32), 'loss_cross_entropy': Array(0.33966407, dtype=float32)}


  4%|▍         | 42248/1000000 [2:10:52<38:05:10,  6.99it/s]

{'loss': Array(0.34599695, dtype=float32), 'loss_reward': Array(0.00332057, dtype=float32), 'loss_cross_entropy': Array(0.34267637, dtype=float32)}


  4%|▍         | 42260/1000000 [2:10:54<26:47:28,  9.93it/s]

{'loss': Array(0.33911973, dtype=float32), 'loss_reward': Array(0.00339923, dtype=float32), 'loss_cross_entropy': Array(0.3357205, dtype=float32)}


  4%|▍         | 42270/1000000 [2:10:55<28:21:46,  9.38it/s]

{'loss': Array(0.3471535, dtype=float32), 'loss_reward': Array(0.00342566, dtype=float32), 'loss_cross_entropy': Array(0.34372783, dtype=float32)}


  4%|▍         | 42280/1000000 [2:10:57<39:07:37,  6.80it/s]

{'loss': Array(0.34286842, dtype=float32), 'loss_reward': Array(0.003455, dtype=float32), 'loss_cross_entropy': Array(0.3394134, dtype=float32)}


  4%|▍         | 42288/1000000 [2:10:59<35:41:01,  7.46it/s]

{'loss': Array(0.34005696, dtype=float32), 'loss_reward': Array(0.00340575, dtype=float32), 'loss_cross_entropy': Array(0.33665118, dtype=float32)}


  4%|▍         | 42300/1000000 [2:11:01<27:49:43,  9.56it/s]

{'loss': Array(0.33869338, dtype=float32), 'loss_reward': Array(0.00344476, dtype=float32), 'loss_cross_entropy': Array(0.3352486, dtype=float32)}


  4%|▍         | 42310/1000000 [2:11:02<28:33:09,  9.32it/s]

{'loss': Array(0.33648738, dtype=float32), 'loss_reward': Array(0.00337028, dtype=float32), 'loss_cross_entropy': Array(0.3331171, dtype=float32)}


  4%|▍         | 42319/1000000 [2:11:04<40:36:57,  6.55it/s]

{'loss': Array(0.33185974, dtype=float32), 'loss_reward': Array(0.00329627, dtype=float32), 'loss_cross_entropy': Array(0.32856348, dtype=float32)}


  4%|▍         | 42330/1000000 [2:11:06<29:45:15,  8.94it/s]

{'loss': Array(0.34009227, dtype=float32), 'loss_reward': Array(0.00333911, dtype=float32), 'loss_cross_entropy': Array(0.33675322, dtype=float32)}


  4%|▍         | 42340/1000000 [2:11:08<29:19:10,  9.07it/s]

{'loss': Array(0.33381328, dtype=float32), 'loss_reward': Array(0.00328133, dtype=float32), 'loss_cross_entropy': Array(0.33053192, dtype=float32)}


  4%|▍         | 42350/1000000 [2:11:09<27:21:11,  9.73it/s]

{'loss': Array(0.33480665, dtype=float32), 'loss_reward': Array(0.00343039, dtype=float32), 'loss_cross_entropy': Array(0.33137628, dtype=float32)}


  4%|▍         | 42360/1000000 [2:11:11<33:38:54,  7.91it/s]

{'loss': Array(0.3305936, dtype=float32), 'loss_reward': Array(0.00331343, dtype=float32), 'loss_cross_entropy': Array(0.32728016, dtype=float32)}


  4%|▍         | 42370/1000000 [2:11:13<27:56:14,  9.52it/s]

{'loss': Array(0.33452073, dtype=float32), 'loss_reward': Array(0.00331624, dtype=float32), 'loss_cross_entropy': Array(0.3312045, dtype=float32)}


  4%|▍         | 42379/1000000 [2:11:14<32:21:01,  8.22it/s]

{'loss': Array(0.3373361, dtype=float32), 'loss_reward': Array(0.00332152, dtype=float32), 'loss_cross_entropy': Array(0.33401462, dtype=float32)}


  4%|▍         | 42389/1000000 [2:11:16<29:56:08,  8.89it/s]

{'loss': Array(0.3278443, dtype=float32), 'loss_reward': Array(0.00330775, dtype=float32), 'loss_cross_entropy': Array(0.32453653, dtype=float32)}


  4%|▍         | 42400/1000000 [2:11:18<30:08:11,  8.83it/s]

{'loss': Array(0.33911672, dtype=float32), 'loss_reward': Array(0.00333761, dtype=float32), 'loss_cross_entropy': Array(0.3357791, dtype=float32)}


  4%|▍         | 42409/1000000 [2:11:20<31:08:59,  8.54it/s]

{'loss': Array(0.33167312, dtype=float32), 'loss_reward': Array(0.00332451, dtype=float32), 'loss_cross_entropy': Array(0.3283486, dtype=float32)}


  4%|▍         | 42420/1000000 [2:11:21<26:19:04, 10.11it/s]

{'loss': Array(0.3339242, dtype=float32), 'loss_reward': Array(0.00329957, dtype=float32), 'loss_cross_entropy': Array(0.33062464, dtype=float32)}


  4%|▍         | 42430/1000000 [2:11:23<28:33:11,  9.32it/s]

{'loss': Array(0.34429234, dtype=float32), 'loss_reward': Array(0.00340742, dtype=float32), 'loss_cross_entropy': Array(0.34088492, dtype=float32)}


  4%|▍         | 42439/1000000 [2:11:25<34:09:25,  7.79it/s]

{'loss': Array(0.33634314, dtype=float32), 'loss_reward': Array(0.00331822, dtype=float32), 'loss_cross_entropy': Array(0.33302492, dtype=float32)}


  4%|▍         | 42450/1000000 [2:11:27<27:50:58,  9.55it/s]

{'loss': Array(0.34840542, dtype=float32), 'loss_reward': Array(0.00338757, dtype=float32), 'loss_cross_entropy': Array(0.34501788, dtype=float32)}


  4%|▍         | 42458/1000000 [2:11:28<32:46:09,  8.12it/s]

{'loss': Array(0.33050734, dtype=float32), 'loss_reward': Array(0.00331724, dtype=float32), 'loss_cross_entropy': Array(0.32719004, dtype=float32)}


  4%|▍         | 42469/1000000 [2:11:30<29:10:05,  9.12it/s]

{'loss': Array(0.33090487, dtype=float32), 'loss_reward': Array(0.00324303, dtype=float32), 'loss_cross_entropy': Array(0.32766184, dtype=float32)}


  4%|▍         | 42478/1000000 [2:11:32<33:35:51,  7.92it/s]

{'loss': Array(0.33566865, dtype=float32), 'loss_reward': Array(0.00335972, dtype=float32), 'loss_cross_entropy': Array(0.33230892, dtype=float32)}


  4%|▍         | 42489/1000000 [2:11:33<28:19:00,  9.39it/s]

{'loss': Array(0.33618435, dtype=float32), 'loss_reward': Array(0.00311953, dtype=float32), 'loss_cross_entropy': Array(0.33306482, dtype=float32)}


  4%|▍         | 42500/1000000 [2:11:35<26:26:02, 10.06it/s]

{'loss': Array(0.32814184, dtype=float32), 'loss_reward': Array(0.00329276, dtype=float32), 'loss_cross_entropy': Array(0.3248491, dtype=float32)}


  4%|▍         | 42510/1000000 [2:11:47<145:19:37,  1.83it/s]

{'loss': Array(0.34104034, dtype=float32), 'loss_reward': Array(0.00339462, dtype=float32), 'loss_cross_entropy': Array(0.33764574, dtype=float32)}


  4%|▍         | 42518/1000000 [2:11:48<72:16:54,  3.68it/s] 

{'loss': Array(0.3382322, dtype=float32), 'loss_reward': Array(0.00315365, dtype=float32), 'loss_cross_entropy': Array(0.33507857, dtype=float32)}


  4%|▍         | 42528/1000000 [2:11:50<41:03:35,  6.48it/s]

{'loss': Array(0.33382237, dtype=float32), 'loss_reward': Array(0.00337124, dtype=float32), 'loss_cross_entropy': Array(0.3304511, dtype=float32)}


  4%|▍         | 42539/1000000 [2:11:52<29:36:31,  8.98it/s]

{'loss': Array(0.33775365, dtype=float32), 'loss_reward': Array(0.00334458, dtype=float32), 'loss_cross_entropy': Array(0.3344091, dtype=float32)}


  4%|▍         | 42549/1000000 [2:11:54<36:36:31,  7.26it/s]

{'loss': Array(0.34980837, dtype=float32), 'loss_reward': Array(0.00356696, dtype=float32), 'loss_cross_entropy': Array(0.34624138, dtype=float32)}


  4%|▍         | 42560/1000000 [2:11:55<28:57:56,  9.18it/s]

{'loss': Array(0.3422577, dtype=float32), 'loss_reward': Array(0.00343663, dtype=float32), 'loss_cross_entropy': Array(0.33882102, dtype=float32)}


  4%|▍         | 42568/1000000 [2:11:57<32:30:06,  8.18it/s]

{'loss': Array(0.35173225, dtype=float32), 'loss_reward': Array(0.00344394, dtype=float32), 'loss_cross_entropy': Array(0.3482883, dtype=float32)}


  4%|▍         | 42580/1000000 [2:11:59<26:34:51, 10.01it/s]

{'loss': Array(0.34650633, dtype=float32), 'loss_reward': Array(0.00348886, dtype=float32), 'loss_cross_entropy': Array(0.3430175, dtype=float32)}


  4%|▍         | 42590/1000000 [2:12:01<31:06:35,  8.55it/s]

{'loss': Array(0.33150005, dtype=float32), 'loss_reward': Array(0.00338205, dtype=float32), 'loss_cross_entropy': Array(0.328118, dtype=float32)}


  4%|▍         | 42599/1000000 [2:12:02<30:20:33,  8.76it/s]

{'loss': Array(0.3389685, dtype=float32), 'loss_reward': Array(0.00335174, dtype=float32), 'loss_cross_entropy': Array(0.3356168, dtype=float32)}


  4%|▍         | 42608/1000000 [2:12:04<31:47:31,  8.37it/s]

{'loss': Array(0.3423762, dtype=float32), 'loss_reward': Array(0.00333997, dtype=float32), 'loss_cross_entropy': Array(0.3390362, dtype=float32)}


  4%|▍         | 42619/1000000 [2:12:05<28:21:02,  9.38it/s]

{'loss': Array(0.33737504, dtype=float32), 'loss_reward': Array(0.00334767, dtype=float32), 'loss_cross_entropy': Array(0.33402738, dtype=float32)}


  4%|▍         | 42629/1000000 [2:12:07<33:19:57,  7.98it/s]

{'loss': Array(0.33557525, dtype=float32), 'loss_reward': Array(0.00333024, dtype=float32), 'loss_cross_entropy': Array(0.33224505, dtype=float32)}


  4%|▍         | 42639/1000000 [2:12:09<30:45:58,  8.64it/s]

{'loss': Array(0.33355275, dtype=float32), 'loss_reward': Array(0.00324018, dtype=float32), 'loss_cross_entropy': Array(0.33031258, dtype=float32)}


  4%|▍         | 42650/1000000 [2:12:11<25:55:44, 10.26it/s]

{'loss': Array(0.33751702, dtype=float32), 'loss_reward': Array(0.00334539, dtype=float32), 'loss_cross_entropy': Array(0.33417162, dtype=float32)}


  4%|▍         | 42659/1000000 [2:12:13<42:45:33,  6.22it/s]

{'loss': Array(0.33580697, dtype=float32), 'loss_reward': Array(0.00330781, dtype=float32), 'loss_cross_entropy': Array(0.33249915, dtype=float32)}


  4%|▍         | 42669/1000000 [2:12:14<31:32:22,  8.43it/s]

{'loss': Array(0.33605847, dtype=float32), 'loss_reward': Array(0.00349663, dtype=float32), 'loss_cross_entropy': Array(0.33256185, dtype=float32)}


  4%|▍         | 42678/1000000 [2:12:16<30:55:13,  8.60it/s]

{'loss': Array(0.32767865, dtype=float32), 'loss_reward': Array(0.00325129, dtype=float32), 'loss_cross_entropy': Array(0.32442737, dtype=float32)}


  4%|▍         | 42689/1000000 [2:12:17<28:07:07,  9.46it/s]

{'loss': Array(0.34327704, dtype=float32), 'loss_reward': Array(0.00331202, dtype=float32), 'loss_cross_entropy': Array(0.339965, dtype=float32)}


  4%|▍         | 42698/1000000 [2:12:19<42:15:10,  6.29it/s]

{'loss': Array(0.34293696, dtype=float32), 'loss_reward': Array(0.00345186, dtype=float32), 'loss_cross_entropy': Array(0.33948514, dtype=float32)}


  4%|▍         | 42710/1000000 [2:12:21<28:05:47,  9.46it/s]

{'loss': Array(0.33283263, dtype=float32), 'loss_reward': Array(0.00333514, dtype=float32), 'loss_cross_entropy': Array(0.3294975, dtype=float32)}


  4%|▍         | 42719/1000000 [2:12:23<30:33:06,  8.70it/s]

{'loss': Array(0.33163026, dtype=float32), 'loss_reward': Array(0.00338366, dtype=float32), 'loss_cross_entropy': Array(0.32824656, dtype=float32)}


  4%|▍         | 42729/1000000 [2:12:24<28:39:53,  9.28it/s]

{'loss': Array(0.33111268, dtype=float32), 'loss_reward': Array(0.003334, dtype=float32), 'loss_cross_entropy': Array(0.32777867, dtype=float32)}


  4%|▍         | 42738/1000000 [2:12:26<37:27:16,  7.10it/s]

{'loss': Array(0.33399808, dtype=float32), 'loss_reward': Array(0.00334841, dtype=float32), 'loss_cross_entropy': Array(0.33064967, dtype=float32)}


  4%|▍         | 42750/1000000 [2:12:28<26:56:18,  9.87it/s]

{'loss': Array(0.3336542, dtype=float32), 'loss_reward': Array(0.00332137, dtype=float32), 'loss_cross_entropy': Array(0.33033282, dtype=float32)}


  4%|▍         | 42759/1000000 [2:12:29<30:36:57,  8.69it/s]

{'loss': Array(0.33411536, dtype=float32), 'loss_reward': Array(0.00328345, dtype=float32), 'loss_cross_entropy': Array(0.33083192, dtype=float32)}


  4%|▍         | 42770/1000000 [2:12:31<27:56:56,  9.51it/s]

{'loss': Array(0.3377014, dtype=float32), 'loss_reward': Array(0.0034126, dtype=float32), 'loss_cross_entropy': Array(0.33428875, dtype=float32)}


  4%|▍         | 42779/1000000 [2:12:33<34:17:19,  7.75it/s]

{'loss': Array(0.33032686, dtype=float32), 'loss_reward': Array(0.00330489, dtype=float32), 'loss_cross_entropy': Array(0.32702193, dtype=float32)}


  4%|▍         | 42788/1000000 [2:12:35<31:47:25,  8.36it/s]

{'loss': Array(0.34196198, dtype=float32), 'loss_reward': Array(0.00332011, dtype=float32), 'loss_cross_entropy': Array(0.33864185, dtype=float32)}


  4%|▍         | 42800/1000000 [2:12:36<26:09:26, 10.16it/s]

{'loss': Array(0.3362194, dtype=float32), 'loss_reward': Array(0.00344261, dtype=float32), 'loss_cross_entropy': Array(0.33277684, dtype=float32)}


  4%|▍         | 42810/1000000 [2:12:38<38:12:42,  6.96it/s]

{'loss': Array(0.33097076, dtype=float32), 'loss_reward': Array(0.00340724, dtype=float32), 'loss_cross_entropy': Array(0.32756352, dtype=float32)}


  4%|▍         | 42818/1000000 [2:12:40<36:51:35,  7.21it/s]

{'loss': Array(0.33122897, dtype=float32), 'loss_reward': Array(0.00337128, dtype=float32), 'loss_cross_entropy': Array(0.32785773, dtype=float32)}


  4%|▍         | 42829/1000000 [2:12:41<29:28:35,  9.02it/s]

{'loss': Array(0.33393663, dtype=float32), 'loss_reward': Array(0.00322345, dtype=float32), 'loss_cross_entropy': Array(0.33071315, dtype=float32)}


  4%|▍         | 42838/1000000 [2:12:43<30:22:15,  8.75it/s]

{'loss': Array(0.33986792, dtype=float32), 'loss_reward': Array(0.00329926, dtype=float32), 'loss_cross_entropy': Array(0.33656868, dtype=float32)}


  4%|▍         | 42849/1000000 [2:12:45<39:25:00,  6.75it/s]

{'loss': Array(0.33748004, dtype=float32), 'loss_reward': Array(0.00319936, dtype=float32), 'loss_cross_entropy': Array(0.3342807, dtype=float32)}


  4%|▍         | 42860/1000000 [2:12:46<29:01:31,  9.16it/s]

{'loss': Array(0.33674595, dtype=float32), 'loss_reward': Array(0.0033011, dtype=float32), 'loss_cross_entropy': Array(0.33344486, dtype=float32)}


  4%|▍         | 42868/1000000 [2:12:48<33:55:40,  7.84it/s]

{'loss': Array(0.33366367, dtype=float32), 'loss_reward': Array(0.00336065, dtype=float32), 'loss_cross_entropy': Array(0.330303, dtype=float32)}


  4%|▍         | 42878/1000000 [2:12:49<29:53:55,  8.89it/s]

{'loss': Array(0.3273768, dtype=float32), 'loss_reward': Array(0.00334378, dtype=float32), 'loss_cross_entropy': Array(0.32403305, dtype=float32)}


  4%|▍         | 42890/1000000 [2:12:52<33:04:54,  8.04it/s]

{'loss': Array(0.3322456, dtype=float32), 'loss_reward': Array(0.00325743, dtype=float32), 'loss_cross_entropy': Array(0.3289882, dtype=float32)}


  4%|▍         | 42900/1000000 [2:12:53<29:02:30,  9.15it/s]

{'loss': Array(0.32691976, dtype=float32), 'loss_reward': Array(0.00334293, dtype=float32), 'loss_cross_entropy': Array(0.32357678, dtype=float32)}


  4%|▍         | 42908/1000000 [2:12:55<32:40:35,  8.14it/s]

{'loss': Array(0.33434436, dtype=float32), 'loss_reward': Array(0.00338641, dtype=float32), 'loss_cross_entropy': Array(0.330958, dtype=float32)}


  4%|▍         | 42920/1000000 [2:12:56<26:57:35,  9.86it/s]

{'loss': Array(0.32410192, dtype=float32), 'loss_reward': Array(0.00315937, dtype=float32), 'loss_cross_entropy': Array(0.32094258, dtype=float32)}


  4%|▍         | 42930/1000000 [2:12:59<33:49:58,  7.86it/s]

{'loss': Array(0.3350327, dtype=float32), 'loss_reward': Array(0.00333809, dtype=float32), 'loss_cross_entropy': Array(0.33169463, dtype=float32)}


  4%|▍         | 42938/1000000 [2:13:00<35:24:55,  7.51it/s]

{'loss': Array(0.3312439, dtype=float32), 'loss_reward': Array(0.00333081, dtype=float32), 'loss_cross_entropy': Array(0.3279131, dtype=float32)}


  4%|▍         | 42948/1000000 [2:13:02<32:07:07,  8.28it/s]

{'loss': Array(0.33002302, dtype=float32), 'loss_reward': Array(0.00327533, dtype=float32), 'loss_cross_entropy': Array(0.32674763, dtype=float32)}


  4%|▍         | 42959/1000000 [2:13:03<27:56:18,  9.52it/s]

{'loss': Array(0.327935, dtype=float32), 'loss_reward': Array(0.00330183, dtype=float32), 'loss_cross_entropy': Array(0.32463318, dtype=float32)}


  4%|▍         | 42969/1000000 [2:13:05<30:40:03,  8.67it/s]

{'loss': Array(0.32517678, dtype=float32), 'loss_reward': Array(0.00324563, dtype=float32), 'loss_cross_entropy': Array(0.32193115, dtype=float32)}


  4%|▍         | 42980/1000000 [2:13:07<26:23:57, 10.07it/s]

{'loss': Array(0.3320549, dtype=float32), 'loss_reward': Array(0.00333953, dtype=float32), 'loss_cross_entropy': Array(0.32871535, dtype=float32)}


  4%|▍         | 42988/1000000 [2:13:08<34:09:18,  7.78it/s]

{'loss': Array(0.3353326, dtype=float32), 'loss_reward': Array(0.00334347, dtype=float32), 'loss_cross_entropy': Array(0.33198914, dtype=float32)}


  4%|▍         | 43000/1000000 [2:13:10<38:05:19,  6.98it/s]

{'loss': Array(0.3284361, dtype=float32), 'loss_reward': Array(0.003211, dtype=float32), 'loss_cross_entropy': Array(0.32522509, dtype=float32)}


  4%|▍         | 43009/1000000 [2:13:22<156:43:03,  1.70it/s]

{'loss': Array(0.33016953, dtype=float32), 'loss_reward': Array(0.00340729, dtype=float32), 'loss_cross_entropy': Array(0.3267622, dtype=float32)}


  4%|▍         | 43018/1000000 [2:13:24<61:59:17,  4.29it/s] 

{'loss': Array(0.34170544, dtype=float32), 'loss_reward': Array(0.00333567, dtype=float32), 'loss_cross_entropy': Array(0.3383698, dtype=float32)}


  4%|▍         | 43030/1000000 [2:13:25<30:22:36,  8.75it/s]

{'loss': Array(0.33416685, dtype=float32), 'loss_reward': Array(0.00332384, dtype=float32), 'loss_cross_entropy': Array(0.330843, dtype=float32)}


  4%|▍         | 43040/1000000 [2:13:27<36:54:55,  7.20it/s]

{'loss': Array(0.32992435, dtype=float32), 'loss_reward': Array(0.00325584, dtype=float32), 'loss_cross_entropy': Array(0.3266685, dtype=float32)}


  4%|▍         | 43050/1000000 [2:13:29<30:37:21,  8.68it/s]

{'loss': Array(0.32957554, dtype=float32), 'loss_reward': Array(0.00335891, dtype=float32), 'loss_cross_entropy': Array(0.3262166, dtype=float32)}


  4%|▍         | 43058/1000000 [2:13:31<33:08:41,  8.02it/s]

{'loss': Array(0.33827135, dtype=float32), 'loss_reward': Array(0.00333018, dtype=float32), 'loss_cross_entropy': Array(0.33494112, dtype=float32)}


  4%|▍         | 43069/1000000 [2:13:32<29:33:22,  8.99it/s]

{'loss': Array(0.33618772, dtype=float32), 'loss_reward': Array(0.00335503, dtype=float32), 'loss_cross_entropy': Array(0.3328327, dtype=float32)}


  4%|▍         | 43079/1000000 [2:13:34<36:27:57,  7.29it/s]

{'loss': Array(0.3291692, dtype=float32), 'loss_reward': Array(0.00331475, dtype=float32), 'loss_cross_entropy': Array(0.32585442, dtype=float32)}


  4%|▍         | 43089/1000000 [2:13:36<30:41:50,  8.66it/s]

{'loss': Array(0.33263573, dtype=float32), 'loss_reward': Array(0.00324679, dtype=float32), 'loss_cross_entropy': Array(0.32938895, dtype=float32)}


  4%|▍         | 43099/1000000 [2:13:38<27:43:54,  9.58it/s]

{'loss': Array(0.33384094, dtype=float32), 'loss_reward': Array(0.00349603, dtype=float32), 'loss_cross_entropy': Array(0.33034486, dtype=float32)}


  4%|▍         | 43110/1000000 [2:13:39<25:39:07, 10.36it/s]

{'loss': Array(0.33743253, dtype=float32), 'loss_reward': Array(0.00338221, dtype=float32), 'loss_cross_entropy': Array(0.33405036, dtype=float32)}


  4%|▍         | 43119/1000000 [2:13:41<36:18:03,  7.32it/s]

{'loss': Array(0.3427664, dtype=float32), 'loss_reward': Array(0.00330897, dtype=float32), 'loss_cross_entropy': Array(0.33945742, dtype=float32)}


  4%|▍         | 43129/1000000 [2:13:43<30:09:40,  8.81it/s]

{'loss': Array(0.32709855, dtype=float32), 'loss_reward': Array(0.00338197, dtype=float32), 'loss_cross_entropy': Array(0.3237166, dtype=float32)}


  4%|▍         | 43140/1000000 [2:13:44<25:30:21, 10.42it/s]

{'loss': Array(0.3309561, dtype=float32), 'loss_reward': Array(0.00339816, dtype=float32), 'loss_cross_entropy': Array(0.3275579, dtype=float32)}


  4%|▍         | 43150/1000000 [2:13:46<26:52:11,  9.89it/s]

{'loss': Array(0.33478418, dtype=float32), 'loss_reward': Array(0.00315377, dtype=float32), 'loss_cross_entropy': Array(0.33163044, dtype=float32)}


  4%|▍         | 43158/1000000 [2:13:48<39:55:43,  6.66it/s]

{'loss': Array(0.33396682, dtype=float32), 'loss_reward': Array(0.00336233, dtype=float32), 'loss_cross_entropy': Array(0.33060446, dtype=float32)}


  4%|▍         | 43169/1000000 [2:13:50<29:19:03,  9.07it/s]

{'loss': Array(0.33119053, dtype=float32), 'loss_reward': Array(0.003279, dtype=float32), 'loss_cross_entropy': Array(0.32791147, dtype=float32)}


  4%|▍         | 43180/1000000 [2:13:51<26:50:29,  9.90it/s]

{'loss': Array(0.3221096, dtype=float32), 'loss_reward': Array(0.00328081, dtype=float32), 'loss_cross_entropy': Array(0.3188288, dtype=float32)}


  4%|▍         | 43190/1000000 [2:13:53<39:20:28,  6.76it/s]

{'loss': Array(0.33088455, dtype=float32), 'loss_reward': Array(0.00327012, dtype=float32), 'loss_cross_entropy': Array(0.32761443, dtype=float32)}


  4%|▍         | 43199/1000000 [2:13:55<32:46:06,  8.11it/s]

{'loss': Array(0.3306531, dtype=float32), 'loss_reward': Array(0.00323982, dtype=float32), 'loss_cross_entropy': Array(0.3274133, dtype=float32)}


  4%|▍         | 43209/1000000 [2:13:56<29:13:53,  9.09it/s]

{'loss': Array(0.33169332, dtype=float32), 'loss_reward': Array(0.00324886, dtype=float32), 'loss_cross_entropy': Array(0.32844445, dtype=float32)}


  4%|▍         | 43219/1000000 [2:13:58<28:28:38,  9.33it/s]

{'loss': Array(0.32290715, dtype=float32), 'loss_reward': Array(0.00330878, dtype=float32), 'loss_cross_entropy': Array(0.31959832, dtype=float32)}


  4%|▍         | 43230/1000000 [2:14:00<34:39:06,  7.67it/s]

{'loss': Array(0.324259, dtype=float32), 'loss_reward': Array(0.00330657, dtype=float32), 'loss_cross_entropy': Array(0.32095245, dtype=float32)}


  4%|▍         | 43240/1000000 [2:14:01<30:23:07,  8.75it/s]

{'loss': Array(0.32218415, dtype=float32), 'loss_reward': Array(0.00320173, dtype=float32), 'loss_cross_entropy': Array(0.3189824, dtype=float32)}


  4%|▍         | 43250/1000000 [2:14:03<29:34:13,  8.99it/s]

{'loss': Array(0.3330739, dtype=float32), 'loss_reward': Array(0.00334009, dtype=float32), 'loss_cross_entropy': Array(0.32973382, dtype=float32)}


  4%|▍         | 43259/1000000 [2:14:05<31:32:04,  8.43it/s]

{'loss': Array(0.3299901, dtype=float32), 'loss_reward': Array(0.00328899, dtype=float32), 'loss_cross_entropy': Array(0.32670113, dtype=float32)}


  4%|▍         | 43268/1000000 [2:14:07<39:06:31,  6.80it/s]

{'loss': Array(0.33070406, dtype=float32), 'loss_reward': Array(0.0032694, dtype=float32), 'loss_cross_entropy': Array(0.3274347, dtype=float32)}


  4%|▍         | 43280/1000000 [2:14:08<26:42:59,  9.95it/s]

{'loss': Array(0.32892898, dtype=float32), 'loss_reward': Array(0.0033795, dtype=float32), 'loss_cross_entropy': Array(0.32554948, dtype=float32)}


  4%|▍         | 43290/1000000 [2:14:10<27:04:59,  9.81it/s]

{'loss': Array(0.320575, dtype=float32), 'loss_reward': Array(0.00330772, dtype=float32), 'loss_cross_entropy': Array(0.31726727, dtype=float32)}


  4%|▍         | 43298/1000000 [2:14:12<32:50:20,  8.09it/s]

{'loss': Array(0.32488874, dtype=float32), 'loss_reward': Array(0.0032353, dtype=float32), 'loss_cross_entropy': Array(0.32165346, dtype=float32)}


  4%|▍         | 43310/1000000 [2:14:14<32:17:45,  8.23it/s]

{'loss': Array(0.33629495, dtype=float32), 'loss_reward': Array(0.00341931, dtype=float32), 'loss_cross_entropy': Array(0.3328756, dtype=float32)}


  4%|▍         | 43319/1000000 [2:14:15<31:41:40,  8.38it/s]

{'loss': Array(0.33095917, dtype=float32), 'loss_reward': Array(0.00326528, dtype=float32), 'loss_cross_entropy': Array(0.3276939, dtype=float32)}


  4%|▍         | 43328/1000000 [2:14:17<31:09:13,  8.53it/s]

{'loss': Array(0.3215128, dtype=float32), 'loss_reward': Array(0.00331495, dtype=float32), 'loss_cross_entropy': Array(0.31819782, dtype=float32)}


  4%|▍         | 43340/1000000 [2:14:18<25:11:35, 10.55it/s]

{'loss': Array(0.32696697, dtype=float32), 'loss_reward': Array(0.00319251, dtype=float32), 'loss_cross_entropy': Array(0.32377452, dtype=float32)}


  4%|▍         | 43349/1000000 [2:14:20<34:27:03,  7.71it/s]

{'loss': Array(0.3362988, dtype=float32), 'loss_reward': Array(0.00322112, dtype=float32), 'loss_cross_entropy': Array(0.33307764, dtype=float32)}


  4%|▍         | 43360/1000000 [2:14:22<27:32:36,  9.65it/s]

{'loss': Array(0.328351, dtype=float32), 'loss_reward': Array(0.00328944, dtype=float32), 'loss_cross_entropy': Array(0.32506156, dtype=float32)}


  4%|▍         | 43368/1000000 [2:14:24<34:34:16,  7.69it/s]

{'loss': Array(0.32867986, dtype=float32), 'loss_reward': Array(0.00335309, dtype=float32), 'loss_cross_entropy': Array(0.32532677, dtype=float32)}


  4%|▍         | 43380/1000000 [2:14:26<38:07:56,  6.97it/s]

{'loss': Array(0.32492036, dtype=float32), 'loss_reward': Array(0.00328077, dtype=float32), 'loss_cross_entropy': Array(0.3216396, dtype=float32)}


  4%|▍         | 43390/1000000 [2:14:27<30:34:44,  8.69it/s]

{'loss': Array(0.3291868, dtype=float32), 'loss_reward': Array(0.00326894, dtype=float32), 'loss_cross_entropy': Array(0.32591784, dtype=float32)}


  4%|▍         | 43400/1000000 [2:14:29<29:11:15,  9.10it/s]

{'loss': Array(0.32423073, dtype=float32), 'loss_reward': Array(0.00336262, dtype=float32), 'loss_cross_entropy': Array(0.32086807, dtype=float32)}


  4%|▍         | 43409/1000000 [2:14:31<30:25:15,  8.73it/s]

{'loss': Array(0.31971058, dtype=float32), 'loss_reward': Array(0.00337371, dtype=float32), 'loss_cross_entropy': Array(0.31633684, dtype=float32)}


  4%|▍         | 43418/1000000 [2:14:33<42:46:37,  6.21it/s]

{'loss': Array(0.32887584, dtype=float32), 'loss_reward': Array(0.00337028, dtype=float32), 'loss_cross_entropy': Array(0.32550555, dtype=float32)}


  4%|▍         | 43429/1000000 [2:14:34<29:38:04,  8.97it/s]

{'loss': Array(0.3329938, dtype=float32), 'loss_reward': Array(0.00338733, dtype=float32), 'loss_cross_entropy': Array(0.3296065, dtype=float32)}


  4%|▍         | 43440/1000000 [2:14:36<25:18:10, 10.50it/s]

{'loss': Array(0.3403081, dtype=float32), 'loss_reward': Array(0.00336977, dtype=float32), 'loss_cross_entropy': Array(0.33693835, dtype=float32)}


  4%|▍         | 43450/1000000 [2:14:38<28:03:21,  9.47it/s]

{'loss': Array(0.3283456, dtype=float32), 'loss_reward': Array(0.00328664, dtype=float32), 'loss_cross_entropy': Array(0.32505897, dtype=float32)}


  4%|▍         | 43460/1000000 [2:14:40<35:08:28,  7.56it/s]

{'loss': Array(0.32744336, dtype=float32), 'loss_reward': Array(0.00332157, dtype=float32), 'loss_cross_entropy': Array(0.3241218, dtype=float32)}


  4%|▍         | 43469/1000000 [2:14:41<33:12:32,  8.00it/s]

{'loss': Array(0.32644728, dtype=float32), 'loss_reward': Array(0.0031915, dtype=float32), 'loss_cross_entropy': Array(0.32325578, dtype=float32)}


  4%|▍         | 43480/1000000 [2:14:43<28:03:34,  9.47it/s]

{'loss': Array(0.3284413, dtype=float32), 'loss_reward': Array(0.00333766, dtype=float32), 'loss_cross_entropy': Array(0.3251036, dtype=float32)}


  4%|▍         | 43489/1000000 [2:14:44<31:20:52,  8.48it/s]

{'loss': Array(0.32338497, dtype=float32), 'loss_reward': Array(0.00334542, dtype=float32), 'loss_cross_entropy': Array(0.3200395, dtype=float32)}


  4%|▍         | 43499/1000000 [2:14:56<34:38:37,  7.67it/s]

{'loss': Array(0.3328322, dtype=float32), 'loss_reward': Array(0.00324086, dtype=float32), 'loss_cross_entropy': Array(0.32959133, dtype=float32)}


  4%|▍         | 43510/1000000 [2:14:58<128:53:44,  2.06it/s]

{'loss': Array(0.32695982, dtype=float32), 'loss_reward': Array(0.00330447, dtype=float32), 'loss_cross_entropy': Array(0.32365537, dtype=float32)}


  4%|▍         | 43520/1000000 [2:15:00<49:16:15,  5.39it/s] 

{'loss': Array(0.33400962, dtype=float32), 'loss_reward': Array(0.00329015, dtype=float32), 'loss_cross_entropy': Array(0.33071944, dtype=float32)}


  4%|▍         | 43528/1000000 [2:15:01<40:18:13,  6.59it/s]

{'loss': Array(0.32051024, dtype=float32), 'loss_reward': Array(0.00328346, dtype=float32), 'loss_cross_entropy': Array(0.31722677, dtype=float32)}


  4%|▍         | 43540/1000000 [2:15:03<30:49:55,  8.62it/s]

{'loss': Array(0.32647252, dtype=float32), 'loss_reward': Array(0.00336299, dtype=float32), 'loss_cross_entropy': Array(0.32310954, dtype=float32)}


  4%|▍         | 43550/1000000 [2:15:05<27:27:02,  9.68it/s]

{'loss': Array(0.32085782, dtype=float32), 'loss_reward': Array(0.00313978, dtype=float32), 'loss_cross_entropy': Array(0.31771803, dtype=float32)}


  4%|▍         | 43559/1000000 [2:15:07<29:49:40,  8.91it/s]

{'loss': Array(0.32661277, dtype=float32), 'loss_reward': Array(0.00315611, dtype=float32), 'loss_cross_entropy': Array(0.32345667, dtype=float32)}


  4%|▍         | 43569/1000000 [2:15:08<28:37:13,  9.28it/s]

{'loss': Array(0.3284657, dtype=float32), 'loss_reward': Array(0.00321897, dtype=float32), 'loss_cross_entropy': Array(0.32524678, dtype=float32)}


  4%|▍         | 43580/1000000 [2:15:10<29:24:03,  9.04it/s]

{'loss': Array(0.32018492, dtype=float32), 'loss_reward': Array(0.00324959, dtype=float32), 'loss_cross_entropy': Array(0.31693533, dtype=float32)}


  4%|▍         | 43589/1000000 [2:15:12<30:39:31,  8.67it/s]

{'loss': Array(0.319525, dtype=float32), 'loss_reward': Array(0.00338305, dtype=float32), 'loss_cross_entropy': Array(0.31614196, dtype=float32)}


  4%|▍         | 43599/1000000 [2:15:14<29:31:41,  9.00it/s]

{'loss': Array(0.3243874, dtype=float32), 'loss_reward': Array(0.0032711, dtype=float32), 'loss_cross_entropy': Array(0.3211163, dtype=float32)}


  4%|▍         | 43609/1000000 [2:15:16<41:36:11,  6.39it/s]

{'loss': Array(0.3331467, dtype=float32), 'loss_reward': Array(0.00329288, dtype=float32), 'loss_cross_entropy': Array(0.32985386, dtype=float32)}


  4%|▍         | 43619/1000000 [2:15:17<29:25:11,  9.03it/s]

{'loss': Array(0.334115, dtype=float32), 'loss_reward': Array(0.00326346, dtype=float32), 'loss_cross_entropy': Array(0.33085153, dtype=float32)}


  4%|▍         | 43629/1000000 [2:15:19<27:25:16,  9.69it/s]

{'loss': Array(0.32518804, dtype=float32), 'loss_reward': Array(0.00324669, dtype=float32), 'loss_cross_entropy': Array(0.32194135, dtype=float32)}


  4%|▍         | 43639/1000000 [2:15:21<28:39:19,  9.27it/s]

{'loss': Array(0.32515642, dtype=float32), 'loss_reward': Array(0.00327871, dtype=float32), 'loss_cross_entropy': Array(0.32187772, dtype=float32)}


  4%|▍         | 43649/1000000 [2:15:23<36:16:16,  7.32it/s]

{'loss': Array(0.3253402, dtype=float32), 'loss_reward': Array(0.00326771, dtype=float32), 'loss_cross_entropy': Array(0.32207248, dtype=float32)}


  4%|▍         | 43660/1000000 [2:15:24<27:34:04,  9.64it/s]

{'loss': Array(0.32343426, dtype=float32), 'loss_reward': Array(0.00318731, dtype=float32), 'loss_cross_entropy': Array(0.32024696, dtype=float32)}


  4%|▍         | 43670/1000000 [2:15:26<27:01:40,  9.83it/s]

{'loss': Array(0.32291278, dtype=float32), 'loss_reward': Array(0.00330184, dtype=float32), 'loss_cross_entropy': Array(0.31961098, dtype=float32)}


  4%|▍         | 43680/1000000 [2:15:28<28:33:31,  9.30it/s]

{'loss': Array(0.3150776, dtype=float32), 'loss_reward': Array(0.00335158, dtype=float32), 'loss_cross_entropy': Array(0.31172603, dtype=float32)}


  4%|▍         | 43690/1000000 [2:15:30<33:20:11,  7.97it/s]

{'loss': Array(0.33103487, dtype=float32), 'loss_reward': Array(0.0032805, dtype=float32), 'loss_cross_entropy': Array(0.32775432, dtype=float32)}


  4%|▍         | 43700/1000000 [2:15:31<29:59:39,  8.86it/s]

{'loss': Array(0.3164181, dtype=float32), 'loss_reward': Array(0.003257, dtype=float32), 'loss_cross_entropy': Array(0.3131611, dtype=float32)}


  4%|▍         | 43710/1000000 [2:15:33<28:45:35,  9.24it/s]

{'loss': Array(0.32146502, dtype=float32), 'loss_reward': Array(0.00330445, dtype=float32), 'loss_cross_entropy': Array(0.31816056, dtype=float32)}


  4%|▍         | 43720/1000000 [2:15:35<39:41:42,  6.69it/s]

{'loss': Array(0.3232472, dtype=float32), 'loss_reward': Array(0.00317726, dtype=float32), 'loss_cross_entropy': Array(0.32006994, dtype=float32)}


  4%|▍         | 43730/1000000 [2:15:36<30:59:28,  8.57it/s]

{'loss': Array(0.328376, dtype=float32), 'loss_reward': Array(0.00327391, dtype=float32), 'loss_cross_entropy': Array(0.32510206, dtype=float32)}


  4%|▍         | 43738/1000000 [2:15:38<34:10:49,  7.77it/s]

{'loss': Array(0.31740713, dtype=float32), 'loss_reward': Array(0.00326484, dtype=float32), 'loss_cross_entropy': Array(0.3141423, dtype=float32)}


  4%|▍         | 43749/1000000 [2:15:39<29:15:22,  9.08it/s]

{'loss': Array(0.3272972, dtype=float32), 'loss_reward': Array(0.00325919, dtype=float32), 'loss_cross_entropy': Array(0.32403803, dtype=float32)}


  4%|▍         | 43759/1000000 [2:15:41<39:59:37,  6.64it/s]

{'loss': Array(0.33295807, dtype=float32), 'loss_reward': Array(0.00337841, dtype=float32), 'loss_cross_entropy': Array(0.32957965, dtype=float32)}


  4%|▍         | 43769/1000000 [2:15:43<28:59:39,  9.16it/s]

{'loss': Array(0.33086562, dtype=float32), 'loss_reward': Array(0.00336689, dtype=float32), 'loss_cross_entropy': Array(0.32749876, dtype=float32)}


  4%|▍         | 43780/1000000 [2:15:45<25:53:19, 10.26it/s]

{'loss': Array(0.31824356, dtype=float32), 'loss_reward': Array(0.0032541, dtype=float32), 'loss_cross_entropy': Array(0.31498945, dtype=float32)}


  4%|▍         | 43790/1000000 [2:15:46<29:04:55,  9.13it/s]

{'loss': Array(0.32670757, dtype=float32), 'loss_reward': Array(0.00338604, dtype=float32), 'loss_cross_entropy': Array(0.32332152, dtype=float32)}


  4%|▍         | 43798/1000000 [2:15:48<41:55:53,  6.33it/s]

{'loss': Array(0.32106376, dtype=float32), 'loss_reward': Array(0.00326902, dtype=float32), 'loss_cross_entropy': Array(0.31779477, dtype=float32)}


  4%|▍         | 43810/1000000 [2:15:50<28:23:36,  9.35it/s]

{'loss': Array(0.32941872, dtype=float32), 'loss_reward': Array(0.00322649, dtype=float32), 'loss_cross_entropy': Array(0.32619226, dtype=float32)}


  4%|▍         | 43820/1000000 [2:15:51<26:45:56,  9.92it/s]

{'loss': Array(0.31893837, dtype=float32), 'loss_reward': Array(0.00326552, dtype=float32), 'loss_cross_entropy': Array(0.31567287, dtype=float32)}


  4%|▍         | 43829/1000000 [2:15:53<31:14:06,  8.50it/s]

{'loss': Array(0.32690272, dtype=float32), 'loss_reward': Array(0.00333401, dtype=float32), 'loss_cross_entropy': Array(0.3235687, dtype=float32)}


  4%|▍         | 43840/1000000 [2:15:55<31:33:29,  8.42it/s]

{'loss': Array(0.3208954, dtype=float32), 'loss_reward': Array(0.00331941, dtype=float32), 'loss_cross_entropy': Array(0.317576, dtype=float32)}


  4%|▍         | 43849/1000000 [2:15:57<31:29:29,  8.43it/s]

{'loss': Array(0.3141403, dtype=float32), 'loss_reward': Array(0.00322705, dtype=float32), 'loss_cross_entropy': Array(0.31091323, dtype=float32)}


  4%|▍         | 43859/1000000 [2:15:58<27:38:06,  9.61it/s]

{'loss': Array(0.32635298, dtype=float32), 'loss_reward': Array(0.00322575, dtype=float32), 'loss_cross_entropy': Array(0.32312724, dtype=float32)}


  4%|▍         | 43869/1000000 [2:16:00<29:31:48,  8.99it/s]

{'loss': Array(0.3346602, dtype=float32), 'loss_reward': Array(0.00340611, dtype=float32), 'loss_cross_entropy': Array(0.3312541, dtype=float32)}


  4%|▍         | 43879/1000000 [2:16:02<32:04:53,  8.28it/s]

{'loss': Array(0.32402918, dtype=float32), 'loss_reward': Array(0.00333027, dtype=float32), 'loss_cross_entropy': Array(0.3206989, dtype=float32)}


  4%|▍         | 43890/1000000 [2:16:04<28:16:10,  9.39it/s]

{'loss': Array(0.33022282, dtype=float32), 'loss_reward': Array(0.00326015, dtype=float32), 'loss_cross_entropy': Array(0.32696268, dtype=float32)}


  4%|▍         | 43900/1000000 [2:16:05<29:21:28,  9.05it/s]

{'loss': Array(0.32164302, dtype=float32), 'loss_reward': Array(0.00338089, dtype=float32), 'loss_cross_entropy': Array(0.31826216, dtype=float32)}


  4%|▍         | 43909/1000000 [2:16:07<31:56:22,  8.32it/s]

{'loss': Array(0.3233163, dtype=float32), 'loss_reward': Array(0.00321406, dtype=float32), 'loss_cross_entropy': Array(0.32010224, dtype=float32)}


  4%|▍         | 43920/1000000 [2:16:09<31:30:03,  8.43it/s]

{'loss': Array(0.3253051, dtype=float32), 'loss_reward': Array(0.00336429, dtype=float32), 'loss_cross_entropy': Array(0.32194078, dtype=float32)}


  4%|▍         | 43922/1000000 [2:16:10<65:55:09,  4.03it/s]

In [12]:
sample

TrajectoryBufferSample(experience={'action': Array([[[1.32556781e-01, 7.96739519e-01, 5.36718592e-02, ...,
         3.91646661e-03, 4.48901858e-03, 9.91594553e-01],
        [3.49070907e-01, 4.57749265e-04, 4.38157976e-01, ...,
         7.23136306e-01, 1.23497941e-01, 1.53365776e-01],
        [6.12441264e-03, 2.50436477e-02, 1.35732419e-03, ...,
         3.82237613e-01, 5.98694921e-01, 1.90675538e-02],
        ...,
        [1.41329234e-04, 2.44877161e-03, 8.43136787e-01, ...,
         2.33344346e-01, 6.42170012e-01, 1.24485560e-01],
        [6.32655225e-04, 1.77795421e-02, 9.65278149e-01, ...,
         1.25269741e-02, 3.21629345e-01, 6.65843725e-01],
        [9.08881542e-04, 1.04175135e-01, 7.50824576e-04, ...,
         9.99683421e-03, 7.89827347e-01, 2.00175866e-01]],

       [[2.03237548e-01, 7.00179100e-01, 3.63819454e-05, ...,
         9.96583939e-01, 2.39940570e-03, 1.01662707e-03],
        [7.63220847e-01, 1.11325733e-01, 3.15520242e-02, ...,
         5.45369804e-01, 4.54322606e-0

In [15]:
sample = buffer_eval.sample(buffer_list_eval, subkey)
sample = reshape_sample(sample)


In [16]:
loss, (loss_crossentropy, loss_reward) = loss_fn_transformer(transformer, sample.experience)


AttributeError: 'RubikTransformer' object has no attribute 'state_mapping'

In [17]:
# save buffer, buffer_list
# in pickle 
import pickle

state_weight = nnx.state(transformer)

In [18]:
state_weight

State({
  'action_mapping': {
    'bias': VariableState(
      type=Param,
      value=Array([ 0.05804873,  0.03530794, -0.06344386,  0.11217742, -0.00204838,
              0.03259815,  0.00209457, -0.00875406, -0.02046519,  0.07442955,
              0.04382639, -0.06812178,  0.0239886 ,  0.03021448,  0.03324478,
             -0.00939911,  0.01842497, -0.02299442,  0.00597318, -0.05957094,
              0.05180154,  0.08517693, -0.0014244 ,  0.00738644, -0.06243268,
              0.14644375, -0.00069646,  0.01227313, -0.04554354, -0.0732253 ,
             -0.0509976 , -0.05631914,  0.03738089, -0.05132922,  0.05668032,
             -0.00109585, -0.15057902, -0.05047007, -0.04745406,  0.04642721,
              0.05568941,  0.05438963, -0.06343702,  0.07053652, -0.0265609 ,
              0.05900841,  0.027164  ,  0.07832118, -0.05330597,  0.05487438,
              0.14665098, -0.08351623, -0.09119233, -0.06400578, -0.0773109 ,
              0.05192997, -0.08234955, -0.04352584, -0.020047

In [19]:
# save state into pickle
with open('state_probainput_vscale4.pickle', 'wb') as handle:
    pickle.dump(state_weight, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
state_weight

State({})

In [13]:
nnx.state(transformer)

State({})

In [14]:

transformer

RubikTransformer()