<a href="https://colab.research.google.com/github/ADMoreau/MAMLGAN/blob/master/mamlgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import cv2
import torch
import requests
import numpy as np
import gc
import pandas as pd
from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
from PIL import Image
from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout
from io import BytesIO
import argparse
import os
import numpy as np
import math
import itertools
import datetime
import time
import sys
from tqdm import trange

from torch.distributions import Normal, Categorical
import torchvision
from torchvision import datasets, models, transforms
import torchvision.transforms as transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
from torchvision import datasets
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch
from copy import deepcopy
import scipy
import scipy.misc
from torch.autograd import Variable
from torchvision.models.utils import load_state_dict_from_url


cuda = torch.cuda.is_available()

if cuda:
  device = torch.device("cuda:0")

In [0]:
!pip install --upgrade efficientnet-pytorch

Requirement already up-to-date: efficientnet-pytorch in /usr/local/lib/python3.6/dist-packages (0.6.1)


In [0]:
from efficientnet_pytorch.utils import (
    round_filters,
    round_repeats,
    drop_connect,
    get_same_padding_conv2d,
    get_model_params,
    efficientnet_params,
    load_pretrained_weights,
    Swish,
    MemoryEfficientSwish,
)

class MBConvBlock(nn.Module):
    """
    Mobile Inverted Residual Bottleneck Block
    Args:
        block_args (namedtuple): BlockArgs, see above
        global_params (namedtuple): GlobalParam, see above
    Attributes:
        has_se (bool): Whether the block contains a Squeeze and Excitation layer.
    """

    def __init__(self, block_args, global_params):
        super().__init__()
        self._block_args = block_args
        self._bn_mom = 1 - global_params.batch_norm_momentum
        self._bn_eps = global_params.batch_norm_epsilon
        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
        self.id_skip = block_args.id_skip  # skip connection and drop connect

        # Get static or dynamic convolution depending on image size
        Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)

        # Expansion phase
        inp = self._block_args.input_filters  # number of input channels
        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
        if self._block_args.expand_ratio != 1:
            self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

        # Depthwise convolution phase
        k = self._block_args.kernel_size
        s = self._block_args.stride
        self._depthwise_conv = Conv2d(
            in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
            kernel_size=k, stride=s, bias=False)
        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

        # Squeeze and Excitation layer, if desired
        if self.has_se:
            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
            self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)

        # Output phase
        final_oup = self._block_args.output_filters
        self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
        self._swish = MemoryEfficientSwish()

    def forward(self, inputs, drop_connect_rate=None):
        """
        :param inputs: input tensor
        :param drop_connect_rate: drop connect rate (float, between 0 and 1)
        :return: output of block
        """

        # Expansion and Depthwise Convolution
        x = inputs
        if self._block_args.expand_ratio != 1:
            x = self._swish(self._bn0(self._expand_conv(inputs)))
        x = self._swish(self._bn1(self._depthwise_conv(x)))

        # Squeeze and Excitation
        if self.has_se:
            x_squeezed = F.adaptive_avg_pool2d(x, 1)
            x_squeezed = self._se_expand(self._swish(self._se_reduce(x_squeezed)))
            x = torch.sigmoid(x_squeezed) * x

        x = self._bn2(self._project_conv(x))

        # Skip connection and drop connect
        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
            if drop_connect_rate:
                x = drop_connect(x, p=drop_connect_rate, training=self.training)
            x = x + inputs  # skip connection
        return x

    def set_swish(self, memory_efficient=True):
        """Sets swish function as memory efficient (for training) or standard (for export)"""
        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()


class EfficientNet(nn.Module):
    """
    An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
    Args:
        blocks_args (list): A list of BlockArgs to construct blocks
        global_params (namedtuple): A set of GlobalParams shared between blocks
    Example:
        model = EfficientNet.from_pretrained('efficientnet-b0')
    """

    def __init__(self, blocks_args=None, global_params=None):
        super().__init__()
        assert isinstance(blocks_args, list), 'blocks_args should be a list'
        assert len(blocks_args) > 0, 'block args must be greater than 0'
        self._global_params = global_params
        self._blocks_args = blocks_args

        # Get static or dynamic convolution depending on image size
        Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)

        # Batch norm parameters
        bn_mom = 1 - self._global_params.batch_norm_momentum
        bn_eps = self._global_params.batch_norm_epsilon

        # Stem
        in_channels = 3  # rgb
        out_channels = round_filters(32, self._global_params)  # number of output channels
        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)

        # Build blocks
        self._blocks = nn.ModuleList([])
        for block_args in self._blocks_args:

            # Update block input and output filters based on depth multiplier.
            block_args = block_args._replace(
                input_filters=round_filters(block_args.input_filters, self._global_params),
                output_filters=round_filters(block_args.output_filters, self._global_params),
                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
            )

            # The first block needs to take care of stride and filter size increase.
            self._blocks.append(MBConvBlock(block_args, self._global_params))
            if block_args.num_repeat > 1:
                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
            for _ in range(block_args.num_repeat - 1):
                self._blocks.append(MBConvBlock(block_args, self._global_params))

        # Head
        in_channels = block_args.output_filters  # output of final block
        out_channels = round_filters(1280, self._global_params)
        self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)

        # Final linear layer
        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
        self._dropout = nn.Dropout(self._global_params.dropout_rate)
        self._fc = nn.Linear(out_channels, self._global_params.num_classes)
        self._swish = MemoryEfficientSwish()

    def set_swish(self, memory_efficient=True):
        """Sets swish function as memory efficient (for training) or standard (for export)"""
        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
        for block in self._blocks:
            block.set_swish(memory_efficient)


    def extract_features(self, inputs):
        """ Returns output of the final convolution layer """

        # Stem
        x = self._swish(self._bn0(self._conv_stem(inputs)))

        # Blocks
        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)
            x = block(x, drop_connect_rate=drop_connect_rate)

        # Head
        x = self._swish(self._bn1(self._conv_head(x)))

        return x

    def classify(self, inputs):
        bs = inputs.size(0)
        # Pooling and final linear layer
        x = self._avg_pooling(inputs)
        x = x.view(bs, -1)
        x = self._dropout(x)
        x = self._fc(x)
        return x

    def forward(self, inputs):
        """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
        # Convolution layers
        x = self.extract_features(inputs)
        x = self.classify(x)
        return x
        

    @classmethod
    def from_name(cls, model_name, override_params=None):
        cls._check_model_name_is_valid(model_name)
        blocks_args, global_params = get_model_params(model_name, override_params)
        return cls(blocks_args, global_params)

    @classmethod
    def from_pretrained(cls, model_name, advprop=False, num_classes=1000, in_channels=3):
        model = cls.from_name(model_name, override_params={'num_classes': num_classes})
        load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
        if in_channels != 3:
            Conv2d = get_same_padding_conv2d(image_size = model._global_params.image_size)
            out_channels = round_filters(32, model._global_params)
            model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
        return model
    
    @classmethod
    def get_image_size(cls, model_name):
        cls._check_model_name_is_valid(model_name)
        _, _, res, _ = efficientnet_params(model_name)
        return res

    @classmethod
    def _check_model_name_is_valid(cls, model_name):
        """ Validates model name. """ 
        valid_models = ['efficientnet-b'+str(i) for i in range(9)]
        if model_name not in valid_models:
            raise ValueError('model_name should be one of: ' + ', '.join(valid_models))

In [0]:
efficientnet_transform=transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

efficientnet = EfficientNet.from_name('efficientnet-b5') 
out_channels = round_filters(1280, efficientnet._global_params)
efficientnet._fc = nn.Linear(out_channels, 10)
efficientnet = efficientnet.to(device)
efficientnet.load_state_dict(torch.load('/content/drive/My Drive/cifar10.pth'))
efficientnet.eval()

MUNIT Constructor

https://github.com/eriklindernoren/PyTorch-GAN/blob/master/implementations/munit

In [0]:
def weights_init_normal(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        torch.nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find("BatchNorm2d") != -1:
        torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
        torch.nn.init.constant_(m.bias.data, 0.0)


class LambdaLR:
    def __init__(self, n_epochs, offset, decay_start_epoch):
        assert (n_epochs - decay_start_epoch) > 0, "Decay must start before the training session ends!"
        self.n_epochs = n_epochs
        self.offset = offset
        self.decay_start_epoch = decay_start_epoch

    def step(self, epoch):
        return 1.0 - max(0, epoch + self.offset - self.decay_start_epoch) / (self.n_epochs - self.decay_start_epoch)


class Encoder(nn.Module):
    def __init__(self, in_channels=3, dim=64, n_residual=3, n_downsample=2, style_dim=8):
        super(Encoder, self).__init__()
        self.content_encoder = ContentEncoder(in_channels, dim, n_residual, n_downsample)
        self.style_encoder = StyleEncoder(in_channels, dim, n_downsample, style_dim)

    def forward(self, x):
        content_code = self.content_encoder(x)
        style_code = self.style_encoder(x)
        return content_code, style_code


class Decoder(nn.Module):
    def __init__(self, out_channels=3, dim=64, n_residual=3, n_upsample=2, style_dim=8):
        super(Decoder, self).__init__()

        layers = []
        dim = dim * 2 ** n_upsample
        # Residual blocks
        for _ in range(n_residual):
            layers += [ResidualBlock(dim, norm="adain")]

        # Upsampling
        for _ in range(n_upsample):
            layers += [
                nn.Upsample(scale_factor=2),
                nn.Conv2d(dim, dim // 2, 5, stride=1, padding=2),
                LayerNorm(dim // 2),
                nn.ReLU(inplace=True),
            ]
            dim = dim // 2

        # Output layer
        layers += [nn.ReflectionPad2d(3), nn.Conv2d(dim, out_channels, 7), nn.Tanh()]

        self.model = nn.Sequential(*layers)

        # Initiate mlp (predicts AdaIN parameters)
        num_adain_params = self.get_num_adain_params()
        self.mlp = MLP(style_dim, num_adain_params)

    def get_num_adain_params(self):
        """Return the number of AdaIN parameters needed by the model"""
        num_adain_params = 0
        for m in self.modules():
            if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
                num_adain_params += 2 * m.num_features
        return num_adain_params

    def assign_adain_params(self, adain_params):
        """Assign the adain_params to the AdaIN layers in model"""
        for m in self.modules():
            if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
                # Extract mean and std predictions
                mean = adain_params[:, : m.num_features]
                std = adain_params[:, m.num_features : 2 * m.num_features]
                # Update bias and weight
                m.bias = mean.contiguous().view(-1)
                m.weight = std.contiguous().view(-1)
                # Move pointer
                if adain_params.size(1) > 2 * m.num_features:
                    adain_params = adain_params[:, 2 * m.num_features :]

    def forward(self, content_code, style_code):
        # Update AdaIN parameters by MLP prediction based off style code
        self.assign_adain_params(self.mlp(style_code))
        img = self.model(content_code)
        return img


class ContentEncoder(nn.Module):
    def __init__(self, in_channels=3, dim=64, n_residual=3, n_downsample=2):
        super(ContentEncoder, self).__init__()

        # Initial convolution block
        layers = [
            nn.ReflectionPad2d(3),
            nn.Conv2d(in_channels, dim, 7),
            nn.InstanceNorm2d(dim),
            nn.ReLU(inplace=True),
        ]

        # Downsampling
        for _ in range(n_downsample):
            layers += [
                nn.Conv2d(dim, dim * 2, 4, stride=2, padding=1),
                nn.InstanceNorm2d(dim * 2),
                nn.ReLU(inplace=True),
            ]
            dim *= 2

        # Residual blocks
        for _ in range(n_residual):
            layers += [ResidualBlock(dim, norm="in")]

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


class StyleEncoder(nn.Module):
    def __init__(self, in_channels=3, dim=64, n_downsample=2, style_dim=8):
        super(StyleEncoder, self).__init__()

        # Initial conv block
        layers = [nn.ReflectionPad2d(3), nn.Conv2d(in_channels, dim, 7), nn.ReLU(inplace=True)]

        # Downsampling
        for _ in range(2):
            layers += [nn.Conv2d(dim, dim * 2, 4, stride=2, padding=1), nn.ReLU(inplace=True)]
            dim *= 2

        # Downsampling with constant depth
        for _ in range(n_downsample - 2):
            layers += [nn.Conv2d(dim, dim, 4, stride=2, padding=1), nn.ReLU(inplace=True)]

        # Average pool and output layer
        layers += [nn.AdaptiveAvgPool2d(1), nn.Conv2d(dim, style_dim, 1, 1, 0)]

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


class MLP(nn.Module):
    def __init__(self, input_dim, output_dim, dim=256, n_blk=3, activ="relu"):
        super(MLP, self).__init__()
        layers = [nn.Linear(input_dim, dim), nn.ReLU(inplace=True)]
        for _ in range(n_blk - 2):
            layers += [nn.Linear(dim, dim), nn.ReLU(inplace=True)]
        layers += [nn.Linear(dim, output_dim)]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x.view(x.size(0), -1))


class MultiDiscriminator(nn.Module):
    def __init__(self, in_channels=3):
        super(MultiDiscriminator, self).__init__()

        def discriminator_block(in_filters, out_filters, normalize=True):
            """Returns downsampling layers of each discriminator block"""
            layers = [nn.Conv2d(in_filters, out_filters, 4, stride=2, padding=1)]
            if normalize:
                layers.append(nn.InstanceNorm2d(out_filters))
            layers.append(nn.LeakyReLU(0.2, inplace=True))
            return layers

        # Extracts three discriminator models
        self.models = nn.ModuleList()
        for i in range(3):
            self.models.add_module(
                "disc_%d" % i,
                nn.Sequential(
                    *discriminator_block(in_channels, 64, normalize=False),
                    *discriminator_block(64, 128),
                    *discriminator_block(128, 256),
                    *discriminator_block(256, 512),
                    nn.Conv2d(512, 1, 3, padding=1)
                ),
            )

        self.downsample = nn.AvgPool2d(in_channels, stride=2, padding=[1, 1], count_include_pad=False)

    def compute_loss(self, x, gt):
        """Computes the MSE between model output and scalar gt"""
        loss = sum([torch.mean((out - gt) ** 2) for out in self.forward(x)])
        return loss

    def forward(self, x):
        outputs = []
        for m in self.models:
            outputs.append(m(x))
            x = self.downsample(x)
        return outputs


class ResidualBlock(nn.Module):
    def __init__(self, features, norm="in"):
        super(ResidualBlock, self).__init__()

        norm_layer = AdaptiveInstanceNorm2d if norm == "adain" else nn.InstanceNorm2d

        self.block = nn.Sequential(
            nn.ReflectionPad2d(1),
            nn.Conv2d(features, features, 3),
            norm_layer(features),
            nn.ReLU(inplace=True),
            nn.ReflectionPad2d(1),
            nn.Conv2d(features, features, 3),
            norm_layer(features),
        )

    def forward(self, x):
        return x + self.block(x)


class AdaptiveInstanceNorm2d(nn.Module):
    """Reference: https://github.com/NVlabs/MUNIT/blob/master/networks.py"""

    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super(AdaptiveInstanceNorm2d, self).__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        # weight and bias are dynamically assigned
        self.weight = None
        self.bias = None
        # just dummy buffers, not used
        self.register_buffer("running_mean", torch.zeros(num_features))
        self.register_buffer("running_var", torch.ones(num_features))

    def forward(self, x):
        assert (
            self.weight is not None and self.bias is not None
        ), "Please assign weight and bias before calling AdaIN!"
        b, c, h, w = x.size()
        running_mean = self.running_mean.repeat(b)
        running_var = self.running_var.repeat(b)

        # Apply instance norm
        x_reshaped = x.contiguous().view(1, b * c, h, w)

        out = F.batch_norm(
            x_reshaped, running_mean, running_var, self.weight, self.bias, True, self.momentum, self.eps
        )

        return out.view(b, c, h, w)

    def __repr__(self):
        return self.__class__.__name__ + "(" + str(self.num_features) + ")"


class LayerNorm(nn.Module):
    def __init__(self, num_features, eps=1e-5, affine=True):
        super(LayerNorm, self).__init__()
        self.num_features = num_features
        self.affine = affine
        self.eps = eps

        if self.affine:
            self.gamma = nn.Parameter(torch.Tensor(num_features).uniform_())
            self.beta = nn.Parameter(torch.zeros(num_features))

    def forward(self, x):
        shape = [-1] + [1] * (x.dim() - 1)
        mean = x.view(x.size(0), -1).mean(1).view(*shape)
        std = x.view(x.size(0), -1).std(1).view(*shape)
        x = (x - mean) / (std + self.eps)

        if self.affine:
            shape = [1, -1] + [1] * (x.dim() - 2)
            x = x * self.gamma.view(*shape) + self.beta.view(*shape)
        return x

In [0]:
def discount_rewards(r, gamma):
  discounted_r = torch.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size(-1))):
      running_add = running_add * gamma + r[t]
      discounted_r[t] = running_add
  return discounted_r

class StylePolicy(nn.Module):
  def __init__(self, latent_dim=512, img_size = 64, window_size = 16):
    super(StylePolicy, self).__init__()
    self.gamma = 0.99
    self.action_probs = []
    self.saved_log_probs = []
    self.rewards = []

    self.fc = nn.Linear(8192, 1024)
    self.ReLU1_1 = nn.ReLU(inplace=True)

    self.fc_2 = nn.Linear(1044, 512)
    self.ReLU1_2 = nn.ReLU(inplace=True)

    self.lfc = nn.Linear(latent_dim, 256)
    self.lat = nn.Linear(256, img_size - window_size)
    self.lon = nn.Linear(256, img_size - window_size)

    self.fc_a = nn.Linear(latent_dim, 256)
    self.mean_a = nn.Linear(256, 1)
    self.sigma_a = nn.Linear(256, 1) 

    self.fc_b = nn.Linear(latent_dim, 256)
    self.mean_b = nn.Linear(256, 1)
    self.sigma_b = nn.Linear(256, 1) 

    self.fc_c = nn.Linear(latent_dim, 256)
    self.mean_c = nn.Linear(256, 1)
    self.sigma_c = nn.Linear(256, 1) 

    self.fc_d = nn.Linear(latent_dim, 256)
    self.mean_d = nn.Linear(256, 1)
    self.sigma_d = nn.Linear(256, 1) 

    self.fc_e = nn.Linear(latent_dim, 256)
    self.mean_e = nn.Linear(256, 1)
    self.sigma_e = nn.Linear(256, 1) 

    self.fc_f = nn.Linear(latent_dim, 256)
    self.mean_f = nn.Linear(256, 1)
    self.sigma_f = nn.Linear(256, 1) 

    self.fc_g = nn.Linear(latent_dim, 256)
    self.mean_g = nn.Linear(256, 1)
    self.sigma_g = nn.Linear(256, 1) 

    self.fc_h = nn.Linear(latent_dim, 256)
    self.mean_h = nn.Linear(256, 1)
    self.sigma_h = nn.Linear(256, 1) 

  def forward(self, efficientnet_encoding, efficientnet_output, goal_vector):
    r = torch.flatten(efficientnet_encoding, 1)
    r = self.fc(r)
    r = self.ReLU1_1(r)

    z = torch.cat((r, efficientnet_output, goal_vector), 1)
    z = self.ReLU1_2(self.fc_2(z))

    # Decode the hidden state of the last time step
    lz = nn.ReLU()(self.lfc(z))
    lat = Categorical(nn.Softmax(dim=1)(self.lat(lz)))
    lon = Categorical(nn.Softmax(dim=1)(self.lon(lz)))

    z_a = nn.ReLU()(self.fc_a(z))
    mu_a = self.mean_a(z_a)
    sig_a = nn.Softplus()(self.sigma_a(z_a)) + 1e-5
    style_a = Normal(mu_a, sig_a)

    z_b = nn.ReLU()(self.fc_b(z))
    mu_b = self.mean_b(z_b)
    sig_b = nn.Softplus()(self.sigma_b(z_b)) + 1e-5
    style_b = Normal(mu_b, sig_b)

    z_c = nn.ReLU()(self.fc_c(z))
    mu_c = self.mean_c(z_c)
    sig_c = nn.Softplus()(self.sigma_c(z_c)) + 1e-5
    style_c = Normal(mu_c, sig_c)

    z_d = nn.ReLU()(self.fc_d(z))
    mu_d = self.mean_d(z_d)
    sig_d = nn.Softplus()(self.sigma_d(z_d)) + 1e-5
    style_d = Normal(mu_d, sig_d)

    z_e = nn.ReLU()(self.fc_e(z))
    mu_e = self.mean_e(z_e)
    sig_e = nn.Softplus()(self.sigma_e(z_e)) + 1e-5
    style_e = Normal(mu_e, sig_e)

    z_f = nn.ReLU()(self.fc_f(z))
    mu_f = self.mean_f(z_f)
    sig_f = nn.Softplus()(self.sigma_f(z_f)) + 1e-5
    style_f = Normal(mu_f, sig_f)

    z_g = nn.ReLU()(self.fc_g(z))
    mu_g = self.mean_g(z_g)
    sig_g = nn.Softplus()(self.sigma_g(z_g)) + 1e-5
    style_g = Normal(mu_g, sig_g)

    z_h = nn.ReLU()(self.fc_h(z))
    mu_h = self.mean_h(z_h)
    sig_h = nn.Softplus()(self.sigma_h(z_h)) + 1e-5
    style_h = Normal(mu_h, sig_h)
    
    return lat, lon, style_a, style_b, style_c, style_d, style_e, style_f, style_g, style_h

  def get_action(self, efficientnet_encoding, efficientnet_output, goal_vector):
    lat, lon, style_a, style_b, style_c, style_d, style_e, style_f, style_g, style_h = self.forward(efficientnet_encoding, efficientnet_output, goal_vector)
    lat_action = lat.sample() 
    lon_action = lon.sample()
    style_a_action = style_a.sample()
    style_b_action = style_b.sample()
    style_c_action = style_c.sample()
    style_d_action = style_d.sample()
    style_e_action = style_e.sample()
    style_f_action = style_f.sample()
    style_g_action = style_g.sample()
    style_h_action = style_h.sample()
    action_log_prob = lat.log_prob(lat_action) \
                      * lon.log_prob(lon_action) \
                      * style_a.log_prob(style_a_action) \
                      * style_b.log_prob(style_b_action) \
                      * style_c.log_prob(style_c_action) \
                      * style_d.log_prob(style_d_action) \
                      * style_e.log_prob(style_e_action) \
                      * style_f.log_prob(style_f_action) \
                      * style_g.log_prob(style_g_action) \
                      * style_h.log_prob(style_h_action) 
    self.action_probs.append(action_log_prob)
    return lat_action, lon_action, style_a_action, style_b_action, style_c_action, \
            style_d_action, style_e_action, style_f_action, style_g_action, style_h_action

  def episode_finished(self):
    #print("Self.action_probs : {}".format(self.action_probs))
    action_probs = torch.stack(self.action_probs, dim=0) \
            .to(device).squeeze(-1)
    rewards = torch.stack(self.rewards, dim=0).to(device).squeeze(-1)
    self.action_probs, self.rewards = [], []

    G = discount_rewards(rewards, self.gamma)
    #print("G : {}".format(G))
    G = (G - G.mean()) / (G.std() + 1e-9) #normalize discounted rewards
    loss = (-1 * action_probs * G.detach()).sum()
    #print("G : {}, Rewards : {}, Action_probs : {}".format(G, rewards, action_probs))
    sys.stdout.write("\rReinforce Loss : {} Reinforce Mean Reward : {}".format(loss, rewards.detach().mean()))
    sys.stdout.flush()
    return loss

  def store_outcome(self, reward):
    self.rewards.append(torch.Tensor([reward]))

params

In [0]:
dataset_name        = 'cifar10'
epoch               = 215           #epoch to start training from
n_epochs            = 2000         #number of epochs of training
batch_size          = 10         #size of the batches
lr                  = 0.0001      #adam: learning rate
b1                  = 0.5         #adam: decay of first order momentum of gradient
b2                  = 0.999       #adam: decay of first order momentum of gradient
decay_epoch         = 100         #epoch from which to start lr decay
img_height          = 64         #size of image height
img_width           = 64         #size of image width
channels            = 3           #number of image channels
sample_interval     = 400         #interval saving generator samples
n_downsample        = 5           #number downsampling layers in encoder
n_residual          = 2           #number of residual blocks in encoder / decoder
dim                 = 64          #number of filters in first encoder layer
style_dim           = 8        #dimensionality of the style code
reinforce_iterations= 64
inner_loop_batch_size=10          #how many iterations to use for training the discriminator and dec/enc
outerstepsize0       = 0.1 # stepsize of outer optimization, i.e., meta-optimization

In [0]:
from torchvision import datasets, transforms
train_dataset = datasets.CIFAR10(root="./data/",
                                transform=transforms.Compose([
                                    transforms.Resize(64),
                                    transforms.RandomCrop(16),
                                    transforms.Resize(64),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]),
                                train=True,
                                download=True)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,
    drop_last=True)

Files already downloaded and verified


In [0]:
# Create sample and checkpoint directories
if not os.path.isdir("/content/drive/My Drive/model"):
  os.makedirs("/content/drive/My Drive/model")
if not os.path.isdir("/content/drive/My Drive/model/images/"):
  os.makedirs("/content/drive/My Drive/model/images/")
if not os.path.isdir("/content/drive/My Drive/model/saved_models/"):
  os.makedirs("/content/drive/My Drive/model/saved_models/")
if not os.path.isdir("/content/drive/My Drive/model/saved_models/cifar10/"):
  os.makedirs("/content/drive/My Drive/model/saved_models/cifar10/")

In [0]:
CrossEntropy_criterion = nn.CrossEntropyLoss()
L1_criterion = torch.nn.L1Loss()

# Initialize encoders, generators and discriminators
StylePolicy = StylePolicy()

Enc = Encoder(dim=dim, n_downsample=n_downsample, n_residual=n_residual, style_dim=style_dim)
Dec = Decoder(dim=dim, n_upsample=n_downsample, n_residual=n_residual, style_dim=style_dim)

D = MultiDiscriminator()

In [0]:
if cuda:
  StylePolicy.to(device)
  Enc = Enc.to(device)
  Dec = Dec.to(device)
  D = D.to(device)

  CrossEntropy_criterion = CrossEntropy_criterion.to(device)
  L1_criterion = L1_criterion.to(device)

if epoch != 0:
# Load pretrained models
  StylePolicy.load_state_dict(torch.load("/content/drive/My Drive/model/saved_models/%s/Policy.pth" % (dataset_name)))
  Enc.load_state_dict(torch.load("/content/drive/My Drive/model/saved_models/%s/Enc.pth" % (dataset_name)))
  Dec.load_state_dict(torch.load("/content/drive/My Drive/model/saved_models/%s/Dec.pth" % (dataset_name)))
  D.load_state_dict(torch.load("/content/drive/My Drive/model/saved_models/%s/D.pth" % (dataset_name)))

else:
  # Initialize weights
  StylePolicy.apply(weights_init_normal)
  Enc.apply(weights_init_normal)
  Dec.apply(weights_init_normal)
  D.apply(weights_init_normal)

# Loss weights
lambda_gan = 5
lambda_style = 10
lambda_cont = 1

# Optimizers
optimizer_G = torch.optim.Adam(
    itertools.chain(Enc.parameters(), Dec.parameters()),
    lr=lr,
    betas=(b1, b2),
)
optimizer_D = torch.optim.Adam(D.parameters(), lr=lr, betas=(b1, b2))

# Learning rate update schedulers
lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(
    optimizer_G, lr_lambda=LambdaLR(n_epochs, epoch, decay_epoch).step
)
lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR(
    optimizer_D, lr_lambda=LambdaLR(n_epochs, epoch, decay_epoch).step
)

reinforce_optimizer = torch.optim.SGD(StylePolicy.parameters(), lr=.0001, momentum=0.9, weight_decay=5e-4)

In [0]:
# Adversarial ground truths
valid = 1
fake = 0

log_file = open("/content/drive/My Drive/model/log.txt", 'a')

for epoch in trange(epoch, n_epochs, desc="Epoch"):
  torch.cuda.empty_cache()
  #save the initial weights for the REPTILE loop
  weights_original = deepcopy(StylePolicy.state_dict())
  new_weights = []
  #running_reward = 10

  for i in trange(batch_size, desc="Batch"):
    torch.cuda.empty_cache()
    #initialize empty tensors to save information to train the discriminators and generators
    content_code_stack = torch.empty(32, 2048, 2, 2)
    content_code_recovered_stack = torch.empty(32, 2048, 2, 2)
    style_code_stack = torch.empty(32, style_dim, 1, 1)
    style_code_recovered_stack = torch.empty(32, style_dim, 1, 1)
    X_stack = torch.empty(32, channels, img_height, img_width).cuda()

    #create random goal class one-hot encoded vector
    indices = torch.randint(0,10, size=(1,1))
    one_hot = torch.nn.functional.one_hot(indices, 10) # size=(4,7,n)
    goal = one_hot.squeeze(0).float().cuda()
    goal_reinforce = indices.long().cuda()[0]

    #generate gaussian noise image
    mu, sigma = 0, 1.0 # mean and standard deviation
    image = np.random.normal(mu, sigma, [img_height, img_width, channels])
    image = 255*(image - np.min(image))/np.ptp(image)
    X = image.astype(np.uint8)

    X = efficientnet_transform(transforms.ToPILImage()(X).convert("RGB"))
    X.unsqueeze_(0)
    X = Variable(X).to(device)

    efficientnet_features = efficientnet.extract_features(X)
    efficientnet_class = efficientnet.classify(efficientnet_features)
    policycode = StylePolicy.get_action(efficientnet_features, efficientnet_class, goal)
  
    #iterations of recurrent image development, 100 good enough? Potentially explore stacking samples for proper batching
    for reinforce_iteration in range(reinforce_iterations):
      X_focus = X[:, :, policycode[0]:policycode[0] + 16, policycode[1]:policycode[1] + 16]
      X_focus = torch.nn.functional.interpolate(X_focus, size=[img_height, img_width], mode = 'bilinear')
      
      content_code = efficientnet.extract_features(X_focus)
      style_code = torch.stack(policycode[2:]).unsqueeze(dim=0)

      # Reconstruct images
      X_hat = Dec(content_code, style_code)

      X[:, :, policycode[0]:policycode[0] + 16, policycode[1]:policycode[1] + 16] = \
                      torch.nn.functional.interpolate(X_hat, size=[16, 16], mode = 'bilinear')

      efficientnet_features = efficientnet.extract_features(X)
      Efficientnet_encoding = torch.nn.Softmax(dim=1)(efficientnet.classify(efficientnet_features))
      Efficientnet_encdoing = Efficientnet_encoding

      if reinforce_iteration == reinforce_iterations - 33:
        content_code_stack[reinforce_iteration, :, :, :] = content_code
        style_code_stack[reinforce_iteration, :, :, :] = style_code
        X_stack[reinforce_iteration, :, :, :] = X_hat

      #get the reward for each step taken during the inner model training for the REINFORCE algorithm
      reward = -1 * CrossEntropy_criterion(Efficientnet_encoding, goal_reinforce)

      StylePolicy.store_outcome(reward)

      policycode = StylePolicy.get_action(efficientnet_features, Efficientnet_encoding, goal)

      sys.stdout.write('\rEpoch %d Batch %d Iteration %d Reward %.2f%%' % (epoch, i, reinforce_iteration, reward))
      sys.stdout.flush()

      #save data if batch number is 0
      if i == 0:
        if not os.path.isdir("/content/drive/My Drive/model/images/{}".format(epoch)):
          os.makedirs("/content/drive/My Drive/model/images/{}".format(epoch))
        torchvision.utils.save_image(X, '/content/drive/My Drive/model/images/{}/iter_{}.jpg'.format(epoch, reinforce_iteration))
      line = "\r\rEpoch:{},Batch:{},Iter:{},Reward:{}".format(epoch, i, reinforce_iteration, reward)
      log_file.write(line)
      #print(line)

    # Cycle translation
    content_code_recovered_stack, style_code_recovered_stack = Enc(X_stack)

    # Losses
    loss_GAN = 5 * D.compute_loss(X_stack.detach(), valid)
    loss_s = 10 * L1_criterion(style_code_recovered_stack.cuda().detach(), style_code_stack.cuda().detach()) #.detach().item()
    loss_c = lambda_cont * L1_criterion(content_code_recovered_stack.cuda().detach(), content_code_stack.cuda().detach()) #.detach().item()
    
    optimizer_G.zero_grad()
    loss_G = loss_GAN + loss_s + loss_c
    loss_G.backward()
    optimizer_G.step()

    optimizer_D.zero_grad()
    true_image = next(iter(train_loader))[0]
    loss_D = D.compute_loss(true_image.cuda(), valid) + D.compute_loss(X_hat.detach(), fake)
    loss_D.backward()
    optimizer_D.step()

    #finish iteration - train REINFORCE algo
    reinforce_loss = StylePolicy.episode_finished()
    reinforce_optimizer.zero_grad()
    reinforce_loss.backward()
    reinforce_optimizer.step()
    log_file.write("Epoch:{},Batch:{},Reinforce_Loss:{}\n".format(epoch, i, reinforce_loss))

    #save trained weights & reload the weights from the original model
    new_weights.append(deepcopy(StylePolicy.state_dict()))
    StylePolicy.load_state_dict({ name: weights_original[name] for name in weights_original })

  #meta training step, REPTILE
  ws = len(new_weights)
  fweights = { name : new_weights[0][name]/float(ws) for name in new_weights[0] }
  for i in range(1, ws):
      #cur_weights = deepcopy(model.state_dict())
      for name in new_weights[i]:
          fweights[name] += new_weights[i][name]/float(ws)
  outerstepsize = outerstepsize0 * (1 - epoch / n_epochs) # linear schedule
  StylePolicy.load_state_dict({name : 
      weights_original[name] + ((fweights[name] - weights_original[name]) * outerstepsize) 
      for name in weights_original})
  
  # Update learning rates
  lr_scheduler_G.step()
  lr_scheduler_D.step()
  torch.save(StylePolicy.state_dict(), "/content/drive/My Drive/model/saved_models/%s/Policy.pth" % (dataset_name))
  torch.save(Enc.state_dict(), "/content/drive/My Drive/model/saved_models/%s/Enc.pth" % (dataset_name))
  torch.save(Dec.state_dict(), "/content/drive/My Drive/model/saved_models/%s/Dec.pth" % (dataset_name))
  torch.save(D.state_dict(), "/content/drive/My Drive/model/saved_models/%s/D.pth" % (dataset_name))


Epoch:   0%|          | 0/1785 [00:00<?, ?it/s]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Epoch 215 Batch 0 Iteration 0 Reward -2.43%

  "See the documentation of nn.Upsample for details.".format(mode))


Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.375192165374756


Batch:  10%|█         | 1/10 [00:24<03:43, 24.87s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -1.848824381828308


Batch:  20%|██        | 2/10 [00:40<02:57, 22.19s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4218201637268066


Batch:  30%|███       | 3/10 [00:56<02:20, 20.14s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.1174798011779785


Batch:  40%|████      | 4/10 [01:11<01:52, 18.79s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.384948253631592


Batch:  50%|█████     | 5/10 [01:27<01:29, 17.93s/it][A

Reinforce Loss : 0.017578125 Reinforce Max Reward : -2.4234230518341064


Batch:  60%|██████    | 6/10 [01:43<01:09, 17.32s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.33429217338562


Batch:  70%|███████   | 7/10 [01:59<00:50, 16.81s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.291266441345215


Batch:  80%|████████  | 8/10 [02:15<00:33, 16.50s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4446089267730713


Batch:  90%|█████████ | 9/10 [02:30<00:16, 16.25s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3993353843688965


Batch: 100%|██████████| 10/10 [02:46<00:00, 16.15s/it][A
Epoch:   0%|          | 1/1785 [03:00<89:31:16, 180.65s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.4324231147766113


Batch:  10%|█         | 1/10 [00:21<03:15, 21.71s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.38482403755188


Batch:  20%|██        | 2/10 [00:37<02:39, 19.97s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.404481887817383


Batch:  30%|███       | 3/10 [00:53<02:10, 18.70s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.239600896835327


Batch:  40%|████      | 4/10 [01:09<01:46, 17.79s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.252746105194092


Batch:  50%|█████     | 5/10 [01:24<01:26, 17.21s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4361155033111572


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.85s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -1.9546704292297363


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.39s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.393582582473755


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.20s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.424652576446533


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.01s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4236345291137695


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.91s/it][A
Epoch:   0%|          | 2/1785 [06:00<89:24:32, 180.52s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.398489475250244


Batch:  10%|█         | 1/10 [00:18<02:46, 18.46s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3650174140930176


Batch:  20%|██        | 2/10 [00:34<02:21, 17.64s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.433488130569458


Batch:  30%|███       | 3/10 [00:49<01:59, 17.04s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4204814434051514


Batch:  40%|████      | 4/10 [01:05<01:39, 16.64s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4329404830932617


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.30s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4354090690612793


Batch:  60%|██████    | 6/10 [01:36<01:04, 16.09s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.414299249649048


Batch:  70%|███████   | 7/10 [01:52<00:47, 15.93s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.418654203414917


Batch:  80%|████████  | 8/10 [02:07<00:31, 15.83s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.0919175148010254


Batch:  90%|█████████ | 9/10 [02:23<00:15, 15.83s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.412412166595459


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.76s/it][A
Epoch:   0%|          | 3/1785 [08:55<88:29:29, 178.77s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.389160633087158


Batch:  10%|█         | 1/10 [00:20<03:00, 20.10s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.0382847785949707


Batch:  20%|██        | 2/10 [00:35<02:30, 18.77s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.3208353519439697


Batch:  30%|███       | 3/10 [00:51<02:04, 17.86s/it][A

Reinforce Loss : -0.0087890625 Reinforce Max Reward : -2.449061870574951


Batch:  40%|████      | 4/10 [01:07<01:42, 17.16s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.42130708694458


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.74s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.108424425125122


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.36s/it][A

Reinforce Loss : -0.0087890625 Reinforce Max Reward : -2.376765251159668


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.17s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4314520359039307


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.01s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3625364303588867


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.90s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4150803089141846


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.81s/it][A
Epoch:   0%|          | 4/1785 [11:53<88:20:14, 178.56s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.4331531524658203


Batch:  10%|█         | 1/10 [00:20<03:06, 20.77s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3923516273498535


Batch:  20%|██        | 2/10 [00:36<02:33, 19.20s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4064221382141113


Batch:  30%|███       | 3/10 [00:52<02:07, 18.16s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.436652421951294


Batch:  40%|████      | 4/10 [01:07<01:44, 17.38s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.419875144958496


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.84s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4223380088806152


Batch:  60%|██████    | 6/10 [01:39<01:07, 16.78s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.408853530883789


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.62s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.440159320831299


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.38s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4350671768188477


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.21s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.437775135040283


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.02s/it][A
Epoch:   0%|          | 5/1785 [14:49<87:56:02, 177.84s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4131858348846436


Batch:  10%|█         | 1/10 [00:22<03:18, 22.10s/it][A

Reinforce Loss : 0.046875 Reinforce Max Reward : -2.322720527648926


Batch:  20%|██        | 2/10 [00:38<02:42, 20.25s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.436614990234375


Batch:  30%|███       | 3/10 [00:53<02:12, 18.89s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.343397855758667


Batch:  40%|████      | 4/10 [01:09<01:47, 17.96s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4263312816619873


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.31s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4072155952453613


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.82s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.357064723968506


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.49s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4063472747802734


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.22s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2857742309570312


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.10s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4420242309570312


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.92s/it][A
Epoch:   0%|          | 6/1785 [17:48<87:58:53, 178.04s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01953125 Reinforce Max Reward : -2.418424606323242


Batch:  10%|█         | 1/10 [00:20<03:04, 20.48s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.123983144760132


Batch:  20%|██        | 2/10 [00:36<02:32, 19.05s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.341032028198242


Batch:  30%|███       | 3/10 [00:51<02:06, 18.01s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.428856372833252


Batch:  40%|████      | 4/10 [01:07<01:44, 17.34s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.299901008605957


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.85s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.390477180480957


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.52s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.431690216064453


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.24s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.424732208251953


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.10s/it][A

Reinforce Loss : 0.0234375 Reinforce Max Reward : -2.4400248527526855


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.98s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.329132556915283


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.83s/it][A
Epoch:   0%|          | 7/1785 [20:45<87:52:15, 177.92s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3712997436523438


Batch:  10%|█         | 1/10 [00:20<03:08, 20.95s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4238710403442383


Batch:  20%|██        | 2/10 [00:36<02:35, 19.46s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4504809379577637


Batch:  30%|███       | 3/10 [00:52<02:08, 18.30s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4362192153930664


Batch:  40%|████      | 4/10 [01:08<01:44, 17.49s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.429286479949951


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.93s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.9046273231506348


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.59s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.410691261291504


Batch:  70%|███████   | 7/10 [01:55<00:48, 16.32s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.3955323696136475


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.10s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4131879806518555


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.01s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3701391220092773


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.85s/it][A
Epoch:   0%|          | 8/1785 [23:43<87:48:36, 177.89s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4420363903045654


Batch:  10%|█         | 1/10 [00:21<03:09, 21.03s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.1481618881225586


Batch:  20%|██        | 2/10 [00:36<02:35, 19.50s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4012451171875


Batch:  30%|███       | 3/10 [00:52<02:08, 18.33s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.440882682800293


Batch:  40%|████      | 4/10 [01:08<01:45, 17.54s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.432044267654419


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.99s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4368245601654053


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.63s/it][A

Reinforce Loss : -0.0107421875 Reinforce Max Reward : -2.302730083465576


Batch:  70%|███████   | 7/10 [01:55<00:48, 16.31s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4245073795318604


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.10s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4073615074157715


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.98s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.104705333709717


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.91s/it][A
Epoch:   1%|          | 9/1785 [26:40<87:34:04, 177.50s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.328559398651123


Batch:  10%|█         | 1/10 [00:22<03:21, 22.39s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4275524616241455


Batch:  20%|██        | 2/10 [00:38<02:43, 20.42s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.407785415649414


Batch:  30%|███       | 3/10 [00:53<02:12, 18.97s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4408013820648193


Batch:  40%|████      | 4/10 [01:09<01:47, 17.96s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.1956467628479004


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.26s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4004831314086914


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.84s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -1.9879034757614136


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.44s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.428640365600586


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.21s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.1738815307617188


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.07s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.25504732131958


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.96s/it][A
Epoch:   1%|          | 10/1785 [29:37<87:27:34, 177.38s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.206066608428955


Batch:  10%|█         | 1/10 [00:23<03:29, 23.24s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4312891960144043


Batch:  20%|██        | 2/10 [00:38<02:47, 20.98s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.420253276824951


Batch:  30%|███       | 3/10 [00:54<02:15, 19.37s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.416213274002075


Batch:  40%|████      | 4/10 [01:10<01:49, 18.31s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -1.731136679649353


Batch:  50%|█████     | 5/10 [01:26<01:27, 17.55s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.425722122192383


Batch:  60%|██████    | 6/10 [01:41<01:08, 17.03s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.300166368484497


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.59s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -1.9460182189941406


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.33s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.376316785812378


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.17s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4106106758117676


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.09s/it][A
Epoch:   1%|          | 11/1785 [32:37<87:44:39, 178.06s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.393561601638794


Batch:  10%|█         | 1/10 [00:21<03:14, 21.66s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.357520580291748


Batch:  20%|██        | 2/10 [00:37<02:38, 19.85s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.385686159133911


Batch:  30%|███       | 3/10 [00:52<02:09, 18.55s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4322378635406494


Batch:  40%|████      | 4/10 [01:08<01:46, 17.75s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4241700172424316


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.10s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4223382472991943


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.74s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.2623443603515625


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.42s/it][A

Reinforce Loss : 0.021484375 Reinforce Max Reward : -2.4248576164245605


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.20s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.18902587890625


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.03s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -1.7377442121505737


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.95s/it][A
Epoch:   1%|          | 12/1785 [35:37<88:06:46, 178.91s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3806509971618652


Batch:  10%|█         | 1/10 [00:18<02:44, 18.31s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.356743812561035


Batch:  20%|██        | 2/10 [00:34<02:20, 17.55s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -1.9482030868530273


Batch:  30%|███       | 3/10 [00:49<01:59, 17.05s/it][A

Reinforce Loss : 0.0234375 Reinforce Max Reward : -2.2509121894836426


Batch:  40%|████      | 4/10 [01:05<01:39, 16.66s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.002223014831543


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.38s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4004149436950684


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.15s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4332997798919678


Batch:  70%|███████   | 7/10 [01:52<00:47, 15.97s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.3179993629455566


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.91s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.431673049926758


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.84s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.2848079204559326


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.79s/it][A
Epoch:   1%|          | 13/1785 [38:31<87:12:32, 177.17s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.406012773513794


Batch:  10%|█         | 1/10 [00:23<03:28, 23.22s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.4215383529663086


Batch:  20%|██        | 2/10 [00:38<02:47, 20.98s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4519548416137695


Batch:  30%|███       | 3/10 [00:54<02:15, 19.42s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3439793586730957


Batch:  40%|████      | 4/10 [01:10<01:50, 18.34s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.155985116958618


Batch:  50%|█████     | 5/10 [01:26<01:27, 17.53s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3741164207458496


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.94s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4173507690429688


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.57s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.329667091369629


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.25s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.172365188598633


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.06s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.42557954788208


Batch: 100%|██████████| 10/10 [02:44<00:00, 15.92s/it][A
Epoch:   1%|          | 14/1785 [41:31<87:35:35, 178.06s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4445061683654785


Batch:  10%|█         | 1/10 [00:20<03:03, 20.41s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4384682178497314


Batch:  20%|██        | 2/10 [00:36<02:31, 18.98s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.8865993022918701


Batch:  30%|███       | 3/10 [00:51<02:06, 18.00s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.0204355716705322


Batch:  40%|████      | 4/10 [01:07<01:43, 17.23s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.431812047958374


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.73s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.2717771530151367


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.46s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2447543144226074


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.22s/it][A

Reinforce Loss : -0.021484375 Reinforce Max Reward : -2.4234886169433594


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.04s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.428476333618164


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.87s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -1.820874810218811


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.78s/it][A
Epoch:   1%|          | 15/1785 [44:27<87:13:10, 177.40s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.428915023803711


Batch:  10%|█         | 1/10 [00:22<03:18, 22.01s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.428250789642334


Batch:  20%|██        | 2/10 [00:37<02:40, 20.08s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3433775901794434


Batch:  30%|███       | 3/10 [00:53<02:11, 18.80s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4006776809692383


Batch:  40%|████      | 4/10 [01:08<01:46, 17.81s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.186481475830078


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.18s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4189062118530273


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.68s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.0960116386413574


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.38s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.2887587547302246


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.07s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.321742296218872


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.93s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4222257137298584


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.86s/it][A
Epoch:   1%|          | 16/1785 [47:24<87:10:27, 177.40s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.03515625 Reinforce Max Reward : -2.425053358078003


Batch:  10%|█         | 1/10 [00:21<03:14, 21.63s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4385197162628174


Batch:  20%|██        | 2/10 [00:37<02:38, 19.82s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.688536286354065


Batch:  30%|███       | 3/10 [00:53<02:10, 18.62s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.275829315185547


Batch:  40%|████      | 4/10 [01:08<01:46, 17.71s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -1.711172103881836


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.03s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.352871894836426


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.58s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4119930267333984


Batch:  70%|███████   | 7/10 [01:55<00:48, 16.32s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.426175594329834


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.04s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.319709539413452


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.97s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2844913005828857


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.88s/it][A
Epoch:   1%|          | 17/1785 [50:25<87:38:10, 178.44s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.3119707107543945


Batch:  10%|█         | 1/10 [00:20<03:00, 20.01s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -1.8244019746780396


Batch:  20%|██        | 2/10 [00:35<02:29, 18.68s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.292095184326172


Batch:  30%|███       | 3/10 [00:51<02:04, 17.83s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.966522216796875


Batch:  40%|████      | 4/10 [01:06<01:42, 17.14s/it][A

Reinforce Loss : -0.02734375 Reinforce Max Reward : -2.4197487831115723


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.68s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.432598352432251


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.35s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.326305389404297


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.16s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.0107979774475098


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.02s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4263694286346436


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.92s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4385106563568115


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.87s/it][A
Epoch:   1%|          | 18/1785 [53:23<87:28:30, 178.22s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.31762433052063


Batch:  10%|█         | 1/10 [00:18<02:46, 18.51s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.3782958984375


Batch:  20%|██        | 2/10 [00:34<02:22, 17.79s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.290062189102173


Batch:  30%|███       | 3/10 [00:50<02:00, 17.18s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4182419776916504


Batch:  40%|████      | 4/10 [01:05<01:40, 16.71s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4273667335510254


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.42s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4386043548583984


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.24s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.087024688720703


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.12s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4047770500183105


Batch:  80%|████████  | 8/10 [02:09<00:31, 15.98s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4290242195129395


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.90s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.429887294769287


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.81s/it][A
Epoch:   1%|          | 19/1785 [56:17<86:55:13, 177.19s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2546539306640625


Batch:  10%|█         | 1/10 [00:20<03:06, 20.68s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.43253231048584


Batch:  20%|██        | 2/10 [00:36<02:33, 19.17s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.1036500930786133


Batch:  30%|███       | 3/10 [00:51<02:06, 18.06s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4356849193573


Batch:  40%|████      | 4/10 [01:07<01:44, 17.34s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3675074577331543


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.87s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3976330757141113


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.46s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.2308688163757324


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.25s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4253921508789062


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.08s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.1607723236083984


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.97s/it][A

Reinforce Loss : 0.021484375 Reinforce Max Reward : -2.4223625659942627


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.84s/it][A
Epoch:   1%|          | 20/1785 [59:14<86:47:52, 177.04s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4464101791381836


Batch:  10%|█         | 1/10 [00:19<02:58, 19.79s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.3833508491516113


Batch:  20%|██        | 2/10 [00:35<02:28, 18.62s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4262475967407227


Batch:  30%|███       | 3/10 [00:51<02:04, 17.78s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.323805332183838


Batch:  40%|████      | 4/10 [01:07<01:43, 17.24s/it][A

Reinforce Loss : -0.0087890625 Reinforce Max Reward : -2.4357681274414062


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.78s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.9744205474853516


Batch:  60%|██████    | 6/10 [01:39<01:05, 16.50s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4333338737487793


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.25s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4362616539001465


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.10s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.371426820755005


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.00s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.293996810913086


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.87s/it][A
Epoch:   1%|          | 21/1785 [1:02:13<86:57:52, 177.48s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.397808790206909


Batch:  10%|█         | 1/10 [00:19<02:56, 19.59s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -1.987952470779419


Batch:  20%|██        | 2/10 [00:35<02:27, 18.47s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.11032772064209


Batch:  30%|███       | 3/10 [00:51<02:04, 17.72s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4417927265167236


Batch:  40%|████      | 4/10 [01:07<01:43, 17.22s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.424806833267212


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.76s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.184898614883423


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.52s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4210145473480225


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.29s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.438809394836426


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.20s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4219751358032227


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.04s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.311931610107422


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.97s/it][A
Epoch:   1%|          | 22/1785 [1:05:12<87:09:28, 177.97s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.3553361892700195


Batch:  10%|█         | 1/10 [00:20<03:02, 20.26s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.236896514892578


Batch:  20%|██        | 2/10 [00:36<02:31, 18.92s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4362902641296387


Batch:  30%|███       | 3/10 [00:51<02:05, 17.97s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.355203151702881


Batch:  40%|████      | 4/10 [01:07<01:44, 17.36s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4485249519348145


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.81s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.2927095890045166


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.46s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.424907684326172


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.24s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.9748427867889404


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.13s/it][A

Reinforce Loss : -0.0107421875 Reinforce Max Reward : -2.431887149810791


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.95s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3370323181152344


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.90s/it][A
Epoch:   1%|▏         | 23/1785 [1:08:09<87:02:56, 177.85s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.426577091217041


Batch:  10%|█         | 1/10 [00:19<02:57, 19.73s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.416271686553955


Batch:  20%|██        | 2/10 [00:35<02:28, 18.62s/it][A

Reinforce Loss : 0.0107421875 Reinforce Max Reward : -2.286128044128418


Batch:  30%|███       | 3/10 [00:51<02:04, 17.78s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4258484840393066


Batch:  40%|████      | 4/10 [01:07<01:43, 17.24s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4370415210723877


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.77s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.4265599250793457


Batch:  60%|██████    | 6/10 [01:39<01:05, 16.48s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.261742353439331


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.24s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.212923049926758


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.13s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.052536964416504


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.99s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4237825870513916


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.89s/it][A
Epoch:   1%|▏         | 24/1785 [1:11:05<86:41:51, 177.24s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.03125 Reinforce Max Reward : -2.426820993423462


Batch:  10%|█         | 1/10 [00:24<03:38, 24.23s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.2003426551818848


Batch:  20%|██        | 2/10 [00:40<02:53, 21.72s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.425959587097168


Batch:  30%|███       | 3/10 [00:55<02:19, 19.90s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4287896156311035


Batch:  40%|████      | 4/10 [01:11<01:51, 18.65s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.431156635284424


Batch:  50%|█████     | 5/10 [01:27<01:28, 17.74s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.9981896877288818


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.13s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4273946285247803


Batch:  70%|███████   | 7/10 [01:58<00:49, 16.65s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4277615547180176


Batch:  80%|████████  | 8/10 [02:14<00:32, 16.43s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.438361644744873


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.19s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.039639472961426


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.12s/it][A
Epoch:   1%|▏         | 25/1785 [1:14:08<87:33:01, 179.08s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.257150411605835


Batch:  10%|█         | 1/10 [00:18<02:48, 18.76s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.948468804359436


Batch:  20%|██        | 2/10 [00:34<02:23, 17.90s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.417476177215576


Batch:  30%|███       | 3/10 [00:50<02:00, 17.27s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.1089019775390625


Batch:  40%|████      | 4/10 [01:06<01:41, 16.87s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.139059543609619


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.49s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4299607276916504


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.40s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.426693916320801


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.19s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.22884464263916


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.12s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4134292602539062


Batch:  90%|█████████ | 9/10 [02:25<00:16, 16.05s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.593005895614624


Batch: 100%|██████████| 10/10 [02:41<00:00, 16.01s/it][A
Epoch:   1%|▏         | 26/1785 [1:17:06<87:14:06, 178.54s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.129917621612549


Batch:  10%|█         | 1/10 [00:20<03:00, 20.10s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.4316344261169434


Batch:  20%|██        | 2/10 [00:36<02:31, 18.89s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.317141532897949


Batch:  30%|███       | 3/10 [00:52<02:06, 18.04s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4131245613098145


Batch:  40%|████      | 4/10 [01:08<01:44, 17.42s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3985705375671387


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.92s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.1230149269104004


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.64s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -1.9292056560516357


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.38s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.4325380325317383


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.21s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.1737592220306396


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.07s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3212451934814453


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.99s/it][A
Epoch:   2%|▏         | 27/1785 [1:20:04<87:05:10, 178.33s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4396116733551025


Batch:  10%|█         | 1/10 [00:20<03:07, 20.84s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4248886108398438


Batch:  20%|██        | 2/10 [00:36<02:35, 19.39s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -1.8357840776443481


Batch:  30%|███       | 3/10 [00:52<02:08, 18.36s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.0878355503082275


Batch:  40%|████      | 4/10 [01:08<01:45, 17.63s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.408278465270996


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.06s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.040398597717285


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.71s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.1542515754699707


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.45s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4365017414093018


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.27s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4305708408355713


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.06s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.4261627197265625


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.00s/it][A
Epoch:   2%|▏         | 28/1785 [1:23:03<87:08:16, 178.54s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.1171875 Reinforce Max Reward : -2.423874855041504


Batch:  10%|█         | 1/10 [00:20<03:04, 20.48s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4191794395446777


Batch:  20%|██        | 2/10 [00:36<02:32, 19.03s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -1.6940851211547852


Batch:  30%|███       | 3/10 [00:52<02:06, 18.09s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.2011871337890625


Batch:  40%|████      | 4/10 [01:07<01:44, 17.45s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4197912216186523


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.99s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4038734436035156


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.63s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.146327018737793


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.35s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3343615531921387


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.15s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.9136197566986084


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.02s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.431185007095337


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.96s/it][A
Epoch:   2%|▏         | 29/1785 [1:25:58<86:39:56, 177.67s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.432162046432495


Batch:  10%|█         | 1/10 [00:23<03:28, 23.20s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.42880916595459


Batch:  20%|██        | 2/10 [00:38<02:47, 20.92s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.3019039630889893


Batch:  30%|███       | 3/10 [00:54<02:15, 19.37s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4394679069519043


Batch:  40%|████      | 4/10 [01:10<01:49, 18.30s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.430375099182129


Batch:  50%|█████     | 5/10 [01:26<01:27, 17.55s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -1.5912892818450928


Batch:  60%|██████    | 6/10 [01:41<01:08, 17.01s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3473987579345703


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.55s/it][A

Reinforce Loss : 0.0107421875 Reinforce Max Reward : -2.4256656169891357


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.33s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.430696487426758


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.15s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.23854923248291


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.01s/it][A
Epoch:   2%|▏         | 30/1785 [1:28:57<86:43:46, 177.91s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.2512078285217285


Batch:  10%|█         | 1/10 [00:22<03:19, 22.21s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.396315813064575


Batch:  20%|██        | 2/10 [00:37<02:41, 20.22s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3114635944366455


Batch:  30%|███       | 3/10 [00:53<02:12, 18.90s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4105286598205566


Batch:  40%|████      | 4/10 [01:09<01:47, 17.98s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4157583713531494


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.32s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.343968391418457


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.77s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.079936981201172


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.45s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.432197332382202


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.26s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4347381591796875


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.22s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3793234825134277


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.09s/it][A
Epoch:   2%|▏         | 31/1785 [1:31:55<86:45:10, 178.06s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.202437400817871


Batch:  10%|█         | 1/10 [00:22<03:18, 22.03s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.398516893386841


Batch:  20%|██        | 2/10 [00:37<02:40, 20.08s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.184218645095825


Batch:  30%|███       | 3/10 [00:53<02:11, 18.78s/it][A

Reinforce Loss : 0.021484375 Reinforce Max Reward : -2.441462755203247


Batch:  40%|████      | 4/10 [01:08<01:46, 17.83s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -1.5567702054977417


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.20s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.443286657333374


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.72s/it][A

Reinforce Loss : 0.025390625 Reinforce Max Reward : -2.4357786178588867


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.42s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -1.5830695629119873


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.26s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2363805770874023


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.16s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.048304796218872


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.99s/it][A
Epoch:   2%|▏         | 32/1785 [1:36:02<96:41:37, 198.57s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4012882709503174


Batch:  10%|█         | 1/10 [00:19<02:52, 19.21s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.41839599609375


Batch:  20%|██        | 2/10 [00:35<02:25, 18.20s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.206540584564209


Batch:  30%|███       | 3/10 [00:50<02:02, 17.46s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.277005910873413


Batch:  40%|████      | 4/10 [01:06<01:41, 16.90s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4302291870117188


Batch:  50%|█████     | 5/10 [01:22<01:22, 16.58s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -1.9633291959762573


Batch:  60%|██████    | 6/10 [01:37<01:05, 16.27s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4126930236816406


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.09s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.4459176063537598


Batch:  80%|████████  | 8/10 [02:09<00:31, 16.00s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.42751407623291


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.97s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.406397581100464


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.90s/it][A
Epoch:   2%|▏         | 33/1785 [1:38:58<93:27:49, 192.05s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.427820920944214


Batch:  10%|█         | 1/10 [00:19<02:59, 19.96s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4056472778320312


Batch:  20%|██        | 2/10 [00:35<02:29, 18.72s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.422118902206421


Batch:  30%|███       | 3/10 [00:51<02:04, 17.83s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4075589179992676


Batch:  40%|████      | 4/10 [01:07<01:43, 17.18s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.420107364654541


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.77s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.1878814697265625


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.40s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3945159912109375


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.23s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.344029426574707


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.04s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4237172603607178


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.98s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.3737831115722656


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.95s/it][A
Epoch:   2%|▏         | 34/1785 [1:41:55<91:10:43, 187.46s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.3485238552093506


Batch:  10%|█         | 1/10 [00:21<03:09, 21.03s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.338062286376953


Batch:  20%|██        | 2/10 [00:36<02:35, 19.43s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.442925453186035


Batch:  30%|███       | 3/10 [00:52<02:08, 18.29s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.426243782043457


Batch:  40%|████      | 4/10 [01:07<01:44, 17.49s/it][A

Reinforce Loss : 0.017578125 Reinforce Max Reward : -2.4318952560424805


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.97s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.0830955505371094


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.55s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.194013833999634


Batch:  70%|███████   | 7/10 [01:55<00:48, 16.31s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.4225516319274902


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.12s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4122507572174072


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.05s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.200794219970703


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.88s/it][A
Epoch:   2%|▏         | 35/1785 [1:44:53<89:40:04, 184.46s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4316229820251465


Batch:  10%|█         | 1/10 [00:20<03:08, 20.97s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.355431318283081


Batch:  20%|██        | 2/10 [00:36<02:34, 19.37s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.407207489013672


Batch:  30%|███       | 3/10 [00:52<02:08, 18.29s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.396725654602051


Batch:  40%|████      | 4/10 [01:08<01:45, 17.53s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.423917055130005


Batch:  50%|█████     | 5/10 [01:23<01:25, 17.01s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -1.9662597179412842


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.58s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.430138111114502


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.38s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3989148139953613


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.17s/it][A

Reinforce Loss : -0.0146484375 Reinforce Max Reward : -2.4080357551574707


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.04s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4237639904022217


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.91s/it][A
Epoch:   2%|▏         | 36/1785 [1:47:49<88:29:56, 182.16s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.251211643218994


Batch:  10%|█         | 1/10 [00:22<03:19, 22.15s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.409156560897827


Batch:  20%|██        | 2/10 [00:38<02:42, 20.27s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.193246364593506


Batch:  30%|███       | 3/10 [00:53<02:12, 18.91s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.3183789253234863


Batch:  40%|████      | 4/10 [01:09<01:47, 17.94s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4279205799102783


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.32s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.3711211681365967


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.80s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4367849826812744


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.49s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.423895835876465


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.31s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4192299842834473


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.18s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.1104812622070312


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.00s/it][A
Epoch:   2%|▏         | 37/1785 [1:50:50<88:14:29, 181.73s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.436558723449707


Batch:  10%|█         | 1/10 [00:20<03:01, 20.11s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4219512939453125


Batch:  20%|██        | 2/10 [00:36<02:31, 18.90s/it][A

Reinforce Loss : -0.01953125 Reinforce Max Reward : -2.4300894737243652


Batch:  30%|███       | 3/10 [00:52<02:06, 18.01s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4277749061584473


Batch:  40%|████      | 4/10 [01:08<01:44, 17.41s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4298033714294434


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.94s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3426854610443115


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.60s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.43434476852417


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.40s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3841099739074707


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.21s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4342427253723145


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.12s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.439854621887207


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.98s/it][A
Epoch:   2%|▏         | 38/1785 [1:53:46<87:24:07, 180.11s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4363887310028076


Batch:  10%|█         | 1/10 [00:23<03:29, 23.33s/it][A

Reinforce Loss : -0.0087890625 Reinforce Max Reward : -2.1754379272460938


Batch:  20%|██        | 2/10 [00:39<02:48, 21.09s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.196106433868408


Batch:  30%|███       | 3/10 [00:55<02:16, 19.53s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.309014320373535


Batch:  40%|████      | 4/10 [01:10<01:50, 18.37s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.387683391571045


Batch:  50%|█████     | 5/10 [01:26<01:27, 17.57s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.3070228099823


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.04s/it][A

Reinforce Loss : -0.0107421875 Reinforce Max Reward : -2.434027910232544


Batch:  70%|███████   | 7/10 [01:58<00:49, 16.65s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.431124210357666


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.38s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.4062654972076416


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.22s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4437971115112305


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.05s/it][A
Epoch:   2%|▏         | 39/1785 [1:56:46<87:15:09, 179.90s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.2276899814605713


Batch:  10%|█         | 1/10 [00:22<03:22, 22.52s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.371746063232422


Batch:  20%|██        | 2/10 [00:38<02:43, 20.49s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4270057678222656


Batch:  30%|███       | 3/10 [00:54<02:13, 19.11s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2965869903564453


Batch:  40%|████      | 4/10 [01:09<01:48, 18.06s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.361577272415161


Batch:  50%|█████     | 5/10 [01:25<01:27, 17.43s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.37221097946167


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.97s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3183319568634033


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.64s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.410287618637085


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.34s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.321382761001587


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.17s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.291450262069702


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.06s/it][A
Epoch:   2%|▏         | 40/1785 [1:59:46<87:12:12, 179.90s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4407849311828613


Batch:  10%|█         | 1/10 [00:21<03:17, 21.97s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4313697814941406


Batch:  20%|██        | 2/10 [00:37<02:41, 20.14s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.1638388633728027


Batch:  30%|███       | 3/10 [00:53<02:11, 18.84s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.034780979156494


Batch:  40%|████      | 4/10 [01:09<01:47, 17.96s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.1369383335113525


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.40s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -1.9352035522460938


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.93s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4093499183654785


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.64s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.38531231880188


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.36s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4060263633728027


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.30s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.427626609802246


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.14s/it][A
Epoch:   2%|▏         | 41/1785 [2:02:50<87:45:37, 181.16s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.02734375 Reinforce Max Reward : -2.4105706214904785


Batch:  10%|█         | 1/10 [00:19<02:52, 19.21s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.4355666637420654


Batch:  20%|██        | 2/10 [00:35<02:25, 18.23s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.427964687347412


Batch:  30%|███       | 3/10 [00:51<02:03, 17.61s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.355384588241577


Batch:  40%|████      | 4/10 [01:07<01:42, 17.05s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.105229377746582


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.65s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.407893180847168


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.35s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.434140682220459


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.25s/it][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.4313673973083496


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.08s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.425717353820801


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.96s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.411025047302246


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.86s/it][A
Epoch:   2%|▏         | 42/1785 [2:05:45<86:51:19, 179.39s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.177365303039551


Batch:  10%|█         | 1/10 [00:20<03:04, 20.55s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3058981895446777


Batch:  20%|██        | 2/10 [00:36<02:32, 19.10s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4225873947143555


Batch:  30%|███       | 3/10 [00:51<02:06, 18.01s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.432313919067383


Batch:  40%|████      | 4/10 [01:07<01:44, 17.37s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.2926902770996094


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.87s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.3453164100646973


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.54s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.192718029022217


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.40s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.394136667251587


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.15s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4236977100372314


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.04s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.431894302368164


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.92s/it][A
Epoch:   2%|▏         | 43/1785 [2:08:45<86:55:36, 179.64s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4083781242370605


Batch:  10%|█         | 1/10 [00:18<02:43, 18.19s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.431985378265381


Batch:  20%|██        | 2/10 [00:33<02:19, 17.42s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3454275131225586


Batch:  30%|███       | 3/10 [00:49<01:57, 16.85s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.351595163345337


Batch:  40%|████      | 4/10 [01:05<01:39, 16.57s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4489529132843018


Batch:  50%|█████     | 5/10 [01:20<01:21, 16.28s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.0806376934051514


Batch:  60%|██████    | 6/10 [01:36<01:04, 16.12s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.344609260559082


Batch:  70%|███████   | 7/10 [01:51<00:47, 15.90s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.427938938140869


Batch:  80%|████████  | 8/10 [02:07<00:31, 15.88s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.339771270751953


Batch:  90%|█████████ | 9/10 [02:23<00:15, 15.86s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.200223922729492


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.99s/it][A
Epoch:   2%|▏         | 44/1785 [2:11:43<86:38:10, 179.14s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.4336414337158203


Batch:  10%|█         | 1/10 [00:18<02:43, 18.15s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.2271952629089355


Batch:  20%|██        | 2/10 [00:34<02:19, 17.49s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4271721839904785


Batch:  30%|███       | 3/10 [00:49<01:59, 17.00s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.3427228927612305


Batch:  40%|████      | 4/10 [01:06<01:40, 16.76s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.403477668762207


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.47s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -1.9930565357208252


Batch:  60%|██████    | 6/10 [01:37<01:05, 16.28s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.232964038848877


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.10s/it][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.2108376026153564


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.05s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.1120548248291016


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.97s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4357125759124756


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.94s/it][A
Epoch:   3%|▎         | 45/1785 [2:14:41<86:26:07, 178.83s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3931264877319336


Batch:  10%|█         | 1/10 [00:18<02:48, 18.69s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.190059185028076


Batch:  20%|██        | 2/10 [00:34<02:22, 17.85s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.420348644256592


Batch:  30%|███       | 3/10 [00:50<02:00, 17.21s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.1648802757263184


Batch:  40%|████      | 4/10 [01:06<01:41, 16.90s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.402900457382202


Batch:  50%|█████     | 5/10 [01:22<01:22, 16.53s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.420545816421509


Batch:  60%|██████    | 6/10 [01:37<01:05, 16.31s/it][A

Reinforce Loss : 0.0107421875 Reinforce Max Reward : -2.4432497024536133


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.12s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4263923168182373


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.07s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.404827356338501


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.92s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4420976638793945


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.87s/it][A
Epoch:   3%|▎         | 46/1785 [2:17:38<86:02:17, 178.11s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.434061050415039


Batch:  10%|█         | 1/10 [00:19<02:57, 19.69s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.2146124839782715


Batch:  20%|██        | 2/10 [00:35<02:27, 18.49s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4186105728149414


Batch:  30%|███       | 3/10 [00:50<02:03, 17.58s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4344000816345215


Batch:  40%|████      | 4/10 [01:06<01:42, 17.03s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4217443466186523


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.61s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4235262870788574


Batch:  60%|██████    | 6/10 [01:37<01:05, 16.32s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.42311429977417


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.09s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4433979988098145


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.08s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4370481967926025


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.92s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.343918800354004


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.81s/it][A
Epoch:   3%|▎         | 47/1785 [2:20:34<85:40:28, 177.46s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.9464795589447021


Batch:  10%|█         | 1/10 [00:19<02:53, 19.33s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.004286289215088


Batch:  20%|██        | 2/10 [00:34<02:25, 18.21s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.18027663230896


Batch:  30%|███       | 3/10 [00:50<02:01, 17.42s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.353332042694092


Batch:  40%|████      | 4/10 [01:06<01:41, 16.89s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4345598220825195


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.51s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.420656681060791


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.23s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4329872131347656


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.09s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.428025245666504


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.93s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.1855204105377197


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.95s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4350457191467285


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.86s/it][A
Epoch:   3%|▎         | 48/1785 [2:23:30<85:29:39, 177.19s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4218244552612305


Batch:  10%|█         | 1/10 [00:19<02:52, 19.20s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4202988147735596


Batch:  20%|██        | 2/10 [00:34<02:24, 18.11s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4329991340637207


Batch:  30%|███       | 3/10 [00:50<02:01, 17.33s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.7249903678894043


Batch:  40%|████      | 4/10 [01:05<01:40, 16.77s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4142112731933594


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.52s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2633793354034424


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.23s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.1836280822753906


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.04s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.433797836303711


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.85s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.170653820037842


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.85s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.2961668968200684


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.77s/it][A
Epoch:   3%|▎         | 49/1785 [2:26:28<85:28:29, 177.25s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.218735456466675


Batch:  10%|█         | 1/10 [00:18<02:45, 18.36s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.421006441116333


Batch:  20%|██        | 2/10 [00:34<02:20, 17.62s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.373602867126465


Batch:  30%|███       | 3/10 [00:50<01:59, 17.06s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.311854362487793


Batch:  40%|████      | 4/10 [01:05<01:39, 16.58s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.311081886291504


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.33s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.416252613067627


Batch:  60%|██████    | 6/10 [01:36<01:04, 16.14s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.304058074951172


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.04s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4332690238952637


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.86s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.428529977798462


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.91s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.430049180984497


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.82s/it][A
Epoch:   3%|▎         | 50/1785 [2:29:26<85:30:15, 177.42s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4321999549865723


Batch:  10%|█         | 1/10 [00:18<02:42, 18.04s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.255570888519287


Batch:  20%|██        | 2/10 [00:34<02:19, 17.43s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4216954708099365


Batch:  30%|███       | 3/10 [00:49<01:58, 16.94s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4294276237487793


Batch:  40%|████      | 4/10 [01:05<01:39, 16.56s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.357423782348633


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.28s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4431519508361816


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.19s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.41027569770813


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.08s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3058857917785645


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.95s/it][A

Reinforce Loss : 0.02734375 Reinforce Max Reward : -2.431680202484131


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.98s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.406342029571533


Batch: 100%|██████████| 10/10 [02:40<00:00, 16.05s/it][A
Epoch:   3%|▎         | 51/1785 [2:32:25<85:46:27, 178.08s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.4034690856933594


Batch:  10%|█         | 1/10 [00:19<02:54, 19.37s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.233501434326172


Batch:  20%|██        | 2/10 [00:35<02:27, 18.40s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4045815467834473


Batch:  30%|███       | 3/10 [00:51<02:04, 17.72s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.438821315765381


Batch:  40%|████      | 4/10 [01:07<01:42, 17.10s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4427859783172607


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.70s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3532862663269043


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.56s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.436647891998291


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.37s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3749184608459473


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.17s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.375035285949707


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.07s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.42169189453125


Batch: 100%|██████████| 10/10 [02:42<00:00, 16.08s/it][A
Epoch:   3%|▎         | 52/1785 [2:35:26<86:05:08, 178.83s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.422163963317871


Batch:  10%|█         | 1/10 [00:19<02:51, 19.04s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4257168769836426


Batch:  20%|██        | 2/10 [00:35<02:25, 18.14s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.396923065185547


Batch:  30%|███       | 3/10 [00:50<02:02, 17.44s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3735244274139404


Batch:  40%|████      | 4/10 [01:06<01:41, 16.97s/it][A

Reinforce Loss : -0.03125 Reinforce Max Reward : -1.8284571170806885


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.66s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.329960346221924


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.47s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.369158983230591


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.30s/it][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.3889577388763428


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.11s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4373655319213867


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.05s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.3716039657592773


Batch: 100%|██████████| 10/10 [02:42<00:00, 16.00s/it][A
Epoch:   3%|▎         | 53/1785 [2:38:23<85:49:30, 178.39s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4443395137786865


Batch:  10%|█         | 1/10 [00:19<02:59, 19.93s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4279823303222656


Batch:  20%|██        | 2/10 [00:35<02:29, 18.69s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.754830241203308


Batch:  30%|███       | 3/10 [00:51<02:04, 17.83s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.444150447845459


Batch:  40%|████      | 4/10 [01:07<01:43, 17.21s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.30808424949646


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.82s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.00345516204834


Batch:  60%|██████    | 6/10 [01:38<01:06, 16.50s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4404945373535156


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.30s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.396739959716797


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.13s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.18109393119812


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.03s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.374969244003296


Batch: 100%|██████████| 10/10 [02:42<00:00, 16.00s/it][A
Epoch:   3%|▎         | 54/1785 [2:41:23<86:03:16, 178.97s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.426471710205078


Batch:  10%|█         | 1/10 [00:18<02:45, 18.40s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.2822632789611816


Batch:  20%|██        | 2/10 [00:34<02:21, 17.63s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.431154251098633


Batch:  30%|███       | 3/10 [00:50<01:59, 17.09s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4020566940307617


Batch:  40%|████      | 4/10 [01:05<01:39, 16.66s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4242146015167236


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.41s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.41601300239563


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.15s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.413769245147705


Batch:  70%|███████   | 7/10 [01:52<00:47, 15.98s/it][A

Reinforce Loss : 0.02734375 Reinforce Max Reward : -2.4225730895996094


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.86s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.203042507171631


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.88s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.3640637397766113


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.75s/it][A
Epoch:   3%|▎         | 55/1785 [2:44:22<85:54:56, 178.78s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.209937572479248


Batch:  10%|█         | 1/10 [00:17<02:37, 17.46s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -1.9565346240997314


Batch:  20%|██        | 2/10 [00:33<02:16, 17.11s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.024925708770752


Batch:  30%|███       | 3/10 [00:49<01:57, 16.72s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4247143268585205


Batch:  40%|████      | 4/10 [01:05<01:38, 16.49s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.0448715686798096


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.31s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.423093795776367


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.13s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -1.9359267950057983


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.01s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.403963327407837


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.90s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.254390001296997


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.89s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.8165638446807861


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.85s/it][A
Epoch:   3%|▎         | 56/1785 [2:47:19<85:38:26, 178.31s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.41123104095459


Batch:  10%|█         | 1/10 [00:18<02:46, 18.49s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.396749973297119


Batch:  20%|██        | 2/10 [00:34<02:21, 17.68s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.422006130218506


Batch:  30%|███       | 3/10 [00:50<01:59, 17.11s/it][A

Reinforce Loss : 0.02734375 Reinforce Max Reward : -2.430187463760376


Batch:  40%|████      | 4/10 [01:05<01:39, 16.66s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.448540687561035


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.37s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.307915210723877


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.18s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4120519161224365


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.07s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4186835289001465


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.92s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4150800704956055


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.84s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.423184633255005


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.74s/it][A
Epoch:   3%|▎         | 57/1785 [2:50:17<85:33:37, 178.25s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.398935079574585


Batch:  10%|█         | 1/10 [00:18<02:44, 18.31s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.4401068687438965


Batch:  20%|██        | 2/10 [00:33<02:19, 17.50s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3409423828125


Batch:  30%|███       | 3/10 [00:49<01:58, 16.95s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.423318862915039


Batch:  40%|████      | 4/10 [01:05<01:39, 16.56s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.1605262756347656


Batch:  50%|█████     | 5/10 [01:20<01:21, 16.22s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3904929161071777


Batch:  60%|██████    | 6/10 [01:36<01:04, 16.07s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.4221692085266113


Batch:  70%|███████   | 7/10 [01:51<00:47, 15.93s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -1.9786303043365479


Batch:  80%|████████  | 8/10 [02:07<00:31, 15.84s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.272418260574341


Batch:  90%|█████████ | 9/10 [02:23<00:15, 15.80s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -1.8132457733154297


Batch: 100%|██████████| 10/10 [02:38<00:00, 15.69s/it][A
Epoch:   3%|▎         | 58/1785 [2:53:14<85:18:19, 177.82s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.0603456497192383


Batch:  10%|█         | 1/10 [00:18<02:44, 18.23s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3685789108276367


Batch:  20%|██        | 2/10 [00:33<02:19, 17.45s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.080378532409668


Batch:  30%|███       | 3/10 [00:49<01:59, 17.00s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3920350074768066


Batch:  40%|████      | 4/10 [01:05<01:40, 16.69s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.411017894744873


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.43s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.4408822059631348


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.25s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4221396446228027


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.14s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4367446899414062


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.10s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.40114164352417


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.96s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.375460624694824


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.96s/it][A
Epoch:   3%|▎         | 59/1785 [2:56:13<85:24:59, 178.16s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4309258460998535


Batch:  10%|█         | 1/10 [00:18<02:45, 18.44s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.422809600830078


Batch:  20%|██        | 2/10 [00:34<02:21, 17.69s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4329895973205566


Batch:  30%|███       | 3/10 [00:50<01:59, 17.10s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4229965209960938


Batch:  40%|████      | 4/10 [01:05<01:40, 16.74s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.42927885055542


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.37s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -1.773529291152954


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.18s/it][A

Reinforce Loss : 0.017578125 Reinforce Max Reward : -2.237109661102295


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.01s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -1.9257858991622925


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.99s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.293292999267578


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.87s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -1.7451331615447998


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.85s/it][A
Epoch:   3%|▎         | 60/1785 [2:59:11<85:21:26, 178.14s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4318346977233887


Batch:  10%|█         | 1/10 [00:19<02:51, 19.03s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.3976640701293945


Batch:  20%|██        | 2/10 [00:34<02:24, 18.07s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.3659300804138184


Batch:  30%|███       | 3/10 [00:50<02:01, 17.33s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.239863634109497


Batch:  40%|████      | 4/10 [01:06<01:41, 16.84s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3543295860290527


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.48s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.123084306716919


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.21s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -1.7389165163040161


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.06s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.28053879737854


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.96s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3799827098846436


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.85s/it][A

Reinforce Loss : -0.0087890625 Reinforce Max Reward : -2.4457614421844482


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.78s/it][A
Epoch:   3%|▎         | 61/1785 [3:02:08<85:10:40, 177.87s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4253649711608887


Batch:  10%|█         | 1/10 [00:19<02:54, 19.37s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4293675422668457


Batch:  20%|██        | 2/10 [00:35<02:26, 18.32s/it][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.431669235229492


Batch:  30%|███       | 3/10 [00:50<02:02, 17.53s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.447815418243408


Batch:  40%|████      | 4/10 [01:06<01:42, 17.02s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.424685478210449


Batch:  50%|█████     | 5/10 [01:22<01:22, 16.59s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.392533302307129


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.39s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.428737163543701


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.19s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.4347879886627197


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.11s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3929009437561035


Batch:  90%|█████████ | 9/10 [02:25<00:16, 16.02s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.24861478805542


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.90s/it][A
Epoch:   3%|▎         | 62/1785 [3:05:05<85:01:53, 177.66s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.429792881011963


Batch:  10%|█         | 1/10 [00:18<02:49, 18.87s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.7791962623596191


Batch:  20%|██        | 2/10 [00:34<02:23, 17.96s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -1.7392494678497314


Batch:  30%|███       | 3/10 [00:50<02:00, 17.27s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.1729681491851807


Batch:  40%|████      | 4/10 [01:06<01:40, 16.82s/it][A

Reinforce Loss : -0.021484375 Reinforce Max Reward : -2.373122215270996


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.51s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.375248432159424


Batch:  60%|██████    | 6/10 [01:37<01:05, 16.28s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.37642502784729


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.13s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.4116153717041016


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.02s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.1839709281921387


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.90s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.410430669784546


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.89s/it][A
Epoch:   4%|▎         | 63/1785 [3:08:01<84:44:58, 177.18s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.2215542793273926


Batch:  10%|█         | 1/10 [00:19<02:56, 19.62s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.435746431350708


Batch:  20%|██        | 2/10 [00:35<02:27, 18.42s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.4333910942077637


Batch:  30%|███       | 3/10 [00:51<02:03, 17.65s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.435809850692749


Batch:  40%|████      | 4/10 [01:07<01:43, 17.17s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -1.895572304725647


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.77s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4419381618499756


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.55s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4250993728637695


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.33s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3743133544921875


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.25s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4196176528930664


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.14s/it][A

Reinforce Loss : 0.0107421875 Reinforce Max Reward : -2.427048921585083


Batch: 100%|██████████| 10/10 [02:42<00:00, 16.07s/it][A
Epoch:   4%|▎         | 64/1785 [3:11:03<85:16:34, 178.38s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.3816022872924805


Batch:  10%|█         | 1/10 [00:18<02:50, 18.94s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4315762519836426


Batch:  20%|██        | 2/10 [00:34<02:24, 18.00s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4144904613494873


Batch:  30%|███       | 3/10 [00:50<02:02, 17.43s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.0543155670166016


Batch:  40%|████      | 4/10 [01:06<01:41, 16.92s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.447087049484253


Batch:  50%|█████     | 5/10 [01:22<01:22, 16.57s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.163846492767334


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.32s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.429508686065674


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.05s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.3935818672180176


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.04s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -1.6514389514923096


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.95s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.435483455657959


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.91s/it][A
Epoch:   4%|▎         | 65/1785 [3:13:58<84:50:36, 177.58s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4225821495056152


Batch:  10%|█         | 1/10 [00:20<03:01, 20.19s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4261765480041504


Batch:  20%|██        | 2/10 [00:36<02:31, 18.88s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.426520347595215


Batch:  30%|███       | 3/10 [00:52<02:06, 18.07s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4168701171875


Batch:  40%|████      | 4/10 [01:08<01:44, 17.42s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.293694496154785


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.96s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.102236032485962


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.58s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -1.950042486190796


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.39s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3790082931518555


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.24s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4275150299072266


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.16s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.3821067810058594


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.05s/it][A
Epoch:   4%|▎         | 66/1785 [3:17:00<85:26:22, 178.93s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4244141578674316


Batch:  10%|█         | 1/10 [00:18<02:43, 18.15s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4074273109436035


Batch:  20%|██        | 2/10 [00:34<02:19, 17.46s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.441112995147705


Batch:  30%|███       | 3/10 [00:50<01:59, 17.13s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.3367090225219727


Batch:  40%|████      | 4/10 [01:06<01:40, 16.69s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2710914611816406


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.43s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3740735054016113


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.22s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.429434299468994


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.05s/it][A

Reinforce Loss : -0.0234375 Reinforce Max Reward : -2.428116798400879


Batch:  80%|████████  | 8/10 [02:09<00:31, 15.96s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.0541114807128906


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.90s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.397230625152588


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.84s/it][A
Epoch:   4%|▍         | 67/1785 [3:19:56<84:52:15, 177.84s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.394477367401123


Batch:  10%|█         | 1/10 [00:20<03:06, 20.77s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.43884539604187


Batch:  20%|██        | 2/10 [00:36<02:33, 19.23s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4421677589416504


Batch:  30%|███       | 3/10 [00:52<02:07, 18.20s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4256138801574707


Batch:  40%|████      | 4/10 [01:07<01:44, 17.45s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.2799670696258545


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.95s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4072372913360596


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.55s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.4433774948120117


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.33s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4282355308532715


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.15s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4373536109924316


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.01s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.380204200744629


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.91s/it][A
Epoch:   4%|▍         | 68/1785 [3:22:54<84:53:27, 177.99s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.231701135635376


Batch:  10%|█         | 1/10 [00:19<02:52, 19.14s/it][A

Reinforce Loss : -0.0234375 Reinforce Max Reward : -2.430738925933838


Batch:  20%|██        | 2/10 [00:35<02:26, 18.28s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.422469139099121


Batch:  30%|███       | 3/10 [00:51<02:02, 17.53s/it][A

Reinforce Loss : 0.0107421875 Reinforce Max Reward : -2.3722050189971924


Batch:  40%|████      | 4/10 [01:07<01:42, 17.06s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.247753620147705


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.71s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4277420043945312


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.43s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.402078151702881


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.21s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.307788848876953


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.14s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.4223861694335938


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.03s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.4102208614349365


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.89s/it][A
Epoch:   4%|▍         | 69/1785 [3:25:49<84:24:40, 177.09s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3812170028686523


Batch:  10%|█         | 1/10 [00:22<03:19, 22.17s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2877860069274902


Batch:  20%|██        | 2/10 [00:37<02:42, 20.26s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -1.703292727470398


Batch:  30%|███       | 3/10 [00:53<02:12, 18.92s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.434767484664917


Batch:  40%|████      | 4/10 [01:09<01:47, 17.95s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.1814322471618652


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.31s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.402060031890869


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.77s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.3946990966796875


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.46s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.396162509918213


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.30s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.427309989929199


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.17s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -1.8736991882324219


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.99s/it][A
Epoch:   4%|▍         | 70/1785 [3:28:49<84:42:27, 177.81s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4090394973754883


Batch:  10%|█         | 1/10 [00:20<03:00, 20.09s/it][A

Reinforce Loss : 0.0625 Reinforce Max Reward : -2.427065372467041


Batch:  20%|██        | 2/10 [00:36<02:30, 18.85s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.326730489730835


Batch:  30%|███       | 3/10 [00:51<02:05, 17.94s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.413054943084717


Batch:  40%|████      | 4/10 [01:07<01:43, 17.23s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3793163299560547


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.85s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.414371967315674


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.63s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.096989631652832


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.39s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3287644386291504


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.20s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.199021100997925


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.10s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3827054500579834


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.96s/it][A
Epoch:   4%|▍         | 71/1785 [3:31:49<85:00:11, 178.54s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3590126037597656


Batch:  10%|█         | 1/10 [00:18<02:46, 18.51s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.038219451904297


Batch:  20%|██        | 2/10 [00:34<02:21, 17.71s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4340615272521973


Batch:  30%|███       | 3/10 [00:49<01:59, 17.06s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.429377794265747


Batch:  40%|████      | 4/10 [01:05<01:39, 16.65s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.42724347114563


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.39s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.416049003601074


Batch:  60%|██████    | 6/10 [01:37<01:04, 16.22s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.1741068363189697


Batch:  70%|███████   | 7/10 [01:52<00:48, 16.08s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.2877490520477295


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.98s/it][A

Reinforce Loss : 0.0107421875 Reinforce Max Reward : -2.4338297843933105


Batch:  90%|█████████ | 9/10 [02:24<00:15, 15.89s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.177077293395996


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.82s/it][A
Epoch:   4%|▍         | 72/1785 [3:34:45<84:37:53, 177.86s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.445967197418213


Batch:  10%|█         | 1/10 [00:20<03:05, 20.61s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.42733097076416


Batch:  20%|██        | 2/10 [00:36<02:33, 19.21s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4116013050079346


Batch:  30%|███       | 3/10 [00:52<02:06, 18.14s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.1496500968933105


Batch:  40%|████      | 4/10 [01:08<01:44, 17.49s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.260748863220215


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.98s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.446317672729492


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.59s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3946352005004883


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.35s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.407724380493164


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.09s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.1620168685913086


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.01s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.377110004425049


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.91s/it][A
Epoch:   4%|▍         | 73/1785 [3:37:41<84:15:06, 177.16s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.401759386062622


Batch:  10%|█         | 1/10 [00:23<03:30, 23.43s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.434499502182007


Batch:  20%|██        | 2/10 [00:39<02:49, 21.16s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4241538047790527


Batch:  30%|███       | 3/10 [00:55<02:16, 19.55s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.106219530105591


Batch:  40%|████      | 4/10 [01:10<01:50, 18.41s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.282658576965332


Batch:  50%|█████     | 5/10 [01:26<01:28, 17.62s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.32193660736084


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.04s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4125287532806396


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.59s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.246312141418457


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.37s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4337105751037598


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.18s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.4286112785339355


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.10s/it][A
Epoch:   4%|▍         | 74/1785 [3:40:44<85:04:13, 178.99s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.245072603225708


Batch:  10%|█         | 1/10 [00:18<02:42, 18.05s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -1.58392333984375


Batch:  20%|██        | 2/10 [00:34<02:19, 17.44s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4187378883361816


Batch:  30%|███       | 3/10 [00:49<01:58, 16.91s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.42708158493042


Batch:  40%|████      | 4/10 [01:05<01:39, 16.55s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.330577850341797


Batch:  50%|█████     | 5/10 [01:21<01:21, 16.32s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.405121326446533


Batch:  60%|██████    | 6/10 [01:36<01:04, 16.12s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.424402952194214


Batch:  70%|███████   | 7/10 [01:52<00:47, 15.95s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.447328805923462


Batch:  80%|████████  | 8/10 [02:08<00:31, 15.88s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.3848838806152344


Batch:  90%|█████████ | 9/10 [02:23<00:15, 15.84s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.447986125946045


Batch: 100%|██████████| 10/10 [02:39<00:00, 15.82s/it][A
Epoch:   4%|▍         | 75/1785 [3:43:42<84:50:57, 178.63s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.322587013244629


Batch:  10%|█         | 1/10 [00:18<02:46, 18.54s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4220879077911377


Batch:  20%|██        | 2/10 [00:34<02:21, 17.73s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.202033519744873


Batch:  30%|███       | 3/10 [00:50<01:59, 17.13s/it][A

Reinforce Loss : -0.0234375 Reinforce Max Reward : -2.437242269515991


Batch:  40%|████      | 4/10 [01:06<01:40, 16.76s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.097682476043701


Batch:  50%|█████     | 5/10 [01:21<01:22, 16.48s/it][A

Reinforce Loss : 0.00048828125 Reinforce Max Reward : -2.36441707611084


Batch:  60%|██████    | 6/10 [01:37<01:05, 16.35s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4270248413085938


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.19s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.434508800506592


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.16s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.39347505569458


Batch:  90%|█████████ | 9/10 [02:25<00:16, 16.09s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.421752691268921


Batch: 100%|██████████| 10/10 [02:41<00:00, 16.10s/it][A
Epoch:   4%|▍         | 76/1785 [3:46:38<84:30:48, 178.03s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.421710968017578


Batch:  10%|█         | 1/10 [00:19<02:57, 19.74s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.179328680038452


Batch:  20%|██        | 2/10 [00:35<02:28, 18.57s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4120025634765625


Batch:  30%|███       | 3/10 [00:51<02:03, 17.71s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.226600408554077


Batch:  40%|████      | 4/10 [01:07<01:42, 17.12s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.399989604949951


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.71s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4276037216186523


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.43s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.6591585874557495


Batch:  70%|███████   | 7/10 [01:54<00:48, 16.18s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4349417686462402


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.04s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.1868343353271484


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.94s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.6382280588150024


Batch: 100%|██████████| 10/10 [02:41<00:00, 15.89s/it][A
Epoch:   4%|▍         | 77/1785 [3:49:34<84:11:39, 177.46s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.413341760635376


Batch:  10%|█         | 1/10 [00:19<02:58, 19.83s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4140849113464355


Batch:  20%|██        | 2/10 [00:35<02:29, 18.67s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.357696056365967


Batch:  30%|███       | 3/10 [00:51<02:04, 17.84s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4367446899414062


Batch:  40%|████      | 4/10 [01:07<01:43, 17.23s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.425827980041504


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.95s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.423478603363037


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.60s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4412145614624023


Batch:  70%|███████   | 7/10 [01:55<00:48, 16.32s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4012646675109863


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.17s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.4349136352539062


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.04s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3346712589263916


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.95s/it][A
Epoch:   4%|▍         | 78/1785 [3:52:32<84:07:49, 177.43s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.264626979827881


Batch:  10%|█         | 1/10 [00:20<03:03, 20.43s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3491644859313965


Batch:  20%|██        | 2/10 [00:36<02:31, 18.98s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.434201240539551


Batch:  30%|███       | 3/10 [00:51<02:05, 17.98s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4181742668151855


Batch:  40%|████      | 4/10 [01:07<01:43, 17.21s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.2498466968536377


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.75s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.137885093688965


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.40s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.3959898948669434


Batch:  70%|███████   | 7/10 [01:53<00:48, 16.16s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3398752212524414


Batch:  80%|████████  | 8/10 [02:09<00:32, 16.02s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2773125171661377


Batch:  90%|█████████ | 9/10 [02:25<00:15, 15.90s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -1.6811001300811768


Batch: 100%|██████████| 10/10 [02:40<00:00, 15.79s/it][A
Epoch:   4%|▍         | 79/1785 [3:55:26<83:36:03, 176.41s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.3898282051086426


Batch:  10%|█         | 1/10 [00:22<03:18, 22.00s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.0586166381835938


Batch:  20%|██        | 2/10 [00:37<02:41, 20.16s/it][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.3694067001342773


Batch:  30%|███       | 3/10 [00:53<02:11, 18.85s/it][A

Reinforce Loss : 0.03125 Reinforce Max Reward : -2.4245734214782715


Batch:  40%|████      | 4/10 [01:09<01:47, 17.88s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.2936391830444336


Batch:  50%|█████     | 5/10 [01:24<01:26, 17.22s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.9684767723083496


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.76s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.3983449935913086


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.47s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.3854854106903076


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.18s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.358593463897705


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.05s/it][A

Reinforce Loss : -0.03125 Reinforce Max Reward : -2.4231460094451904


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.91s/it][A
Epoch:   4%|▍         | 80/1785 [3:58:23<83:43:14, 176.77s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4239144325256348


Batch:  10%|█         | 1/10 [00:21<03:10, 21.18s/it][A

Reinforce Loss : 0.0146484375 Reinforce Max Reward : -2.4425549507141113


Batch:  20%|██        | 2/10 [00:36<02:36, 19.55s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4342637062072754


Batch:  30%|███       | 3/10 [00:52<02:09, 18.46s/it][A

Reinforce Loss : -0.033203125 Reinforce Max Reward : -2.416099786758423


Batch:  40%|████      | 4/10 [01:08<01:45, 17.64s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.417320728302002


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.12s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4043045043945312


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.80s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.308516502380371


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.55s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4083290100097656


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.31s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.434601306915283


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.19s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4260125160217285


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.11s/it][A
Epoch:   5%|▍         | 81/1785 [4:01:24<84:10:18, 177.83s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4310007095336914


Batch:  10%|█         | 1/10 [00:20<03:01, 20.11s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.1517796516418457


Batch:  20%|██        | 2/10 [00:36<02:31, 18.89s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.1829864978790283


Batch:  30%|███       | 3/10 [00:52<02:06, 18.02s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.375415802001953


Batch:  40%|████      | 4/10 [01:08<01:44, 17.42s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4332122802734375


Batch:  50%|█████     | 5/10 [01:24<01:24, 17.00s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3368468284606934


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.66s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4252662658691406


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.47s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3770766258239746


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.26s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4264683723449707


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.11s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.4387664794921875


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.04s/it][A
Epoch:   5%|▍         | 82/1785 [4:04:23<84:17:50, 178.20s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01953125 Reinforce Max Reward : -2.415189743041992


Batch:  10%|█         | 1/10 [00:21<03:09, 21.10s/it][A

Reinforce Loss : -0.0234375 Reinforce Max Reward : -2.4529378414154053


Batch:  20%|██        | 2/10 [00:36<02:35, 19.47s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.429947853088379


Batch:  30%|███       | 3/10 [00:52<02:08, 18.30s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.38112211227417


Batch:  40%|████      | 4/10 [01:07<01:45, 17.50s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.419891834259033


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.99s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.374481678009033


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.60s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.425480365753174


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.40s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.297636032104492


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.28s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.414590358734131


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.19s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.33351993560791


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.05s/it][A
Epoch:   5%|▍         | 83/1785 [4:07:22<84:23:07, 178.49s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.338935375213623


Batch:  10%|█         | 1/10 [00:20<03:07, 20.78s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4284400939941406


Batch:  20%|██        | 2/10 [00:36<02:34, 19.33s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.370859146118164


Batch:  30%|███       | 3/10 [00:53<02:08, 18.42s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4268038272857666


Batch:  40%|████      | 4/10 [01:08<01:45, 17.58s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.422262191772461


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.05s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -1.9661998748779297


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.76s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.8447667360305786


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.52s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3802952766418457


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.32s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4075183868408203


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.24s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.433065176010132


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.23s/it][A
Epoch:   5%|▍         | 84/1785 [4:10:25<84:55:10, 179.72s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.231813669204712


Batch:  10%|█         | 1/10 [00:18<02:50, 18.95s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.1201562881469727


Batch:  20%|██        | 2/10 [00:34<02:24, 18.03s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.424886465072632


Batch:  30%|███       | 3/10 [00:51<02:02, 17.48s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4106011390686035


Batch:  40%|████      | 4/10 [01:06<01:41, 16.97s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.1686153411865234


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.68s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.0393550395965576


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.48s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.2487430572509766


Batch:  70%|███████   | 7/10 [01:54<00:49, 16.35s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.398853302001953


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.16s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.433101177215576


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.04s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4225339889526367


Batch: 100%|██████████| 10/10 [02:42<00:00, 16.06s/it][A
Epoch:   5%|▍         | 85/1785 [4:13:21<84:21:49, 178.65s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.082972526550293


Batch:  10%|█         | 1/10 [00:24<03:39, 24.42s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.3260302543640137


Batch:  20%|██        | 2/10 [00:40<02:54, 21.86s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -1.7483415603637695


Batch:  30%|███       | 3/10 [00:56<02:21, 20.15s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.381739616394043


Batch:  40%|████      | 4/10 [01:12<01:53, 18.84s/it][A

Reinforce Loss : 0.02734375 Reinforce Max Reward : -2.3013453483581543


Batch:  50%|█████     | 5/10 [01:27<01:29, 17.90s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.4257707595825195


Batch:  60%|██████    | 6/10 [01:43<01:09, 17.28s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.2401490211486816


Batch:  70%|███████   | 7/10 [01:59<00:50, 16.93s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.432276964187622


Batch:  80%|████████  | 8/10 [02:15<00:33, 16.56s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.406630754470825


Batch:  90%|█████████ | 9/10 [02:31<00:16, 16.35s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4413199424743652


Batch: 100%|██████████| 10/10 [02:47<00:00, 16.35s/it][A
Epoch:   5%|▍         | 86/1785 [4:16:25<85:05:36, 180.30s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4252214431762695


Batch:  10%|█         | 1/10 [00:20<03:06, 20.75s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4075770378112793


Batch:  20%|██        | 2/10 [00:36<02:34, 19.31s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.442812919616699


Batch:  30%|███       | 3/10 [00:52<02:08, 18.38s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.2033987045288086


Batch:  40%|████      | 4/10 [01:08<01:45, 17.65s/it][A

Reinforce Loss : -0.0107421875 Reinforce Max Reward : -2.1943697929382324


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.09s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.369832754135132


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.71s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.2373952865600586


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.51s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.447159767150879


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.26s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3188135623931885


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.14s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.401123523712158


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.14s/it][A
Epoch:   5%|▍         | 87/1785 [4:19:24<84:52:58, 179.96s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.021484375 Reinforce Max Reward : -2.4240057468414307


Batch:  10%|█         | 1/10 [00:21<03:09, 21.03s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.431093215942383


Batch:  20%|██        | 2/10 [00:36<02:35, 19.47s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.326702117919922


Batch:  30%|███       | 3/10 [00:53<02:09, 18.49s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.3915581703186035


Batch:  40%|████      | 4/10 [01:08<01:45, 17.66s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.2955613136291504


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.14s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.447768449783325


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.72s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.399115800857544


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.56s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.3745076656341553


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.36s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.401871681213379


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.30s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.331796407699585


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.32s/it][A
Epoch:   5%|▍         | 88/1785 [4:22:26<85:07:07, 180.57s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.1543922424316406


Batch:  10%|█         | 1/10 [00:18<02:48, 18.78s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.3417885303497314


Batch:  20%|██        | 2/10 [00:34<02:23, 17.98s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4315290451049805


Batch:  30%|███       | 3/10 [00:51<02:01, 17.42s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3747713565826416


Batch:  40%|████      | 4/10 [01:06<01:41, 16.90s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3222999572753906


Batch:  50%|█████     | 5/10 [01:22<01:22, 16.57s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.430367946624756


Batch:  60%|██████    | 6/10 [01:38<01:05, 16.40s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4202871322631836


Batch:  70%|███████   | 7/10 [01:54<00:49, 16.40s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3564696311950684


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.20s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4244747161865234


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.15s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4293198585510254


Batch: 100%|██████████| 10/10 [02:42<00:00, 16.11s/it][A
Epoch:   5%|▍         | 89/1785 [4:25:25<84:49:14, 180.04s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -1.9906678199768066


Batch:  10%|█         | 1/10 [00:20<03:02, 20.22s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.318991184234619


Batch:  20%|██        | 2/10 [00:36<02:31, 18.98s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.420910596847534


Batch:  30%|███       | 3/10 [00:52<02:07, 18.23s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4244585037231445


Batch:  40%|████      | 4/10 [01:08<01:45, 17.54s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4306979179382324


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.10s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.429335594177246


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.81s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.3747363090515137


Batch:  70%|███████   | 7/10 [01:57<00:50, 16.73s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.3640966415405273


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.46s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4280669689178467


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.29s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4047446250915527


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.26s/it][A
Epoch:   5%|▌         | 90/1785 [4:28:27<85:01:30, 180.58s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4033002853393555


Batch:  10%|█         | 1/10 [00:19<02:56, 19.64s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.420816421508789


Batch:  20%|██        | 2/10 [00:35<02:28, 18.51s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.3178162574768066


Batch:  30%|███       | 3/10 [00:51<02:04, 17.79s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.4214987754821777


Batch:  40%|████      | 4/10 [01:07<01:43, 17.20s/it][A

Reinforce Loss : -0.01953125 Reinforce Max Reward : -2.3971729278564453


Batch:  50%|█████     | 5/10 [01:23<01:23, 16.80s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4399404525756836


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.51s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.357762336730957


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.40s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.2431321144104004


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.19s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.2381434440612793


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.05s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4468398094177246


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.97s/it][A
Epoch:   5%|▌         | 91/1785 [4:31:24<84:29:31, 179.56s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.340761423110962


Batch:  10%|█         | 1/10 [00:20<03:02, 20.33s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.448240280151367


Batch:  20%|██        | 2/10 [00:35<02:31, 18.91s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -1.9316303730010986


Batch:  30%|███       | 3/10 [00:52<02:07, 18.15s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.439354658126831


Batch:  40%|████      | 4/10 [01:07<01:44, 17.40s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.35652494430542


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.87s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4351789951324463


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.52s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.410686492919922


Batch:  70%|███████   | 7/10 [01:55<00:48, 16.32s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.436486005783081


Batch:  80%|████████  | 8/10 [02:10<00:32, 16.13s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4211721420288086


Batch:  90%|█████████ | 9/10 [02:26<00:15, 15.97s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4251537322998047


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.88s/it][A
Epoch:   5%|▌         | 92/1785 [4:34:19<83:51:41, 178.32s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4207351207733154


Batch:  10%|█         | 1/10 [00:22<03:18, 22.04s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.442436695098877


Batch:  20%|██        | 2/10 [00:37<02:41, 20.21s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.385310649871826


Batch:  30%|███       | 3/10 [00:53<02:12, 18.94s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4422411918640137


Batch:  40%|████      | 4/10 [01:09<01:47, 17.92s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.353795051574707


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.31s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4275593757629395


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.82s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4245471954345703


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.65s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4491262435913086


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.34s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.2412800788879395


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.16s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3821394443511963


Batch: 100%|██████████| 10/10 [02:44<00:00, 15.98s/it][A
Epoch:   5%|▌         | 93/1785 [4:37:19<84:03:25, 178.84s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.1804542541503906


Batch:  10%|█         | 1/10 [00:21<03:17, 21.93s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.434574604034424


Batch:  20%|██        | 2/10 [00:37<02:40, 20.07s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.36171293258667


Batch:  30%|███       | 3/10 [00:53<02:11, 18.78s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4325239658355713


Batch:  40%|████      | 4/10 [01:09<01:47, 17.87s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.2915525436401367


Batch:  50%|█████     | 5/10 [01:24<01:26, 17.24s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -1.7052390575408936


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.79s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.2384605407714844


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.53s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.428852081298828


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.24s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.2447805404663086


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.08s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.427135944366455


Batch: 100%|██████████| 10/10 [02:43<00:00, 15.94s/it][A
Epoch:   5%|▌         | 94/1785 [4:40:19<84:04:50, 179.00s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4336130619049072


Batch:  10%|█         | 1/10 [00:20<03:06, 20.68s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.146679401397705


Batch:  20%|██        | 2/10 [00:36<02:33, 19.16s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.323329210281372


Batch:  30%|███       | 3/10 [00:52<02:07, 18.15s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -1.996181607246399


Batch:  40%|████      | 4/10 [01:08<01:45, 17.53s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.431889057159424


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.98s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.30570125579834


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.56s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4318933486938477


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.39s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.42960262298584


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.22s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.4183084964752197


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.16s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.420100212097168


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.04s/it][A
Epoch:   5%|▌         | 95/1785 [4:43:18<84:04:07, 179.08s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4133450984954834


Batch:  10%|█         | 1/10 [00:20<03:04, 20.48s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.305889844894409


Batch:  20%|██        | 2/10 [00:36<02:32, 19.04s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4266316890716553


Batch:  30%|███       | 3/10 [00:51<02:06, 18.02s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.4195804595947266


Batch:  40%|████      | 4/10 [01:07<01:44, 17.45s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.387498617172241


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.93s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4286389350891113


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.55s/it][A

Reinforce Loss : -0.0126953125 Reinforce Max Reward : -2.431522846221924


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.34s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.1467113494873047


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.22s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.3079819679260254


Batch:  90%|█████████ | 9/10 [02:26<00:16, 16.08s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4262354373931885


Batch: 100%|██████████| 10/10 [02:42<00:00, 15.91s/it][A
Epoch:   5%|▌         | 96/1785 [4:46:15<83:43:18, 178.45s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.1221296787261963


Batch:  10%|█         | 1/10 [00:22<03:21, 22.36s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.374809980392456


Batch:  20%|██        | 2/10 [00:38<02:42, 20.37s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4334402084350586


Batch:  30%|███       | 3/10 [00:53<02:12, 18.97s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.4379091262817383


Batch:  40%|████      | 4/10 [01:09<01:48, 18.05s/it][A

Reinforce Loss : 0.0234375 Reinforce Max Reward : -2.429823875427246


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.34s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4267022609710693


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.85s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.43270206451416


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.57s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.4398388862609863


Batch:  80%|████████  | 8/10 [02:13<00:33, 16.55s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.439385414123535


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.28s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.394664764404297


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.18s/it][A
Epoch:   5%|▌         | 97/1785 [4:49:16<84:02:07, 179.22s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.3244309425354004


Batch:  10%|█         | 1/10 [00:20<03:08, 20.97s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.2378597259521484


Batch:  20%|██        | 2/10 [00:36<02:35, 19.41s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2868447303771973


Batch:  30%|███       | 3/10 [00:52<02:08, 18.41s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.1807870864868164


Batch:  40%|████      | 4/10 [01:09<01:46, 17.82s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4245097637176514


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.22s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4252233505249023


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.23s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.406165599822998


Batch:  70%|███████   | 7/10 [01:58<00:50, 16.95s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4362680912017822


Batch:  80%|████████  | 8/10 [02:15<00:33, 16.79s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.1308400630950928


Batch:  90%|█████████ | 9/10 [02:30<00:16, 16.50s/it][A

Reinforce Loss : -0.017578125 Reinforce Max Reward : -2.041020154953003


Batch: 100%|██████████| 10/10 [02:46<00:00, 16.32s/it][A
Epoch:   5%|▌         | 98/1785 [4:52:19<84:31:00, 180.36s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.01953125 Reinforce Max Reward : -2.3439698219299316


Batch:  10%|█         | 1/10 [00:19<02:58, 19.78s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.420196533203125


Batch:  20%|██        | 2/10 [00:35<02:28, 18.57s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.359192371368408


Batch:  30%|███       | 3/10 [00:51<02:04, 17.82s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.356679916381836


Batch:  40%|████      | 4/10 [01:07<01:44, 17.34s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.0653786659240723


Batch:  50%|█████     | 5/10 [01:23<01:24, 16.85s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4231436252593994


Batch:  60%|██████    | 6/10 [01:39<01:06, 16.60s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.3866095542907715


Batch:  70%|███████   | 7/10 [01:55<00:49, 16.39s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -1.954766869544983


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.30s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.260403871536255


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.06s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.241260528564453


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.04s/it][A
Epoch:   6%|▌         | 99/1785 [4:55:15<83:52:45, 179.10s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4366915225982666


Batch:  10%|█         | 1/10 [00:22<03:26, 22.97s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.353447914123535


Batch:  20%|██        | 2/10 [00:38<02:45, 20.74s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4309487342834473


Batch:  30%|███       | 3/10 [00:54<02:15, 19.30s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.437284469604492


Batch:  40%|████      | 4/10 [01:10<01:50, 18.40s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.43296480178833


Batch:  50%|█████     | 5/10 [01:26<01:27, 17.54s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -1.946180820465088


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.98s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.3812313079833984


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.61s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.386368751525879


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.44s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2745814323425293


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.18s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.040541887283325


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.04s/it][A
Epoch:   6%|▌         | 100/1785 [4:58:15<83:56:19, 179.33s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4345431327819824


Batch:  10%|█         | 1/10 [00:21<03:15, 21.69s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.422506809234619


Batch:  20%|██        | 2/10 [00:37<02:38, 19.86s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2838616371154785


Batch:  30%|███       | 3/10 [00:53<02:10, 18.71s/it][A

Reinforce Loss : -0.0234375 Reinforce Max Reward : -2.407339096069336


Batch:  40%|████      | 4/10 [01:09<01:47, 17.88s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4334311485290527


Batch:  50%|█████     | 5/10 [01:24<01:26, 17.23s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.3756532669067383


Batch:  60%|██████    | 6/10 [01:40<01:06, 16.72s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.2412190437316895


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.48s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.3898887634277344


Batch:  80%|████████  | 8/10 [02:12<00:32, 16.44s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.325045108795166


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.20s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.397695541381836


Batch: 100%|██████████| 10/10 [02:44<00:00, 16.12s/it][A
Epoch:   6%|▌         | 101/1785 [5:01:15<83:58:29, 179.52s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4347453117370605


Batch:  10%|█         | 1/10 [00:21<03:15, 21.67s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.421037197113037


Batch:  20%|██        | 2/10 [00:37<02:39, 19.88s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.3535728454589844


Batch:  30%|███       | 3/10 [00:53<02:10, 18.69s/it][A

Reinforce Loss : -0.01953125 Reinforce Max Reward : -2.4459545612335205


Batch:  40%|████      | 4/10 [01:09<01:47, 17.88s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.22910213470459


Batch:  50%|█████     | 5/10 [01:25<01:26, 17.36s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.421471118927002


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.93s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.4295620918273926


Batch:  70%|███████   | 7/10 [01:57<00:50, 16.69s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.4119625091552734


Batch:  80%|████████  | 8/10 [02:14<00:33, 16.67s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.4241647720336914


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.38s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4405298233032227


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.29s/it][A
Epoch:   6%|▌         | 102/1785 [5:04:19<84:29:44, 180.74s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.44282865524292


Batch:  10%|█         | 1/10 [00:18<02:50, 18.96s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.427279472351074


Batch:  20%|██        | 2/10 [00:34<02:23, 17.99s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.439213275909424


Batch:  30%|███       | 3/10 [00:50<02:01, 17.40s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.205441951751709


Batch:  40%|████      | 4/10 [01:06<01:42, 17.05s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4072556495666504


Batch:  50%|█████     | 5/10 [01:22<01:23, 16.69s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.2255053520202637


Batch:  60%|██████    | 6/10 [01:38<01:06, 16.51s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.3123161792755127


Batch:  70%|███████   | 7/10 [01:54<00:49, 16.35s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3216052055358887


Batch:  80%|████████  | 8/10 [02:11<00:32, 16.36s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.405205249786377


Batch:  90%|█████████ | 9/10 [02:27<00:16, 16.21s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.430877923965454


Batch: 100%|██████████| 10/10 [02:43<00:00, 16.23s/it][A
Epoch:   6%|▌         | 103/1785 [5:07:20<84:35:32, 181.05s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.420340061187744


Batch:  10%|█         | 1/10 [00:19<02:52, 19.14s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3120031356811523


Batch:  20%|██        | 2/10 [00:35<02:25, 18.19s/it][A

Reinforce Loss : 0.025390625 Reinforce Max Reward : -2.4275736808776855


Batch:  30%|███       | 3/10 [00:51<02:03, 17.68s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4288289546966553


Batch:  40%|████      | 4/10 [01:08<01:44, 17.42s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.11991548538208


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.02s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.327641010284424


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.80s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.432654857635498


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.64s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4111409187316895


Batch:  80%|████████  | 8/10 [02:13<00:33, 16.57s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4275407791137695


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.48s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3691775798797607


Batch: 100%|██████████| 10/10 [02:46<00:00, 16.45s/it][A
Epoch:   6%|▌         | 104/1785 [5:10:23<84:42:24, 181.41s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.4376745223999023


Batch:  10%|█         | 1/10 [00:22<03:18, 22.02s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4002063274383545


Batch:  20%|██        | 2/10 [00:38<02:42, 20.36s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.1975598335266113


Batch:  30%|███       | 3/10 [00:54<02:13, 19.12s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.430361747741699


Batch:  40%|████      | 4/10 [01:11<01:49, 18.32s/it][A

Reinforce Loss : -0.0029296875 Reinforce Max Reward : -2.431936740875244


Batch:  50%|█████     | 5/10 [01:27<01:28, 17.61s/it][A

Reinforce Loss : -0.013671875 Reinforce Max Reward : -2.423893928527832


Batch:  60%|██████    | 6/10 [01:43<01:08, 17.19s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.425750494003296


Batch:  70%|███████   | 7/10 [01:59<00:50, 16.91s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.0695953369140625


Batch:  80%|████████  | 8/10 [02:16<00:33, 16.77s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4099698066711426


Batch:  90%|█████████ | 9/10 [02:31<00:16, 16.49s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.403069496154785


Batch: 100%|██████████| 10/10 [02:48<00:00, 16.43s/it][A
Epoch:   6%|▌         | 105/1785 [5:13:26<84:53:13, 181.90s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.149317502975464


Batch:  10%|█         | 1/10 [00:20<03:03, 20.35s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.426884412765503


Batch:  20%|██        | 2/10 [00:36<02:32, 19.12s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4220125675201416


Batch:  30%|███       | 3/10 [00:52<02:07, 18.21s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.28587007522583


Batch:  40%|████      | 4/10 [01:08<01:45, 17.60s/it][A

Reinforce Loss : 0.0029296875 Reinforce Max Reward : -2.445594549179077


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.04s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4445533752441406


Batch:  60%|██████    | 6/10 [01:40<01:07, 16.79s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.149250030517578


Batch:  70%|███████   | 7/10 [01:56<00:49, 16.57s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.349557876586914


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.47s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.432534694671631


Batch:  90%|█████████ | 9/10 [02:28<00:16, 16.29s/it][A

Reinforce Loss : 0.015625 Reinforce Max Reward : -2.324209451675415


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.27s/it][A
Epoch:   6%|▌         | 106/1785 [5:16:29<85:02:38, 182.35s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.311159133911133


Batch:  10%|█         | 1/10 [00:19<02:59, 19.98s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.2827463150024414


Batch:  20%|██        | 2/10 [00:36<02:30, 18.80s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.3484182357788086


Batch:  30%|███       | 3/10 [00:52<02:06, 18.00s/it][A

Reinforce Loss : 0.021484375 Reinforce Max Reward : -2.4347023963928223


Batch:  40%|████      | 4/10 [01:08<01:45, 17.53s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.4377388954162598


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.11s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.3962693214416504


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.90s/it][A

Reinforce Loss : 0.021484375 Reinforce Max Reward : -2.4219884872436523


Batch:  70%|███████   | 7/10 [01:57<00:50, 16.78s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4066691398620605


Batch:  80%|████████  | 8/10 [02:13<00:33, 16.64s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.431629180908203


Batch:  90%|█████████ | 9/10 [02:30<00:16, 16.52s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.424879550933838


Batch: 100%|██████████| 10/10 [02:46<00:00, 16.45s/it][A
Epoch:   6%|▌         | 107/1785 [5:19:33<85:12:45, 182.82s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.438446044921875


Batch:  10%|█         | 1/10 [00:20<03:08, 20.94s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -1.9722321033477783


Batch:  20%|██        | 2/10 [00:37<02:36, 19.54s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.434988260269165


Batch:  30%|███       | 3/10 [00:53<02:10, 18.65s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.1481451988220215


Batch:  40%|████      | 4/10 [01:10<01:47, 17.94s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -2.4023866653442383


Batch:  50%|█████     | 5/10 [01:26<01:27, 17.44s/it][A

Reinforce Loss : -0.00048828125 Reinforce Max Reward : -2.432166576385498


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.04s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.417825698852539


Batch:  70%|███████   | 7/10 [01:58<00:50, 16.81s/it][A

Reinforce Loss : 0.025390625 Reinforce Max Reward : -2.4232280254364014


Batch:  80%|████████  | 8/10 [02:14<00:33, 16.64s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.116776943206787


Batch:  90%|█████████ | 9/10 [02:31<00:16, 16.51s/it][A

Reinforce Loss : 0.0068359375 Reinforce Max Reward : -2.3708560466766357


Batch: 100%|██████████| 10/10 [02:47<00:00, 16.43s/it][A
Epoch:   6%|▌         | 108/1785 [5:22:34<84:54:27, 182.27s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.238260269165039


Batch:  10%|█         | 1/10 [00:22<03:20, 22.23s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.3501758575439453


Batch:  20%|██        | 2/10 [00:38<02:43, 20.49s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.13608980178833


Batch:  30%|███       | 3/10 [00:54<02:14, 19.17s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.415454626083374


Batch:  40%|████      | 4/10 [01:10<01:49, 18.27s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.276538372039795


Batch:  50%|█████     | 5/10 [01:27<01:28, 17.68s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.3083884716033936


Batch:  60%|██████    | 6/10 [01:43<01:08, 17.23s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.427649974822998


Batch:  70%|███████   | 7/10 [01:59<00:50, 16.97s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.429917335510254


Batch:  80%|████████  | 8/10 [02:15<00:33, 16.66s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.4232349395751953


Batch:  90%|█████████ | 9/10 [02:31<00:16, 16.50s/it][A

Reinforce Loss : 0.0234375 Reinforce Max Reward : -2.3866443634033203


Batch: 100%|██████████| 10/10 [02:47<00:00, 16.39s/it][A
Epoch:   6%|▌         | 109/1785 [5:25:38<85:07:36, 182.85s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.021484375 Reinforce Max Reward : -2.4208123683929443


Batch:  10%|█         | 1/10 [00:20<03:03, 20.43s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.355433702468872


Batch:  20%|██        | 2/10 [00:36<02:33, 19.13s/it][A

Reinforce Loss : -0.0068359375 Reinforce Max Reward : -2.040297508239746


Batch:  30%|███       | 3/10 [00:52<02:07, 18.24s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.4290947914123535


Batch:  40%|████      | 4/10 [01:08<01:45, 17.59s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.4292407035827637


Batch:  50%|█████     | 5/10 [01:24<01:25, 17.17s/it][A

Reinforce Loss : -0.015625 Reinforce Max Reward : -2.4386181831359863


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.86s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3857250213623047


Batch:  70%|███████   | 7/10 [01:57<00:49, 16.64s/it][A

Reinforce Loss : 0.013671875 Reinforce Max Reward : -2.212874412536621


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.47s/it][A

Reinforce Loss : -0.0009765625 Reinforce Max Reward : -2.398590564727783


Batch:  90%|█████████ | 9/10 [02:29<00:16, 16.43s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3928470611572266


Batch: 100%|██████████| 10/10 [02:45<00:00, 16.31s/it][A
Epoch:   6%|▌         | 110/1785 [5:28:41<85:06:35, 182.92s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.307586669921875


Batch:  10%|█         | 1/10 [00:19<02:56, 19.66s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.0475196838378906


Batch:  20%|██        | 2/10 [00:36<02:30, 18.84s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4280550479888916


Batch:  30%|███       | 3/10 [00:52<02:06, 18.01s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.4459404945373535


Batch:  40%|████      | 4/10 [01:08<01:44, 17.46s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3718442916870117


Batch:  50%|█████     | 5/10 [01:25<01:25, 17.13s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.1643171310424805


Batch:  60%|██████    | 6/10 [01:41<01:07, 16.84s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3942275047302246


Batch:  70%|███████   | 7/10 [01:57<00:50, 16.74s/it][A

Reinforce Loss : -0.009765625 Reinforce Max Reward : -2.4199728965759277


Batch:  80%|████████  | 8/10 [02:13<00:32, 16.43s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.2615504264831543


Batch:  90%|█████████ | 9/10 [02:30<00:16, 16.48s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4204320907592773


Batch: 100%|██████████| 10/10 [02:46<00:00, 16.45s/it][A
Epoch:   6%|▌         | 111/1785 [5:31:44<85:03:07, 182.91s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4268717765808105


Batch:  10%|█         | 1/10 [00:20<03:02, 20.25s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.088197708129883


Batch:  20%|██        | 2/10 [00:36<02:32, 19.07s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.408395767211914


Batch:  30%|███       | 3/10 [00:53<02:08, 18.36s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.411912441253662


Batch:  40%|████      | 4/10 [01:09<01:46, 17.73s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -2.4373650550842285


Batch:  50%|█████     | 5/10 [01:26<01:26, 17.37s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.2472379207611084


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.17s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.383697032928467


Batch:  70%|███████   | 7/10 [01:59<00:50, 16.93s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.3499081134796143


Batch:  80%|████████  | 8/10 [02:15<00:33, 16.78s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.389374256134033


Batch:  90%|█████████ | 9/10 [02:31<00:16, 16.67s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4101083278656006


Batch: 100%|██████████| 10/10 [02:48<00:00, 16.66s/it][A
Epoch:   6%|▋         | 112/1785 [5:34:48<85:11:24, 183.31s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4287610054016113


Batch:  10%|█         | 1/10 [00:20<03:05, 20.57s/it][A

Reinforce Loss : -0.0048828125 Reinforce Max Reward : -2.289313316345215


Batch:  20%|██        | 2/10 [00:37<02:35, 19.40s/it][A

Reinforce Loss : 0.0078125 Reinforce Max Reward : -2.106978416442871


Batch:  30%|███       | 3/10 [00:53<02:09, 18.51s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -1.916501760482788


Batch:  40%|████      | 4/10 [01:10<01:47, 17.86s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.3684372901916504


Batch:  50%|█████     | 5/10 [01:26<01:26, 17.38s/it][A

Reinforce Loss : 0.0009765625 Reinforce Max Reward : -2.4272303581237793


Batch:  60%|██████    | 6/10 [01:42<01:08, 17.08s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.432542324066162


Batch:  70%|███████   | 7/10 [01:59<00:50, 16.89s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.366408586502075


Batch:  80%|████████  | 8/10 [02:15<00:33, 16.72s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.44329833984375


Batch:  90%|█████████ | 9/10 [02:31<00:16, 16.59s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.2266907691955566


Batch: 100%|██████████| 10/10 [02:48<00:00, 16.51s/it][A
Epoch:   6%|▋         | 113/1785 [5:37:51<85:04:55, 183.19s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.0281052589416504


Batch:  10%|█         | 1/10 [00:21<03:15, 21.67s/it][A

Reinforce Loss : 0.005859375 Reinforce Max Reward : -2.4049887657165527


Batch:  20%|██        | 2/10 [00:38<02:41, 20.19s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.319932222366333


Batch:  30%|███       | 3/10 [00:54<02:13, 19.05s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.3310651779174805


Batch:  40%|████      | 4/10 [01:11<01:49, 18.27s/it][A

Reinforce Loss : 0.01171875 Reinforce Max Reward : -2.434605121612549


Batch:  50%|█████     | 5/10 [01:27<01:28, 17.70s/it][A

Reinforce Loss : 0.0 Reinforce Max Reward : -2.207273244857788


Batch:  60%|██████    | 6/10 [01:43<01:09, 17.30s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.440826654434204


Batch:  70%|███████   | 7/10 [02:00<00:51, 17.10s/it][A

Reinforce Loss : -0.0078125 Reinforce Max Reward : -2.4469211101531982


Batch:  80%|████████  | 8/10 [02:16<00:33, 16.86s/it][A

Reinforce Loss : 0.0048828125 Reinforce Max Reward : -1.6816482543945312


Batch:  90%|█████████ | 9/10 [02:33<00:16, 16.68s/it][A

Reinforce Loss : -0.00390625 Reinforce Max Reward : -1.7397053241729736


Batch: 100%|██████████| 10/10 [02:49<00:00, 16.63s/it][A
Epoch:   6%|▋         | 114/1785 [5:40:57<85:19:56, 183.84s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.4041907787323


Batch:  10%|█         | 1/10 [00:21<03:12, 21.40s/it][A

Reinforce Loss : 0.009765625 Reinforce Max Reward : -2.4240102767944336


Batch:  20%|██        | 2/10 [00:37<02:39, 19.90s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.3982181549072266


Batch:  30%|███       | 3/10 [00:54<02:12, 18.96s/it][A

Reinforce Loss : -0.01171875 Reinforce Max Reward : -2.425541639328003


Batch:  40%|████      | 4/10 [01:11<01:49, 18.26s/it][A

Reinforce Loss : 0.001953125 Reinforce Max Reward : -2.3115897178649902


Batch:  50%|█████     | 5/10 [01:27<01:28, 17.74s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.3903050422668457


Batch:  60%|██████    | 6/10 [01:44<01:09, 17.39s/it][A

Reinforce Loss : -0.005859375 Reinforce Max Reward : -2.4069056510925293


Batch:  70%|███████   | 7/10 [02:01<00:52, 17.35s/it][A

Reinforce Loss : -0.001953125 Reinforce Max Reward : -2.2984795570373535


Batch:  80%|████████  | 8/10 [02:18<00:34, 17.20s/it][A

Reinforce Loss : 0.00390625 Reinforce Max Reward : -2.4234485626220703


Batch:  90%|█████████ | 9/10 [02:34<00:16, 16.95s/it][A

Reinforce Loss : 0.0087890625 Reinforce Max Reward : -2.4233551025390625


Batch: 100%|██████████| 10/10 [02:51<00:00, 16.76s/it][A
Epoch:   6%|▋         | 115/1785 [5:44:04<85:45:05, 184.85s/it]
Batch:   0%|          | 0/10 [00:00<?, ?it/s][A

Epoch 330 Batch 0 Iteration 2 Reward -2.44%

In [0]:
#del Policy
del Enc
del D
del Dec
torch.cuda.empty_cache()

In [0]:
test = next(iter(train_loader))[0]
test.shape

In [0]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [0]:
torch.cuda.empty_cache()