In [189]:
import numpy as np
import matplotlib.pyplot as plt
import PIL

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
use_cuda = torch.cuda.is_available()

In [190]:
GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

# Additional Infos
if device.type == 'cuda':
    print(torch.cuda.get_device_name(GPU_NUM))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(GPU_NUM)/1024**3,1), 'GB')

Current cuda device  0
GeForce GTX 1660 Ti
Memory Usage:
Allocated: 0.1 GB
Cached:    0.5 GB




In [191]:
###################################### hyperparameters
class HParams():
    def __init__(self):
        self.data_location = 'cat.npz'
        self.enc_hidden_size = 256
        self.dec_hidden_size = 512
        self.Nz = 128
        self.M = 20
        self.dropout = 0.9
        self.batch_size = 100
        self.eta_min = 0.01
        self.R = 0.99995
        self.KL_min = 0.2
        self.wKL = 0.5
        self.lr = 0.001
        self.lr_decay = 0.9999
        self.min_lr = 0.00001
        self.grad_clip = 1.
        self.temperature = 0.4
        self.max_seq_length = 200

hp = HParams()

In [192]:
################################# load and prepare data
def max_size(data):
    """larger sequence length in the data set"""
    sizes = [len(seq) for seq in data]
#     len(seq)는 각 image의 row개수를 알려줌
# 그게 sequence (펜 움직인) 길이니까
# number of points
    return max(sizes)

def purify(strokes):
    """removes to small or too long sequences + removes large gaps"""
    data = []
    for seq in strokes:
#         70000개중 1개의 image가 seq이고, seq.shape[0]은 e.g. 51 (51x3)에서 나옴
        if seq.shape[0] <= hp.max_seq_length and seq.shape[0] > 10:
            seq = np.minimum(seq, 1000)
            seq = np.maximum(seq, -1000)
            seq = np.array(seq, dtype=np.float32)
            data.append(seq)
    return data

def calculate_normalizing_scale_factor(strokes):
    """Calculate the normalizing factor explained in appendix of sketch-rnn."""
    data = []
#     각 image의 x,y를 일렬로 쫙 data에 append. 총 len(data)는 70000 x 2 x (각 image의 row 개수)
    for i in range(len(strokes)):
        for j in range(len(strokes[i])):
            data.append(strokes[i][j, 0])
            data.append(strokes[i][j, 1])
    data = np.array(data)
    return np.std(data)

def normalize(strokes):
    """Normalize entire dataset (delta_x, delta_y) by the scaling factor."""
    data = []
    scale_factor = calculate_normalizing_scale_factor(strokes)
    for seq in strokes:
#         seq[:, 0:2]에서 첫번째 :이거는 모든 row를 다 포함시키겠다는거고
#         0:2는 그 row들의 첫번째 두번째 column을 indexing
        seq[:, 0:2] /= scale_factor
        data.append(seq)
    return data

In [193]:
data = []
for i in range(3):
    for j in range(4):
        data.append(4)
        data.append(1)
data = np.array(data)
print(np.std(data))

1.5


In [194]:
x = np.random.randn(5,5)
print(x)
print(x[:, 0:2])

[[-0.84288706 -0.81007502  0.23138612 -1.09344471  0.31905611]
 [-0.77993514 -0.48063765 -0.43322573 -0.85017802  1.23934203]
 [ 0.56073433 -0.16472515 -0.02395987 -0.70838435 -1.24048544]
 [-0.52897517 -1.55743127  0.53950104  0.70320162 -1.508241  ]
 [-0.35762015 -1.88266864  0.22042051 -0.85006095 -0.20699523]]
[[-0.84288706 -0.81007502]
 [-0.77993514 -0.48063765]
 [ 0.56073433 -0.16472515]
 [-0.52897517 -1.55743127]
 [-0.35762015 -1.88266864]]


In [195]:
x = np.random.randn(2,51,3)
print(x)
print("\n")
for i in x:
    print(i)
    print(i.shape[0])
    i = np.array(i, dtype=np.float32)
    print("\n")
    print(i)

[[[-2.10051127 -0.50021532 -0.29542511]
  [-0.77425718  0.33104448 -0.98908593]
  [-0.12055953 -0.24447894  1.00026517]
  [ 2.41124226 -1.35535783  0.85709503]
  [-0.52561922  1.57178625  0.16251373]
  [-0.06160139  0.60660753 -0.35287663]
  [-1.16274725  1.55443184 -0.11896821]
  [ 0.06108278  0.89401555  1.75379064]
  [-1.49949254 -1.40487934  0.0432748 ]
  [ 0.52984642 -1.28709861 -0.64841151]
  [-0.17892715 -1.639133   -0.44761373]
  [ 1.349245   -1.78496027 -0.25253838]
  [ 0.30086323  0.25019935  1.12035212]
  [ 0.84931648 -0.43235498  0.88208517]
  [ 1.17909964 -0.01538708 -0.50485062]
  [-0.70695009 -0.55172929  0.83845811]
  [ 0.97970715  0.32219204  0.4191813 ]
  [-0.27032153 -0.30811184  1.42830217]
  [ 1.27782949 -1.51466847 -1.5796183 ]
  [ 0.02296061  0.36609193  0.54293638]
  [ 0.09657662 -1.6435781   0.23927777]
  [-0.13816438 -0.69500382  0.04534715]
  [-0.3948403  -0.46959244  0.24788216]
  [-0.13672501  0.97554805 -0.36563871]
  [ 1.45266074  1.65725406  0.56381502]


In [196]:
dataset = np.load(hp.data_location, encoding='latin1', allow_pickle=True)
# print(dataset)
# print(dataset.keys())

data = dataset['train']
# for i in train_set:
#     print(train_set.shape)
print(len(data[1]))
#70000개의 channel, 각 channel이 한 image를뜻하고, 각 image는 2-d like 51x3

# calculate_normalizing_scale_factor(data)

valid_set = dataset['valid']
test_set = dataset['test']
# print(valid_set.shape)

data = purify(data)
# print(train_set)
data = normalize(data)
Nmax = max_size(data)
print(Nmax)

81
129


In [197]:
# import numpy as np
# # # save np.load
# # np_load_old = np.load
# np.load.__defaults__=(None, True, True, 'ASCII')

# # modify the default parameters of np.load
# # np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# dataset = np.load(hp.data_location, encoding='latin1')

# data = dataset['train']
# data = purify(data)
# data = normalize(data)
# Nmax = max_size(data)
# # restore np.load for future normal usage
# # np.load = np_load_old
# np.load.__defaults__=(None, False, True, 'ASCII')
x=np.zeros((Nmax,5))
print(x.shape)

(129, 5)


In [198]:
############################## function to generate a batch:
def make_batch(batch_size):
    batch_idx = np.random.choice(len(data),batch_size)
#     batch_size개수만큼의 70000이하의 숫자들을 생성
    batch_sequences = [data[idx] for idx in batch_idx]
#     각 image의 2d array가 list로 저장, len(batch_sequences)=batch_size
    strokes = []
    lengths = []
    indice = 0
    for seq in batch_sequences:
        len_seq = len(seq[:,0])
        new_seq = np.zeros((Nmax,5))
        new_seq[:len_seq,:2] = seq[:,:2]
        new_seq[:len_seq-1,2] = 1-seq[:-1,2]
        new_seq[:len_seq,3] = seq[:,2]
        new_seq[(len_seq-1):,4] = 1
        new_seq[len_seq-1,2:4] = 0
        lengths.append(len(seq[:,0]))
        strokes.append(new_seq)
        indice += 1

#         np.stack(smt, 1)이면 row대로 (밑으로) 붙인다
    if use_cuda:
        batch = Variable(torch.from_numpy(np.stack(strokes,1)).cuda().float())
    else:
        batch = Variable(torch.from_numpy(np.stack(strokes,1)).float())
    return batch, lengths

In [199]:
################################ adaptive lr
def lr_decay(optimizer):
    """Decay learning rate by a factor of lr_decay"""
    for param_group in optimizer.param_groups:
        if param_group['lr']>hp.min_lr:
            param_group['lr'] *= hp.lr_decay
    return optimizer

In [200]:
a = torch.arange(10).reshape(1,5,2)
print(a)
print(torch.split(a,1))

tensor([[[0, 1],
         [2, 3],
         [4, 5],
         [6, 7],
         [8, 9]]])
(tensor([[[0, 1],
         [2, 3],
         [4, 5],
         [6, 7],
         [8, 9]]]),)


In [201]:
################################# encoder and decoder modules
class EncoderRNN(nn.Module):
    def __init__(self):
        super(EncoderRNN, self).__init__()
        # bidirectional lstm:
        self.lstm = nn.LSTM(5, hp.enc_hidden_size, \
            dropout=hp.dropout, bidirectional=True)
        # create mu and sigma from lstm's last output:
        self.fc_mu = nn.Linear(2*hp.enc_hidden_size, hp.Nz)
        self.fc_sigma = nn.Linear(2*hp.enc_hidden_size, hp.Nz)
        # active dropout:
        self.train()

    def forward(self, inputs, batch_size, hidden_cell=None):
        if hidden_cell is None:
            # then must init with zeros
            if use_cuda:
                hidden = torch.zeros(2, batch_size, hp.enc_hidden_size).cuda()
                cell = torch.zeros(2, batch_size, hp.enc_hidden_size).cuda()
            else:
                hidden = torch.zeros(2, batch_size, hp.enc_hidden_size)
                cell = torch.zeros(2, batch_size, hp.enc_hidden_size)
            hidden_cell = (hidden, cell)
        _, (hidden,cell) = self.lstm(inputs.float(), hidden_cell)
        # hidden is (2, batch_size, hidden_size), we want (batch_size, 2*hidden_size):
        hidden_forward, hidden_backward = torch.split(hidden,1,0)
        hidden_cat = torch.cat([hidden_forward.squeeze(0), hidden_backward.squeeze(0)],1)
        # mu and sigma:
        mu = self.fc_mu(hidden_cat)
        sigma_hat = self.fc_sigma(hidden_cat)
        sigma = torch.exp(sigma_hat/2.)
        # N ~ N(0,1)
        z_size = mu.size()
        if use_cuda:
            N = torch.normal(torch.zeros(z_size),torch.ones(z_size)).cuda()
        else:
            N = torch.normal(torch.zeros(z_size),torch.ones(z_size))
        z = mu + sigma*N
        # mu and sigma_hat are needed for LKL loss
        return z, mu, sigma_hat

In [202]:
x = torch.tensor([1, 2, 3, 4])
print(x.shape)
print(x.unsqueeze(0).shape)

torch.Size([4])
torch.Size([1, 4])


In [203]:
class DecoderRNN(nn.Module):
    def __init__(self):
        super(DecoderRNN, self).__init__()
        # to init hidden and cell from z:
        self.fc_hc = nn.Linear(hp.Nz, 2*hp.dec_hidden_size)
        # unidirectional lstm:
        self.lstm = nn.LSTM(hp.Nz+5, hp.dec_hidden_size, dropout=hp.dropout)
        # create proba distribution parameters from hiddens:
        self.fc_params = nn.Linear(hp.dec_hidden_size,6*hp.M+3)

    def forward(self, inputs, z, hidden_cell=None):
        if hidden_cell is None:
            # then we must init from z
            hidden,cell = torch.split(F.tanh(self.fc_hc(z)),hp.dec_hidden_size,1)
            hidden_cell = (hidden.unsqueeze(0).contiguous(), cell.unsqueeze(0).contiguous())
        outputs,(hidden,cell) = self.lstm(inputs, hidden_cell)
        # in training we feed the lstm with the whole input in one shot
        # and use all outputs contained in 'outputs', while in generate
        # mode we just feed with the last generated sample:
        if self.training:
            y = self.fc_params(outputs.view(-1, hp.dec_hidden_size))
        else:
            y = self.fc_params(hidden.view(-1, hp.dec_hidden_size))
        # separate pen and mixture params:
        params = torch.split(y,6,1)
        params_mixture = torch.stack(params[:-1]) # trajectory
        params_pen = params[-1] # pen up/down
        # identify mixture params:
        pi,mu_x,mu_y,sigma_x,sigma_y,rho_xy = torch.split(params_mixture,1,2)
        # preprocess params::
        if self.training:
            len_out = Nmax+1
        else:
            len_out = 1
                                   
        pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
        sigma_x = torch.exp(sigma_x.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
        sigma_y = torch.exp(sigma_y.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
        rho_xy = torch.tanh(rho_xy.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
        mu_x = mu_x.transpose(0,1).squeeze().contiguous().view(len_out,-1,hp.M)
        mu_y = mu_y.transpose(0,1).squeeze().contiguous().view(len_out,-1,hp.M)
        q = F.softmax(params_pen).view(len_out,-1,3)
        return pi,mu_x,mu_y,sigma_x,sigma_y,rho_xy,q,hidden,cell

In [204]:
torch.stack([torch.Tensor([0,0,1,0,0])]*hp.batch_size)

tensor([[0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0

In [205]:
os = Variable(torch.Tensor([0,0,1,0,0]).view(1,1,-1))
print(os.shape)

torch.Size([1, 1, 5])


In [206]:
x = torch.ones(2,3,3)
idx = [2]
print(x.data[0,0,idx])


tensor([1.])


In [207]:
class Model():
    def __init__(self):
        if use_cuda:
            self.encoder = EncoderRNN().cuda()
            self.decoder = DecoderRNN().cuda()
        else:
            self.encoder = EncoderRNN()
            self.decoder = DecoderRNN()
        self.encoder_optimizer = optim.Adam(self.encoder.parameters(), hp.lr)
        self.decoder_optimizer = optim.Adam(self.decoder.parameters(), hp.lr)
        self.eta_step = hp.eta_min

    def make_target(self, batch, lengths):
        if use_cuda:
            eos = torch.stack([torch.Tensor([0,0,0,0,1])]*batch.size()[1]).cuda().unsqueeze(0)
        else:
            eos = torch.stack([torch.Tensor([0,0,0,0,1])]*batch.size()[1]).unsqueeze(0)
        batch = torch.cat([batch, eos], 0)
        mask = torch.zeros(Nmax+1, batch.size()[1])
        for indice,length in enumerate(lengths):
            mask[:length,indice] = 1
        if use_cuda:
            mask = mask.cuda()
        dx = torch.stack([batch.data[:,:,0]]*hp.M,2)
        dy = torch.stack([batch.data[:,:,1]]*hp.M,2)
        p1 = batch.data[:,:,2]
        p2 = batch.data[:,:,3]
        p3 = batch.data[:,:,4]
        p = torch.stack([p1,p2,p3],2)
        return mask,dx,dy,p

    def train(self, epoch):
        self.encoder.train()
        self.decoder.train()
        batch, lengths = make_batch(hp.batch_size)
        # encode:
#                                             input, batch_size
        z, self.mu, self.sigma = self.encoder(batch, hp.batch_size)
        # create start of sequence:
        if use_cuda:
            sos = torch.stack([torch.Tensor([0,0,1,0,0])]*hp.batch_size).cuda().unsqueeze(0)
        else:
            sos = torch.stack([torch.Tensor([0,0,1,0,0])]*hp.batch_size).unsqueeze(0)
        # had sos at the begining of the batch:
        batch_init = torch.cat([sos, batch],0)
        # expend z to be ready to concatenate with inputs:
        z_stack = torch.stack([z]*(Nmax+1))
        # inputs is concatenation of z and batch_inputs
        inputs = torch.cat([batch_init, z_stack],2)
        # decode:
        self.pi, self.mu_x, self.mu_y, self.sigma_x, self.sigma_y, \
            self.rho_xy, self.q, _, _ = self.decoder(inputs, z)
        # prepare targets:
        mask,dx,dy,p = self.make_target(batch, lengths)
        # prepare optimizers:
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        # update eta for LKL:
        self.eta_step = 1-(1-hp.eta_min)*hp.R
        # compute losses:
        LKL = self.kullback_leibler_loss()
        LR = self.reconstruction_loss(mask,dx,dy,p,epoch)
        loss = LR + LKL
        # gradient step
        loss.backward()
        # gradient cliping
        nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
        nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)
        # optim step
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        # some print and save:
        if epoch%1==0:
#             import pdb; pdb.set_trace()
            print('epoch',epoch,'loss',loss.item(),'LR',LR.item(),'LKL',LKL.item())
#             print('epoch',epoch,'loss',loss.data[0],'LR',LR.data[0],'LKL',LKL.data[0])
            self.encoder_optimizer = lr_decay(self.encoder_optimizer)
            self.decoder_optimizer = lr_decay(self.decoder_optimizer)
        if epoch%100==0:
            #self.save(epoch)
            self.conditional_generation(epoch)
#             self.reconstruct()
    
    def bivariate_normal_pdf(self, dx, dy):
        z_x = ((dx-self.mu_x)/self.sigma_x)**2
        z_y = ((dy-self.mu_y)/self.sigma_y)**2
        z_xy = (dx-self.mu_x)*(dy-self.mu_y)/(self.sigma_x*self.sigma_y)
        z = z_x + z_y -2*self.rho_xy*z_xy
        exp = torch.exp(-z/(2*(1-self.rho_xy**2)))
        norm = 2*np.pi*self.sigma_x*self.sigma_y*torch.sqrt(1-self.rho_xy**2)
        return exp/norm

    def reconstruction_loss(self, mask, dx, dy, p, epoch):
        pdf = self.bivariate_normal_pdf(dx, dy)
        LS = -torch.sum(mask*torch.log(1e-5+torch.sum(self.pi * pdf, 2)))\
            /float(Nmax*hp.batch_size)
        LP = -torch.sum(p*torch.log(self.q))/float(Nmax*hp.batch_size)
        return LS+LP

    def kullback_leibler_loss(self):
        LKL = -0.5*torch.sum(1+self.sigma-self.mu**2-torch.exp(self.sigma))\
            /float(hp.Nz*hp.batch_size)
        if use_cuda:
            KL_min = Variable(torch.Tensor([hp.KL_min]).cuda()).detach()
        else:
            KL_min = Variable(torch.Tensor([hp.KL_min])).detach()
        return hp.wKL*self.eta_step * torch.max(LKL,KL_min)

    def save(self, epoch):
        sel = np.random.rand()
        torch.save(self.encoder.state_dict(), \
            'encoderRNN_sel_%3f_epoch_%d.pth' % (sel,epoch))
        torch.save(self.decoder.state_dict(), \
            'decoderRNN_sel_%3f_epoch_%d.pth' % (sel,epoch))

    def load(self, encoder_name, decoder_name):
        saved_encoder = torch.load(encoder_name)
        saved_decoder = torch.load(decoder_name)
        self.encoder.load_state_dict(saved_encoder)
        self.decoder.load_state_dict(saved_decoder)

    def conditional_generation(self, epoch):
        batch,lengths = make_batch(1)
        # should remove dropouts:
        self.encoder.train(False)
        self.decoder.train(False)
        # encode:
        z, _, _ = self.encoder(batch, 1)
        if use_cuda:
            sos = Variable(torch.Tensor([0,0,1,0,0]).view(1,1,-1).cuda())
        else:
            sos = Variable(torch.Tensor([0,0,1,0,0]).view(1,1,-1))
        s = sos
        seq_x = []
        seq_y = []
        seq_z = []
        hidden_cell = None
        for i in range(Nmax):
            input = torch.cat([s,z.unsqueeze(0)],2)
            # decode:
            self.pi, self.mu_x, self.mu_y, self.sigma_x, self.sigma_y, \
                self.rho_xy, self.q, hidden, cell = \
                    self.decoder(input, z, hidden_cell)
            hidden_cell = (hidden, cell)
            # sample from parameters:
            s, dx, dy, pen_down, eos = self.sample_next_state()
#             s, dx, dy, pen_down, eos = self.sample_next()

            #------
            seq_x.append(dx)
            seq_y.append(dy)
            seq_z.append(pen_down)
            if eos:
                print(i)
                break
        # visualize result:
        x_sample = np.cumsum(seq_x, 0)
        y_sample = np.cumsum(seq_y, 0)
        z_sample = np.array(seq_z)
        sequence = np.stack([x_sample,y_sample,z_sample]).T
        make_image(sequence, epoch)

    def sample_next(self, pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q):
        pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q =\
            pi[0, 0, :], mu_x[0, 0, :], mu_y[0, 0, :], sigma_x[0,
                                                               0, :], sigma_y[0, 0, :], rho_xy[0, 0, :], q[0, 0, :]
        mu_x, mu_y, sigma_x, sigma_y, rho_xy =\
            mu_x.cpu().numpy(), mu_y.cpu().numpy(), sigma_x.cpu(
            ).numpy(), sigma_y.cpu().numpy(), rho_xy.cpu().numpy()
        M = pi.shape[0]
        # offset
        idx = np.random.choice(M, p=pi.cpu().numpy())
        mean = [mu_x[idx], mu_y[idx]]
        cov = [[sigma_x[idx] * sigma_x[idx], rho_xy[idx] * sigma_x[idx]*sigma_y[idx]],
               [rho_xy[idx] * sigma_x[idx]*sigma_y[idx], sigma_y[idx] * sigma_y[idx]]]
        xy = np.random.multivariate_normal(mean, cov, 1)
        xy = torch.from_numpy(xy).float().to(device)

        # pen
        p = torch.tensor([0, 0, 0], device=device, dtype=torch.float)
        idx = np.random.choice(3, p=q.cpu().numpy())
        p[idx] = 1.0
        p = p.unsqueeze(0)

        return torch.cat([xy, p], dim=1).unsqueeze(0)
    
    def sample_next_state(self):

        def adjust_temp(pi_pdf):
            pi_pdf = np.log(pi_pdf)/hp.temperature
            pi_pdf -= pi_pdf.max()
            pi_pdf = np.exp(pi_pdf)
            pi_pdf /= pi_pdf.sum()
            return pi_pdf

        # get mixture indice:
        pi = self.pi.data[0,0,:].cpu().numpy()
        pi = adjust_temp(pi)
        pi_idx = np.random.choice(hp.M, p=pi)
        # get pen state:
        q = self.q.data[0,0,:].cpu().numpy()
        q = adjust_temp(q)
        q_idx = np.random.choice(3, p=q)
        # get mixture params:
#                                 len(pi_idx)가 들어가게됨
        mu_x = self.mu_x.data[0,0,pi_idx]
        mu_y = self.mu_y.data[0,0,pi_idx]
        sigma_x = self.sigma_x.data[0,0,pi_idx]
        sigma_y = self.sigma_y.data[0,0,pi_idx]
        rho_xy = self.rho_xy.data[0,0,pi_idx]
        
        mean = [mu_x, mu_y]
        mean = np.array([mu_x, mu_y], dtype=np.float64)
        sigma_x = sigma_x * np.sqrt(hp.temperature)
        sigma_y = sigma_y * np.sqrt(hp.temperature)
#         cov=[[1,1],[1,1]]
        cov = [[sigma_x * sigma_x, rho_xy * sigma_x * sigma_y],\
            [rho_xy * sigma_x * sigma_y, sigma_y * sigma_y]]
        cov = np.array([[sigma_x * sigma_x, rho_xy * sigma_x * sigma_y],[rho_xy * sigma_x * sigma_y, sigma_y * sigma_y]], dtype=np.float64)
        import pdb; pdb.set_trace
        xy = np.random.multivariate_normal(mean, cov, 1)
        x=xy[0][0] 
        y=xy[0][1] 
#         x,y = sample_bivariate_normal(mu_x,mu_y,sigma_x,sigma_y,rho_xy,greedy=False)
        next_state = torch.zeros(5)
        next_state[0] = x
        next_state[1] = y
        next_state[q_idx+2] = 1
        if use_cuda:
            return Variable(next_state.cuda()).view(1,1,-1),x,y,q_idx==1,q_idx==2
        else:
            return Variable(next_state).view(1,1,-1),x,y,q_idx==1,q_idx==2
#         return 1,2,3,4,5

# def sample_bivariate_normal(mu_x,mu_y,sigma_x,sigma_y,rho_xy, greedy=False):
#     # inputs must be floats
#     if greedy:
#         return mu_x,mu_y
#     mean = [mu_x, mu_y]
#     sigma_x = sigma_x * np.sqrt(hp.temperature)
#     sigma_y = sigma_y * np.sqrt(hp.temperature)
#     cov = [[sigma_x * sigma_x, rho_xy * sigma_x * sigma_y],\
#         [rho_xy * sigma_x * sigma_y, sigma_y * sigma_y]]
#     import pdb; pdb.set_trace
#     x = np.random.multivariate_normal(mean, cov, 1)
#     x = torch.from_numpy(xy).float().to(device)

#     return x[0][0], x[0][1]
    def reconstruct(self, S):
        self.encoder.eval()
        self.decoder.eval()
        with torch.no_grad():
            Nmax = S.shape[0]
            batch_size = S.shape[1]
            s_i = torch.stack(
                [torch.tensor([0, 0, 1, 0, 0], device=device, dtype=torch.float)] * batch_size, dim=0).unsqueeze(0)
            output = s_i  # dummy
            z, _, _ = self.encoder(S)
            h0, c0 = torch.split(torch.tanh(self.decoder.fc_hc(z)),
                                 self.decoder.dec_hidden_size, 1)
            hidden_cell = (h0.unsqueeze(0).contiguous(),
                           c0.unsqueeze(0).contiguous())
            for i in range(Nmax):
                (pi, mu_x, mu_y, sigma_x, sigma_y,
                 rho_xy, q), hidden_cell = self.decoder(s_i, z, hidden_cell)
                s_i = self.sample_next(
                    pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q)
                output = torch.cat([output, s_i], dim=0)
                if output[-1, 0, 4] == 1:
                    break

            output = output[1:, :, :]  # remove dummy
            return output

    def sample_next(self, pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q):
        pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q =\
            pi[0, 0, :], mu_x[0, 0, :], mu_y[0, 0, :], sigma_x[0,
                                                               0, :], sigma_y[0, 0, :], rho_xy[0, 0, :], q[0, 0, :]
        mu_x, mu_y, sigma_x, sigma_y, rho_xy =\
            mu_x.cpu().numpy(), mu_y.cpu().numpy(), sigma_x.cpu(
            ).numpy(), sigma_y.cpu().numpy(), rho_xy.cpu().numpy()
        M = pi.shape[0]
        # offset
        idx = np.random.choice(M, p=pi.cpu().numpy())
        mean = [mu_x[idx], mu_y[idx]]
        cov = [[sigma_x[idx] * sigma_x[idx], rho_xy[idx] * sigma_x[idx]*sigma_y[idx]],
               [rho_xy[idx] * sigma_x[idx]*sigma_y[idx], sigma_y[idx] * sigma_y[idx]]]
        xy = np.random.multivariate_normal(mean, cov, 1)
        xy = torch.from_numpy(xy).float().to(device)

        # pen
        p = torch.tensor([0, 0, 0], device=device, dtype=torch.float)
        idx = np.random.choice(3, p=q.cpu().numpy())
        p[idx] = 1.0
        p = p.unsqueeze(0)

        return torch.cat([xy, p], dim=1).unsqueeze(0)


In [208]:
def make_image(sequence, epoch, name='_output_'):
    """plot drawing with separated strokes"""
    strokes = np.split(sequence, np.where(sequence[:,2]>0)[0]+1)
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    for s in strokes:
        plt.plot(s[:,0],-s[:,1])
    canvas = plt.get_current_fig_manager().canvas
    canvas.draw()
    pil_image = PIL.Image.frombytes('RGB', canvas.get_width_height(),
                 canvas.tostring_rgb())
    name = str(epoch)+name+'.jpg'
    pil_image.save(name,"JPEG")
    plt.close("all")

In [209]:
arrays = [np.random.randn(1, 1) for _ in range(10)]
print(arrays)

[array([[-0.51001199]]), array([[-1.32880626]]), array([[-0.79570227]]), array([[0.023427]]), array([[0.28846899]]), array([[-0.1577076]]), array([[0.05367381]]), array([[1.42525504]]), array([[-1.02237308]]), array([[-0.52167998]])]


In [210]:
x=np.random.randn(3,4)
print(x)
y=np.random.randn(3,4)
print(y)
np.stack([x,y])

[[-0.40716304 -1.46409236 -0.02488944  0.07599259]
 [ 1.65806582 -1.35603363 -1.10371984  1.18437166]
 [-0.73965437 -1.10535307  0.65729672 -0.0373229 ]]
[[-0.95683687  2.22552821  0.90234392 -1.03772584]
 [ 0.52092805  1.05588582  0.12533905  0.95103017]
 [-0.21520836 -0.6900452   1.02141197 -1.25694769]]


array([[[-0.40716304, -1.46409236, -0.02488944,  0.07599259],
        [ 1.65806582, -1.35603363, -1.10371984,  1.18437166],
        [-0.73965437, -1.10535307,  0.65729672, -0.0373229 ]],

       [[-0.95683687,  2.22552821,  0.90234392, -1.03772584],
        [ 0.52092805,  1.05588582,  0.12533905,  0.95103017],
        [-0.21520836, -0.6900452 ,  1.02141197, -1.25694769]]])

In [211]:
mean = [1, 2]
cov = [[1, 0], [0, 1]]  
x = np.random.multivariate_normal(mean, cov, 1)
print(x)
x = torch.from_numpy(x).float().to(device)
print(x.shape)

p = torch.tensor([0, 0, 0], device=device, dtype=torch.float)
idx = np.random.choice(3, 5)
print(idx)
p[idx] = 1.0
print(p)
p = p.unsqueeze(0)
print(p.shape)
z=torch.cat([x, p], dim=1).unsqueeze(0)
print(z.shape)
# print(x[0])
# print(x.shape)

[[2.82709951 2.35873629]]
torch.Size([1, 2])
[1 2 0 1 2]
tensor([1., 1., 1.], device='cuda:0')
torch.Size([1, 3])
torch.Size([1, 1, 5])


In [212]:
import numpy as np

b = np.zeros(1)
c = np.zeros(1)
c = c/2**63

b += c

In [213]:
model=Model()



In [214]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:


AttributeError: 'Model' object has no attribute 'state_dict'

In [215]:
model.encoder.state_dict()

OrderedDict([('lstm.weight_ih_l0',
              tensor([[ 6.1180e-02, -2.3263e-02,  3.4519e-02, -5.1125e-02,  2.1832e-02],
                      [-3.4036e-02, -2.0473e-03, -2.5708e-02,  2.9035e-02,  8.6183e-03],
                      [ 2.0172e-02, -1.3311e-02, -1.7053e-03,  5.2037e-02,  3.2584e-02],
                      ...,
                      [-5.6572e-02,  1.7812e-02,  5.5317e-02,  6.8635e-04, -1.1889e-02],
                      [-4.9143e-02, -2.5645e-05, -1.2556e-02, -2.2418e-02, -1.1055e-02],
                      [-3.2239e-02,  2.7249e-02, -2.8720e-02,  1.7094e-02,  5.5566e-02]],
                     device='cuda:0')),
             ('lstm.weight_hh_l0',
              tensor([[-0.0480,  0.0164,  0.0454,  ...,  0.0047, -0.0370,  0.0519],
                      [ 0.0460, -0.0259, -0.0150,  ...,  0.0167, -0.0238, -0.0143],
                      [-0.0565,  0.0078,  0.0478,  ...,  0.0414,  0.0349, -0.0567],
                      ...,
                      [ 0.0576, -0.0615, -0.0182,

In [216]:
if __name__=="__main__":
    model = Model()
    for epoch in range(50001):
        model.train(epoch)

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)
  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)


epoch 0 loss 2.419813394546509 LR 2.4188084602355957 LKL 0.0010049500269815326
1
epoch 1 loss 2.381244421005249 LR 2.380239486694336 LKL 0.0010049500269815326


  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2 loss 2.497189521789551 LR 2.4961845874786377 LKL 0.0010049500269815326
epoch 3 loss 2.3125431537628174 LR 2.3115382194519043 LKL 0.0010049500269815326
epoch 4 loss 2.2314395904541016 LR 2.2304346561431885 LKL 0.0010049500269815326
epoch 5 loss 2.247342348098755 LR 2.246337413787842 LKL 0.0010049500269815326
epoch 6 loss 1.9871671199798584 LR 1.9861621856689453 LKL 0.0010049500269815326
epoch 7 loss 2.259406566619873 LR 2.25840163230896 LKL 0.0010049500269815326
epoch 8 loss 2.0399041175842285 LR 2.0388991832733154 LKL 0.0010049500269815326
epoch 9 loss 1.9737485647201538 LR 1.9727436304092407 LKL 0.0010049500269815326
epoch 10 loss 1.8675637245178223 LR 1.8665587902069092 LKL 0.0010049500269815326
epoch 11 loss 1.841461420059204 LR 1.840456485748291 LKL 0.0010049500269815326
epoch 12 loss 1.8234825134277344 LR 1.8224775791168213 LKL 0.0010049500269815326
epoch 13 loss 1.6655797958374023 LR 1.6645748615264893 LKL 0.0010049500269815326
epoch 14 loss 1.7003731727600098 LR 1.699368

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 101 loss 1.0172175168991089 LR 1.0114325284957886 LKL 0.005784943699836731
epoch 102 loss 1.0731767416000366 LR 1.0672293901443481 LKL 0.0059473104774951935
epoch 103 loss 1.0031085014343262 LR 0.9973317384719849 LKL 0.005776780191808939
epoch 104 loss 0.9759146571159363 LR 0.9705591797828674 LKL 0.005355500150471926
epoch 105 loss 1.0174201726913452 LR 1.0123562812805176 LKL 0.005063948221504688
epoch 106 loss 1.0076501369476318 LR 1.00277841091156 LKL 0.004871757701039314
epoch 107 loss 0.9405450820922852 LR 0.9356914758682251 LKL 0.004853579681366682
epoch 108 loss 1.0546796321868896 LR 1.0497515201568604 LKL 0.004928108770400286
epoch 109 loss 0.990035355091095 LR 0.9850033521652222 LKL 0.005031979642808437
epoch 110 loss 0.9455693364143372 LR 0.9404424428939819 LKL 0.0051268781535327435
epoch 111 loss 0.9752423763275146 LR 0.9699993133544922 LKL 0.00524304760619998
epoch 112 loss 0.978911280632019 LR 0.973432183265686 LKL 0.005479100160300732
epoch 113 loss 1.015003204345703

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 201 loss 0.742763876914978 LR 0.7356678247451782 LKL 0.007096069399267435
epoch 202 loss 0.698342502117157 LR 0.6912093758583069 LKL 0.007133155595511198
epoch 203 loss 0.7225954532623291 LR 0.7154748439788818 LKL 0.0071206227876245975
epoch 204 loss 0.7199904918670654 LR 0.7129373550415039 LKL 0.007053157780319452
epoch 205 loss 0.6560202836990356 LR 0.6490031480789185 LKL 0.007017114665359259
epoch 206 loss 0.7633732557296753 LR 0.7561545968055725 LKL 0.0072186789475381374
epoch 207 loss 0.7097523212432861 LR 0.7025655508041382 LKL 0.007186777889728546
epoch 208 loss 0.7508443593978882 LR 0.7435572147369385 LKL 0.007287128362804651
epoch 209 loss 0.6587114334106445 LR 0.6515054702758789 LKL 0.007205944508314133
epoch 210 loss 0.6981282830238342 LR 0.6908867359161377 LKL 0.007241550832986832
epoch 211 loss 0.7482343912124634 LR 0.740865170955658 LKL 0.007369208615273237
epoch 212 loss 0.7293669581413269 LR 0.721966564655304 LKL 0.007400388829410076
epoch 213 loss 0.6898690462112

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 302 loss 0.6761140823364258 LR 0.6676578521728516 LKL 0.008456237614154816
epoch 303 loss 0.5377587080001831 LR 0.5292519330978394 LKL 0.00850679911673069
epoch 304 loss 0.599770188331604 LR 0.5913013219833374 LKL 0.008468883112072945
epoch 305 loss 0.6018341779708862 LR 0.5932945609092712 LKL 0.00853959284722805
epoch 306 loss 0.5290051698684692 LR 0.5203880071640015 LKL 0.008617151528596878
epoch 307 loss 0.6426783800125122 LR 0.6339429616928101 LKL 0.008735439740121365
epoch 308 loss 0.5950220227241516 LR 0.5862736701965332 LKL 0.008748351596295834
epoch 309 loss 0.6010993719100952 LR 0.5923542976379395 LKL 0.008745046332478523
epoch 310 loss 0.5671884417533875 LR 0.5582984685897827 LKL 0.008889985270798206
epoch 311 loss 0.5957151055335999 LR 0.5867166519165039 LKL 0.008998431265354156
epoch 312 loss 0.5370175838470459 LR 0.5280291438102722 LKL 0.0089884577319026
epoch 313 loss 0.539950966835022 LR 0.5309523940086365 LKL 0.00899857934564352
epoch 314 loss 0.5905118584632874 L

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 401 loss 0.5378593802452087 LR 0.5283402800559998 LKL 0.009519116021692753
epoch 402 loss 0.493839293718338 LR 0.4842202961444855 LKL 0.009618986397981644
epoch 403 loss 0.5368442535400391 LR 0.5272591710090637 LKL 0.009585065767168999
epoch 404 loss 0.47347965836524963 LR 0.46384644508361816 LKL 0.009633224457502365
epoch 405 loss 0.5234001278877258 LR 0.5138256549835205 LKL 0.00957450084388256
epoch 406 loss 0.5226023197174072 LR 0.5128605961799622 LKL 0.009741741232573986
epoch 407 loss 0.49661222100257874 LR 0.48691868782043457 LKL 0.009693530388176441
epoch 408 loss 0.49840521812438965 LR 0.48845523595809937 LKL 0.009949996136128902
epoch 409 loss 0.5170519351959229 LR 0.5070813894271851 LKL 0.009970548562705517
epoch 410 loss 0.5679851174354553 LR 0.5581241846084595 LKL 0.009860954247415066
epoch 411 loss 0.5428215861320496 LR 0.5328736305236816 LKL 0.009947978891432285
epoch 412 loss 0.4651778042316437 LR 0.45521074533462524 LKL 0.009967057034373283
epoch 413 loss 0.526986

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 501 loss 0.4877796471118927 LR 0.4772075414657593 LKL 0.010572115890681744
epoch 502 loss 0.4496305286884308 LR 0.43915683031082153 LKL 0.010473699308931828
epoch 503 loss 0.5235605239868164 LR 0.5130626559257507 LKL 0.010497886687517166
epoch 504 loss 0.5227636098861694 LR 0.5123236775398254 LKL 0.010439926758408546
epoch 505 loss 0.43815067410469055 LR 0.42763662338256836 LKL 0.01051404234021902
epoch 506 loss 0.42310619354248047 LR 0.41262882947921753 LKL 0.010477360337972641
epoch 507 loss 0.5184443593025208 LR 0.5080239176750183 LKL 0.010420466773211956
epoch 508 loss 0.45252835750579834 LR 0.44214683771133423 LKL 0.010381520725786686
epoch 509 loss 0.4761897921562195 LR 0.4658466577529907 LKL 0.010343139991164207
epoch 510 loss 0.4639701247215271 LR 0.45355647802352905 LKL 0.010413646697998047
epoch 511 loss 0.4607090353965759 LR 0.4504016041755676 LKL 0.010307427495718002
epoch 512 loss 0.4586143493652344 LR 0.44834381341934204 LKL 0.010270546190440655
epoch 513 loss 0.497

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 601 loss 0.3818858563899994 LR 0.37039774656295776 LKL 0.011488107964396477
epoch 602 loss 0.36269673705101013 LR 0.351201593875885 LKL 0.01149514876306057
epoch 603 loss 0.3690311014652252 LR 0.35751014947891235 LKL 0.011520951986312866
epoch 604 loss 0.4574318826198578 LR 0.44589269161224365 LKL 0.011539185419678688
epoch 605 loss 0.41526222229003906 LR 0.4036332666873932 LKL 0.011628969572484493
epoch 606 loss 0.42962968349456787 LR 0.4180682897567749 LKL 0.011561404913663864
epoch 607 loss 0.5033950209617615 LR 0.49183720350265503 LKL 0.01155783236026764
epoch 608 loss 0.41151443123817444 LR 0.39998793601989746 LKL 0.011526482179760933
epoch 609 loss 0.3993349075317383 LR 0.3878716826438904 LKL 0.011463210918009281
epoch 610 loss 0.43265220522880554 LR 0.4211966395378113 LKL 0.011455575935542583
epoch 611 loss 0.43850916624069214 LR 0.4271005690097809 LKL 0.011408585123717785
epoch 612 loss 0.4249081313610077 LR 0.4135361611843109 LKL 0.011371980421245098
epoch 613 loss 0.440

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 702 loss 0.3461597263813019 LR 0.3340587019920349 LKL 0.012101029977202415
epoch 703 loss 0.3530774414539337 LR 0.34108081459999084 LKL 0.01199664082378149
epoch 704 loss 0.3786203861236572 LR 0.3665209710597992 LKL 0.01209941040724516
epoch 705 loss 0.3868606388568878 LR 0.3747241795063019 LKL 0.012136473320424557
epoch 706 loss 0.3449537456035614 LR 0.3329026699066162 LKL 0.012051088735461235
epoch 707 loss 0.44105958938598633 LR 0.4289311468601227 LKL 0.012128429487347603
epoch 708 loss 0.34999018907546997 LR 0.33790943026542664 LKL 0.012080765329301357
epoch 709 loss 0.39026904106140137 LR 0.37811389565467834 LKL 0.012155154719948769
epoch 710 loss 0.39084509015083313 LR 0.37875503301620483 LKL 0.012090058997273445
epoch 711 loss 0.36923646926879883 LR 0.35723578929901123 LKL 0.012000670656561852
epoch 712 loss 0.375635027885437 LR 0.36356666684150696 LKL 0.012068361975252628
epoch 713 loss 0.39067724347114563 LR 0.37863409519195557 LKL 0.012043156661093235
epoch 714 loss 0.3

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 801 loss 0.3969365656375885 LR 0.38516661524772644 LKL 0.011769945733249187
epoch 802 loss 0.36659136414527893 LR 0.354870080947876 LKL 0.01172127015888691
epoch 803 loss 0.3159893751144409 LR 0.30415594577789307 LKL 0.011833437718451023
epoch 804 loss 0.44837528467178345 LR 0.436532199382782 LKL 0.011843081563711166
epoch 805 loss 0.3257419466972351 LR 0.3138572573661804 LKL 0.011884702369570732
epoch 806 loss 0.36868956685066223 LR 0.35685765743255615 LKL 0.01183190569281578
epoch 807 loss 0.40467554330825806 LR 0.39278706908226013 LKL 0.011888474225997925
epoch 808 loss 0.4225536286830902 LR 0.4107898473739624 LKL 0.01176377385854721
epoch 809 loss 0.4072401821613312 LR 0.39551377296447754 LKL 0.011726396158337593
epoch 810 loss 0.39266401529312134 LR 0.3808574676513672 LKL 0.011806538328528404
epoch 811 loss 0.3562747538089752 LR 0.3445245921611786 LKL 0.01175016351044178
epoch 812 loss 0.39144253730773926 LR 0.3796554505825043 LKL 0.011787091381847858
epoch 813 loss 0.358263

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 901 loss 0.3399695158004761 LR 0.3275251090526581 LKL 0.012444416992366314
epoch 902 loss 0.33964189887046814 LR 0.3272295594215393 LKL 0.01241233479231596
epoch 903 loss 0.384274959564209 LR 0.37187659740448 LKL 0.01239834725856781
epoch 904 loss 0.2644011080265045 LR 0.25199687480926514 LKL 0.01240423135459423
epoch 905 loss 0.3244468569755554 LR 0.3120749592781067 LKL 0.01237188559025526
epoch 906 loss 0.32393279671669006 LR 0.31153446435928345 LKL 0.012398331426084042
epoch 907 loss 0.32766830921173096 LR 0.3153141140937805 LKL 0.012354194186627865
epoch 908 loss 0.32890400290489197 LR 0.3165009021759033 LKL 0.012403114698827267
epoch 909 loss 0.36325275897979736 LR 0.35095393657684326 LKL 0.012298816815018654
epoch 910 loss 0.27087926864624023 LR 0.25849804282188416 LKL 0.012381238862872124
epoch 911 loss 0.3309594392776489 LR 0.3186261057853699 LKL 0.012333340011537075
epoch 912 loss 0.31382453441619873 LR 0.3014301061630249 LKL 0.012394415214657784
epoch 913 loss 0.3705743

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1001 loss 0.3196072578430176 LR 0.30643510818481445 LKL 0.013172155246138573
epoch 1002 loss 0.33874204754829407 LR 0.3254701793193817 LKL 0.013271864503622055
epoch 1003 loss 0.3420602083206177 LR 0.32879626750946045 LKL 0.013263939879834652
epoch 1004 loss 0.30372196435928345 LR 0.29052770137786865 LKL 0.013194250874221325
epoch 1005 loss 0.2688780128955841 LR 0.25562620162963867 LKL 0.01325180847197771
epoch 1006 loss 0.2756267786026001 LR 0.2623656690120697 LKL 0.013261121697723866
epoch 1007 loss 0.3742363154888153 LR 0.361023485660553 LKL 0.013212834484875202
epoch 1008 loss 0.3100859820842743 LR 0.29676735401153564 LKL 0.013318638317286968
epoch 1009 loss 0.30797111988067627 LR 0.2947629392147064 LKL 0.013208165764808655
epoch 1010 loss 0.28507962822914124 LR 0.271828830242157 LKL 0.0132508035749197
epoch 1011 loss 0.22813169658184052 LR 0.21491417288780212 LKL 0.013217520900070667
epoch 1012 loss 0.30348899960517883 LR 0.2902489900588989 LKL 0.013240017928183079
epoch 101

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1101 loss 0.24104353785514832 LR 0.22811061143875122 LKL 0.012932931073009968
epoch 1102 loss 0.27864596247673035 LR 0.2656398117542267 LKL 0.013006137683987617
epoch 1103 loss 0.3697570860385895 LR 0.35690218210220337 LKL 0.012854903936386108
epoch 1104 loss 0.27198052406311035 LR 0.2590036988258362 LKL 0.012976826168596745
epoch 1105 loss 0.30336901545524597 LR 0.2904987633228302 LKL 0.01287024561315775
epoch 1106 loss 0.31683507561683655 LR 0.3038334846496582 LKL 0.013001601211726665
epoch 1107 loss 0.25143110752105713 LR 0.2383885383605957 LKL 0.013042577542364597
epoch 1108 loss 0.34155556559562683 LR 0.3285065293312073 LKL 0.013049039989709854
epoch 1109 loss 0.3437422811985016 LR 0.33079996705055237 LKL 0.012942302040755749
epoch 1110 loss 0.3181594908237457 LR 0.30507543683052063 LKL 0.013084065169095993
epoch 1111 loss 0.3052864074707031 LR 0.2922323942184448 LKL 0.013054007664322853
epoch 1112 loss 0.307523638010025 LR 0.2943820357322693 LKL 0.013141607865691185
epoch 1

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1201 loss 0.2367974817752838 LR 0.22338008880615234 LKL 0.013417395763099194
epoch 1202 loss 0.34764212369918823 LR 0.3342677354812622 LKL 0.013374391943216324
epoch 1203 loss 0.30247098207473755 LR 0.28895655274391174 LKL 0.013514439575374126
epoch 1204 loss 0.24179711937904358 LR 0.22842219471931458 LKL 0.013374927453696728
epoch 1205 loss 0.2985644042491913 LR 0.28520703315734863 LKL 0.01335737481713295
epoch 1206 loss 0.3357340097427368 LR 0.32237064838409424 LKL 0.013363366015255451
epoch 1207 loss 0.30063530802726746 LR 0.2873435318470001 LKL 0.013291766867041588
epoch 1208 loss 0.22945505380630493 LR 0.21603599190711975 LKL 0.013419069349765778
epoch 1209 loss 0.34200170636177063 LR 0.3286859393119812 LKL 0.01331575307995081
epoch 1210 loss 0.2972122132778168 LR 0.2839418351650238 LKL 0.013270387426018715
epoch 1211 loss 0.32062655687332153 LR 0.30739304423332214 LKL 0.013233514502644539
epoch 1212 loss 0.2668071687221527 LR 0.2535594701766968 LKL 0.01324769388884306
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1301 loss 0.22796574234962463 LR 0.21435415744781494 LKL 0.013611582107841969
epoch 1302 loss 0.33103275299072266 LR 0.31737229228019714 LKL 0.013660474680364132
epoch 1303 loss 0.2549339532852173 LR 0.24125242233276367 LKL 0.013681525364518166
epoch 1304 loss 0.29609808325767517 LR 0.2823578715324402 LKL 0.013740204274654388
epoch 1305 loss 0.3179624676704407 LR 0.30419135093688965 LKL 0.013771126978099346
epoch 1306 loss 0.28193455934524536 LR 0.26828035712242126 LKL 0.0136541947722435
epoch 1307 loss 0.27546435594558716 LR 0.2617608606815338 LKL 0.013703503645956516
epoch 1308 loss 0.2202245593070984 LR 0.20657673478126526 LKL 0.013647819869220257
epoch 1309 loss 0.2798420190811157 LR 0.2661099135875702 LKL 0.013732091523706913
epoch 1310 loss 0.28497108817100525 LR 0.27130675315856934 LKL 0.013664325699210167
epoch 1311 loss 0.28614091873168945 LR 0.2725738286972046 LKL 0.01356708537787199
epoch 1312 loss 0.30262520909309387 LR 0.28903621435165405 LKL 0.013588991016149521
epo

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1401 loss 0.21191063523292542 LR 0.1980876475572586 LKL 0.013822986744344234
epoch 1402 loss 0.2806199789047241 LR 0.2668435573577881 LKL 0.013776407577097416
epoch 1403 loss 0.24746091663837433 LR 0.2337600290775299 LKL 0.013700892217457294
epoch 1404 loss 0.2670336961746216 LR 0.25324106216430664 LKL 0.013792645186185837
epoch 1405 loss 0.24967925250530243 LR 0.23592007160186768 LKL 0.013759185560047626
epoch 1406 loss 0.2827138304710388 LR 0.2689543664455414 LKL 0.013759472407400608
epoch 1407 loss 0.25137051939964294 LR 0.2375616729259491 LKL 0.013808849267661572
epoch 1408 loss 0.31891515851020813 LR 0.3051564395427704 LKL 0.0137587059289217
epoch 1409 loss 0.3319772779941559 LR 0.3181638717651367 LKL 0.013813400641083717
epoch 1410 loss 0.20817933976650238 LR 0.1944006383419037 LKL 0.013778703287243843
epoch 1411 loss 0.3070320785045624 LR 0.29318705201148987 LKL 0.013845017179846764
epoch 1412 loss 0.24436460435390472 LR 0.2304774820804596 LKL 0.01388712227344513
epoch 141

epoch 1500 loss 0.26241636276245117 LR 0.24796253442764282 LKL 0.014453822746872902


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1501 loss 0.20723426342010498 LR 0.19280049204826355 LKL 0.014433768577873707
epoch 1502 loss 0.2714478671550751 LR 0.256972998380661 LKL 0.014474868774414062
epoch 1503 loss 0.2108115553855896 LR 0.19630573689937592 LKL 0.014505812898278236
epoch 1504 loss 0.2741549015045166 LR 0.25963935256004333 LKL 0.014515537768602371
epoch 1505 loss 0.31102412939071655 LR 0.2964552044868469 LKL 0.014568934217095375
epoch 1506 loss 0.30888593196868896 LR 0.2945035994052887 LKL 0.014382327906787395
epoch 1507 loss 0.34785372018814087 LR 0.33346283435821533 LKL 0.014390872791409492
epoch 1508 loss 0.2631285488605499 LR 0.2487248331308365 LKL 0.014403719455003738
epoch 1509 loss 0.2969791293144226 LR 0.2825690805912018 LKL 0.014410041272640228
epoch 1510 loss 0.3093841075897217 LR 0.29502683877944946 LKL 0.01435725949704647
epoch 1511 loss 0.26377326250076294 LR 0.24944616854190826 LKL 0.014327085576951504
epoch 1512 loss 0.29727351665496826 LR 0.2829779386520386 LKL 0.014295563101768494
epoch 

epoch 1600 loss 0.28958529233932495 LR 0.27485451102256775 LKL 0.01473077293485403


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1601 loss 0.2672078311443329 LR 0.2524911165237427 LKL 0.014716721139848232
epoch 1602 loss 0.2307816743850708 LR 0.2160934954881668 LKL 0.01468818262219429
epoch 1603 loss 0.24136267602443695 LR 0.226590096950531 LKL 0.014772574417293072
epoch 1604 loss 0.24435290694236755 LR 0.22966009378433228 LKL 0.014692816883325577
epoch 1605 loss 0.2939102053642273 LR 0.279285192489624 LKL 0.014625024050474167
epoch 1606 loss 0.19450268149375916 LR 0.17975586652755737 LKL 0.014746812172234058
epoch 1607 loss 0.22153685986995697 LR 0.2068234235048294 LKL 0.014713440090417862
epoch 1608 loss 0.18687796592712402 LR 0.1722419261932373 LKL 0.014636034145951271
epoch 1609 loss 0.2161453813314438 LR 0.2014782428741455 LKL 0.014667141251266003
epoch 1610 loss 0.24769265949726105 LR 0.23297464847564697 LKL 0.014718016609549522
epoch 1611 loss 0.16391225159168243 LR 0.14920957386493683 LKL 0.014702671207487583
epoch 1612 loss 0.20539675652980804 LR 0.19067791104316711 LKL 0.01471884548664093
epoch 1

epoch 1700 loss 0.2024623602628708 LR 0.1877898871898651 LKL 0.014672476798295975


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1701 loss 0.250394731760025 LR 0.23577363789081573 LKL 0.014621094800531864
epoch 1702 loss 0.20519474148750305 LR 0.1904364824295044 LKL 0.01475826557725668
epoch 1703 loss 0.2794140875339508 LR 0.26477527618408203 LKL 0.014638821594417095
epoch 1704 loss 0.27296698093414307 LR 0.2583222985267639 LKL 0.014644690789282322
epoch 1705 loss 0.14763055741786957 LR 0.13301558792591095 LKL 0.014614969491958618
epoch 1706 loss 0.24469883739948273 LR 0.23010705411434174 LKL 0.014591777697205544
epoch 1707 loss 0.24027201533317566 LR 0.22569206357002258 LKL 0.014579949900507927
epoch 1708 loss 0.25727519392967224 LR 0.2427259385585785 LKL 0.0145492535084486
epoch 1709 loss 0.18659137189388275 LR 0.17203281819820404 LKL 0.014558556489646435
epoch 1710 loss 0.21875587105751038 LR 0.20423440635204315 LKL 0.014521459117531776
epoch 1711 loss 0.285165399312973 LR 0.2707441449165344 LKL 0.014421259053051472
epoch 1712 loss 0.2645852863788605 LR 0.25008147954940796 LKL 0.01450379379093647
epoch 

epoch 1800 loss 0.24709245562553406 LR 0.2320956438779831 LKL 0.01499681081622839


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1801 loss 0.18265265226364136 LR 0.16762864589691162 LKL 0.015024002641439438
epoch 1802 loss 0.22634270787239075 LR 0.21134965121746063 LKL 0.014993056654930115
epoch 1803 loss 0.25520405173301697 LR 0.24012872576713562 LKL 0.015075317583978176
epoch 1804 loss 0.16619877517223358 LR 0.1511019468307495 LKL 0.015096830204129219
epoch 1805 loss 0.18296034634113312 LR 0.1679060012102127 LKL 0.015054349787533283
epoch 1806 loss 0.1554613709449768 LR 0.14044399559497833 LKL 0.015017382800579071
epoch 1807 loss 0.19561246037483215 LR 0.18063472211360931 LKL 0.014977742917835712
epoch 1808 loss 0.17514251172542572 LR 0.16008299589157104 LKL 0.015059513039886951
epoch 1809 loss 0.21647527813911438 LR 0.20137161016464233 LKL 0.015103671699762344
epoch 1810 loss 0.19486619532108307 LR 0.17965905368328094 LKL 0.01520714070647955
epoch 1811 loss 0.20325258374214172 LR 0.18821367621421814 LKL 0.01503890659660101
epoch 1812 loss 0.24223504960536957 LR 0.22717349231243134 LKL 0.0150615517050027

epoch 1900 loss 0.19984929263591766 LR 0.18462629616260529 LKL 0.015222991816699505
77


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 1901 loss 0.24102330207824707 LR 0.22574394941329956 LKL 0.015279359184205532
epoch 1902 loss 0.21757526695728302 LR 0.20230776071548462 LKL 0.01526750810444355
epoch 1903 loss 0.17917880415916443 LR 0.16391824185848236 LKL 0.01526056881994009
epoch 1904 loss 0.2031426876783371 LR 0.18794527649879456 LKL 0.015197409316897392
epoch 1905 loss 0.22114251554012299 LR 0.2059488296508789 LKL 0.015193684957921505
epoch 1906 loss 0.2134019285440445 LR 0.19820906221866608 LKL 0.01519286073744297
epoch 1907 loss 0.16756144165992737 LR 0.1523500382900238 LKL 0.015211409889161587
epoch 1908 loss 0.19728557765483856 LR 0.18206487596035004 LKL 0.015220705419778824
epoch 1909 loss 0.18437306582927704 LR 0.16909661889076233 LKL 0.015276450663805008
epoch 1910 loss 0.16600586473941803 LR 0.15072934329509735 LKL 0.015276527032256126
epoch 1911 loss 0.21489085257053375 LR 0.19974751770496368 LKL 0.015143328346312046
epoch 1912 loss 0.17093947529792786 LR 0.1557871252298355 LKL 0.015152346342802048


epoch 2000 loss 0.19799967110157013 LR 0.18276210129261017 LKL 0.015237570740282536


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2001 loss 0.24403692781925201 LR 0.22879798710346222 LKL 0.015238937921822071
epoch 2002 loss 0.17926697432994843 LR 0.1639007329940796 LKL 0.015366245061159134
epoch 2003 loss 0.15029948949813843 LR 0.13493850827217102 LKL 0.015360982157289982
epoch 2004 loss 0.21149463951587677 LR 0.1961803436279297 LKL 0.015314298681914806
epoch 2005 loss 0.21363888680934906 LR 0.19818639755249023 LKL 0.01545249205082655
epoch 2006 loss 0.24761226773262024 LR 0.23220151662826538 LKL 0.015410754829645157
epoch 2007 loss 0.2003927230834961 LR 0.18493720889091492 LKL 0.015455521643161774
epoch 2008 loss 0.26412418484687805 LR 0.24859021604061127 LKL 0.015533956699073315
epoch 2009 loss 0.18514883518218994 LR 0.1696508377790451 LKL 0.015497993677854538
epoch 2010 loss 0.22346653044223785 LR 0.20789487659931183 LKL 0.01557165291160345
epoch 2011 loss 0.26677218079566956 LR 0.25135400891304016 LKL 0.01541818119585514
epoch 2012 loss 0.1826210469007492 LR 0.16706427931785583 LKL 0.015556774102151394


epoch 2100 loss 0.119285449385643 LR 0.1033056378364563 LKL 0.015979811549186707


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2101 loss 0.20492954552173615 LR 0.18910734355449677 LKL 0.01582220382988453
epoch 2102 loss 0.137380912899971 LR 0.1215207576751709 LKL 0.01586015522480011
epoch 2103 loss 0.19444639980793 LR 0.17864176630973816 LKL 0.015804633498191833
epoch 2104 loss 0.1774516999721527 LR 0.16152697801589966 LKL 0.015924720093607903
epoch 2105 loss 0.21542812883853912 LR 0.19951170682907104 LKL 0.01591642014682293
epoch 2106 loss 0.18534228205680847 LR 0.1693880558013916 LKL 0.015954231843352318
epoch 2107 loss 0.20752054452896118 LR 0.19159725308418274 LKL 0.015923289582133293
epoch 2108 loss 0.18245448172092438 LR 0.16639702022075653 LKL 0.016057459637522697
epoch 2109 loss 0.2308281660079956 LR 0.21487820148468018 LKL 0.015949958935379982
epoch 2110 loss 0.14642758667469025 LR 0.13036304712295532 LKL 0.016064541414380074
epoch 2111 loss 0.24513733386993408 LR 0.22912545502185822 LKL 0.016011882573366165
epoch 2112 loss 0.21716360747814178 LR 0.20122787356376648 LKL 0.015935737639665604
epoc

epoch 2200 loss 0.1635057032108307 LR 0.14740638434886932 LKL 0.016099322587251663
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2201 loss 0.1448560208082199 LR 0.12868668138980865 LKL 0.016169343143701553
epoch 2202 loss 0.12703529000282288 LR 0.11096712201833725 LKL 0.016068175435066223
epoch 2203 loss 0.19648253917694092 LR 0.18041718006134033 LKL 0.016065364703536034
epoch 2204 loss 0.1767040640115738 LR 0.16062811017036438 LKL 0.016075950115919113
epoch 2205 loss 0.2004617601633072 LR 0.18449856340885162 LKL 0.015963194891810417
epoch 2206 loss 0.16842137277126312 LR 0.15244808793067932 LKL 0.015973281115293503
epoch 2207 loss 0.1741102635860443 LR 0.15817581117153168 LKL 0.01593445986509323
epoch 2208 loss 0.1242968812584877 LR 0.1082412451505661 LKL 0.0160556361079216
epoch 2209 loss 0.17901206016540527 LR 0.16304241120815277 LKL 0.01596965081989765
epoch 2210 loss 0.2076997458934784 LR 0.1917821168899536 LKL 0.015917623415589333
epoch 2211 loss 0.1861165165901184 LR 0.17026589810848236 LKL 0.015850620344281197
epoch 2212 loss 0.18232892453670502 LR 0.16645891964435577 LKL 0.015870004892349243
epoch

epoch 2300 loss 0.19295156002044678 LR 0.17692089080810547 LKL 0.016030674800276756
42


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2301 loss 0.15922293066978455 LR 0.1432485431432724 LKL 0.0159743819385767
epoch 2302 loss 0.1292731612920761 LR 0.11325238645076752 LKL 0.016020772978663445
epoch 2303 loss 0.16634513437747955 LR 0.15038374066352844 LKL 0.01596139185130596
epoch 2304 loss 0.11774024367332458 LR 0.10173098742961884 LKL 0.01600925624370575
epoch 2305 loss 0.11353466659784317 LR 0.09742894023656845 LKL 0.01610572449862957
epoch 2306 loss 0.14208006858825684 LR 0.12604984641075134 LKL 0.01603022962808609
epoch 2307 loss 0.1783481240272522 LR 0.162288099527359 LKL 0.016060031950473785
epoch 2308 loss 0.1234801635146141 LR 0.10745660960674286 LKL 0.016023552045226097
epoch 2309 loss 0.1954747587442398 LR 0.17941193282604218 LKL 0.016062825918197632
epoch 2310 loss 0.23018065094947815 LR 0.21408522129058838 LKL 0.01609543152153492
epoch 2311 loss 0.16004250943660736 LR 0.14385411143302917 LKL 0.016188403591513634
epoch 2312 loss 0.21795400977134705 LR 0.2018805593252182 LKL 0.016073448583483696
epoch 2

81
epoch 2401 loss 0.13342516124248505 LR 0.11699523776769638 LKL 0.016429927200078964


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2402 loss 0.2075263261795044 LR 0.1912718564271927 LKL 0.016254466027021408
epoch 2403 loss 0.15515539050102234 LR 0.13875943422317505 LKL 0.01639596000313759
epoch 2404 loss 0.22191950678825378 LR 0.20559658110141754 LKL 0.01632292941212654
epoch 2405 loss 0.124582439661026 LR 0.10826341807842255 LKL 0.016319017857313156
epoch 2406 loss 0.0912502110004425 LR 0.07486218214035034 LKL 0.016388028860092163
epoch 2407 loss 0.2718889117240906 LR 0.2556120753288269 LKL 0.016276834532618523
epoch 2408 loss 0.1185673177242279 LR 0.10220403969287872 LKL 0.016363274306058884
epoch 2409 loss 0.12618671357631683 LR 0.10989662259817123 LKL 0.01629009284079075
epoch 2410 loss 0.15329359471797943 LR 0.13700535893440247 LKL 0.016288241371512413
epoch 2411 loss 0.16777373850345612 LR 0.15151505172252655 LKL 0.016258684918284416
epoch 2412 loss 0.15065377950668335 LR 0.1343006193637848 LKL 0.01635316200554371
epoch 2413 loss 0.18451420962810516 LR 0.16816234588623047 LKL 0.016351858153939247
epoch

68
epoch 2501 loss 0.06539943069219589 LR 0.049195729196071625 LKL 0.016203703358769417


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2502 loss 0.11308139562606812 LR 0.0969935730099678 LKL 0.016087820753455162
epoch 2503 loss 0.11155413836240768 LR 0.09528757631778717 LKL 0.016266563907265663
epoch 2504 loss 0.15543386340141296 LR 0.13926945626735687 LKL 0.016164414584636688
epoch 2505 loss 0.1654791533946991 LR 0.14918000996112823 LKL 0.016299141570925713
epoch 2506 loss 0.139547199010849 LR 0.12345181405544281 LKL 0.01609537936747074
epoch 2507 loss 0.1064089983701706 LR 0.09024190902709961 LKL 0.016167087480425835
epoch 2508 loss 0.17081856727600098 LR 0.1546991914510727 LKL 0.016119377687573433
epoch 2509 loss 0.15570972859859467 LR 0.13966195285320282 LKL 0.016047781333327293
epoch 2510 loss 0.11726908385753632 LR 0.10100585222244263 LKL 0.01626322790980339
epoch 2511 loss 0.12473442405462265 LR 0.10841333866119385 LKL 0.016321083530783653
epoch 2512 loss 0.16744549572467804 LR 0.15126478672027588 LKL 0.01618070714175701
epoch 2513 loss 0.09463120996952057 LR 0.07843282073736191 LKL 0.01619839295744896
ep

47
epoch 2601 loss 0.11721460521221161 LR 0.10113850235939026 LKL 0.0160761009901762


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2602 loss 0.11723509430885315 LR 0.10109458863735199 LKL 0.01614050567150116
epoch 2603 loss 0.16952382028102875 LR 0.15346798300743103 LKL 0.016055842861533165
epoch 2604 loss 0.1801830232143402 LR 0.1641625314950943 LKL 0.01602049358189106
epoch 2605 loss 0.19588671624660492 LR 0.1797151118516922 LKL 0.01617160066962242
epoch 2606 loss 0.14268222451210022 LR 0.1265157014131546 LKL 0.016166524961590767
epoch 2607 loss 0.13817428052425385 LR 0.12188230454921722 LKL 0.01629198156297207
epoch 2608 loss 0.1780073493719101 LR 0.16169863939285278 LKL 0.016308708116412163
epoch 2609 loss 0.19715577363967896 LR 0.18087297677993774 LKL 0.01628279685974121
epoch 2610 loss 0.18221426010131836 LR 0.16590863466262817 LKL 0.01630561798810959
epoch 2611 loss 0.1027296707034111 LR 0.08629653602838516 LKL 0.01643313467502594
epoch 2612 loss 0.0706532821059227 LR 0.05424797534942627 LKL 0.01640530489385128
epoch 2613 loss 0.14243833720684052 LR 0.12616005539894104 LKL 0.016278281807899475
epoch 2

88
epoch 2701 loss 0.11603012681007385 LR 0.09938511252403259 LKL 0.016645018011331558


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2702 loss 0.16516733169555664 LR 0.14854878187179565 LKL 0.01661854423582554
epoch 2703 loss 0.17759516835212708 LR 0.16097573935985565 LKL 0.01661943458020687
epoch 2704 loss 0.1426106095314026 LR 0.12600891292095184 LKL 0.016601696610450745
epoch 2705 loss 0.1286698579788208 LR 0.11203695833683014 LKL 0.01663290336728096
epoch 2706 loss 0.2084604650735855 LR 0.191819965839386 LKL 0.016640497371554375
epoch 2707 loss 0.10958994179964066 LR 0.09293067455291748 LKL 0.016659265384078026
epoch 2708 loss 0.12443389743566513 LR 0.1077234074473381 LKL 0.016710491850972176
epoch 2709 loss 0.1606188267469406 LR 0.14397038519382477 LKL 0.016648437827825546
epoch 2710 loss 0.1528460681438446 LR 0.13616439700126648 LKL 0.016681669279932976
epoch 2711 loss 0.11291497945785522 LR 0.09632348269224167 LKL 0.016591496765613556
epoch 2712 loss 0.19701451063156128 LR 0.18040427565574646 LKL 0.01661023497581482
epoch 2713 loss 0.0863024890422821 LR 0.069670170545578 LKL 0.016632314771413803
epoch 2

73
epoch 2801 loss 0.13784389197826385 LR 0.12131045013666153 LKL 0.016533447429537773


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2802 loss 0.09692579507827759 LR 0.08031497150659561 LKL 0.016610827296972275
epoch 2803 loss 0.13318628072738647 LR 0.11662793904542923 LKL 0.016558337956666946
epoch 2804 loss 0.1730509251356125 LR 0.15657265484333038 LKL 0.016478264704346657
epoch 2805 loss 0.08222702145576477 LR 0.06564339995384216 LKL 0.01658361777663231
epoch 2806 loss 0.05344144254922867 LR 0.03681337088346481 LKL 0.016628073528409004
epoch 2807 loss 0.08260659873485565 LR 0.06598541140556335 LKL 0.016621189191937447
epoch 2808 loss 0.10284528136253357 LR 0.08628822863101959 LKL 0.016557050868868828
epoch 2809 loss 0.07592456042766571 LR 0.0594811886548996 LKL 0.016443375498056412
epoch 2810 loss 0.11679300665855408 LR 0.10027773678302765 LKL 0.016515273600816727
epoch 2811 loss 0.07756362855434418 LR 0.060971736907958984 LKL 0.016591889783740044
epoch 2812 loss 0.08260426670312881 LR 0.0660170167684555 LKL 0.01658724807202816
epoch 2813 loss 0.07805677503347397 LR 0.061424657702445984 LKL 0.01663211919367

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 2901 loss 0.08899548649787903 LR 0.07197000831365585 LKL 0.017025480046868324
epoch 2902 loss 0.07056015729904175 LR 0.0535215362906456 LKL 0.017038619145751
epoch 2903 loss 0.1041790023446083 LR 0.08718068152666092 LKL 0.01699831895530224
epoch 2904 loss 0.0826057493686676 LR 0.06568626314401627 LKL 0.016919488087296486
epoch 2905 loss 0.03953317180275917 LR 0.022620253264904022 LKL 0.01691291853785515
epoch 2906 loss 0.059810616075992584 LR 0.04292472451925278 LKL 0.016885889694094658
epoch 2907 loss 0.12865190207958221 LR 0.11164601147174835 LKL 0.01700589619576931
epoch 2908 loss 0.16925950348377228 LR 0.1522827297449112 LKL 0.016976771876215935
epoch 2909 loss 0.1924646496772766 LR 0.17549248039722443 LKL 0.016972161829471588
epoch 2910 loss 0.10433387756347656 LR 0.08743977546691895 LKL 0.016894103959202766
epoch 2911 loss 0.15702849626541138 LR 0.14020229876041412 LKL 0.016826201230287552
epoch 2912 loss 0.17728134989738464 LR 0.16037137806415558 LKL 0.016909975558519363
e

epoch 3000 loss 0.15358644723892212 LR 0.13649149239063263 LKL 0.017094962298870087
71


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3001 loss 0.0909840315580368 LR 0.07382168620824814 LKL 0.017162347212433815
epoch 3002 loss 0.11752994358539581 LR 0.10040348023176193 LKL 0.01712646335363388
epoch 3003 loss 0.14744731783866882 LR 0.13029032945632935 LKL 0.017156995832920074
epoch 3004 loss 0.11395213007926941 LR 0.09685778617858887 LKL 0.017094340175390244
epoch 3005 loss 0.11067990213632584 LR 0.09351614117622375 LKL 0.017163759097456932
epoch 3006 loss 0.1508553922176361 LR 0.1336863785982132 LKL 0.017169013619422913
epoch 3007 loss 0.1271277368068695 LR 0.10996919870376587 LKL 0.01715853624045849
epoch 3008 loss 0.14246079325675964 LR 0.12534093856811523 LKL 0.01711985282599926
epoch 3009 loss 0.08486904948949814 LR 0.06775851547718048 LKL 0.017110535874962807
epoch 3010 loss 0.0879153162240982 LR 0.07073377072811127 LKL 0.017181547358632088
epoch 3011 loss 0.09071984887123108 LR 0.07353844493627548 LKL 0.017181402072310448
epoch 3012 loss 0.18487060070037842 LR 0.167768657207489 LKL 0.017101941630244255
ep

epoch 3100 loss 0.12251194566488266 LR 0.10523712635040283 LKL 0.01727481745183468
50


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3101 loss 0.10661137104034424 LR 0.08921003341674805 LKL 0.01740133948624134
epoch 3102 loss 0.08560344576835632 LR 0.06833714991807938 LKL 0.017266299575567245
epoch 3103 loss 0.10876189917325974 LR 0.09148930013179779 LKL 0.017272600904107094
epoch 3104 loss 0.12461447715759277 LR 0.10735297203063965 LKL 0.017261508852243423
epoch 3105 loss 0.1388934999704361 LR 0.12156566232442856 LKL 0.01732783205807209
epoch 3106 loss 0.11832553893327713 LR 0.10103143006563187 LKL 0.017294107005000114
epoch 3107 loss 0.13849548995494843 LR 0.12120678275823593 LKL 0.017288705334067345
epoch 3108 loss 0.0990917980670929 LR 0.08178290724754333 LKL 0.01730888895690441
epoch 3109 loss 0.10970060527324677 LR 0.09246683120727539 LKL 0.017233775928616524
epoch 3110 loss 0.0867030993103981 LR 0.06953684240579605 LKL 0.0171662587672472
epoch 3111 loss 0.14458584785461426 LR 0.12732239067554474 LKL 0.01726345904171467
epoch 3112 loss 0.04329629987478256 LR 0.025983214378356934 LKL 0.01731308363378048
e

epoch 3200 loss 0.1025877296924591 LR 0.08532346785068512 LKL 0.017264263704419136
51


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3201 loss 0.07054592669010162 LR 0.05319084972143173 LKL 0.017355075106024742
epoch 3202 loss 0.14131851494312286 LR 0.12414834648370743 LKL 0.017170162871479988
epoch 3203 loss 0.13279160857200623 LR 0.11554348468780518 LKL 0.017248116433620453
epoch 3204 loss 0.12718848884105682 LR 0.10995277762413025 LKL 0.017235713079571724
epoch 3205 loss 0.05569075047969818 LR 0.03838939219713211 LKL 0.01730135828256607
epoch 3206 loss 0.12897221744060516 LR 0.11165723204612732 LKL 0.017314989119768143
epoch 3207 loss 0.12468776851892471 LR 0.10738249123096466 LKL 0.017305275425314903
epoch 3208 loss 0.12178726494312286 LR 0.10450710356235504 LKL 0.017280161380767822
epoch 3209 loss 0.12016718834638596 LR 0.10293714702129364 LKL 0.017230043187737465
epoch 3210 loss 0.08181685954332352 LR 0.06462515145540237 LKL 0.017191706225275993
epoch 3211 loss 0.09377340227365494 LR 0.07643253356218338 LKL 0.01734086684882641
epoch 3212 loss 0.1062285304069519 LR 0.08885717391967773 LKL 0.01737135276198

60
epoch 3301 loss 0.12945546209812164 LR 0.11177045106887817 LKL 0.01768500916659832


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3302 loss 0.11844475567340851 LR 0.10079555213451385 LKL 0.017649203538894653
epoch 3303 loss 0.03778688609600067 LR 0.020206689834594727 LKL 0.017580196261405945
epoch 3304 loss 0.029454305768013 LR 0.011760801076889038 LKL 0.017693504691123962
epoch 3305 loss 0.07125695049762726 LR 0.053625501692295074 LKL 0.017631450667977333
epoch 3306 loss 0.11793650686740875 LR 0.10021297633647919 LKL 0.017723532393574715
epoch 3307 loss 0.1103607714176178 LR 0.09266862273216248 LKL 0.017692148685455322
epoch 3308 loss 0.19432327151298523 LR 0.17677326500415802 LKL 0.01755000837147236
epoch 3309 loss 0.06706036627292633 LR 0.04942212998867035 LKL 0.017638232558965683
epoch 3310 loss 0.09320725500583649 LR 0.0757351964712143 LKL 0.01747206039726734
epoch 3311 loss 0.1378181129693985 LR 0.12016956508159637 LKL 0.017648544162511826
epoch 3312 loss 0.12702031433582306 LR 0.10940498113632202 LKL 0.017615333199501038
epoch 3313 loss 0.10364890098571777 LR 0.0861421674489975 LKL 0.0175067298114299

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3401 loss 0.09273828566074371 LR 0.07506094872951508 LKL 0.01767733320593834
epoch 3402 loss 0.17197087407112122 LR 0.15435518324375153 LKL 0.017615685239434242
epoch 3403 loss 0.08729434758424759 LR 0.06972400099039078 LKL 0.017570344731211662
epoch 3404 loss 0.03594652935862541 LR 0.01817530393600464 LKL 0.017771225422620773
epoch 3405 loss 0.11153139919042587 LR 0.09401378035545349 LKL 0.017517616972327232
epoch 3406 loss 0.07226647436618805 LR 0.0545523464679718 LKL 0.01771412417292595
epoch 3407 loss 0.06244231015443802 LR 0.04463814198970795 LKL 0.017804166302084923
epoch 3408 loss 0.12002071738243103 LR 0.10246530175209045 LKL 0.017555413767695427
epoch 3409 loss 0.028163215145468712 LR 0.010461896657943726 LKL 0.017701318487524986
epoch 3410 loss 0.05430477857589722 LR 0.03656384348869324 LKL 0.01774093694984913
epoch 3411 loss 0.11454842984676361 LR 0.09684638679027557 LKL 0.017702044919133186
epoch 3412 loss 0.09278350323438644 LR 0.07515320181846619 LKL 0.0176303032785

epoch 3500 loss 0.1319936066865921 LR 0.1145835891366005 LKL 0.01741001568734646


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3501 loss 0.059867821633815765 LR 0.04241597652435303 LKL 0.017451846972107887
epoch 3502 loss 0.08166646957397461 LR 0.06415536254644394 LKL 0.01751110330224037
epoch 3503 loss 0.08287833631038666 LR 0.06535407155752182 LKL 0.01752426289021969
epoch 3504 loss 0.08550409227609634 LR 0.06798593699932098 LKL 0.01751815341413021
epoch 3505 loss 0.0659102126955986 LR 0.04840036481618881 LKL 0.01750984787940979
epoch 3506 loss 0.09298522025346756 LR 0.07543270289897919 LKL 0.017552519217133522
epoch 3507 loss 0.14066429436206818 LR 0.12320540845394135 LKL 0.017458880320191383
epoch 3508 loss 0.09770673513412476 LR 0.08022399246692657 LKL 0.017482740804553032
epoch 3509 loss 0.06763896346092224 LR 0.0501164048910141 LKL 0.01752256229519844
epoch 3510 loss 0.06640848517417908 LR 0.04893682152032852 LKL 0.017471665516495705
epoch 3511 loss 0.044580161571502686 LR 0.027023546397686005 LKL 0.01755661331117153
epoch 3512 loss 0.041817232966423035 LR 0.024319544434547424 LKL 0.01749768666923

epoch 3600 loss 0.053540851920843124 LR 0.035775065422058105 LKL 0.01776578649878502
77


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3601 loss 0.08104237914085388 LR 0.06337932497262955 LKL 0.017663052305579185
epoch 3602 loss 0.10261186212301254 LR 0.08498503267765045 LKL 0.01762682944536209
epoch 3603 loss 0.11200186610221863 LR 0.09431733936071396 LKL 0.01768452301621437
epoch 3604 loss 0.05022828280925751 LR 0.03238671272993088 LKL 0.01784157194197178
epoch 3605 loss 0.10532431304454803 LR 0.0875905305147171 LKL 0.017733778804540634
epoch 3606 loss 0.08156543970108032 LR 0.06397871673107147 LKL 0.01758672297000885
epoch 3607 loss 0.0978960171341896 LR 0.08013483881950378 LKL 0.01776117831468582
epoch 3608 loss 0.1077924519777298 LR 0.0901019498705864 LKL 0.017690500244498253
epoch 3609 loss 0.03371189534664154 LR 0.016045764088630676 LKL 0.017666129395365715
epoch 3610 loss 0.0930773913860321 LR 0.07535025477409363 LKL 0.017727138474583626
epoch 3611 loss 0.11062397807836533 LR 0.09292298555374146 LKL 0.01770099066197872
epoch 3612 loss 0.067051962018013 LR 0.04926559329032898 LKL 0.017786366865038872
epoc

epoch 3700 loss 0.0873754546046257 LR 0.06934844702482224 LKL 0.018027009442448616


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3701 loss 0.07364198565483093 LR 0.05563443899154663 LKL 0.018007544800639153
epoch 3702 loss 0.09510109573602676 LR 0.07709970325231552 LKL 0.018001390621066093
epoch 3703 loss 0.035662535578012466 LR 0.017542503774166107 LKL 0.01812003180384636
epoch 3704 loss 0.055731795728206635 LR 0.03773459792137146 LKL 0.017997197806835175
epoch 3705 loss 0.08101555705070496 LR 0.0630539208650589 LKL 0.017961638048291206
epoch 3706 loss 0.05214148387312889 LR 0.0341699942946434 LKL 0.01797148957848549
epoch 3707 loss 0.012945132330060005 LR -0.005067512392997742 LKL 0.018012644723057747
epoch 3708 loss 0.024716444313526154 LR 0.0066832974553108215 LKL 0.018033146858215332
epoch 3709 loss 0.04749248921871185 LR 0.029426082968711853 LKL 0.01806640438735485
epoch 3710 loss 0.08948910981416702 LR 0.07141232490539551 LKL 0.018076783046126366
epoch 3711 loss 0.04197161644697189 LR 0.02385273575782776 LKL 0.018118878826498985
epoch 3712 loss 0.0770416334271431 LR 0.0590139701962471 LKL 0.01802766

epoch 3800 loss 0.030501123517751694 LR 0.012454308569431305 LKL 0.01804681494832039
78
epoch 3801 loss 0.11559244245290756 LR 0.09751205146312714 LKL 0.018080390989780426
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 3802 loss -0.004175277426838875 LR -0.02224542200565338 LKL 0.018070144578814507
epoch 3803 loss 0.07987430691719055 LR 0.06180621683597565 LKL 0.018068091943860054
epoch 3804 loss -0.010471140965819359 LR -0.028431817889213562 LKL 0.017960676923394203
epoch 3805 loss 0.09701195359230042 LR 0.07887659221887589 LKL 0.01813536323606968
epoch 3806 loss 0.08897732198238373 LR 0.07098053395748138 LKL 0.017996788024902344
epoch 3807 loss 0.06458517163991928 LR 0.046520039439201355 LKL 0.018065134063363075
epoch 3808 loss -0.04029368609189987 LR -0.05841733515262604 LKL 0.018123650923371315
epoch 3809 loss 0.08876176178455353 LR 0.07081004232168198 LKL 0.01795172318816185
epoch 3810 loss 0.10286585986614227 LR 0.08489464223384857 LKL 0.017971221357584
epoch 3811 loss 0.04312621057033539 LR 0.025033600628376007 LKL 0.01809260994195938
epoch 3812 loss 0.047707460820674896 LR 0.029745958745479584 LKL 0.017961502075195312
epoch 3813 loss 0.07400573790073395 LR 0.05599994957447052 LKL 0.018005790

112
epoch 3901 loss 0.07232075184583664 LR 0.05415169894695282 LKL 0.01816905103623867


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 3902 loss 0.017133988440036774 LR -0.0009661838412284851 LKL 0.01810017228126526
epoch 3903 loss 0.054076164960861206 LR 0.03587797284126282 LKL 0.01819819211959839
epoch 3904 loss 0.14323152601718903 LR 0.12520459294319153 LKL 0.018026936799287796
epoch 3905 loss 0.09607649594545364 LR 0.07793745398521423 LKL 0.01813904382288456
epoch 3906 loss 0.037478480488061905 LR 0.019293569028377533 LKL 0.018184911459684372
epoch 3907 loss 0.08349145948886871 LR 0.06540112942457199 LKL 0.018090328201651573
epoch 3908 loss 0.060860954225063324 LR 0.04271918535232544 LKL 0.018141767010092735
epoch 3909 loss 0.04110153764486313 LR 0.02293478697538376 LKL 0.01816675066947937
epoch 3910 loss 0.14741301536560059 LR 0.1293376386165619 LKL 0.018075373023748398
epoch 3911 loss 0.089018814265728 LR 0.07093201577663422 LKL 0.01808679848909378
epoch 3912 loss 0.10073743015527725 LR 0.08258673548698425 LKL 0.01815069653093815
epoch 3913 loss 0.06128210574388504 LR 0.04309159517288208 LKL 0.018190510571

epoch 4000 loss 0.04775372892618179 LR 0.02933754026889801 LKL 0.018416188657283783
40


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4001 loss 0.048074278980493546 LR 0.029682151973247528 LKL 0.018392127007246017
epoch 4002 loss 0.02914469875395298 LR 0.010747775435447693 LKL 0.018396923318505287
epoch 4003 loss 0.005704348906874657 LR -0.01274089515209198 LKL 0.018445244058966637
epoch 4004 loss 0.03480400890111923 LR 0.016374677419662476 LKL 0.018429333344101906
epoch 4005 loss 0.03376470506191254 LR 0.01531771570444107 LKL 0.018446989357471466
epoch 4006 loss 0.03825081139802933 LR 0.01986575871706009 LKL 0.01838505268096924
epoch 4007 loss 0.042799606919288635 LR 0.0244036465883255 LKL 0.018395958468317986
epoch 4008 loss 0.04081352800130844 LR 0.022393308579921722 LKL 0.018420221284031868
epoch 4009 loss 0.10363423824310303 LR 0.0852476954460144 LKL 0.018386544659733772
epoch 4010 loss 0.04930555820465088 LR 0.03092809021472931 LKL 0.01837746798992157
epoch 4011 loss 0.112137071788311 LR 0.09367135167121887 LKL 0.018465720117092133
epoch 4012 loss 0.05097072571516037 LR 0.03239759802818298 LKL 0.018573127

epoch 4099 loss 0.05128735676407814 LR 0.03288165479898453 LKL 0.018405701965093613
epoch 4100 loss 0.059477850794792175 LR 0.041143812239170074 LKL 0.01833404041826725


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4101 loss 0.018675753846764565 LR 0.00038911402225494385 LKL 0.01828663982450962
epoch 4102 loss 0.02579193003475666 LR 0.007362857460975647 LKL 0.018429072573781013
epoch 4103 loss 0.02476838044822216 LR 0.0064067840576171875 LKL 0.018361596390604973
epoch 4104 loss 0.061233215034008026 LR 0.04286075383424759 LKL 0.018372459337115288
epoch 4105 loss 0.020454376935958862 LR 0.002137094736099243 LKL 0.01831728219985962
epoch 4106 loss 0.06367596238851547 LR 0.045401498675346375 LKL 0.018274463713169098
epoch 4107 loss 0.06902126222848892 LR 0.050528258085250854 LKL 0.018493004143238068
epoch 4108 loss 0.07718731462955475 LR 0.05888994038105011 LKL 0.01829737424850464
epoch 4109 loss 0.06959544867277145 LR 0.051180943846702576 LKL 0.01841450296342373
epoch 4110 loss 0.02902250550687313 LR 0.010565578937530518 LKL 0.018456926569342613
epoch 4111 loss 0.03727201744914055 LR 0.018788397312164307 LKL 0.018483620136976242
epoch 4112 loss 0.07667147368192673 LR 0.05820678174495697 LKL 0.

epoch 4200 loss 0.10108155012130737 LR 0.08254964649677277 LKL 0.01853189989924431
88
epoch 4201 loss 0.019383031874895096 LR 0.0009295344352722168 LKL 0.01845349743962288


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4202 loss -0.006175292655825615 LR -0.02463001012802124 LKL 0.018454717472195625
epoch 4203 loss 0.051907651126384735 LR 0.0334334671497345 LKL 0.01847418211400509
epoch 4204 loss 0.11058340966701508 LR 0.09218502044677734 LKL 0.018398385494947433
epoch 4205 loss -0.03309246152639389 LR -0.051552996039390564 LKL 0.018460532650351524
epoch 4206 loss 0.034056708216667175 LR 0.015562497079372406 LKL 0.01849420927464962
epoch 4207 loss 0.034315936267375946 LR 0.01579069346189499 LKL 0.018525240942835808
epoch 4208 loss 0.07898905128240585 LR 0.060583338141441345 LKL 0.018405715003609657
epoch 4209 loss -0.003153029829263687 LR -0.021714672446250916 LKL 0.01856164261698723
epoch 4210 loss 0.036603059619665146 LR 0.01809859275817871 LKL 0.018504466861486435
epoch 4211 loss 0.040713563561439514 LR 0.022133715450763702 LKL 0.01857984997332096
epoch 4212 loss 0.07057727128267288 LR 0.05210695415735245 LKL 0.018470315262675285
epoch 4213 loss 0.0671190470457077 LR 0.04869091510772705 LKL 0

epoch 4298 loss 0.05634971708059311 LR 0.03763454407453537 LKL 0.01871517300605774
epoch 4299 loss 0.030664535239338875 LR 0.0119963139295578 LKL 0.018668221309781075
epoch 4300 loss 0.07940039038658142 LR 0.0607713907957077 LKL 0.018629001453518867
107
epoch 4301 loss 0.0303809754550457 LR 0.011751271784305573 LKL 0.018629703670740128


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4302 loss 0.034821946173906326 LR 0.01619786024093628 LKL 0.018624085932970047
epoch 4303 loss -0.018676763400435448 LR -0.037437304854393005 LKL 0.018760541453957558
epoch 4304 loss 0.050613343715667725 LR 0.03206106275320053 LKL 0.018552279099822044
epoch 4305 loss 0.04605896770954132 LR 0.02733083814382553 LKL 0.01872812770307064
epoch 4306 loss 0.07409286499023438 LR 0.05543772876262665 LKL 0.018655134364962578
epoch 4307 loss 0.04408235475420952 LR 0.02540101855993271 LKL 0.01868133619427681
epoch 4308 loss 0.03030702844262123 LR 0.011562906205654144 LKL 0.018744122236967087
epoch 4309 loss 0.037083737552165985 LR 0.018434956669807434 LKL 0.018648779019713402
epoch 4310 loss 0.05135352164506912 LR 0.032673709094524384 LKL 0.018679814413189888
epoch 4311 loss 0.009205363690853119 LR -0.009505391120910645 LKL 0.018710754811763763
epoch 4312 loss 0.05788448080420494 LR 0.03927543759346008 LKL 0.018609043210744858
epoch 4313 loss -0.02892678789794445 LR -0.04778747260570526 LKL 

74
epoch 4401 loss 0.03528856113553047 LR 0.016443490982055664 LKL 0.018845070153474808


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4402 loss 0.0909564346075058 LR 0.0721084401011467 LKL 0.018847990781068802
epoch 4403 loss 0.08104641735553741 LR 0.06212453544139862 LKL 0.018921878188848495
epoch 4404 loss 0.036135483533144 LR 0.01729719340801239 LKL 0.018838290125131607
epoch 4405 loss 0.03309941291809082 LR 0.014306068420410156 LKL 0.018793344497680664
epoch 4406 loss 0.02809438481926918 LR 0.009350590407848358 LKL 0.018743794411420822
epoch 4407 loss 0.03210723027586937 LR 0.013312242925167084 LKL 0.018794987350702286
epoch 4408 loss 0.014291185885667801 LR -0.00455842912197113 LKL 0.01884961500763893
epoch 4409 loss 0.04533141106367111 LR 0.026439383625984192 LKL 0.01889202557504177
epoch 4410 loss 0.11347281187772751 LR 0.09456540644168854 LKL 0.01890740543603897
epoch 4411 loss 0.07081540673971176 LR 0.05199743062257767 LKL 0.018817974254488945
epoch 4412 loss 0.09720733016729355 LR 0.07833180576562881 LKL 0.018875526264309883
epoch 4413 loss 0.10955711454153061 LR 0.09082146733999252 LKL 0.018735647201

epoch 4499 loss 0.09778623282909393 LR 0.07908782362937927 LKL 0.01869840733706951
epoch 4500 loss 0.0003050919622182846 LR -0.018440738320350647 LKL 0.01874583028256893
109
epoch 4501 loss 0.10465477406978607 LR 0.08600348979234695 LKL 0.018651282414793968


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4502 loss 0.0042226966470479965 LR -0.014547772705554962 LKL 0.01877046935260296
epoch 4503 loss 0.04079534858465195 LR 0.022264406085014343 LKL 0.018530940636992455
epoch 4504 loss 0.04418967664241791 LR 0.02535349130630493 LKL 0.018836183473467827
epoch 4505 loss 0.017531249672174454 LR -0.0013190656900405884 LKL 0.018850315362215042
epoch 4506 loss 0.01736922189593315 LR -0.001398131251335144 LKL 0.018767353147268295
epoch 4507 loss 0.11959999054670334 LR 0.10097182542085648 LKL 0.018628165125846863
epoch 4508 loss 0.025236165151000023 LR 0.006376802921295166 LKL 0.018859362229704857
epoch 4509 loss 0.04330095648765564 LR 0.024598903954029083 LKL 0.018702050670981407
epoch 4510 loss 0.03302803263068199 LR 0.01414625346660614 LKL 0.01888177916407585
epoch 4511 loss 0.014235338196158409 LR -0.00451311469078064 LKL 0.01874845288693905
epoch 4512 loss 0.07469044625759125 LR 0.055993758141994476 LKL 0.01869669184088707
epoch 4513 loss 0.07694104313850403 LR 0.05812659114599228 LKL 

epoch 4600 loss 0.062210388481616974 LR 0.04325626790523529 LKL 0.018954122439026833
94
epoch 4601 loss 0.036918140947818756 LR 0.017853736877441406 LKL 0.0190644059330225


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4602 loss 0.024806203320622444 LR 0.005800746381282806 LKL 0.019005456939339638
epoch 4603 loss 0.048156023025512695 LR 0.02917422354221344 LKL 0.018981797620654106
epoch 4604 loss 0.07992161810398102 LR 0.060977816581726074 LKL 0.018943803384900093
epoch 4605 loss -0.0014727097004652023 LR -0.020535975694656372 LKL 0.01906326599419117
epoch 4606 loss 0.02109035663306713 LR 0.00205056369304657 LKL 0.01903979294002056
epoch 4607 loss 0.060674309730529785 LR 0.04157666116952896 LKL 0.019097648561000824
epoch 4608 loss 0.004930669441819191 LR -0.014162257313728333 LKL 0.019092926755547523
epoch 4609 loss 0.00962911918759346 LR -0.009447798132896423 LKL 0.019076917320489883
epoch 4610 loss 0.04110438749194145 LR 0.022091947495937347 LKL 0.019012439996004105
epoch 4611 loss -0.02990790084004402 LR -0.04900921881198883 LKL 0.01910131797194481
epoch 4612 loss 0.03626985102891922 LR 0.01719418168067932 LKL 0.0190756693482399
epoch 4613 loss 0.09901107847690582 LR 0.0799613669514656 LKL 0

epoch 4700 loss 0.012332923710346222 LR -0.006781250238418579 LKL 0.0191141739487648
49


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4701 loss 0.04741045832633972 LR 0.028339914977550507 LKL 0.019070545211434364
epoch 4702 loss 0.03129681944847107 LR 0.01228092610836029 LKL 0.01901589334011078
epoch 4703 loss 0.015781307592988014 LR -0.003256574273109436 LKL 0.01903788186609745
epoch 4704 loss -0.01319466345012188 LR -0.03230941295623779 LKL 0.019114749506115913
epoch 4705 loss 0.01660051941871643 LR -0.002597704529762268 LKL 0.0191982239484787
epoch 4706 loss 0.03455536812543869 LR 0.01540975272655487 LKL 0.01914561353623867
epoch 4707 loss 0.010982627049088478 LR -0.00808270275592804 LKL 0.019065329805016518
epoch 4708 loss 0.05097861588001251 LR 0.031837962567806244 LKL 0.019140655174851418
epoch 4709 loss 0.04488486051559448 LR 0.025661692023277283 LKL 0.01922317035496235
epoch 4710 loss 0.054624319076538086 LR 0.03548078238964081 LKL 0.01914353482425213
epoch 4711 loss 0.034763313829898834 LR 0.015609808266162872 LKL 0.01915350742638111
epoch 4712 loss -0.004001857712864876 LR -0.023096099495887756 LKL 0.

epoch 4799 loss 0.015421044081449509 LR -0.0036238431930541992 LKL 0.019044887274503708
epoch 4800 loss 0.022674545645713806 LR 0.0036396384239196777 LKL 0.01903490722179413
46
epoch 4801 loss 0.06658174097537994 LR 0.04757428914308548 LKL 0.019007449969649315


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4802 loss 0.019391903653740883 LR 0.0003680586814880371 LKL 0.019023844972252846
epoch 4803 loss 0.05774891376495361 LR 0.03870052844285965 LKL 0.019048385322093964
epoch 4804 loss -0.015772923827171326 LR -0.03486628830432892 LKL 0.019093364477157593
epoch 4805 loss 0.003287799656391144 LR -0.015684321522712708 LKL 0.01897212117910385
epoch 4806 loss 0.04785268381237984 LR 0.028825528919696808 LKL 0.01902715489268303
epoch 4807 loss 0.027273867279291153 LR 0.008201271295547485 LKL 0.019072595983743668
epoch 4808 loss -0.029095912352204323 LR -0.048216402530670166 LKL 0.019120490178465843
epoch 4809 loss 0.014292646199464798 LR -0.004706099629402161 LKL 0.01899874582886696
epoch 4810 loss -0.03174414113163948 LR -0.05079898238182068 LKL 0.019054841250181198
epoch 4811 loss 0.042093414813280106 LR 0.022945940494537354 LKL 0.019147474318742752
epoch 4812 loss -0.012770634144544601 LR -0.0319797545671463 LKL 0.0192091204226017
epoch 4813 loss 0.03977368772029877 LR 0.020668148994445

epoch 4899 loss -0.0030763838440179825 LR -0.022345587611198425 LKL 0.019269203767180443
epoch 4900 loss 0.03880206495523453 LR 0.019692398607730865 LKL 0.019109666347503662
62
epoch 4901 loss -0.021186405792832375 LR -0.04041571915149689 LKL 0.019229313358664513


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 4902 loss 0.03966805338859558 LR 0.020443782210350037 LKL 0.019224271178245544
epoch 4903 loss -0.02556472085416317 LR -0.04490771144628525 LKL 0.019342990592122078
epoch 4904 loss 0.03487417846918106 LR 0.01575355976819992 LKL 0.01912062056362629
epoch 4905 loss -0.03516940027475357 LR -0.054548315703868866 LKL 0.019378913566470146
epoch 4906 loss 0.03986818343400955 LR 0.020576968789100647 LKL 0.019291216507554054
epoch 4907 loss -0.045135706663131714 LR -0.06439628452062607 LKL 0.019260575994849205
epoch 4908 loss 0.009701520204544067 LR -0.009648256003856659 LKL 0.019349776208400726
epoch 4909 loss -0.06071220710873604 LR -0.0800255611538887 LKL 0.019313354045152664
epoch 4910 loss 0.021606529131531715 LR 0.002338394522666931 LKL 0.019268134608864784
epoch 4911 loss -0.01788264513015747 LR -0.03714418411254883 LKL 0.019261538982391357
epoch 4912 loss 0.0015048962086439133 LR -0.01780782639980316 LKL 0.019312722608447075
epoch 4913 loss 0.011851580813527107 LR -0.0074494928121

epoch 5000 loss -0.021028637886047363 LR -0.04048612713813782 LKL 0.019457489252090454
69


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5001 loss -0.0020211264491081238 LR -0.021423369646072388 LKL 0.019402243196964264
epoch 5002 loss -0.05843386426568031 LR -0.07799561321735382 LKL 0.019561748951673508
epoch 5003 loss 0.00484807975590229 LR -0.014534331858158112 LKL 0.019382411614060402
epoch 5004 loss 0.03869384899735451 LR 0.01916123926639557 LKL 0.01953260973095894
epoch 5005 loss 0.10310611128807068 LR 0.08377517014741898 LKL 0.019330944865942
epoch 5006 loss 0.005478313192725182 LR -0.01404540240764618 LKL 0.01952371560037136
epoch 5007 loss 0.035664062947034836 LR 0.016163796186447144 LKL 0.019500266760587692
epoch 5008 loss 0.10391673445701599 LR 0.08451535552740097 LKL 0.01940137892961502
epoch 5009 loss -0.01865703985095024 LR -0.038034334778785706 LKL 0.019377294927835464
epoch 5010 loss -0.04701883718371391 LR -0.06654965877532959 LKL 0.019530821591615677
epoch 5011 loss 0.0032679103314876556 LR -0.016198143362998962 LKL 0.019466053694486618
epoch 5012 loss 0.003512805327773094 LR -0.01602339744567871

epoch 5097 loss -0.000310700386762619 LR -0.019831925630569458 LKL 0.01952122524380684
epoch 5098 loss 0.009090447798371315 LR -0.010410085320472717 LKL 0.019500533118844032
epoch 5099 loss 0.020112628117203712 LR 0.0005819052457809448 LKL 0.019530722871422768
epoch 5100 loss 0.02055090293288231 LR 0.001000821590423584 LKL 0.019550081342458725
62


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5101 loss 0.06656587868928909 LR 0.04708394408226013 LKL 0.01948193646967411
epoch 5102 loss 0.048537611961364746 LR 0.028980515897274017 LKL 0.019557097926735878
epoch 5103 loss 0.004054080694913864 LR -0.015575051307678223 LKL 0.019629132002592087
epoch 5104 loss 0.021358482539653778 LR 0.0017964988946914673 LKL 0.01956198364496231
epoch 5105 loss -0.02690110355615616 LR -0.04643002152442932 LKL 0.019528917968273163
epoch 5106 loss 0.018922708928585052 LR -0.0005974769592285156 LKL 0.019520185887813568
epoch 5107 loss -0.06254372000694275 LR -0.08208621293306351 LKL 0.019542496651411057
epoch 5108 loss -0.03206519037485123 LR -0.05156128108501434 LKL 0.019496088847517967
epoch 5109 loss 0.041080839931964874 LR 0.021537266671657562 LKL 0.01954357512295246
epoch 5110 loss 0.02977887913584709 LR 0.010346077382564545 LKL 0.019432801753282547
epoch 5111 loss -0.027973169460892677 LR -0.047348685562610626 LKL 0.01937551610171795
epoch 5112 loss 0.008022438734769821 LR -0.011446177959

epoch 5197 loss -0.022249285131692886 LR -0.04195259511470795 LKL 0.01970330998301506
epoch 5198 loss -0.0082755908370018 LR -0.027799978852272034 LKL 0.019524388015270233
epoch 5199 loss -0.028263522312045097 LR -0.04799865186214447 LKL 0.019735129550099373
epoch 5200 loss -0.05346420034766197 LR -0.07318313419818878 LKL 0.01971893385052681
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5201 loss -0.013252489268779755 LR -0.03288261592388153 LKL 0.019630126655101776
epoch 5202 loss -0.014494480565190315 LR -0.03419870883226395 LKL 0.01970422826707363
epoch 5203 loss -0.047131508588790894 LR -0.0667504221200943 LKL 0.019618913531303406
epoch 5204 loss -0.027802105993032455 LR -0.04733097553253174 LKL 0.019528869539499283
epoch 5205 loss -0.004435742273926735 LR -0.02403251826763153 LKL 0.019596775993704796
epoch 5206 loss -0.04239477217197418 LR -0.06206943839788437 LKL 0.019674668088555336
epoch 5207 loss -0.011706296354532242 LR -0.03131057322025299 LKL 0.01960427686572075
epoch 5208 loss 0.029985452070832253 LR 0.010361261665821075 LKL 0.019624190405011177
epoch 5209 loss 0.03472882881760597 LR 0.015087097883224487 LKL 0.019641730934381485
epoch 5210 loss 0.022249959409236908 LR 0.0025954395532608032 LKL 0.019654519855976105
epoch 5211 loss -0.006362026557326317 LR -0.025959447026252747 LKL 0.01959742046892643
epoch 5212 loss 0.04468586668372154 LR 0.025167144

epoch 5299 loss -0.05268729478120804 LR -0.07235405594110489 LKL 0.01966676115989685
epoch 5300 loss 0.034025657922029495 LR 0.014438077807426453 LKL 0.019587580114603043
108
epoch 5301 loss -0.0438082292675972 LR -0.06344938278198242 LKL 0.019641151651740074


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5302 loss -0.06706570833921432 LR -0.08678002655506134 LKL 0.019714320078492165
epoch 5303 loss -0.039098650217056274 LR -0.058745793998241425 LKL 0.01964714191854
epoch 5304 loss -0.011393388733267784 LR -0.030998587608337402 LKL 0.019605198875069618
epoch 5305 loss 0.009710505604743958 LR -0.009790629148483276 LKL 0.019501134753227234
epoch 5306 loss 0.08604338020086288 LR 0.06655624508857727 LKL 0.019487135112285614
epoch 5307 loss 0.0011151768267154694 LR -0.018424883484840393 LKL 0.019540060311555862
epoch 5308 loss 0.0005735158920288086 LR -0.01904265582561493 LKL 0.019616171717643738
epoch 5309 loss -0.015465779229998589 LR -0.03512564301490784 LKL 0.01965986378490925
epoch 5310 loss 0.03857877850532532 LR 0.018987886607646942 LKL 0.019590893760323524
epoch 5311 loss -0.03954697027802467 LR -0.05917719006538391 LKL 0.019630219787359238
epoch 5312 loss 0.0569828525185585 LR 0.03730485588312149 LKL 0.01967799849808216
epoch 5313 loss 0.00573396123945713 LR -0.013868436217308

epoch 5400 loss 0.030804110690951347 LR 0.011036485433578491 LKL 0.019767625257372856
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5401 loss 0.013917079195380211 LR -0.005792528390884399 LKL 0.01970960758626461
epoch 5402 loss 0.06289980560541153 LR 0.043110594153404236 LKL 0.019789213314652443
epoch 5403 loss 0.022197190672159195 LR 0.0023742467164993286 LKL 0.019822943955659866
epoch 5404 loss -0.0028138943016529083 LR -0.02263374626636505 LKL 0.019819851964712143
epoch 5405 loss 0.011170390993356705 LR -0.008523315191268921 LKL 0.019693706184625626
epoch 5406 loss 0.002204807475209236 LR -0.017592594027519226 LKL 0.019797401502728462
epoch 5407 loss 0.024966120719909668 LR 0.005175404250621796 LKL 0.019790716469287872
epoch 5408 loss 0.019496245309710503 LR -0.0001802891492843628 LKL 0.019676534458994865
epoch 5409 loss 0.038726601749658585 LR 0.018946319818496704 LKL 0.01978028193116188
epoch 5410 loss -0.030314354225993156 LR -0.05023287236690521 LKL 0.019918518140912056
epoch 5411 loss 0.051357559859752655 LR 0.03155823051929474 LKL 0.019799331203103065
epoch 5412 loss 0.08590001612901688 LR 0.06619516

epoch 5497 loss -0.015649059787392616 LR -0.03542460501194 LKL 0.019775545224547386
epoch 5498 loss 0.018075918778777122 LR -0.0017314702272415161 LKL 0.01980738900601864
epoch 5499 loss 0.011622006073594093 LR -0.00822051614522934 LKL 0.019842522218823433
epoch 5500 loss -0.02157691679894924 LR -0.041422873735427856 LKL 0.019845956936478615
65


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5501 loss 0.007416924461722374 LR -0.01221168041229248 LKL 0.019628604874014854
epoch 5502 loss -0.006785700097680092 LR -0.02656872570514679 LKL 0.019783025607466698
epoch 5503 loss 0.039298780262470245 LR 0.019506700336933136 LKL 0.01979207806289196
epoch 5504 loss 0.022192800417542458 LR 0.0023843199014663696 LKL 0.019808480516076088
epoch 5505 loss 0.04919353127479553 LR 0.029454566538333893 LKL 0.01973896287381649
epoch 5506 loss 0.012573273852467537 LR -0.007265359163284302 LKL 0.01983863301575184
epoch 5507 loss 0.046632200479507446 LR 0.026958949863910675 LKL 0.019673248752951622
epoch 5508 loss 0.025464091449975967 LR 0.005564481019973755 LKL 0.019899610430002213
epoch 5509 loss -0.007583806291222572 LR -0.02736644446849823 LKL 0.019782638177275658
epoch 5510 loss -0.006891077384352684 LR -0.02674172818660736 LKL 0.019850650802254677
epoch 5511 loss 0.054765552282333374 LR 0.03488483279943466 LKL 0.01988072134554386
epoch 5512 loss 0.022852696478366852 LR 0.0029458627104

epoch 5599 loss -0.010987848043441772 LR -0.03089802712202072 LKL 0.01991017907857895
epoch 5600 loss 0.02900470420718193 LR 0.009089827537536621 LKL 0.01991487666964531
51
epoch 5601 loss -0.0022116005420684814 LR -0.02205921709537506 LKL 0.01984761655330658


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5602 loss -0.02413402497768402 LR -0.044000864028930664 LKL 0.019866839051246643
epoch 5603 loss -0.015397151932120323 LR -0.035236626863479614 LKL 0.01983947493135929
epoch 5604 loss -0.06253331154584885 LR -0.08239885419607162 LKL 0.01986554078757763
epoch 5605 loss 0.013797221705317497 LR -0.005939945578575134 LKL 0.01973716728389263
epoch 5606 loss 0.036792367696762085 LR 0.016856461763381958 LKL 0.019935905933380127
epoch 5607 loss -0.02388562448322773 LR -0.04372270405292511 LKL 0.01983707956969738
epoch 5608 loss 0.0016855616122484207 LR -0.018277868628501892 LKL 0.019963430240750313
epoch 5609 loss 0.004157273098826408 LR -0.015671074390411377 LKL 0.019828347489237785
epoch 5610 loss -0.017789732664823532 LR -0.037729665637016296 LKL 0.019939932972192764
epoch 5611 loss 0.0016103815287351608 LR -0.018328607082366943 LKL 0.019938988611102104
epoch 5612 loss -0.02616862952709198 LR -0.045949049293994904 LKL 0.019780419766902924
epoch 5613 loss -0.07671461999416351 LR -0.096

epoch 5698 loss 0.048610541969537735 LR 0.028807759284973145 LKL 0.01980278268456459
epoch 5699 loss 0.03427524119615555 LR 0.014459580183029175 LKL 0.019815661013126373
epoch 5700 loss 0.006889607757329941 LR -0.012939438223838806 LKL 0.019829045981168747
80
epoch 5701 loss 0.01239015907049179 LR -0.007577031850814819 LKL 0.01996719092130661


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5702 loss -0.015036813914775848 LR -0.0349167138338089 LKL 0.01987989991903305
epoch 5703 loss 0.011982524767518044 LR -0.007847979664802551 LKL 0.019830504432320595
epoch 5704 loss 0.012934796512126923 LR -0.006898865103721619 LKL 0.01983366161584854
epoch 5705 loss -0.05623236298561096 LR -0.0760810524225235 LKL 0.019848687574267387
epoch 5706 loss -0.03875037655234337 LR -0.058664292097091675 LKL 0.019913915544748306
epoch 5707 loss -0.04403320699930191 LR -0.06393168866634369 LKL 0.01989848166704178
epoch 5708 loss -0.01865386962890625 LR -0.03860899806022644 LKL 0.01995512843132019
epoch 5709 loss 0.02751949615776539 LR 0.007601030170917511 LKL 0.019918465986847878
epoch 5710 loss -0.02085702493786812 LR -0.04067198932170868 LKL 0.01981496438384056
epoch 5711 loss -0.021520249545574188 LR -0.041474372148513794 LKL 0.019954122602939606
epoch 5712 loss -0.022212037816643715 LR -0.042187198996543884 LKL 0.01997516117990017
epoch 5713 loss -0.01996106654405594 LR -0.039832994341

epoch 5800 loss -0.02246839739382267 LR -0.042488157749176025 LKL 0.020019760355353355
45


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5801 loss -0.05794675648212433 LR -0.07801040261983871 LKL 0.020063644275069237
epoch 5802 loss -0.006649406626820564 LR -0.026689201593399048 LKL 0.020039794966578484
epoch 5803 loss -0.08391088992357254 LR -0.10394009947776794 LKL 0.020029209554195404
epoch 5804 loss -0.02739931456744671 LR -0.047439754009246826 LKL 0.020040439441800117
epoch 5805 loss -0.031437188386917114 LR -0.05147071182727814 LKL 0.020033523440361023
epoch 5806 loss -0.01226341724395752 LR -0.03225485980510712 LKL 0.019991442561149597
epoch 5807 loss 0.044400788843631744 LR 0.024436071515083313 LKL 0.01996471919119358
epoch 5808 loss -0.023085404187440872 LR -0.04317212104797363 LKL 0.02008671686053276
epoch 5809 loss -0.007096355780959129 LR -0.02717166393995285 LKL 0.02007530815899372
epoch 5810 loss -0.05049007385969162 LR -0.0706903487443924 LKL 0.020200276747345924
epoch 5811 loss -0.019812636077404022 LR -0.03982393443584442 LKL 0.0200112983584404
epoch 5812 loss -0.007863340899348259 LR -0.027890861

epoch 5897 loss -0.0048740096390247345 LR -0.025067850947380066 LKL 0.02019384130835533
epoch 5898 loss -0.014243271201848984 LR -0.03437887132167816 LKL 0.020135600119829178
epoch 5899 loss -0.028015464544296265 LR -0.04816603660583496 LKL 0.020150572061538696
epoch 5900 loss -0.014716288074851036 LR -0.034904301166534424 LKL 0.020188013091683388
55


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 5901 loss 0.012549646198749542 LR -0.00765565037727356 LKL 0.020205296576023102
epoch 5902 loss -0.018374184146523476 LR -0.03859226405620575 LKL 0.020218079909682274
epoch 5903 loss -0.05133206024765968 LR -0.07157649099826813 LKL 0.020244430750608444
epoch 5904 loss -0.027155999094247818 LR -0.047419577836990356 LKL 0.02026357874274254
epoch 5905 loss -0.0285064484924078 LR -0.04876406490802765 LKL 0.02025761641561985
epoch 5906 loss -0.048511654138565063 LR -0.06880680471658707 LKL 0.020295150578022003
epoch 5907 loss -0.011044757440686226 LR -0.03125976026058197 LKL 0.020215002819895744
epoch 5908 loss -0.024032611399888992 LR -0.04432475566864014 LKL 0.020292144268751144
epoch 5909 loss -0.011354580521583557 LR -0.03159026801586151 LKL 0.020235687494277954
epoch 5910 loss -0.02766776829957962 LR -0.04789990931749344 LKL 0.02023214101791382
epoch 5911 loss -0.00758701004087925 LR -0.027837589383125305 LKL 0.020250579342246056
epoch 5912 loss -0.033651988953351974 LR -0.053823

epoch 5997 loss -0.0408780574798584 LR -0.06107092648744583 LKL 0.020192870870232582
epoch 5998 loss -0.008447226136922836 LR -0.02858199179172516 LKL 0.020134765654802322
epoch 5999 loss -0.0933602973818779 LR -0.11361423134803772 LKL 0.02025393396615982
epoch 6000 loss -0.04600483924150467 LR -0.06619492173194885 LKL 0.020190084353089333
47


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6001 loss -0.044561635702848434 LR -0.06475163996219635 LKL 0.020190004259347916
epoch 6002 loss -0.03796052932739258 LR -0.05823926627635956 LKL 0.02027873881161213
epoch 6003 loss -0.04277852177619934 LR -0.06294864416122437 LKL 0.020170122385025024
epoch 6004 loss -0.034286972135305405 LR -0.05442348122596741 LKL 0.020136509090662003
epoch 6005 loss 0.019264794886112213 LR -0.0009811073541641235 LKL 0.020245902240276337
epoch 6006 loss -0.022225812077522278 LR -0.04236103594303131 LKL 0.020135223865509033
epoch 6007 loss -0.04352729022502899 LR -0.06366860121488571 LKL 0.02014130912721157
epoch 6008 loss 0.007194662466645241 LR -0.012942895293235779 LKL 0.02013755775988102
epoch 6009 loss -0.04605567455291748 LR -0.06623419374227524 LKL 0.020178521052002907
epoch 6010 loss -0.07217629253864288 LR -0.09230557084083557 LKL 0.020129280164837837
epoch 6011 loss -0.02594955451786518 LR -0.046053074300289154 LKL 0.020103519782423973
epoch 6012 loss -0.0952981635928154 LR -0.11548441

epoch 6099 loss 0.030819300562143326 LR 0.010470367968082428 LKL 0.020348932594060898
epoch 6100 loss -0.01961078681051731 LR -0.04001244157552719 LKL 0.02040165476500988
60
epoch 6101 loss 0.042556896805763245 LR 0.022076815366744995 LKL 0.0204800833016634


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6102 loss 0.020688781514763832 LR 0.0002037137746810913 LKL 0.02048506774008274
epoch 6103 loss 0.0063474345952272415 LR -0.014032706618309021 LKL 0.020380141213536263
epoch 6104 loss -0.023158730939030647 LR -0.04359288513660431 LKL 0.020434154197573662
epoch 6105 loss -0.059606537222862244 LR -0.07996557652950287 LKL 0.020359041169285774
epoch 6106 loss 0.01001107506453991 LR -0.010336875915527344 LKL 0.020347950980067253
epoch 6107 loss -0.09084003418684006 LR -0.1112947165966034 LKL 0.020454680547118187
epoch 6108 loss -0.00382392480969429 LR -0.024292409420013428 LKL 0.020468484610319138
epoch 6109 loss -0.0253865048289299 LR -0.04585752636194229 LKL 0.02047102153301239
epoch 6110 loss -0.11882932484149933 LR -0.13936659693717957 LKL 0.020537275820970535
epoch 6111 loss -0.05228785425424576 LR -0.07270115613937378 LKL 0.02041330188512802
epoch 6112 loss -0.03877139091491699 LR -0.059279173612594604 LKL 0.020507780835032463
epoch 6113 loss -0.08512140810489655 LR -0.105601444

epoch 6200 loss -0.03773108497262001 LR -0.058176785707473755 LKL 0.020445700734853745
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6201 loss -0.04398857802152634 LR -0.06441854685544968 LKL 0.02042996697127819
epoch 6202 loss -0.05322795361280441 LR -0.07367171347141266 LKL 0.020443759858608246
epoch 6203 loss -0.03220156580209732 LR -0.05268217623233795 LKL 0.02048061043024063
epoch 6204 loss -0.07377849519252777 LR -0.09421901404857635 LKL 0.020440520718693733
epoch 6205 loss -0.07528217881917953 LR -0.0956764742732048 LKL 0.020394297316670418
epoch 6206 loss -0.061254799365997314 LR -0.08177004009485245 LKL 0.020515238866209984
epoch 6207 loss -0.034262340515851974 LR -0.05471870303153992 LKL 0.020456362515687943
epoch 6208 loss -0.05483604222536087 LR -0.0752117782831192 LKL 0.02037573792040348
epoch 6209 loss -0.0038774777203798294 LR -0.02420705556869507 LKL 0.02032957784831524
epoch 6210 loss -0.05845256149768829 LR -0.07888716459274292 LKL 0.020434601232409477
epoch 6211 loss -0.002510812133550644 LR -0.022788599133491516 LKL 0.020277786999940872
epoch 6212 loss -0.033868301659822464 LR -0.0543155968

epoch 6298 loss -0.020839056000113487 LR -0.04137255251407623 LKL 0.020533496513962746
epoch 6299 loss -0.020480075851082802 LR -0.04106926918029785 LKL 0.02058919332921505
epoch 6300 loss -0.047237128019332886 LR -0.06779679656028748 LKL 0.02055967040359974
46
epoch 6301 loss -0.055410273373126984 LR -0.07602427899837494 LKL 0.020614003762602806


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6302 loss -0.0019038189202547073 LR -0.022427216172218323 LKL 0.020523397251963615
epoch 6303 loss -0.04415653645992279 LR -0.06463117897510529 LKL 0.020474644377827644
epoch 6304 loss -0.00824238732457161 LR -0.028790593147277832 LKL 0.020548205822706223
epoch 6305 loss -0.057323940098285675 LR -0.07788059115409851 LKL 0.020556651055812836
epoch 6306 loss -0.0010005850344896317 LR -0.021560773253440857 LKL 0.020560188218951225
epoch 6307 loss -0.0772707462310791 LR -0.09793426841497421 LKL 0.02066352218389511
epoch 6308 loss -0.03014514595270157 LR -0.050684645771980286 LKL 0.020539499819278717
epoch 6309 loss 0.017826678231358528 LR -0.002670004963874817 LKL 0.020496683195233345
epoch 6310 loss -0.024035463109612465 LR -0.04436337202787399 LKL 0.020327908918261528
epoch 6311 loss -0.043856486678123474 LR -0.064469113945961 LKL 0.020612627267837524
epoch 6312 loss -0.05848575383424759 LR -0.07895642518997192 LKL 0.020470669493079185
epoch 6313 loss 0.02354942448437214 LR 0.00304

epoch 6398 loss -0.044658273458480835 LR -0.0648379847407341 LKL 0.020179711282253265
epoch 6399 loss -0.09246368706226349 LR -0.11274164915084839 LKL 0.0202779620885849
epoch 6400 loss -0.020761732012033463 LR -0.04092508554458618 LKL 0.02016335353255272
52
epoch 6401 loss -0.038980595767498016 LR -0.05916722118854523 LKL 0.02018662355840206


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6402 loss -0.027730125933885574 LR -0.04797618091106415 LKL 0.020246054977178574
epoch 6403 loss -0.08946166187524796 LR -0.10969983786344528 LKL 0.020238174125552177
epoch 6404 loss -0.04227866977453232 LR -0.06261847913265228 LKL 0.020339809358119965
epoch 6405 loss -0.029758501797914505 LR -0.050036221742630005 LKL 0.0202777199447155
epoch 6406 loss -0.004340833052992821 LR -0.02453327178955078 LKL 0.02019243873655796
epoch 6407 loss -0.042233906686306 LR -0.062492772936820984 LKL 0.020258868113160133
epoch 6408 loss -0.04707913100719452 LR -0.06738738715648651 LKL 0.02030825801193714
epoch 6409 loss 0.014838406816124916 LR -0.005362004041671753 LKL 0.02020041085779667
epoch 6410 loss -0.043927595019340515 LR -0.06413562595844269 LKL 0.020208030939102173
epoch 6411 loss -0.049682971090078354 LR -0.06995180994272232 LKL 0.020268838852643967
epoch 6412 loss -0.019030895084142685 LR -0.03921151161193848 LKL 0.02018061652779579
epoch 6413 loss -0.04788791388273239 LR -0.0681437030

epoch 6498 loss -0.0998804047703743 LR -0.12058243155479431 LKL 0.020702028647065163
epoch 6499 loss -0.0575389601290226 LR -0.07816201448440552 LKL 0.02062305435538292
epoch 6500 loss -0.04175000637769699 LR -0.06238913536071777 LKL 0.020639128983020782
72
epoch 6501 loss -0.08152342587709427 LR -0.10218130052089691 LKL 0.020657872781157494


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6502 loss -0.06796566396951675 LR -0.08869744837284088 LKL 0.020731784403324127
epoch 6503 loss -0.048621468245983124 LR -0.0693119466304779 LKL 0.020690476521849632
epoch 6504 loss -0.06991052627563477 LR -0.09067872911691666 LKL 0.02076820284128189
epoch 6505 loss 0.006886959075927734 LR -0.013652220368385315 LKL 0.02053917944431305
epoch 6506 loss -0.0401027575135231 LR -0.060700297355651855 LKL 0.020597541704773903
epoch 6507 loss -0.026033751666545868 LR -0.04670970141887665 LKL 0.02067594975233078
epoch 6508 loss -0.11890828609466553 LR -0.13960367441177368 LKL 0.020695388317108154
epoch 6509 loss -0.07664481550455093 LR -0.09735298901796341 LKL 0.020708171650767326
epoch 6510 loss -0.023255666717886925 LR -0.04380801320075989 LKL 0.020552346482872963
epoch 6511 loss -0.033641450107097626 LR -0.05430781841278076 LKL 0.020666366443037987
epoch 6512 loss -0.08809558302164078 LR -0.1087411418557167 LKL 0.020645560696721077
epoch 6513 loss 0.03957666456699371 LR 0.0189966037869

epoch 6600 loss -0.10900530219078064 LR -0.12952253222465515 LKL 0.020517226308584213
70


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6601 loss -0.04986545443534851 LR -0.07036155462265015 LKL 0.020496102049946785
epoch 6602 loss -0.057902559638023376 LR -0.07839792221784592 LKL 0.02049536071717739
epoch 6603 loss -0.018270613625645638 LR -0.03875403851270676 LKL 0.02048342488706112
epoch 6604 loss -0.033142298460006714 LR -0.05361753702163696 LKL 0.0204752366989851
epoch 6605 loss -0.059706248342990875 LR -0.08014117926359177 LKL 0.02043493278324604
epoch 6606 loss -0.04258398711681366 LR -0.06306927651166916 LKL 0.02048528753221035
epoch 6607 loss -0.05064135044813156 LR -0.07115636020898819 LKL 0.02051500789821148
epoch 6608 loss 0.026375383138656616 LR 0.005821302533149719 LKL 0.020554080605506897
epoch 6609 loss -0.03606478124856949 LR -0.056541137397289276 LKL 0.020476358011364937
epoch 6610 loss -0.020527753978967667 LR -0.04095347225666046 LKL 0.020425718277692795
epoch 6611 loss -0.032922036945819855 LR -0.05345243215560913 LKL 0.020530395209789276
epoch 6612 loss -0.03557339683175087 LR -0.05610400438

epoch 6699 loss -0.1019839495420456 LR -0.12251995503902435 LKL 0.02053600363433361
epoch 6700 loss -0.07264604419469833 LR -0.09330053627490997 LKL 0.02065449394285679
44
epoch 6701 loss -0.06549756228923798 LR -0.08614706993103027 LKL 0.020649505779147148


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6702 loss -0.10621756315231323 LR -0.12688913941383362 LKL 0.020671576261520386
epoch 6703 loss -0.049802571535110474 LR -0.07034435868263245 LKL 0.020541789010167122
epoch 6704 loss -0.006190599873661995 LR -0.026685059070587158 LKL 0.020494459196925163
epoch 6705 loss -0.060496293008327484 LR -0.08106060326099396 LKL 0.020564312115311623
epoch 6706 loss -0.06491014361381531 LR -0.08556388318538666 LKL 0.02065373584628105
epoch 6707 loss 0.040317125618457794 LR 0.019878938794136047 LKL 0.020438186824321747
epoch 6708 loss -0.059447161853313446 LR -0.08005630970001221 LKL 0.020609145984053612
epoch 6709 loss -0.09050147235393524 LR -0.11105447262525558 LKL 0.020552998408675194
epoch 6710 loss -0.02244943007826805 LR -0.04306507110595703 LKL 0.02061564102768898
epoch 6711 loss -0.04791560769081116 LR -0.068519726395607 LKL 0.020604116842150688
epoch 6712 loss 0.012877721339464188 LR -0.00760379433631897 LKL 0.020481515675783157
epoch 6713 loss -0.06977139413356781 LR -0.0903723761

epoch 6798 loss -0.09685467928647995 LR -0.11777469515800476 LKL 0.02092001587152481
epoch 6799 loss -0.05414404720067978 LR -0.07494135200977325 LKL 0.020797306671738625
epoch 6800 loss -0.07615842670202255 LR -0.0969991534948349 LKL 0.020840724930167198
48
epoch 6801 loss -0.003939813002943993 LR -0.024777576327323914 LKL 0.02083776332437992


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6802 loss -0.002498473972082138 LR -0.023307785391807556 LKL 0.020809311419725418
epoch 6803 loss -0.0698426216840744 LR -0.09066919982433319 LKL 0.020826581865549088
epoch 6804 loss -0.08009324967861176 LR -0.10088242590427399 LKL 0.02078917622566223
epoch 6805 loss -0.05685178190469742 LR -0.07774914056062698 LKL 0.020897356793284416
epoch 6806 loss -0.08968013525009155 LR -0.1105935126543045 LKL 0.020913375541567802
epoch 6807 loss -0.146291121840477 LR -0.16720813512802124 LKL 0.020917018875479698
epoch 6808 loss -0.07632201164960861 LR -0.09721079468727112 LKL 0.020888781175017357
epoch 6809 loss -0.040175795555114746 LR -0.061064787209033966 LKL 0.02088898979127407
epoch 6810 loss -0.0805681049823761 LR -0.10136092454195023 LKL 0.020792819559574127
epoch 6811 loss 0.000844394788146019 LR -0.020043790340423584 LKL 0.020888185128569603
epoch 6812 loss -0.10844360291957855 LR -0.12927038967609406 LKL 0.020826783031225204
epoch 6813 loss -0.09737114608287811 LR -0.1182983815670

epoch 6898 loss -0.07560142874717712 LR -0.09661737084388733 LKL 0.021015943959355354
epoch 6899 loss -0.07263742387294769 LR -0.09363359212875366 LKL 0.02099616453051567
epoch 6900 loss -0.09464254975318909 LR -0.1157560795545578 LKL 0.021113531664013863
47
epoch 6901 loss -0.0676458328962326 LR -0.08867613971233368 LKL 0.021030304953455925


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 6902 loss -0.06686214357614517 LR -0.08788397163152695 LKL 0.021021826192736626
epoch 6903 loss -0.12019232660531998 LR -0.14120078086853027 LKL 0.021008456125855446
epoch 6904 loss -0.04942658543586731 LR -0.07038614898920059 LKL 0.020959561690688133
epoch 6905 loss -0.02798346057534218 LR -0.048983439803123474 LKL 0.020999979227781296
epoch 6906 loss -0.03628313168883324 LR -0.05725005269050598 LKL 0.020966921001672745
epoch 6907 loss -0.1026441752910614 LR -0.12359236925840378 LKL 0.020948197692632675
epoch 6908 loss -0.13180860877037048 LR -0.1529209017753601 LKL 0.021112291142344475
epoch 6909 loss -0.0445452481508255 LR -0.06554462015628815 LKL 0.020999370142817497
epoch 6910 loss -0.052528753876686096 LR -0.07369682192802429 LKL 0.021168068051338196
epoch 6911 loss -0.061408933252096176 LR -0.08250252157449722 LKL 0.021093588322401047
epoch 6912 loss -0.07053770124912262 LR -0.09155065566301346 LKL 0.021012956276535988
epoch 6913 loss -0.019233914092183113 LR -0.0400897860

epoch 6998 loss -0.09584669023752213 LR -0.11689428985118866 LKL 0.021047597751021385
epoch 6999 loss -0.09803138673305511 LR -0.1191193088889122 LKL 0.021087918430566788
epoch 7000 loss -0.015711847692728043 LR -0.0367833748459816 LKL 0.021071527153253555
84
epoch 7001 loss -0.03156999871134758 LR -0.05264018476009369 LKL 0.02107018604874611
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 7002 loss -0.048881545662879944 LR -0.06987209618091583 LKL 0.02099055051803589
epoch 7003 loss -0.058256834745407104 LR -0.0791466012597084 LKL 0.0208897665143013
epoch 7004 loss -0.08901739120483398 LR -0.11016112565994263 LKL 0.021143736317753792
epoch 7005 loss 0.026546576991677284 LR 0.005535334348678589 LKL 0.021011242642998695
epoch 7006 loss -0.08831758797168732 LR -0.10942436009645462 LKL 0.021106775850057602
epoch 7007 loss -0.07526670396327972 LR -0.0961979553103447 LKL 0.020931249484419823
epoch 7008 loss -0.04616263136267662 LR -0.06725111603736877 LKL 0.021088484674692154
epoch 7009 loss -0.05541651323437691 LR -0.07645518332719803 LKL 0.02103867009282112
epoch 7010 loss -0.080868661403656 LR -0.10186009854078293 LKL 0.020991438999772072
epoch 7011 loss -0.0501164048910141 LR -0.07115401327610016 LKL 0.02103761024773121
epoch 7012 loss -0.0657031312584877 LR -0.08675447106361389 LKL 0.02105133794248104
epoch 7013 loss -0.09363941848278046 LR -0.11465365439653397 LKL 0.02

epoch 7098 loss -0.07561147212982178 LR -0.09663444757461548 LKL 0.0210229754447937
epoch 7099 loss -0.08644161373376846 LR -0.1074923425912857 LKL 0.021050726994872093
epoch 7100 loss -0.1604290008544922 LR -0.1815377026796341 LKL 0.02110869623720646
101
epoch 7101 loss -0.04667574167251587 LR -0.06769533455371857 LKL 0.021019594743847847


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7102 loss -0.10100579261779785 LR -0.12210877984762192 LKL 0.021102985367178917
epoch 7103 loss -0.10531996190547943 LR -0.1263521909713745 LKL 0.02103222906589508
epoch 7104 loss -0.04536377638578415 LR -0.0662461444735527 LKL 0.020882369950413704
epoch 7105 loss -0.08878455311059952 LR -0.10988040268421173 LKL 0.021095849573612213
epoch 7106 loss -0.0940018743276596 LR -0.11510512977838516 LKL 0.021103255450725555
epoch 7107 loss -0.09072399139404297 LR -0.11172903329133987 LKL 0.021005041897296906
epoch 7108 loss -0.03240791708230972 LR -0.05355067551136017 LKL 0.021142756566405296
epoch 7109 loss -0.08729008585214615 LR -0.10819864273071289 LKL 0.020908555015921593
epoch 7110 loss -0.12245006859302521 LR -0.14358867704868317 LKL 0.02113860659301281
epoch 7111 loss -0.05838180333375931 LR -0.0794217512011528 LKL 0.021039949730038643
epoch 7112 loss -0.06503556668758392 LR -0.08615916967391968 LKL 0.021123604848980904
epoch 7113 loss -0.12414925545454025 LR -0.1451282501220703 

epoch 7198 loss -0.06956105679273605 LR -0.09072761237621307 LKL 0.02116655372083187
epoch 7199 loss 0.011016862466931343 LR -0.010187432169914246 LKL 0.02120429463684559
epoch 7200 loss 0.009203925728797913 LR -0.01183195412158966 LKL 0.021035879850387573
114
epoch 7201 loss -0.09756223857402802 LR -0.11877040565013885 LKL 0.02120816893875599


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7202 loss -0.10765019059181213 LR -0.1287914216518402 LKL 0.021141227334737778
epoch 7203 loss -0.02821597456932068 LR -0.04937709867954254 LKL 0.021161124110221863
epoch 7204 loss -0.08818493783473969 LR -0.10946458578109741 LKL 0.021279647946357727
epoch 7205 loss -0.054411113262176514 LR -0.07556942105293274 LKL 0.021158305928111076
epoch 7206 loss -0.010297724977135658 LR -0.03151319921016693 LKL 0.021215474233031273
epoch 7207 loss -0.04598063975572586 LR -0.06709881871938705 LKL 0.021118177101016045
epoch 7208 loss -0.08230393379926682 LR -0.10339990258216858 LKL 0.021095968782901764
epoch 7209 loss -0.052143897861242294 LR -0.07338307797908783 LKL 0.021239180117845535
epoch 7210 loss -0.02687126398086548 LR -0.04800819605588913 LKL 0.02113693207502365
epoch 7211 loss -0.07568822801113129 LR -0.09684561938047409 LKL 0.021157391369342804
epoch 7212 loss -0.08520655333995819 LR -0.10643407702445984 LKL 0.021227525547146797
epoch 7213 loss -0.09167850762605667 LR -0.1128637045

epoch 7300 loss -0.05690751224756241 LR -0.07823894917964935 LKL 0.021331435069441795
88
epoch 7301 loss -0.08876403421163559 LR -0.11012588441371918 LKL 0.02136184833943844


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7302 loss -0.07832875847816467 LR -0.09975438565015793 LKL 0.021425623446702957
epoch 7303 loss -0.08886303007602692 LR -0.11031539738178253 LKL 0.021452367305755615
epoch 7304 loss -0.04131922125816345 LR -0.06267371028661728 LKL 0.021354489028453827
epoch 7305 loss -0.12701238691806793 LR -0.14841428399085999 LKL 0.021401893347501755
epoch 7306 loss -0.036442555487155914 LR -0.05778922140598297 LKL 0.021346665918827057
epoch 7307 loss -0.08911468088626862 LR -0.1105339303612709 LKL 0.02141924574971199
epoch 7308 loss -0.07465863227844238 LR -0.09611565619707108 LKL 0.021457025781273842
epoch 7309 loss -0.07085062563419342 LR -0.09227892756462097 LKL 0.021428298205137253
epoch 7310 loss -0.045188769698143005 LR -0.06645238399505615 LKL 0.021263616159558296
epoch 7311 loss -0.005557091906666756 LR -0.02693389356136322 LKL 0.021376801654696465
epoch 7312 loss -0.03182884305715561 LR -0.05317069590091705 LKL 0.021341850981116295
epoch 7313 loss -0.0870077833533287 LR -0.10839878767

epoch 7400 loss 0.005213836207985878 LR -0.016062825918197632 LKL 0.02127666212618351
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7401 loss 0.004275524988770485 LR -0.01690034568309784 LKL 0.021175870671868324
epoch 7402 loss -0.023396488279104233 LR -0.044559963047504425 LKL 0.021163474768400192
epoch 7403 loss -0.05968085676431656 LR -0.08096402883529663 LKL 0.021283170208334923
epoch 7404 loss 0.026886804029345512 LR 0.005674481391906738 LKL 0.021212322637438774
epoch 7405 loss -0.04797319322824478 LR -0.06920426338911057 LKL 0.021231070160865784
epoch 7406 loss -0.07059451192617416 LR -0.09196612238883972 LKL 0.02137160860002041
epoch 7407 loss -0.09508290886878967 LR -0.11646716296672821 LKL 0.02138425037264824
epoch 7408 loss -0.07149630039930344 LR -0.09282997250556946 LKL 0.02133367210626602
epoch 7409 loss -0.0410432405769825 LR -0.06229014694690704 LKL 0.021246906369924545
epoch 7410 loss -0.03800652176141739 LR -0.05925900489091873 LKL 0.021252483129501343
epoch 7411 loss -0.11484131217002869 LR -0.1361321359872818 LKL 0.02129082754254341
epoch 7412 loss -0.07673271000385284 LR -0.098076462745666

epoch 7497 loss -0.12681561708450317 LR -0.14834082126617432 LKL 0.021525198593735695
epoch 7498 loss -0.09284143894910812 LR -0.11441460251808167 LKL 0.021573161706328392
epoch 7499 loss -0.14134660363197327 LR -0.16290222108364105 LKL 0.021555621176958084
epoch 7500 loss -0.10281991213560104 LR -0.12419945746660233 LKL 0.021379543468356133


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7501 loss -0.06951875984668732 LR -0.09092400223016739 LKL 0.021405240520834923
epoch 7502 loss -0.10633444786071777 LR -0.12774039804935455 LKL 0.02140595205128193
epoch 7503 loss -0.07740364968776703 LR -0.09886032342910767 LKL 0.021456677466630936
epoch 7504 loss -0.07737167179584503 LR -0.09885500371456146 LKL 0.02148333564400673
epoch 7505 loss -0.04426151141524315 LR -0.06566566973924637 LKL 0.02140415832400322
epoch 7506 loss -0.018572263419628143 LR -0.04004666209220886 LKL 0.02147439867258072
epoch 7507 loss -0.09588687121868134 LR -0.11734466254711151 LKL 0.021457787603139877
epoch 7508 loss -0.044628918170928955 LR -0.06602312624454498 LKL 0.021394209936261177
epoch 7509 loss -0.12936602532863617 LR -0.15079966187477112 LKL 0.021433638408780098
epoch 7510 loss -0.05814315378665924 LR -0.07955722510814667 LKL 0.021414069458842278
epoch 7511 loss -0.07660321891307831 LR -0.09802117198705673 LKL 0.021417949348688126
epoch 7512 loss -0.10055352002382278 LR -0.1220043748617

epoch 7597 loss -0.08172240853309631 LR -0.10334920883178711 LKL 0.021626800298690796
epoch 7598 loss -0.018373696133494377 LR -0.039799705147743225 LKL 0.021426009014248848
epoch 7599 loss -0.07846323400735855 LR -0.09995942562818527 LKL 0.02149619348347187
epoch 7600 loss -0.07837719470262527 LR -0.10001493990421295 LKL 0.021637743338942528
56


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7601 loss -0.09838491678237915 LR -0.11999659240245819 LKL 0.02161167748272419
epoch 7602 loss -0.10135471820831299 LR -0.1229977235198021 LKL 0.021643009036779404
epoch 7603 loss -0.07190993428230286 LR -0.09341488033533096 LKL 0.021504942327737808
epoch 7604 loss -0.08929985016584396 LR -0.11088436841964722 LKL 0.021584516391158104
epoch 7605 loss -0.05999064818024635 LR -0.0815306156873703 LKL 0.021539967507123947
epoch 7606 loss -0.08466632664203644 LR -0.10617769509553909 LKL 0.021511364728212357
epoch 7607 loss -0.04277089238166809 LR -0.06420448422431946 LKL 0.021433593705296516
epoch 7608 loss -0.06737149506807327 LR -0.08880463242530823 LKL 0.021433139219880104
epoch 7609 loss -0.07269002497196198 LR -0.09427187591791153 LKL 0.021581849083304405
epoch 7610 loss -0.10643433779478073 LR -0.12791581451892853 LKL 0.021481476724147797
epoch 7611 loss -0.028699547052383423 LR -0.05018080770969391 LKL 0.021481260657310486
epoch 7612 loss -0.08185454457998276 LR -0.1033233031630

epoch 7698 loss -0.13532650470733643 LR -0.15699824690818787 LKL 0.021671734750270844
epoch 7699 loss -0.09082956612110138 LR -0.11245216429233551 LKL 0.02162259630858898
epoch 7700 loss -0.0765657126903534 LR -0.09809420257806778 LKL 0.021528489887714386
59
epoch 7701 loss -0.07160218805074692 LR -0.09318284690380096 LKL 0.021580658853054047


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7702 loss -0.01769775152206421 LR -0.039292529225349426 LKL 0.021594777703285217
epoch 7703 loss -0.06923339515924454 LR -0.09080676734447479 LKL 0.021573374047875404
epoch 7704 loss -0.08854077011346817 LR -0.11010013520717621 LKL 0.02155936509370804
epoch 7705 loss -0.11409124732017517 LR -0.1357957124710083 LKL 0.02170446328818798
epoch 7706 loss -0.0941423624753952 LR -0.11579763144254684 LKL 0.021655268967151642
epoch 7707 loss -0.12993738055229187 LR -0.15170463919639587 LKL 0.021767262369394302
epoch 7708 loss -0.1396687924861908 LR -0.16127976775169373 LKL 0.02161097526550293
epoch 7709 loss -0.06020435690879822 LR -0.08177749812602997 LKL 0.0215731393545866
epoch 7710 loss -0.046847812831401825 LR -0.06855510175228119 LKL 0.021707287058234215
epoch 7711 loss -0.10877379029989243 LR -0.13045451045036316 LKL 0.021680720150470734
epoch 7712 loss -0.04754107445478439 LR -0.06912861764431 LKL 0.021587545052170753
epoch 7713 loss -0.11235865205526352 LR -0.13407893478870392 LK

epoch 7799 loss -0.11156786233186722 LR -0.13335196673870087 LKL 0.02178410440683365
epoch 7800 loss -0.08166000992059708 LR -0.1034168228507042 LKL 0.021756814792752266
61
epoch 7801 loss -0.1029028445482254 LR -0.12459507584571838 LKL 0.02169223502278328


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7802 loss -0.10281126201152802 LR -0.12451215833425522 LKL 0.021700898185372353
epoch 7803 loss -0.11280468106269836 LR -0.13445675373077393 LKL 0.021652068942785263
epoch 7804 loss -0.06964756548404694 LR -0.09144303947687149 LKL 0.021795470267534256
epoch 7805 loss -0.0573163777589798 LR -0.07894250750541687 LKL 0.021626129746437073
epoch 7806 loss -0.12026449292898178 LR -0.14206358790397644 LKL 0.02179909311234951
epoch 7807 loss -0.07067883014678955 LR -0.09235753864049911 LKL 0.021678712218999863
epoch 7808 loss -0.0264704842120409 LR -0.04827128350734711 LKL 0.021800799295306206
epoch 7809 loss -0.11036549508571625 LR -0.13216441869735718 LKL 0.02179892361164093
epoch 7810 loss -0.07091246545314789 LR -0.09263685345649719 LKL 0.021724391728639603
epoch 7811 loss -0.03579963743686676 LR -0.05756857991218567 LKL 0.021768944337964058
epoch 7812 loss -0.07211177051067352 LR -0.0938621461391449 LKL 0.021750377491116524
epoch 7813 loss -0.11339408904314041 LR -0.1351838409900665

epoch 7900 loss -0.03921199589967728 LR -0.06101815402507782 LKL 0.021806156262755394
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 7901 loss -0.050954774022102356 LR -0.0728895366191864 LKL 0.021934764459729195
epoch 7902 loss -0.05259844660758972 LR -0.07448231428861618 LKL 0.02188386768102646
epoch 7903 loss -0.10348770767450333 LR -0.12541168928146362 LKL 0.021923979744315147
epoch 7904 loss -0.07677245140075684 LR -0.09870891273021698 LKL 0.021936465054750443
epoch 7905 loss -0.09256040304899216 LR -0.1145273745059967 LKL 0.021966973319649696
epoch 7906 loss -0.0626455694437027 LR -0.08453749120235443 LKL 0.021891923621296883
epoch 7907 loss -0.1397446095943451 LR -0.16173210740089417 LKL 0.02198750153183937
epoch 7908 loss -0.15889663994312286 LR -0.18086019158363342 LKL 0.02196354605257511
epoch 7909 loss -0.0948554128408432 LR -0.11684805899858475 LKL 0.021992644295096397
epoch 7910 loss -0.1432201862335205 LR -0.16514909267425537 LKL 0.021928898990154266
epoch 7911 loss -0.1562318652868271 LR -0.17830702662467957 LKL 0.022075166925787926
epoch 7912 loss -0.09489578008651733 LR -0.11682362109422684 LK

epoch 7999 loss -0.05722280591726303 LR -0.07925169169902802 LKL 0.022028883919119835
epoch 8000 loss -0.03613022714853287 LR -0.05804973095655441 LKL 0.021919503808021545
44
epoch 8001 loss -0.08347444236278534 LR -0.10539931058883667 LKL 0.02192487008869648


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8002 loss -0.04055353254079819 LR -0.062425993382930756 LKL 0.02187245897948742
epoch 8003 loss 0.03343472257256508 LR 0.011478006839752197 LKL 0.02195671573281288
epoch 8004 loss -0.09148944914340973 LR -0.11340805888175964 LKL 0.021918607875704765
epoch 8005 loss -0.10269046574831009 LR -0.12455594539642334 LKL 0.02186547964811325
epoch 8006 loss -0.09027715027332306 LR -0.11210688948631287 LKL 0.021829737350344658
epoch 8007 loss -0.06791940331459045 LR -0.08983167260885239 LKL 0.02191227115690708
epoch 8008 loss -0.16234371066093445 LR -0.1844235360622406 LKL 0.022079817950725555
epoch 8009 loss -0.11691319942474365 LR -0.1388971507549286 LKL 0.021983949467539787
epoch 8010 loss -0.07717112451791763 LR -0.09909415245056152 LKL 0.02192302606999874
epoch 8011 loss -0.10302602499723434 LR -0.12495257705450058 LKL 0.021926552057266235
epoch 8012 loss -0.062325380742549896 LR -0.08409525454044342 LKL 0.021769873797893524
epoch 8013 loss -0.13128802180290222 LR -0.15328896045684814

epoch 8100 loss -0.10574696958065033 LR -0.12791085243225098 LKL 0.022163886576890945
68


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8101 loss -0.12954933941364288 LR -0.15149474143981934 LKL 0.0219454076141119
epoch 8102 loss -0.1307905614376068 LR -0.152754008769989 LKL 0.021963445469737053
epoch 8103 loss -0.09074229001998901 LR -0.11284022033214569 LKL 0.022097932174801826
epoch 8104 loss -0.07427239418029785 LR -0.09620068967342377 LKL 0.021928295493125916
epoch 8105 loss -0.09807904809713364 LR -0.12003742903470993 LKL 0.021958382800221443
epoch 8106 loss -0.07778189331293106 LR -0.09984081983566284 LKL 0.02205892652273178
epoch 8107 loss -0.12591557204723358 LR -0.14802435040473938 LKL 0.022108783945441246
epoch 8108 loss -0.10323858261108398 LR -0.12525154650211334 LKL 0.02201296016573906
epoch 8109 loss -0.06041289493441582 LR -0.08246763050556183 LKL 0.02205473557114601
epoch 8110 loss -0.0535912849009037 LR -0.07562980055809021 LKL 0.022038515657186508
epoch 8111 loss -0.10555590689182281 LR -0.12759852409362793 LKL 0.022042617201805115
epoch 8112 loss -0.09680316597223282 LR -0.1189093291759491 LKL

epoch 8200 loss -0.1372162401676178 LR -0.1594340056180954 LKL 0.0222177617251873
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8201 loss -0.10515610128641129 LR -0.12744443118572235 LKL 0.022288328036665916
epoch 8202 loss -0.04917659983038902 LR -0.07133445143699646 LKL 0.022157851606607437
epoch 8203 loss -0.09683837741613388 LR -0.11895069479942322 LKL 0.022112319245934486
epoch 8204 loss -0.08118878304958344 LR -0.1034446433186531 LKL 0.02225586213171482
epoch 8205 loss -0.09370392560958862 LR -0.11581350862979889 LKL 0.022109579294919968
epoch 8206 loss -0.05949798971414566 LR -0.08170443028211594 LKL 0.022206442430615425
epoch 8207 loss -0.07507723569869995 LR -0.09720654785633087 LKL 0.02212931029498577
epoch 8208 loss -0.11803820729255676 LR -0.1402757167816162 LKL 0.02223750576376915
epoch 8209 loss -0.1979362666606903 LR -0.2201014757156372 LKL 0.0221652053296566
epoch 8210 loss -0.14258117973804474 LR -0.16478732228279114 LKL 0.022206146270036697
epoch 8211 loss -0.11681374162435532 LR -0.13909706473350525 LKL 0.022283323109149933
epoch 8212 loss -0.06916218996047974 LR -0.09138347208499908 LK

epoch 8299 loss -0.11853700876235962 LR -0.14074593782424927 LKL 0.02220892533659935
epoch 8300 loss -0.16510821878910065 LR -0.18733331561088562 LKL 0.022225094959139824


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8301 loss -0.10240296274423599 LR -0.12467242777347565 LKL 0.022269465029239655
epoch 8302 loss -0.09480655193328857 LR -0.11715847998857498 LKL 0.022351931780576706
epoch 8303 loss -0.081845223903656 LR -0.10396640747785568 LKL 0.022121181711554527
epoch 8304 loss -0.08790580183267593 LR -0.11031171679496765 LKL 0.022405914962291718
epoch 8305 loss -0.056541845202445984 LR -0.07878522574901581 LKL 0.022243382409214973
epoch 8306 loss -0.10716315358877182 LR -0.12953715026378632 LKL 0.022373994812369347
epoch 8307 loss -0.0351896770298481 LR -0.05745616555213928 LKL 0.022266488522291183
epoch 8308 loss -0.053804777562618256 LR -0.07609434425830841 LKL 0.022289566695690155
epoch 8309 loss -0.10526435077190399 LR -0.1275692582130432 LKL 0.02230491116642952
epoch 8310 loss -0.10835379362106323 LR -0.1306735873222351 LKL 0.022319791838526726
epoch 8311 loss -0.12972380220890045 LR -0.15204069018363953 LKL 0.022316887974739075
epoch 8312 loss -0.12361910939216614 LR -0.145983457565307

epoch 8398 loss -0.15290915966033936 LR -0.17512246966362 LKL 0.02221331000328064
epoch 8399 loss -0.15224164724349976 LR -0.17458868026733398 LKL 0.022347036749124527
epoch 8400 loss -0.13913188874721527 LR -0.1612829566001892 LKL 0.022151073440909386
82
epoch 8401 loss -0.12310908734798431 LR -0.14534342288970947 LKL 0.022234339267015457


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8402 loss -0.07119744271039963 LR -0.09331826120615005 LKL 0.022120820358395576
epoch 8403 loss -0.08450783044099808 LR -0.10666337609291077 LKL 0.02215554378926754
epoch 8404 loss -0.10720222443342209 LR -0.1295124590396881 LKL 0.02231023460626602
epoch 8405 loss -0.14349590241909027 LR -0.16580069065093994 LKL 0.02230478636920452
epoch 8406 loss -0.052370503544807434 LR -0.07451167702674866 LKL 0.022141173481941223
epoch 8407 loss -0.07638055086135864 LR -0.09861241281032562 LKL 0.02223186008632183
epoch 8408 loss -0.06336559355258942 LR -0.08551676571369171 LKL 0.022151172161102295
epoch 8409 loss -0.12743768095970154 LR -0.14973744750022888 LKL 0.022299768403172493
epoch 8410 loss -0.11728917062282562 LR -0.13952183723449707 LKL 0.0222326647490263
epoch 8411 loss -0.11010022461414337 LR -0.1323680579662323 LKL 0.022267835214734077
epoch 8412 loss -0.1413988173007965 LR -0.16370561718940735 LKL 0.02230680175125599
epoch 8413 loss -0.057417597621679306 LR -0.07958891242742538 L

124
epoch 8501 loss -0.14653311669826508 LR -0.1688510626554489 LKL 0.02231794223189354


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8502 loss -0.15224148333072662 LR -0.17450600862503052 LKL 0.022264521569013596
epoch 8503 loss -0.10601826012134552 LR -0.12819838523864746 LKL 0.02218012325465679
epoch 8504 loss -0.14406703412532806 LR -0.1665179282426834 LKL 0.022450899705290794
epoch 8505 loss -0.15529809892177582 LR -0.17758280038833618 LKL 0.02228470705449581
epoch 8506 loss -0.11856253445148468 LR -0.14084696769714355 LKL 0.022284429520368576
epoch 8507 loss -0.12135130167007446 LR -0.14366655051708221 LKL 0.022315246984362602
epoch 8508 loss -0.1106717437505722 LR -0.13289310038089752 LKL 0.022221354767680168
epoch 8509 loss -0.0986214354634285 LR -0.12091271579265594 LKL 0.022291282191872597
epoch 8510 loss -0.18589268624782562 LR -0.20841693878173828 LKL 0.02252424880862236
epoch 8511 loss -0.07086405158042908 LR -0.09299857914447784 LKL 0.022134531289339066
epoch 8512 loss -0.08351881057024002 LR -0.10594119131565094 LKL 0.02242238260805607
epoch 8513 loss -0.08341621607542038 LR -0.10570889711380005 

epoch 8598 loss -0.10394211113452911 LR -0.12616893649101257 LKL 0.02222682349383831
epoch 8599 loss -0.12302854657173157 LR -0.14517131447792053 LKL 0.022142771631479263
epoch 8600 loss -0.10320582985877991 LR -0.12551818788051605 LKL 0.022312354296445847
60
epoch 8601 loss -0.07028642296791077 LR -0.09246062487363815 LKL 0.022174201905727386


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8602 loss -0.05796551704406738 LR -0.08014611899852753 LKL 0.022180601954460144
epoch 8603 loss -0.0405908077955246 LR -0.06267598271369934 LKL 0.022085176780819893
epoch 8604 loss -0.08284934610128403 LR -0.10491378605365753 LKL 0.022064438089728355
epoch 8605 loss -0.054720304906368256 LR -0.07685017585754395 LKL 0.02212987095117569
epoch 8606 loss -0.04042347893118858 LR -0.06249183416366577 LKL 0.022068355232477188
epoch 8607 loss -0.14466339349746704 LR -0.16691666841506958 LKL 0.02225327491760254
epoch 8608 loss -0.0892004668712616 LR -0.11149556189775467 LKL 0.022295091301202774
epoch 8609 loss -0.10495134443044662 LR -0.1272064447402954 LKL 0.022255102172493935
epoch 8610 loss -0.13935011625289917 LR -0.1615399271249771 LKL 0.02218981646001339
epoch 8611 loss -0.07888232916593552 LR -0.1010296642780304 LKL 0.02214733324944973
epoch 8612 loss -0.12635169923305511 LR -0.14861445128917694 LKL 0.022262750193476677
epoch 8613 loss -0.12460307776927948 LR -0.14683452248573303 L

69
epoch 8701 loss -0.10486412793397903 LR -0.1271669864654541 LKL 0.022302856668829918


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8702 loss -0.05964277312159538 LR -0.08189558982849121 LKL 0.022252816706895828
epoch 8703 loss -0.1059398427605629 LR -0.12813282012939453 LKL 0.022192979231476784
epoch 8704 loss -0.12845900654792786 LR -0.1508139967918396 LKL 0.02235499583184719
epoch 8705 loss -0.07879027724266052 LR -0.10088169574737549 LKL 0.022091420367360115
epoch 8706 loss -0.12434698641300201 LR -0.1466163992881775 LKL 0.022269416600465775
epoch 8707 loss -0.1265994906425476 LR -0.14890842139720917 LKL 0.022308938205242157
epoch 8708 loss -0.15128713846206665 LR -0.17351368069648743 LKL 0.022226538509130478
epoch 8709 loss -0.034492991864681244 LR -0.05666717141866684 LKL 0.022174181416630745
epoch 8710 loss -0.09046770632266998 LR -0.11274165660142899 LKL 0.022273952141404152
epoch 8711 loss -0.12781044840812683 LR -0.15010148286819458 LKL 0.02229103073477745
epoch 8712 loss -0.10867441445589066 LR -0.13097867369651794 LKL 0.02230425924062729
epoch 8713 loss -0.14596207439899445 LR -0.16832119226455688

epoch 8800 loss -0.1570998728275299 LR -0.17958219349384308 LKL 0.02248232439160347
125
epoch 8801 loss -0.1822114735841751 LR -0.20458367466926575 LKL 0.022372201085090637


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8802 loss -0.10281580686569214 LR -0.12526491284370422 LKL 0.022449104115366936
epoch 8803 loss -0.11855866760015488 LR -0.14081500470638275 LKL 0.022256338968873024
epoch 8804 loss -0.11513440310955048 LR -0.13760823011398315 LKL 0.02247382514178753
epoch 8805 loss -0.13417957723140717 LR -0.15651029348373413 LKL 0.022330710664391518
epoch 8806 loss -0.13313084840774536 LR -0.15554237365722656 LKL 0.0224115289747715
epoch 8807 loss -0.10624665021896362 LR -0.1287144422531128 LKL 0.02246779389679432
epoch 8808 loss -0.08164197206497192 LR -0.1040801852941513 LKL 0.022438211366534233
epoch 8809 loss -0.039172470569610596 LR -0.061444297432899475 LKL 0.02227182872593403
epoch 8810 loss -0.15239697694778442 LR -0.1748250126838684 LKL 0.022428030148148537
epoch 8811 loss -0.15726476907730103 LR -0.17964130640029907 LKL 0.022376539185643196
epoch 8812 loss -0.05935267359018326 LR -0.0818696916103363 LKL 0.022517016157507896
epoch 8813 loss -0.08745356649160385 LR -0.10980603843927383 

epoch 8900 loss -0.1432264745235443 LR -0.16572345793247223 LKL 0.022496988996863365
41


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 8901 loss -0.19349242746829987 LR -0.2160935401916504 LKL 0.022601112723350525
epoch 8902 loss -0.16245411336421967 LR -0.18516427278518677 LKL 0.0227101631462574
epoch 8903 loss -0.07736896723508835 LR -0.09978451579809189 LKL 0.02241554670035839
epoch 8904 loss -0.12336532771587372 LR -0.14591974020004272 LKL 0.022554416209459305
epoch 8905 loss -0.1606837511062622 LR -0.18331535160541534 LKL 0.022631604224443436
epoch 8906 loss -0.08323615044355392 LR -0.10576561838388443 LKL 0.022529467940330505
epoch 8907 loss -0.1187547892332077 LR -0.141316756606102 LKL 0.022561969235539436
epoch 8908 loss -0.11228621006011963 LR -0.13477285206317902 LKL 0.022486645728349686
epoch 8909 loss -0.16169509291648865 LR -0.18409785628318787 LKL 0.02240276336669922
epoch 8910 loss -0.15895047783851624 LR -0.18168073892593384 LKL 0.022730257362127304
epoch 8911 loss -0.13142450153827667 LR -0.15385878086090088 LKL 0.022434277459979057
epoch 8912 loss -0.09791101515293121 LR -0.12045581638813019 LK

epoch 9000 loss -0.10953695327043533 LR -0.1322154700756073 LKL 0.022678516805171967
85
epoch 9001 loss -0.02955157868564129 LR -0.05221916735172272 LKL 0.02266758866608143


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9002 loss -0.08362308144569397 LR -0.10624763369560242 LKL 0.022624554112553596
epoch 9003 loss -0.08600552380084991 LR -0.10865408927202225 LKL 0.02264856919646263
epoch 9004 loss -0.06473366916179657 LR -0.0874115377664566 LKL 0.022677870467305183
epoch 9005 loss -0.08740802109241486 LR -0.11010310053825378 LKL 0.022695079445838928
epoch 9006 loss -0.11281409859657288 LR -0.13547632098197937 LKL 0.022662222385406494
epoch 9007 loss -0.14286288619041443 LR -0.16558901965618134 LKL 0.022726135328412056
epoch 9008 loss -0.12374648451805115 LR -0.1465226411819458 LKL 0.022776152938604355
epoch 9009 loss -0.09304217994213104 LR -0.11579091101884842 LKL 0.022748729214072227
epoch 9010 loss -0.04484019801020622 LR -0.06745520234107971 LKL 0.02261500433087349
epoch 9011 loss -0.10041757673025131 LR -0.12311384826898575 LKL 0.022696271538734436
epoch 9012 loss -0.05806972086429596 LR -0.0807858482003212 LKL 0.022716129198670387
epoch 9013 loss -0.14638695120811462 LR -0.1691662967205047

epoch 9099 loss -0.1436464786529541 LR -0.1663632094860077 LKL 0.022716723382472992
epoch 9100 loss -0.11800464987754822 LR -0.14060822129249573 LKL 0.02260356955230236
59
epoch 9101 loss -0.11929959058761597 LR -0.142024427652359 LKL 0.02272484079003334


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9102 loss -0.13820891082286835 LR -0.16080355644226074 LKL 0.022594643756747246
epoch 9103 loss -0.10684765875339508 LR -0.1294041872024536 LKL 0.022556530311703682
epoch 9104 loss -0.16516569256782532 LR -0.18792223930358887 LKL 0.022756552323698997
epoch 9105 loss -0.10006542503833771 LR -0.12270962446928024 LKL 0.022644199430942535
epoch 9106 loss -0.07609028369188309 LR -0.09881319105625153 LKL 0.022722909227013588
epoch 9107 loss -0.11981776356697083 LR -0.14253519475460052 LKL 0.0227174274623394
epoch 9108 loss -0.053767699748277664 LR -0.0765591412782669 LKL 0.022791441529989243
epoch 9109 loss -0.0748579353094101 LR -0.09756685793399811 LKL 0.022708924487233162
epoch 9110 loss -0.10410398244857788 LR -0.126846581697464 LKL 0.02274259738624096
epoch 9111 loss -0.11914575099945068 LR -0.14178338646888733 LKL 0.022637639194726944
epoch 9112 loss -0.07144252210855484 LR -0.09405827522277832 LKL 0.02261575125157833
epoch 9113 loss -0.0771588683128357 LR -0.09992045164108276 LK

97
epoch 9201 loss -0.17577806115150452 LR -0.19859951734542847 LKL 0.02282145991921425


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9202 loss -0.17527179419994354 LR -0.1981906294822693 LKL 0.022918831557035446
epoch 9203 loss -0.13984808325767517 LR -0.16273094713687897 LKL 0.0228828564286232
epoch 9204 loss -0.16315867006778717 LR -0.18613767623901367 LKL 0.02297900803387165
epoch 9205 loss -0.14486567676067352 LR -0.1676628440618515 LKL 0.022797172889113426
epoch 9206 loss -0.15006132423877716 LR -0.1729905605316162 LKL 0.022929241880774498
epoch 9207 loss -0.10327612608671188 LR -0.1260172426700592 LKL 0.02274111472070217
epoch 9208 loss -0.1795204132795334 LR -0.2024495154619217 LKL 0.022929100319743156
epoch 9209 loss -0.19725027680397034 LR -0.22023418545722961 LKL 0.02298390492796898
epoch 9210 loss -0.15045593678951263 LR -0.17337960004806519 LKL 0.02292366698384285
epoch 9211 loss -0.17909134924411774 LR -0.20201259851455688 LKL 0.022921249270439148
epoch 9212 loss -0.1780504733324051 LR -0.20091167092323303 LKL 0.02286120131611824
epoch 9213 loss -0.1656874120235443 LR -0.18860474228858948 LKL 0.02

epoch 9300 loss -0.12494239211082458 LR -0.1479095220565796 LKL 0.022967129945755005
40


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9301 loss -0.03361310064792633 LR -0.056356921792030334 LKL 0.022743821144104004
epoch 9302 loss -0.22302494943141937 LR -0.2460448145866394 LKL 0.02301986888051033
epoch 9303 loss -0.08966889977455139 LR -0.11261723190546036 LKL 0.022948328405618668
epoch 9304 loss -0.11848184466362 LR -0.1412719190120697 LKL 0.022790072485804558
epoch 9305 loss -0.16275447607040405 LR -0.1858319342136383 LKL 0.023077458143234253
epoch 9306 loss -0.18976075947284698 LR -0.2126058042049408 LKL 0.022845041006803513
epoch 9307 loss -0.1561712622642517 LR -0.17895516753196716 LKL 0.022783908993005753
epoch 9308 loss -0.13085007667541504 LR -0.15373572707176208 LKL 0.022885644808411598
epoch 9309 loss -0.13902872800827026 LR -0.16188953816890717 LKL 0.0228608138859272
epoch 9310 loss -0.11838914453983307 LR -0.1412883698940277 LKL 0.022899221628904343
epoch 9311 loss -0.16501203179359436 LR -0.18780022859573364 LKL 0.022788194939494133
epoch 9312 loss -0.15634965896606445 LR -0.1791703850030899 LKL 0

epoch 9399 loss -0.15445713698863983 LR -0.1773451864719391 LKL 0.022888053208589554
epoch 9400 loss -0.14463841915130615 LR -0.16752108931541443 LKL 0.022882672026753426
108
epoch 9401 loss -0.1344570517539978 LR -0.15718279778957367 LKL 0.02272573858499527


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9402 loss -0.15447770059108734 LR -0.177362859249115 LKL 0.0228851567953825
epoch 9403 loss -0.10504452139139175 LR -0.1278819590806961 LKL 0.0228374395519495
epoch 9404 loss -0.14622914791107178 LR -0.16907799243927002 LKL 0.022848844528198242
epoch 9405 loss -0.05161554366350174 LR -0.07437887787818909 LKL 0.022763334214687347
epoch 9406 loss -0.09421633183956146 LR -0.11710933595895767 LKL 0.02289300039410591
epoch 9407 loss -0.1238097995519638 LR -0.14666292071342468 LKL 0.022853117436170578
epoch 9408 loss -0.15954165160655975 LR -0.1826055496931076 LKL 0.023063892498612404
epoch 9409 loss -0.04478686302900314 LR -0.06750667095184326 LKL 0.022719809785485268
epoch 9410 loss -0.10555265098810196 LR -0.12832123041152954 LKL 0.022768577560782433
epoch 9411 loss -0.06281968206167221 LR -0.08553577959537506 LKL 0.0227160956710577
epoch 9412 loss -0.06244516372680664 LR -0.0852094292640686 LKL 0.022764267399907112
epoch 9413 loss -0.18698561191558838 LR -0.2098630964756012 LKL 0.0

59
epoch 9501 loss -0.13241073489189148 LR -0.15532013773918152 LKL 0.022909406572580338


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9502 loss -0.12698912620544434 LR -0.14976176619529724 LKL 0.022772636264562607
epoch 9503 loss -0.10284998267889023 LR -0.12556469440460205 LKL 0.022714709863066673
epoch 9504 loss -0.11535048484802246 LR -0.13819143176078796 LKL 0.022840943187475204
epoch 9505 loss -0.08957017213106155 LR -0.11227857321500778 LKL 0.02270839922130108
epoch 9506 loss -0.1973961591720581 LR -0.22023969888687134 LKL 0.022843539714813232
epoch 9507 loss -0.0910225585103035 LR -0.11372262239456177 LKL 0.02270006388425827
epoch 9508 loss -0.16057626903057098 LR -0.18346905708312988 LKL 0.022892789915204048
epoch 9509 loss -0.18574243783950806 LR -0.20864638686180115 LKL 0.02290395088493824
epoch 9510 loss -0.09801660478115082 LR -0.12079060822725296 LKL 0.022774003446102142
epoch 9511 loss -0.10686085373163223 LR -0.12966299057006836 LKL 0.022802136838436127
epoch 9512 loss -0.11323251575231552 LR -0.13598453998565674 LKL 0.022752026095986366
epoch 9513 loss -0.1491936594247818 LR -0.17217570543289185

epoch 9600 loss -0.08649593591690063 LR -0.10957814007997513 LKL 0.023082202300429344
66


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9601 loss -0.1585635095834732 LR -0.1817721426486969 LKL 0.023208629339933395
epoch 9602 loss -0.12059763073921204 LR -0.14364466071128845 LKL 0.023047031834721565
epoch 9603 loss -0.10707349330186844 LR -0.13025426864624023 LKL 0.023180773481726646
epoch 9604 loss -0.12438876926898956 LR -0.14753979444503784 LKL 0.023151027038693428
epoch 9605 loss -0.10800058394670486 LR -0.13108396530151367 LKL 0.023083379492163658
epoch 9606 loss -0.1277918964624405 LR -0.15092898905277252 LKL 0.02313709259033203
epoch 9607 loss -0.1219010129570961 LR -0.14506253600120544 LKL 0.023161523044109344
epoch 9608 loss -0.15635894238948822 LR -0.17945191264152527 LKL 0.023092973977327347
epoch 9609 loss -0.13411569595336914 LR -0.15722870826721191 LKL 0.023113006725907326
epoch 9610 loss -0.12215778231620789 LR -0.14534267783164978 LKL 0.023184899240732193
epoch 9611 loss -0.167290598154068 LR -0.19058775901794434 LKL 0.023297155275940895
epoch 9612 loss -0.13189515471458435 LR -0.1551666110754013 L

epoch 9698 loss -0.13697199523448944 LR -0.15993845462799072 LKL 0.022966457530856133
epoch 9699 loss -0.11913646012544632 LR -0.14230120182037354 LKL 0.023164739832282066
epoch 9700 loss -0.18609869480133057 LR -0.20942270755767822 LKL 0.02332400530576706
53
epoch 9701 loss -0.10503845661878586 LR -0.1281164288520813 LKL 0.02307797409594059


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9702 loss -0.14356648921966553 LR -0.16681267321109772 LKL 0.02324618212878704
epoch 9703 loss -0.1773655116558075 LR -0.2006160318851471 LKL 0.02325052209198475
epoch 9704 loss -0.12189600616693497 LR -0.1450245976448059 LKL 0.02312859334051609
epoch 9705 loss -0.1035485565662384 LR -0.12662586569786072 LKL 0.023077307268977165
epoch 9706 loss -0.13939039409160614 LR -0.16269107162952423 LKL 0.02330067567527294
epoch 9707 loss -0.14060606062412262 LR -0.16403023898601532 LKL 0.0234241746366024
epoch 9708 loss -0.16625897586345673 LR -0.18960143625736237 LKL 0.02334246225655079
epoch 9709 loss -0.13126429915428162 LR -0.15451210737228394 LKL 0.02324780635535717
epoch 9710 loss -0.07844376564025879 LR -0.1016254872083664 LKL 0.023181717842817307
epoch 9711 loss -0.1348658949136734 LR -0.1579127460718155 LKL 0.02304684929549694
epoch 9712 loss -0.16505591571331024 LR -0.18813422322273254 LKL 0.023078301921486855
epoch 9713 loss -0.11904091387987137 LR -0.14218996465206146 LKL 0.023

epoch 9800 loss -0.1574346274137497 LR -0.18081310391426086 LKL 0.023378480225801468
119
epoch 9801 loss -0.15390534698963165 LR -0.1772121787071228 LKL 0.02330682799220085
epoch 9802 loss -0.15412935614585876 LR -0.17749738693237305

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 LKL 0.02336803637444973
epoch 9803 loss -0.12635385990142822 LR -0.14964672923088074 LKL 0.023292867466807365
epoch 9804 loss -0.17438821494579315 LR -0.19782748818397522 LKL 0.02343927137553692
epoch 9805 loss -0.20932568609714508 LR -0.23261553049087524 LKL 0.02328984998166561
epoch 9806 loss -0.14915435016155243 LR -0.17248141765594482 LKL 0.023327073082327843
epoch 9807 loss -0.09208018332719803 LR -0.11535099893808365 LKL 0.02327081747353077
epoch 9808 loss -0.12658889591693878 LR -0.14985501766204834 LKL 0.02326611988246441
epoch 9809 loss -0.13847830891609192 LR -0.16192114353179932 LKL 0.0234428271651268
epoch 9810 loss -0.1448703557252884 LR -0.1682182252407074 LKL 0.02334786392748356
epoch 9811 loss -0.1374765783548355 LR -0.1607116460800171 LKL 0.023235071450471878
epoch 9812 loss -0.22204837203025818 LR -0.2455449104309082 LKL 0.02349654585123062
epoch 9813 loss -0.13459433615207672 LR -0.15778207778930664 LKL 0.02318774163722992
epoch 9814 loss -0.15856224298477173 LR -0.

57
epoch 9901 loss -0.1667415201663971 LR -0.19008880853652954 LKL 0.023347282782197


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 9902 loss -0.14371713995933533 LR -0.16680988669395447 LKL 0.023092743009328842
epoch 9903 loss -0.1039988249540329 LR -0.12718437612056732 LKL 0.023185549303889275
epoch 9904 loss -0.11180977523326874 LR -0.13499465584754944 LKL 0.02318488247692585
epoch 9905 loss -0.13024383783340454 LR -0.15347114205360413 LKL 0.023227298632264137
epoch 9906 loss -0.1317882239818573 LR -0.15501439571380615 LKL 0.02322617545723915
epoch 9907 loss -0.11897220462560654 LR -0.14218220114707947 LKL 0.02320999465882778
epoch 9908 loss -0.14486747980117798 LR -0.16792601346969604 LKL 0.023058531805872917
epoch 9909 loss -0.09920642524957657 LR -0.12234349548816681 LKL 0.02313707210123539
epoch 9910 loss -0.13943995535373688 LR -0.1626095473766327 LKL 0.023169586434960365
epoch 9911 loss -0.14739705622196198 LR -0.17033180594444275 LKL 0.022934744134545326
epoch 9912 loss -0.14042691886425018 LR -0.16371533274650574 LKL 0.023288417607545853
epoch 9913 loss -0.12256628274917603 LR -0.14581400156021118 

epoch 9999 loss -0.18893156945705414 LR -0.2121564745903015 LKL 0.023224903270602226
epoch 10000 loss -0.18824292719364166 LR -0.2115093469619751 LKL 0.023266416043043137
37


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10001 loss -0.17626285552978516 LR -0.1996312439441681 LKL 0.023368390277028084
epoch 10002 loss -0.15188543498516083 LR -0.17505258321762085 LKL 0.02316715382039547
epoch 10003 loss -0.16789336502552032 LR -0.1911850869655609 LKL 0.02329172007739544
epoch 10004 loss -0.11568734794855118 LR -0.1390589475631714 LKL 0.02337159775197506
epoch 10005 loss -0.0694526806473732 LR -0.09256100654602051 LKL 0.02310832589864731
epoch 10006 loss -0.18094193935394287 LR -0.20413032174110413 LKL 0.023188387975096703
epoch 10007 loss -0.08906983584165573 LR -0.11223219335079193 LKL 0.02316235937178135
epoch 10008 loss -0.11484683305025101 LR -0.13812832534313202 LKL 0.02328149415552616
epoch 10009 loss -0.11514872312545776 LR -0.13837021589279175 LKL 0.023221489042043686
epoch 10010 loss -0.15209361910820007 LR -0.17536373436450958 LKL 0.02327011525630951
epoch 10011 loss -0.13769976794719696 LR -0.16093626618385315 LKL 0.02323649823665619
epoch 10012 loss -0.12133243680000305 LR -0.14452713727

epoch 10097 loss -0.12859536707401276 LR -0.15200191736221313 LKL 0.023406552150845528
epoch 10098 loss -0.08537208288908005 LR -0.1087394580245018 LKL 0.023367376998066902
epoch 10099 loss -0.1515829712152481 LR -0.17494872212409973 LKL 0.023365749046206474
epoch 10100 loss -0.11614201217889786 LR -0.139535591006279 LKL 0.023393576964735985
50


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10101 loss -0.19523653388023376 LR -0.2185477316379547 LKL 0.023311201483011246
epoch 10102 loss -0.17652250826358795 LR -0.19991463422775269 LKL 0.023392129689455032
epoch 10103 loss -0.1819431036710739 LR -0.20529770851135254 LKL 0.023354604840278625
epoch 10104 loss -0.1745750457048416 LR -0.19787031412124634 LKL 0.023295262828469276
epoch 10105 loss -0.1713506281375885 LR -0.19478698074817657 LKL 0.02343636006116867
epoch 10106 loss -0.19366446137428284 LR -0.21719500422477722 LKL 0.023530546575784683
epoch 10107 loss -0.15434986352920532 LR -0.17772060632705688 LKL 0.02337075024843216
epoch 10108 loss -0.19954046607017517 LR -0.22305166721343994 LKL 0.023511197417974472
epoch 10109 loss -0.17859292030334473 LR -0.2020881325006485 LKL 0.023495212197303772
epoch 10110 loss -0.14502711594104767 LR -0.16837677359580994 LKL 0.023349661380052567
epoch 10111 loss -0.11754922568798065 LR -0.14076054096221924 LKL 0.023211313411593437
epoch 10112 loss -0.16331540048122406 LR -0.186572

epoch 10198 loss -0.1634412258863449 LR -0.18683820962905884 LKL 0.02339698188006878
epoch 10199 loss -0.18129079043865204 LR -0.2048591673374176 LKL 0.023568371310830116
epoch 10200 loss -0.14602820575237274 LR -0.1694299727678299 LKL 0.023401763290166855
48
epoch 10201 loss -0.14166748523712158 LR -0.16517844796180725 LKL 0.02351095713675022


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10202 loss -0.18498258292675018 LR -0.20852810144424438 LKL 0.0235455185174942
epoch 10203 loss -0.1592847853899002 LR -0.1827986240386963 LKL 0.02351384237408638
epoch 10204 loss -0.10009127855300903 LR -0.12352573126554489 LKL 0.02343445084989071
epoch 10205 loss -0.18300923705101013 LR -0.20660175383090973 LKL 0.023592524230480194
epoch 10206 loss -0.11258751153945923 LR -0.13597160577774048 LKL 0.0233840961009264
epoch 10207 loss -0.18430514633655548 LR -0.20784905552864075 LKL 0.023543909192085266
epoch 10208 loss -0.14620280265808105 LR -0.16950419545173645 LKL 0.023301389068365097
epoch 10209 loss -0.18310832977294922 LR -0.20658347010612488 LKL 0.023475144058465958
epoch 10210 loss -0.1533108800649643 LR -0.1767464578151703 LKL 0.023435579612851143
epoch 10211 loss -0.1283521205186844 LR -0.15177655220031738 LKL 0.023424426093697548
epoch 10212 loss -0.1871166080236435 LR -0.2106214165687561 LKL 0.02350480481982231
epoch 10213 loss -0.20594459772109985 LR -0.2295808941125

epoch 10299 loss -0.16469082236289978 LR -0.18806372582912445 LKL 0.02337290719151497
epoch 10300 loss -0.13303861021995544 LR -0.1564607322216034 LKL 0.0234221201390028
77
epoch 10301 loss -0.10996392369270325 LR -0.13325300812721252 LKL 0.023289086297154427


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10302 loss -0.17343103885650635 LR -0.19684788584709167 LKL 0.02341683954000473
epoch 10303 loss -0.13601985573768616 LR -0.15946915745735168 LKL 0.023449301719665527
epoch 10304 loss -0.11401151865720749 LR -0.13741210103034973 LKL 0.023400580510497093
epoch 10305 loss -0.15879705548286438 LR -0.1823199987411499 LKL 0.02352294698357582
epoch 10306 loss -0.17874248325824738 LR -0.2021523118019104 LKL 0.023409830406308174
epoch 10307 loss -0.2091929167509079 LR -0.23270899057388306 LKL 0.02351607196033001
epoch 10308 loss -0.1633617877960205 LR -0.1869201511144638 LKL 0.02355836145579815
epoch 10309 loss -0.0983634814620018 LR -0.12152472138404846 LKL 0.023161238059401512
epoch 10310 loss -0.19435206055641174 LR -0.21784707903862 LKL 0.0234950240701437
epoch 10311 loss -0.16053864359855652 LR -0.18411068618297577 LKL 0.023572048172354698
epoch 10312 loss -0.1448832005262375 LR -0.1683046668767929 LKL 0.02342146448791027
epoch 10313 loss -0.15933452546596527 LR -0.18290527164936066

epoch 10398 loss -0.07502278685569763 LR -0.0986529290676117 LKL 0.02363014593720436
epoch 10399 loss -0.021118685603141785 LR -0.04461434483528137 LKL 0.023495659232139587
epoch 10400 loss -0.09064978361129761 LR -0.11419261991977692 LKL 0.02354283630847931
71
epoch 10401 loss -0.1558893620967865 LR -0.1794252097606659 LKL 0.023535845801234245


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10402 loss -0.10956396162509918 LR -0.13324317336082458 LKL 0.023679209873080254
epoch 10403 loss -0.1665704846382141 LR -0.19019225239753723 LKL 0.02362176403403282
epoch 10404 loss -0.15566611289978027 LR -0.17921429872512817 LKL 0.0235481858253479
epoch 10405 loss -0.10369880497455597 LR -0.12720468640327454 LKL 0.023505879566073418
epoch 10406 loss -0.20607401430606842 LR -0.22980305552482605 LKL 0.02372903563082218
epoch 10407 loss -0.09831996262073517 LR -0.1219312772154808 LKL 0.023611312732100487
epoch 10408 loss -0.0696909949183464 LR -0.09324097633361816 LKL 0.02354998141527176
epoch 10409 loss -0.19069825112819672 LR -0.21437355875968933 LKL 0.023675307631492615
epoch 10410 loss -0.1393100619316101 LR -0.16289681196212769 LKL 0.02358674444258213
epoch 10411 loss -0.1571132391691208 LR -0.1807701587677002 LKL 0.023656921461224556
epoch 10412 loss -0.1286352574825287 LR -0.15216654539108276 LKL 0.02353128232061863
epoch 10413 loss -0.1141417920589447 LR -0.13764506578445

epoch 10498 loss -0.16499586403369904 LR -0.18864284455776215 LKL 0.02364698424935341
epoch 10499 loss -0.12821908295154572 LR -0.1517360955476761 LKL 0.023517008870840073
epoch 10500 loss -0.14759713411331177 LR -0.17118212580680847 LKL 0.023584984242916107
48
epoch 10501 loss -0.13107657432556152 LR -0.15456870198249817 LKL 0.023492123931646347


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10502 loss -0.18405377864837646 LR -0.20777177810668945 LKL 0.023718003183603287
epoch 10503 loss -0.13658711314201355 LR -0.16030216217041016 LKL 0.023715052753686905
epoch 10504 loss -0.1390487253665924 LR -0.1627580225467682 LKL 0.023709291592240334
epoch 10505 loss -0.1777622401714325 LR -0.20139798521995544 LKL 0.02363574504852295
epoch 10506 loss -0.16689130663871765 LR -0.19056211411952972 LKL 0.023670805618166924
epoch 10507 loss -0.10339277237653732 LR -0.12684369087219238 LKL 0.02345092035830021
epoch 10508 loss -0.1452672779560089 LR -0.16884976625442505 LKL 0.023582495748996735
epoch 10509 loss -0.1311483532190323 LR -0.15477438271045685 LKL 0.02362602762877941
epoch 10510 loss -0.1418103724718094 LR -0.16535593569278717 LKL 0.023545557633042336
epoch 10511 loss -0.17861157655715942 LR -0.2021799385547638 LKL 0.023568367585539818
epoch 10512 loss -0.1724642515182495 LR -0.1961907148361206 LKL 0.02372646890580654
epoch 10513 loss -0.1642524003982544 LR -0.1878958344459

epoch 10598 loss -0.19439055025577545 LR -0.2179175615310669 LKL 0.023527007550001144
epoch 10599 loss -0.1758001148700714 LR -0.19927385449409485 LKL 0.02347373589873314
epoch 10600 loss -0.12859667837619781 LR -0.15213987231254578 LKL 0.02354319579899311
40
epoch 10601 loss -0.13630574941635132 LR -0.15975847840309143 LKL 0.023452728986740112


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10602 loss -0.10918019711971283 LR -0.1327713578939438 LKL 0.023591164499521255
epoch 10603 loss -0.14354658126831055 LR -0.16712594032287598 LKL 0.023579362779855728
epoch 10604 loss -0.1746567189693451 LR -0.19809985160827637 LKL 0.023443128913640976
epoch 10605 loss -0.22349971532821655 LR -0.24699172377586365 LKL 0.023492010310292244
epoch 10606 loss -0.1930520236492157 LR -0.21657408773899078 LKL 0.023522062227129936
epoch 10607 loss -0.13805101811885834 LR -0.1617024838924408 LKL 0.023651469498872757
epoch 10608 loss -0.1752864122390747 LR -0.19892346858978271 LKL 0.02363704890012741
epoch 10609 loss -0.1218005120754242 LR -0.14533820748329163 LKL 0.023537693545222282
epoch 10610 loss -0.21417684853076935 LR -0.2377285361289978 LKL 0.023551689460873604
epoch 10611 loss -0.18234272301197052 LR -0.20599283277988434 LKL 0.023650115355849266
epoch 10612 loss -0.06378743797540665 LR -0.08723365515470505 LKL 0.0234462171792984
epoch 10613 loss -0.14953595399856567 LR -0.172985360

epoch 10698 loss -0.10177892446517944 LR -0.12550416588783264 LKL 0.023725241422653198
epoch 10699 loss -0.17282599210739136 LR -0.19649386405944824 LKL 0.023667870089411736
epoch 10700 loss -0.12359803915023804 LR -0.14703649282455444 LKL 0.023438453674316406
49
epoch 10701 loss -0.15634571015834808 LR -0.17991900444030762 LKL 0.023573296144604683


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10702 loss -0.13605256378650665 LR -0.15967515110969543 LKL 0.02362259104847908
epoch 10703 loss -0.13262063264846802 LR -0.15616941452026367 LKL 0.023548778146505356
epoch 10704 loss -0.11416205763816833 LR -0.13761503994464874 LKL 0.02345297858119011
epoch 10705 loss -0.16840144991874695 LR -0.19217991828918457 LKL 0.02377847395837307
epoch 10706 loss -0.14474545419216156 LR -0.16828730702400208 LKL 0.023541850969195366
epoch 10707 loss -0.12042661011219025 LR -0.14392916858196259 LKL 0.02350255846977234
epoch 10708 loss -0.09052862226963043 LR -0.11413340270519257 LKL 0.023604776710271835
epoch 10709 loss -0.14903119206428528 LR -0.17287734150886536 LKL 0.02384614199399948
epoch 10710 loss -0.14858967065811157 LR -0.17223265767097473 LKL 0.023642990738153458
epoch 10711 loss -0.08264564722776413 LR -0.10631691664457321 LKL 0.023671269416809082
epoch 10712 loss -0.13411778211593628 LR -0.15769252181053162 LKL 0.023574737831950188
epoch 10713 loss -0.11039349436759949 LR -0.1339

epoch 10799 loss -0.1696016639471054 LR -0.19326695799827576 LKL 0.023665297776460648
epoch 10800 loss -0.18857981264591217 LR -0.21230429410934448 LKL 0.02372448518872261
91


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10801 loss -0.20272983610630035 LR -0.22647792100906372 LKL 0.023748084902763367
epoch 10802 loss -0.1494060754776001 LR -0.1731206476688385 LKL 0.023714572191238403
epoch 10803 loss -0.12796707451343536 LR -0.1515858769416809 LKL 0.023618798702955246
epoch 10804 loss -0.13722074031829834 LR -0.1607750654220581 LKL 0.023554326966404915
epoch 10805 loss -0.16417253017425537 LR -0.18781210482120514 LKL 0.023639578372240067
epoch 10806 loss -0.07767409831285477 LR -0.10128496587276459 LKL 0.02361086942255497
epoch 10807 loss -0.1927679479122162 LR -0.21643111109733582 LKL 0.023663166910409927
epoch 10808 loss -0.17854754626750946 LR -0.2020428627729416 LKL 0.02349531278014183
epoch 10809 loss -0.17928460240364075 LR -0.2028580605983734 LKL 0.023573454469442368
epoch 10810 loss -0.14702889323234558 LR -0.17067992687225342 LKL 0.02365102805197239
epoch 10811 loss -0.18332014977931976 LR -0.20704546570777893 LKL 0.02372531220316887
epoch 10812 loss -0.1837550848722458 LR -0.20739437639

epoch 10899 loss -0.22457125782966614 LR -0.24833902716636658 LKL 0.02376776933670044
epoch 10900 loss -0.0855250433087349 LR -0.10928486287593842 LKL 0.02375982142984867
53
epoch 10901 loss -0.17780978977680206 LR -0.2016288787126541 LKL 0.0238190945237875


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 10902 loss -0.18138465285301208 LR -0.2051527053117752 LKL 0.02376805804669857
epoch 10903 loss -0.21366150677204132 LR -0.2374267578125 LKL 0.02376525104045868
epoch 10904 loss -0.1504119336605072 LR -0.17421436309814453 LKL 0.02380242384970188
epoch 10905 loss -0.14094184339046478 LR -0.164703369140625 LKL 0.02376152016222477
epoch 10906 loss -0.15621903538703918 LR -0.17985853552818298 LKL 0.023639505729079247
epoch 10907 loss -0.2012445479631424 LR -0.22510215640068054 LKL 0.023857606574892998
epoch 10908 loss -0.10857751965522766 LR -0.13226158916950226 LKL 0.023684067651629448
epoch 10909 loss -0.17587852478027344 LR -0.19967791438102722 LKL 0.023799395188689232
epoch 10910 loss -0.19051972031593323 LR -0.2143615037202835 LKL 0.02384178712964058
epoch 10911 loss -0.15841475129127502 LR -0.18214452266693115 LKL 0.02372976578772068
epoch 10912 loss -0.21211940050125122 LR -0.23592591285705566 LKL 0.023806516081094742
epoch 10913 loss -0.21745817363262177 LR -0.241375625133514

epoch 10998 loss -0.20779454708099365 LR -0.23180824518203735 LKL 0.024013705551624298
epoch 10999 loss -0.16998812556266785 LR -0.19386893510818481 LKL 0.023880809545516968
epoch 11000 loss -0.10719329863786697 LR -0.131073459982872 LKL 0.023880161345005035
58
epoch 11001 loss -0.17415136098861694 LR -0.19803491234779358 LKL 0.023883547633886337


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11002 loss -0.16067086160182953 LR -0.18447327613830566 LKL 0.023802408948540688
epoch 11003 loss -0.11483228951692581 LR -0.1387459933757782 LKL 0.023913705721497536
epoch 11004 loss -0.23203808069229126 LR -0.25607648491859436 LKL 0.024038400501012802
epoch 11005 loss -0.17932848632335663 LR -0.20336824655532837 LKL 0.024039756506681442
epoch 11006 loss -0.17053259909152985 LR -0.19457629323005676 LKL 0.024043697863817215
epoch 11007 loss -0.14045356214046478 LR -0.1644115447998047 LKL 0.023957978934049606
epoch 11008 loss -0.1819261908531189 LR -0.20599785447120667 LKL 0.024071671068668365
epoch 11009 loss -0.1741783618927002 LR -0.19817762076854706 LKL 0.02399926260113716
epoch 11010 loss -0.17701862752437592 LR -0.2009207010269165 LKL 0.023902077227830887
epoch 11011 loss -0.20654703676700592 LR -0.2304854393005371 LKL 0.023938408121466637
epoch 11012 loss -0.11333277821540833 LR -0.13704676926136017 LKL 0.023713989183306694
epoch 11013 loss -0.16654062271118164 LR -0.190346

epoch 11098 loss -0.21367886662483215 LR -0.2377394139766693 LKL 0.024060552939772606
epoch 11099 loss -0.16391508281230927 LR -0.1879146546125412 LKL 0.023999573662877083
epoch 11100 loss -0.2714356780052185 LR -0.2954190969467163 LKL 0.023983405902981758
53
epoch 11101 loss -0.31442171335220337 LR -0.3384453058242798 LKL 0.02402358129620552


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11102 loss -0.23125110566616058 LR -0.25523218512535095 LKL 0.02398107573390007
epoch 11103 loss -0.16442525386810303 LR -0.1884084939956665 LKL 0.023983240127563477
epoch 11104 loss -0.1728912591934204 LR -0.19652937352657318 LKL 0.023638106882572174
epoch 11105 loss -0.21321266889572144 LR -0.23725950717926025 LKL 0.02404683828353882
epoch 11106 loss -0.15207910537719727 LR -0.17607493698596954 LKL 0.023995837196707726
epoch 11107 loss -0.2015925645828247 LR -0.2254987508058548 LKL 0.023906180635094643
epoch 11108 loss -0.17474883794784546 LR -0.198792964220047 LKL 0.02404412440955639
epoch 11109 loss -0.26814982295036316 LR -0.2921941578388214 LKL 0.024044327437877655
epoch 11110 loss -0.2173253446817398 LR -0.24126416444778442 LKL 0.023938823491334915
epoch 11111 loss -0.1911163330078125 LR -0.21506783366203308 LKL 0.023951508104801178
epoch 11112 loss -0.13574667274951935 LR -0.15957924723625183 LKL 0.023832572624087334
epoch 11113 loss -0.17825260758399963 LR -0.20225283503

epoch 11199 loss -0.2230975329875946 LR -0.247091606259346 LKL 0.023994071409106255
epoch 11200 loss -0.19458553194999695 LR -0.21862997114658356 LKL 0.02404443733394146
118
epoch 11201 loss -0.16943980753421783 LR -0.19343982636928558 LKL 0.024000020697712898


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11202 loss -0.181248277425766 LR -0.20512717962265015 LKL 0.023878896608948708
epoch 11203 loss -0.10037105530500412 LR -0.12423180043697357 LKL 0.023860743269324303
epoch 11204 loss -0.17880059778690338 LR -0.20279589295387268 LKL 0.023995291441679
epoch 11205 loss -0.1544637382030487 LR -0.17850181460380554 LKL 0.02403806895017624
epoch 11206 loss -0.1833634227514267 LR -0.20738425850868225 LKL 0.024020839482545853
epoch 11207 loss -0.15856580436229706 LR -0.18248124420642853 LKL 0.02391543611884117
epoch 11208 loss -0.17576515674591064 LR -0.1997099220752716 LKL 0.02394476719200611
epoch 11209 loss -0.18183180689811707 LR -0.20587724447250366 LKL 0.024045445024967194
epoch 11210 loss -0.16269518435001373 LR -0.18673157691955566 LKL 0.02403639629483223
epoch 11211 loss -0.205607071518898 LR -0.2296988070011139 LKL 0.024091731756925583
epoch 11212 loss -0.184029221534729 LR -0.20800118148326874 LKL 0.02397196739912033
epoch 11213 loss -0.15574215352535248 LR -0.17966094613075256

epoch 11299 loss -0.17497412860393524 LR -0.19902829825878143 LKL 0.024054164066910744
epoch 11300 loss -0.13805174827575684 LR -0.16210448741912842 LKL 0.02405274473130703
98
epoch 11301 loss -0.12475937604904175 LR -0.14867839217185974 LKL 0.02391901984810829


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11302 loss -0.1239408627152443 LR -0.14784134924411774 LKL 0.023900486528873444
epoch 11303 loss -0.2041589766740799 LR -0.22807064652442932 LKL 0.02391166426241398
epoch 11304 loss -0.18865139782428741 LR -0.21276330947875977 LKL 0.024111906066536903
epoch 11305 loss -0.13230732083320618 LR -0.15615078806877136 LKL 0.023843469098210335
epoch 11306 loss -0.16833586990833282 LR -0.19243600964546204 LKL 0.024100137874484062
epoch 11307 loss -0.14831188321113586 LR -0.17220750451087952 LKL 0.023895615711808205
epoch 11308 loss -0.18744467198848724 LR -0.21152690052986145 LKL 0.024082230404019356
epoch 11309 loss -0.14801809191703796 LR -0.17197445034980774 LKL 0.023956356570124626
epoch 11310 loss -0.15771085023880005 LR -0.18165141344070435 LKL 0.023940566927194595
epoch 11311 loss -0.1611337959766388 LR -0.18495036661624908 LKL 0.023816566914319992
epoch 11312 loss -0.2337392121553421 LR -0.25765350461006165 LKL 0.02391429804265499
epoch 11313 loss -0.19769887626171112 LR -0.22150

epoch 11397 loss -0.16295164823532104 LR -0.18685269355773926 LKL 0.023901037871837616
epoch 11398 loss -0.214302659034729 LR -0.23850497603416443 LKL 0.02420232445001602
epoch 11399 loss -0.1691572666168213 LR -0.19322064518928528 LKL 0.02406337670981884
epoch 11400 loss -0.1257491409778595 LR -0.14975231885910034 LKL 0.024003179743885994
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11401 loss -0.14984019100666046 LR -0.17392902076244354 LKL 0.02408883534371853
epoch 11402 loss -0.18761733174324036 LR -0.21158882975578308 LKL 0.023971494287252426
epoch 11403 loss -0.16836673021316528 LR -0.19253064692020416 LKL 0.02416391670703888
epoch 11404 loss -0.17566783726215363 LR -0.19969330728054047 LKL 0.02402547188103199
epoch 11405 loss -0.24493862688541412 LR -0.2691870331764221 LKL 0.024248400703072548
epoch 11406 loss -0.16073377430438995 LR -0.18484771251678467 LKL 0.024113934487104416
epoch 11407 loss -0.18378184735774994 LR -0.2078465223312378 LKL 0.024064674973487854
epoch 11408 loss -0.23947632312774658 LR -0.2634882628917694 LKL 0.02401193603873253
epoch 11409 loss -0.2002452164888382 LR -0.22440853714942932 LKL 0.024163315072655678
epoch 11410 loss -0.1700342744588852 LR -0.19406959414482117 LKL 0.024035319685935974
epoch 11411 loss -0.1679346114397049 LR -0.19211852550506592 LKL 0.02418391965329647
epoch 11412 loss -0.18342357873916626 LR -0.2073046863

epoch 11499 loss -0.1715143471956253 LR -0.19571006298065186 LKL 0.02419571578502655
epoch 11500 loss -0.15175023674964905 LR -0.1756291687488556 LKL 0.023878930136561394
74
epoch 11501 loss -0.22285686433315277 LR -0.24695080518722534 LKL 0.02409394085407257


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11502 loss -0.16557562351226807 LR -0.1896647810935974 LKL 0.024089151993393898
epoch 11503 loss -0.19099700450897217 LR -0.21503019332885742 LKL 0.02403319627046585
epoch 11504 loss -0.189411923289299 LR -0.21373111009597778 LKL 0.024319181218743324
epoch 11505 loss -0.127061128616333 LR -0.1511061042547226 LKL 0.02404497005045414
epoch 11506 loss -0.18816067278385162 LR -0.2123418152332306 LKL 0.02418113686144352
epoch 11507 loss -0.17161230742931366 LR -0.19580334424972534 LKL 0.024191034957766533
epoch 11508 loss -0.22061479091644287 LR -0.24470630288124084 LKL 0.024091510102152824
epoch 11509 loss -0.19466787576675415 LR -0.21875731647014618 LKL 0.02408943697810173
epoch 11510 loss -0.18599897623062134 LR -0.21004538238048553 LKL 0.0240463986992836
epoch 11511 loss -0.1952601969242096 LR -0.21940287947654724 LKL 0.024142678827047348
epoch 11512 loss -0.15304118394851685 LR -0.17690029740333557 LKL 0.02385910600423813
epoch 11513 loss -0.202928826212883 LR -0.2269854247570037

epoch 11598 loss -0.1910904049873352 LR -0.21519535779953003 LKL 0.024104958400130272
epoch 11599 loss -0.18192802369594574 LR -0.20606601238250732 LKL 0.024137992411851883
epoch 11600 loss -0.17458176612854004 LR -0.19863496720790863 LKL 0.024053193628787994
66
epoch 11601 loss -0.15147598087787628 LR -0.17545779049396515 LKL 0.023981807753443718


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11602 loss -0.14889758825302124 LR -0.17303046584129333 LKL 0.024132872000336647
epoch 11603 loss -0.19247865676879883 LR -0.2166065275669098 LKL 0.02412787824869156
epoch 11604 loss -0.1756887435913086 LR -0.1997349113225937 LKL 0.0240461602807045
epoch 11605 loss -0.12590152025222778 LR -0.15001516044139862 LKL 0.024113645777106285
epoch 11606 loss -0.1702151894569397 LR -0.19416376948356628 LKL 0.02394857443869114
epoch 11607 loss -0.1549372524023056 LR -0.17906510829925537 LKL 0.02412785217165947
epoch 11608 loss -0.20074713230133057 LR -0.22479644417762756 LKL 0.024049311876296997
epoch 11609 loss -0.17511653900146484 LR -0.19926735758781433 LKL 0.024150824174284935
epoch 11610 loss -0.1465325504541397 LR -0.17057007551193237 LKL 0.02403753064572811
epoch 11611 loss -0.18785880506038666 LR -0.2118198275566101 LKL 0.02396101877093315
epoch 11612 loss -0.19169549643993378 LR -0.21577557921409607 LKL 0.02408008836209774
epoch 11613 loss -0.18265710771083832 LR -0.20682558417320

epoch 11700 loss -0.1571165770292282 LR -0.1812049299478531 LKL 0.024088358506560326
41


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11701 loss -0.1600612998008728 LR -0.1840967983007431 LKL 0.0240354984998703
epoch 11702 loss -0.18689146637916565 LR -0.2109251618385315 LKL 0.024033701047301292
epoch 11703 loss -0.23449218273162842 LR -0.2586972117424011 LKL 0.024205029010772705
epoch 11704 loss -0.2654513716697693 LR -0.2896292209625244 LKL 0.02417784184217453
epoch 11705 loss -0.1855415254831314 LR -0.2096431404352188 LKL 0.0241016186773777
epoch 11706 loss -0.16854830086231232 LR -0.19280530512332916 LKL 0.024257002398371696
epoch 11707 loss -0.17160086333751678 LR -0.19587811827659607 LKL 0.024277251213788986
epoch 11708 loss -0.21151259541511536 LR -0.23578432202339172 LKL 0.024271730333566666
epoch 11709 loss -0.22991067171096802 LR -0.25425243377685547 LKL 0.024341754615306854
epoch 11710 loss -0.21724683046340942 LR -0.2414693832397461 LKL 0.02422255650162697
epoch 11711 loss -0.1745121031999588 LR -0.19870054721832275 LKL 0.024188438430428505
epoch 11712 loss -0.2004648745059967 LR -0.2246247231960296

epoch 11797 loss -0.10884599387645721 LR -0.13296079635620117 LKL 0.024114806205034256
epoch 11798 loss -0.13563457131385803 LR -0.16018053889274597 LKL 0.02454596571624279
epoch 11799 loss -0.23812344670295715 LR -0.2626721262931824 LKL 0.02454868145287037
epoch 11800 loss -0.12361333519220352 LR -0.14789661765098572 LKL 0.024283284321427345
104
epoch 11801 loss -0.15313652157783508 LR -0.17737090587615967 LKL 0.024234382435679436


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11802 loss -0.18307752907276154 LR -0.2072739601135254 LKL 0.024196425452828407
epoch 11803 loss -0.20482231676578522 LR -0.2291310429573059 LKL 0.02430872432887554
epoch 11804 loss -0.22420638799667358 LR -0.2485412359237671 LKL 0.024334844201803207
epoch 11805 loss -0.1438894122838974 LR -0.16815216839313507 LKL 0.02426275610923767
epoch 11806 loss -0.14130322635173798 LR -0.16552646458148956 LKL 0.024223236367106438
epoch 11807 loss -0.1678175926208496 LR -0.19202861189842224 LKL 0.024211017414927483
epoch 11808 loss -0.13810758292675018 LR -0.162267804145813 LKL 0.024160226806998253
epoch 11809 loss -0.21030813455581665 LR -0.23479193449020386 LKL 0.024483801797032356
epoch 11810 loss -0.12065944075584412 LR -0.14489781856536865 LKL 0.024238375946879387
epoch 11811 loss -0.20280469954013824 LR -0.2271963357925415 LKL 0.02439163625240326
epoch 11812 loss -0.18260574340820312 LR -0.20688748359680176 LKL 0.024281742051243782
epoch 11813 loss -0.1741383671760559 LR -0.19832590222

epoch 11898 loss -0.23875358700752258 LR -0.26291021704673767 LKL 0.024156637489795685
epoch 11899 loss -0.19624239206314087 LR -0.2204361855983734 LKL 0.024193791672587395
epoch 11900 loss -0.24772904813289642 LR -0.2719917595386505 LKL 0.02426270581781864
83
epoch 11901 loss -0.2389373779296875 LR -0.2632802724838257 LKL 0.024342898279428482


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 11902 loss -0.2017459273338318 LR -0.22591683268547058 LKL 0.024170897901058197
epoch 11903 loss -0.20562852919101715 LR -0.2298736274242401 LKL 0.024245094507932663
epoch 11904 loss -0.135700985789299 LR -0.1599183976650238 LKL 0.024217406287789345
epoch 11905 loss -0.21913844347000122 LR -0.24356576800346375 LKL 0.024427317082881927
epoch 11906 loss -0.14138686656951904 LR -0.16555067896842957 LKL 0.024163808673620224
epoch 11907 loss -0.22173479199409485 LR -0.24608875811100006 LKL 0.02435397356748581
epoch 11908 loss -0.182878315448761 LR -0.20715288817882538 LKL 0.024274572730064392
epoch 11909 loss -0.15793779492378235 LR -0.18222957849502563 LKL 0.02429177798330784
epoch 11910 loss -0.2315693497657776 LR -0.25599271059036255 LKL 0.02442336641252041
epoch 11911 loss -0.1684151589870453 LR -0.19271810352802277 LKL 0.02430294081568718
epoch 11912 loss -0.14379025995731354 LR -0.16808924078941345 LKL 0.024298984557390213
epoch 11913 loss -0.12602129578590393 LR -0.150264188647

epoch 11998 loss -0.1904086023569107 LR -0.21461978554725647 LKL 0.024211177602410316
epoch 11999 loss -0.17314784228801727 LR -0.19722247123718262 LKL 0.024074632674455643
epoch 12000 loss -0.168302521109581 LR -0.19238954782485962 LKL 0.024087026715278625
65
epoch 12001 loss -0.13933244347572327 LR -0.16356903314590454 LKL 0.024236593395471573


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12002 loss -0.17478437721729279 LR -0.19909700751304626 LKL 0.02431262843310833
epoch 12003 loss -0.1373436152935028 LR -0.16150736808776855 LKL 0.024163760244846344
epoch 12004 loss -0.14307613670825958 LR -0.16734211146831512 LKL 0.02426597848534584
epoch 12005 loss -0.16244089603424072 LR -0.1867145299911499 LKL 0.024273626506328583
epoch 12006 loss -0.18598288297653198 LR -0.21010702848434448 LKL 0.024124139919877052
epoch 12007 loss -0.2060733288526535 LR -0.23032698035240173 LKL 0.02425365149974823
epoch 12008 loss -0.13304345309734344 LR -0.15729603171348572 LKL 0.024252580478787422
epoch 12009 loss -0.18588952720165253 LR -0.2102178931236267 LKL 0.02432836778461933
epoch 12010 loss -0.24946469068527222 LR -0.273769348859787 LKL 0.024304665625095367
epoch 12011 loss -0.19591273367404938 LR -0.22018155455589294 LKL 0.02426881529390812
epoch 12012 loss -0.1397722214460373 LR -0.16384930908679962 LKL 0.02407708391547203
epoch 12013 loss -0.2007802426815033 LR -0.2250335812568

epoch 12100 loss -0.19076602160930634 LR -0.21507564187049866 LKL 0.02430962398648262
60


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12101 loss -0.07170739769935608 LR -0.09603451192378998 LKL 0.024327116087079048
epoch 12102 loss -0.2174903303384781 LR -0.24203312397003174 LKL 0.0245427954941988
epoch 12103 loss -0.21863514184951782 LR -0.24307814240455627 LKL 0.024442996829748154
epoch 12104 loss -0.1731492429971695 LR -0.197567418217659 LKL 0.02441817708313465
epoch 12105 loss -0.31023088097572327 LR -0.33476722240448 LKL 0.02453633025288582
epoch 12106 loss -0.18468505144119263 LR -0.2090688943862915 LKL 0.024383848533034325
epoch 12107 loss -0.23318421840667725 LR -0.2576010823249817 LKL 0.02441686950623989
epoch 12108 loss -0.1790534257888794 LR -0.203443706035614 LKL 0.02439027652144432
epoch 12109 loss -0.1853426694869995 LR -0.20985686779022217 LKL 0.02451419271528721
epoch 12110 loss -0.2157207876443863 LR -0.24016118049621582 LKL 0.02444038726389408
epoch 12111 loss -0.16106943786144257 LR -0.18543383479118347 LKL 0.024364393204450607
epoch 12112 loss -0.2354540228843689 LR -0.25998684763908386 LKL 

epoch 12197 loss -0.2801133692264557 LR -0.3047056794166565 LKL 0.024592315778136253
epoch 12198 loss -0.21594735980033875 LR -0.24039945006370544 LKL 0.024452095851302147
epoch 12199 loss -0.1578080803155899 LR -0.18236196041107178 LKL 0.02455388568341732
epoch 12200 loss -0.24671930074691772 LR -0.27129918336868286 LKL 0.024579890072345734
50


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12201 loss -0.2019810825586319 LR -0.2264510691165924 LKL 0.024469982832670212
epoch 12202 loss -0.14362257719039917 LR -0.1679365187883377 LKL 0.024313945323228836
epoch 12203 loss -0.21092857420444489 LR -0.23544850945472717 LKL 0.02451993152499199
epoch 12204 loss -0.2515517771244049 LR -0.2760288715362549 LKL 0.024477099999785423
epoch 12205 loss -0.1371983289718628 LR -0.16151635348796844 LKL 0.0243180263787508
epoch 12206 loss -0.18595802783966064 LR -0.2103244811296463 LKL 0.024366457015275955
epoch 12207 loss -0.22213494777679443 LR -0.2465583086013794 LKL 0.024423368275165558
epoch 12208 loss -0.2148006558418274 LR -0.2392079085111618 LKL 0.02440726011991501
epoch 12209 loss -0.22034595906734467 LR -0.2448761761188507 LKL 0.024530217051506042
epoch 12210 loss -0.16305120289325714 LR -0.1873883306980133 LKL 0.024337131530046463
epoch 12211 loss -0.2237049788236618 LR -0.24830317497253418 LKL 0.024598198011517525
epoch 12212 loss -0.21149945259094238 LR -0.2359775900840759

epoch 12298 loss -0.20450234413146973 LR -0.22900348901748657 LKL 0.024501152336597443
epoch 12299 loss -0.21082450449466705 LR -0.2354639172554016 LKL 0.02463940903544426
epoch 12300 loss -0.15677636861801147 LR -0.18148016929626465 LKL 0.024703798815608025
70
epoch 12301 loss -0.22796186804771423 LR -0.25264212489128113 LKL 0.024680258706212044


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12302 loss -0.18157023191452026 LR -0.2061755508184433 LKL 0.024605317041277885
epoch 12303 loss -0.2546229660511017 LR -0.2793188989162445 LKL 0.024695921689271927
epoch 12304 loss -0.17228810489177704 LR -0.19669181108474731 LKL 0.024403702467679977
epoch 12305 loss -0.17213337123394012 LR -0.19660991430282593 LKL 0.024476544931530952
epoch 12306 loss -0.22770600020885468 LR -0.2523317039012909 LKL 0.024625707417726517
epoch 12307 loss -0.19647952914237976 LR -0.22109851241111755 LKL 0.024618981406092644
epoch 12308 loss -0.22576391696929932 LR -0.25049901008605957 LKL 0.02473510056734085
epoch 12309 loss -0.2074350118637085 LR -0.23206722736358643 LKL 0.024632208049297333
epoch 12310 loss -0.19152991473674774 LR -0.21609553694725037 LKL 0.024565627798438072
epoch 12311 loss -0.14139026403427124 LR -0.1659715622663498 LKL 0.0245813000947237
epoch 12312 loss -0.17092618346214294 LR -0.19552494585514069 LKL 0.024598754942417145
epoch 12313 loss -0.2318955957889557 LR -0.256623834

epoch 12400 loss -0.09701117873191833 LR -0.12172245979309082 LKL 0.024711277335882187
71


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12401 loss -0.15694589912891388 LR -0.18183360993862152 LKL 0.024887705221772194
epoch 12402 loss -0.1393081545829773 LR -0.16403935849666595 LKL 0.024731207638978958
epoch 12403 loss -0.2027595192193985 LR -0.22761467099189758 LKL 0.024855151772499084
epoch 12404 loss -0.11991684883832932 LR -0.14462020993232727 LKL 0.024703362956643105
epoch 12405 loss -0.16582974791526794 LR -0.19041474163532257 LKL 0.024584999307990074
epoch 12406 loss -0.18326415121555328 LR -0.20818662643432617 LKL 0.024922480806708336
epoch 12407 loss -0.17886976897716522 LR -0.20359420776367188 LKL 0.02472444251179695
epoch 12408 loss -0.16243498027324677 LR -0.18728825449943542 LKL 0.024853277951478958
epoch 12409 loss -0.2166799157857895 LR -0.2413264513015747 LKL 0.024646537378430367
epoch 12410 loss -0.18392835557460785 LR -0.20874185860157013 LKL 0.02481350675225258
epoch 12411 loss -0.1437465250492096 LR -0.16844326257705688 LKL 0.024696743115782738
epoch 12412 loss -0.2121095359325409 LR -0.2369838

epoch 12498 loss -0.17493468523025513 LR -0.19986769556999207 LKL 0.02493300288915634
epoch 12499 loss -0.16678524017333984 LR -0.19173792004585266 LKL 0.024952678009867668
epoch 12500 loss -0.17641973495483398 LR -0.20120933651924133 LKL 0.02478959411382675
57
epoch 12501 loss -0.23494458198547363 LR -0.2597515285015106 LKL 0.024806946516036987


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12502 loss -0.15980300307273865 LR -0.1846083104610443 LKL 0.02480531483888626
epoch 12503 loss -0.1819525510072708 LR -0.2067922055721283 LKL 0.024839648976922035
epoch 12504 loss -0.12686963379383087 LR -0.15168291330337524 LKL 0.024813281372189522
epoch 12505 loss -0.1317552626132965 LR -0.1565951704978943 LKL 0.024839913472533226
epoch 12506 loss -0.13506710529327393 LR -0.15979808568954468 LKL 0.0247309859842062
epoch 12507 loss -0.17627698183059692 LR -0.2010548710823059 LKL 0.024777885526418686
epoch 12508 loss -0.22823481261730194 LR -0.2531604468822479 LKL 0.02492563985288143
epoch 12509 loss -0.19379937648773193 LR -0.2185715287923813 LKL 0.024772144854068756
epoch 12510 loss -0.18052944540977478 LR -0.20554891228675842 LKL 0.025019468739628792
epoch 12511 loss -0.19550153613090515 LR -0.22039583325386047 LKL 0.02489430457353592
epoch 12512 loss -0.14177581667900085 LR -0.16668041050434113 LKL 0.024904586374759674
epoch 12513 loss -0.2078133523464203 LR -0.2325856983661

epoch 12600 loss -0.2399376481771469 LR -0.2647615075111389 LKL 0.024823859333992004
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12601 loss -0.22279657423496246 LR -0.2476777881383896 LKL 0.024881215766072273
epoch 12602 loss -0.1945130079984665 LR -0.21921294927597046 LKL 0.02469993755221367
epoch 12603 loss -0.23281437158584595 LR -0.2577151358127594 LKL 0.0249007660895586
epoch 12604 loss -0.10756631195545197 LR -0.1322040855884552 LKL 0.024637777358293533
epoch 12605 loss -0.21502362191677094 LR -0.23983050882816315 LKL 0.02480689249932766
epoch 12606 loss -0.2274579405784607 LR -0.25231796503067017 LKL 0.024860020726919174
epoch 12607 loss -0.20719236135482788 LR -0.23209848999977112 LKL 0.02490612119436264
epoch 12608 loss -0.21146509051322937 LR -0.23641879856586456 LKL 0.024953706189990044
epoch 12609 loss -0.2642568349838257 LR -0.28909003734588623 LKL 0.024833206087350845
epoch 12610 loss -0.19003921747207642 LR -0.2148808091878891 LKL 0.024841593578457832
epoch 12611 loss -0.18106523156166077 LR -0.20583480596542358 LKL 0.024769581854343414
epoch 12612 loss -0.1485554575920105 LR -0.173341110348

epoch 12697 loss -0.15035295486450195 LR -0.17515072226524353 LKL 0.02479776367545128
epoch 12698 loss -0.20890049636363983 LR -0.2337937355041504 LKL 0.02489323727786541
epoch 12699 loss -0.23376935720443726 LR -0.2587119936943054 LKL 0.02494264394044876
epoch 12700 loss -0.137396901845932 LR -0.1621505320072174 LKL 0.02475363202393055
46


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12701 loss -0.18980447947978973 LR -0.21467575430870056 LKL 0.024871278554201126
epoch 12702 loss -0.22949928045272827 LR -0.25439852476119995 LKL 0.024899251759052277
epoch 12703 loss -0.1376831978559494 LR -0.16254554688930511 LKL 0.024862349033355713
epoch 12704 loss -0.2009776532649994 LR -0.2259637713432312 LKL 0.02498612552881241
epoch 12705 loss -0.19390393793582916 LR -0.218870609998703 LKL 0.024966666474938393
epoch 12706 loss -0.22124721109867096 LR -0.2462354302406311 LKL 0.024988224729895592
epoch 12707 loss -0.2647789418697357 LR -0.28971803188323975 LKL 0.024939103052020073
epoch 12708 loss -0.17210112512111664 LR -0.19691041111946106 LKL 0.024809280410408974
epoch 12709 loss -0.1277788281440735 LR -0.15245421230793 LKL 0.024675382301211357
epoch 12710 loss -0.2523617148399353 LR -0.2774173617362976 LKL 0.025055641308426857
epoch 12711 loss -0.11372265219688416 LR -0.13860517740249634 LKL 0.02488252893090248
epoch 12712 loss -0.18312673270702362 LR -0.20790553092956

epoch 12797 loss -0.18542423844337463 LR -0.21031317114830017 LKL 0.024888930842280388
epoch 12798 loss -0.14459975063800812 LR -0.16946274042129517 LKL 0.0248629879206419
epoch 12799 loss -0.21708475053310394 LR -0.24200886487960815 LKL 0.02492411807179451
epoch 12800 loss -0.20733512938022614 LR -0.23224890232086182 LKL 0.02491377666592598
55


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12801 loss -0.15311439335346222 LR -0.17811425030231476 LKL 0.02499985694885254
epoch 12802 loss -0.22024504840373993 LR -0.2452211230993271 LKL 0.024976080283522606
epoch 12803 loss -0.15451134741306305 LR -0.17933735251426697 LKL 0.02482600510120392
epoch 12804 loss -0.22760097682476044 LR -0.25248390436172485 LKL 0.024882923811674118
epoch 12805 loss -0.25171294808387756 LR -0.27673473954200745 LKL 0.025021802634000778
epoch 12806 loss -0.21563376486301422 LR -0.24064227938652039 LKL 0.025008514523506165
epoch 12807 loss -0.22149580717086792 LR -0.2463589608669281 LKL 0.02486315369606018
epoch 12808 loss -0.23389291763305664 LR -0.25875669717788696 LKL 0.02486378140747547
epoch 12809 loss -0.19090105593204498 LR -0.21565067768096924 LKL 0.024749625474214554
epoch 12810 loss -0.19258251786231995 LR -0.2174336463212967 LKL 0.024851126596331596
epoch 12811 loss -0.19507069885730743 LR -0.22011688351631165 LKL 0.025046182796359062
epoch 12812 loss -0.29711148142814636 LR -0.322174

epoch 12898 loss -0.23329293727874756 LR -0.2581915855407715 LKL 0.02489864081144333
epoch 12899 loss -0.20447486639022827 LR -0.22948497533798218 LKL 0.025010105222463608
epoch 12900 loss -0.18390507996082306 LR -0.2087491750717163 LKL 0.0248440932482481
70
epoch 12901 loss -0.17017439007759094 LR -0.19516751170158386 LKL 0.02499312162399292


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 12902 loss -0.2572236955165863 LR -0.2821747362613678 LKL 0.024951040744781494
epoch 12903 loss -0.18012341856956482 LR -0.20501232147216797 LKL 0.0248888973146677
epoch 12904 loss -0.21240626275539398 LR -0.2373596727848053 LKL 0.024953408166766167
epoch 12905 loss -0.21784141659736633 LR -0.24281971156597137 LKL 0.02497829869389534
epoch 12906 loss -0.19039450585842133 LR -0.21525675058364868 LKL 0.024862239137291908
epoch 12907 loss -0.15582482516765594 LR -0.1805858016014099 LKL 0.024760974571108818
epoch 12908 loss -0.1835685521364212 LR -0.20841121673583984 LKL 0.02484266646206379
epoch 12909 loss -0.22481125593185425 LR -0.24970175325870514 LKL 0.02489049732685089
epoch 12910 loss -0.2395309954881668 LR -0.26441532373428345 LKL 0.02488432265818119
epoch 12911 loss -0.23766282200813293 LR -0.2626122534275055 LKL 0.024949438869953156
epoch 12912 loss -0.19417482614517212 LR -0.21900007128715515 LKL 0.02482524886727333
epoch 12913 loss -0.15275691449642181 LR -0.1774644404649

epoch 12998 loss -0.21829627454280853 LR -0.24345889687538147 LKL 0.025162622332572937
epoch 12999 loss -0.183987095952034 LR -0.20898506045341492 LKL 0.02499796263873577
epoch 13000 loss -0.2645236849784851 LR -0.28964880108833313 LKL 0.025125103071331978
48


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13001 loss -0.14437417685985565 LR -0.1693199723958969 LKL 0.024945799261331558
epoch 13002 loss -0.1939096599817276 LR -0.2189595252275467 LKL 0.025049859657883644
epoch 13003 loss -0.2671020030975342 LR -0.2922704815864563 LKL 0.025168465450406075
epoch 13004 loss -0.24662744998931885 LR -0.2717445194721222 LKL 0.025117063894867897
epoch 13005 loss -0.26170286536216736 LR -0.2868485152721405 LKL 0.0251456405967474
epoch 13006 loss -0.20352616906166077 LR -0.22859619557857513 LKL 0.02507002279162407
epoch 13007 loss -0.224028542637825 LR -0.2490917146205902 LKL 0.02506316639482975
epoch 13008 loss -0.25390875339508057 LR -0.27901124954223633 LKL 0.02510250173509121
epoch 13009 loss -0.2948306202888489 LR -0.32004961371421814 LKL 0.025218980386853218
epoch 13010 loss -0.23820871114730835 LR -0.2633519470691681 LKL 0.02514323778450489
epoch 13011 loss -0.21169209480285645 LR -0.2367735505104065 LKL 0.025081459432840347
epoch 13012 loss -0.16563697159290314 LR -0.19053694605827332 

epoch 13099 loss -0.21314437687397003 LR -0.2381531298160553 LKL 0.025008751079440117
epoch 13100 loss -0.24551844596862793 LR -0.270620733499527 LKL 0.025102291256189346
46
epoch 13101 loss -0.20808620750904083 LR -0.233122318983078 LKL 0.02503611333668232


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13102 loss -0.22107836604118347 LR -0.24613332748413086 LKL 0.02505495771765709
epoch 13103 loss -0.2831862270832062 LR -0.3082426190376282 LKL 0.025056393817067146
epoch 13104 loss -0.18425726890563965 LR -0.2093524932861328 LKL 0.025095220655202866
epoch 13105 loss -0.21816407144069672 LR -0.2430720329284668 LKL 0.02490796148777008
epoch 13106 loss -0.22632111608982086 LR -0.2512194514274597 LKL 0.024898329749703407
epoch 13107 loss -0.27531322836875916 LR -0.3004380464553833 LKL 0.0251248087733984
epoch 13108 loss -0.25141096115112305 LR -0.27640005946159363 LKL 0.024989092722535133
epoch 13109 loss -0.23555946350097656 LR -0.2605286240577698 LKL 0.024969162419438362
epoch 13110 loss -0.18086197972297668 LR -0.20581263303756714 LKL 0.024950655177235603
epoch 13111 loss -0.23558130860328674 LR -0.26070868968963623 LKL 0.02512737736105919
epoch 13112 loss -0.19794705510139465 LR -0.22299814224243164 LKL 0.025051094591617584
epoch 13113 loss -0.22643591463565826 LR -0.25147783756

epoch 13199 loss -0.2419225424528122 LR -0.26709359884262085 LKL 0.025171060115098953
epoch 13200 loss -0.2257925271987915 LR -0.25092947483062744 LKL 0.02513694390654564
122
epoch 13201 loss -0.2275153249502182 LR -0.25257816910743713 LKL 0.025062838569283485


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13202 loss -0.19358375668525696 LR -0.21867746114730835 LKL 0.02509370632469654
epoch 13203 loss -0.23262998461723328 LR -0.25785475969314575 LKL 0.025224773213267326
epoch 13204 loss -0.2146073877811432 LR -0.2396826148033142 LKL 0.02507522702217102
epoch 13205 loss -0.18492713570594788 LR -0.2099514752626419 LKL 0.02502433769404888
epoch 13206 loss -0.264227956533432 LR -0.2894236147403717 LKL 0.025195667520165443
epoch 13207 loss -0.22156940400600433 LR -0.24672028422355652 LKL 0.025150885805487633
epoch 13208 loss -0.12837974727153778 LR -0.1534421145915985 LKL 0.025062372907996178
epoch 13209 loss -0.21794013679027557 LR -0.24318119883537292 LKL 0.025241056457161903
epoch 13210 loss -0.2205999195575714 LR -0.24555274844169617 LKL 0.024952836334705353
epoch 13211 loss -0.2550833821296692 LR -0.28024759888648987 LKL 0.025164222344756126
epoch 13212 loss -0.23050591349601746 LR -0.25568944215774536 LKL 0.025183534249663353
epoch 13213 loss -0.18474750220775604 LR -0.20977133512

epoch 13300 loss -0.20494431257247925 LR -0.23036392033100128 LKL 0.02541961334645748
86
epoch 13301 loss -0.2020324319601059 LR -0.22719940543174744 LKL 0.02516697719693184
epoch 13302 loss -0.14984828233718872 LR -0.17486527562141418 LKL 0.02501700073480606


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13303 loss -0.19297200441360474 LR -0.21818460524082184 LKL 0.025212598964571953
epoch 13304 loss -0.21452733874320984 LR -0.23963677883148193 LKL 0.025109438225626945
epoch 13305 loss -0.1481502652168274 LR -0.17346879839897156 LKL 0.025318527594208717
epoch 13306 loss -0.2638716399669647 LR -0.2891819179058075 LKL 0.02531028538942337
epoch 13307 loss -0.21936474740505219 LR -0.24463899433612823 LKL 0.02527425065636635
epoch 13308 loss -0.19083832204341888 LR -0.2161387801170349 LKL 0.025300459936261177
epoch 13309 loss -0.18084952235221863 LR -0.20602793991565704 LKL 0.02517841011285782
epoch 13310 loss -0.27293872833251953 LR -0.29814451932907104 LKL 0.02520580217242241
epoch 13311 loss -0.24171939492225647 LR -0.26708340644836426 LKL 0.025364011526107788
epoch 13312 loss -0.2652020752429962 LR -0.29045552015304565 LKL 0.025253446772694588
epoch 13313 loss -0.23295851051807404 LR -0.25819873809814453 LKL 0.025240229442715645
epoch 13314 loss -0.23805665969848633 LR -0.26332676

epoch 13399 loss -0.18687976896762848 LR -0.21210192143917084 LKL 0.025222154334187508
epoch 13400 loss -0.1906576305627823 LR -0.21574363112449646 LKL 0.02508600242435932
61
epoch 13401 loss -0.23673778772354126 LR -0.2620112895965576 LKL 0.025273503735661507


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13402 loss -0.2063155323266983 LR -0.2313874065876007 LKL 0.025071868672966957
epoch 13403 loss -0.21568982303142548 LR -0.24097424745559692 LKL 0.025284424424171448
epoch 13404 loss -0.16921600699424744 LR -0.1943761110305786 LKL 0.025160107761621475
epoch 13405 loss -0.2201732099056244 LR -0.24554313719272614 LKL 0.025369921699166298
epoch 13406 loss -0.29311779141426086 LR -0.3183828294277191 LKL 0.0252650398761034
epoch 13407 loss -0.13188475370407104 LR -0.15697786211967468 LKL 0.02509310096502304
epoch 13408 loss -0.11214302480220795 LR -0.1372210681438446 LKL 0.025078045204281807
epoch 13409 loss -0.16929271817207336 LR -0.19444787502288818 LKL 0.025155160576105118
epoch 13410 loss -0.16761833429336548 LR -0.19280092418193817 LKL 0.02518259361386299
epoch 13411 loss -0.2129754275083542 LR -0.23818597197532654 LKL 0.02521054819226265
epoch 13412 loss -0.17331330478191376 LR -0.1985301375389099 LKL 0.025216832756996155
epoch 13413 loss -0.192793071269989 LR -0.21793110668659

epoch 13498 loss -0.18996915221214294 LR -0.21521615982055664 LKL 0.025247005745768547
epoch 13499 loss -0.22628426551818848 LR -0.2514764070510864 LKL 0.025192148983478546
epoch 13500 loss -0.21733465790748596 LR -0.2425864338874817 LKL 0.02525176852941513
70
epoch 13501 loss -0.2755661606788635 LR -0.3008582592010498 LKL 0.025292085483670235


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13502 loss -0.21934065222740173 LR -0.2445196807384491 LKL 0.02517903596162796
epoch 13503 loss -0.17014990746974945 LR -0.19529667496681213 LKL 0.025146767497062683
epoch 13504 loss -0.26111721992492676 LR -0.2864236533641815 LKL 0.02530643530189991
epoch 13505 loss -0.17543722689151764 LR -0.20065414905548096 LKL 0.025216925889253616
epoch 13506 loss -0.2589385211467743 LR -0.28411078453063965 LKL 0.025172259658575058
epoch 13507 loss -0.21402819454669952 LR -0.23933075368404388 LKL 0.02530256099998951
epoch 13508 loss -0.2057473361492157 LR -0.23110342025756836 LKL 0.02535608783364296
epoch 13509 loss -0.21396790444850922 LR -0.23916572332382202 LKL 0.025197813287377357
epoch 13510 loss -0.2824310064315796 LR -0.3078116774559021 LKL 0.025380657985806465
epoch 13511 loss -0.24910515546798706 LR -0.2743861973285675 LKL 0.025281036272644997
epoch 13512 loss -0.21580761671066284 LR -0.24095547199249268 LKL 0.025147859007120132
epoch 13513 loss -0.22676688432693481 LR -0.2519547939

epoch 13599 loss -0.14835509657859802 LR -0.17339801788330078 LKL 0.02504291571676731
epoch 13600 loss -0.17405366897583008 LR -0.19910505414009094 LKL 0.025051381438970566
50
epoch 13601 loss -0.2164635807275772 LR -0.24170546233654022 LKL 0.025241883471608162


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13602 loss -0.253063827753067 LR -0.2782299518585205 LKL 0.02516612596809864
epoch 13603 loss -0.23382766544818878 LR -0.25891315937042236 LKL 0.02508549764752388
epoch 13604 loss -0.24111275374889374 LR -0.2662131190299988 LKL 0.02510036900639534
epoch 13605 loss -0.29870089888572693 LR -0.3239424228668213 LKL 0.02524152398109436
epoch 13606 loss -0.22446231544017792 LR -0.24956703186035156 LKL 0.025104718282818794
epoch 13607 loss -0.21849671006202698 LR -0.2436203956604004 LKL 0.02512368932366371
epoch 13608 loss -0.1875239610671997 LR -0.21255266666412354 LKL 0.02502870000898838
epoch 13609 loss -0.14774109423160553 LR -0.1727566421031952 LKL 0.025015544146299362
epoch 13610 loss -0.22283174097537994 LR -0.24789683520793915 LKL 0.025065094232559204
epoch 13611 loss -0.19546259939670563 LR -0.2205430567264557 LKL 0.025080455467104912
epoch 13612 loss -0.19372254610061646 LR -0.21887816488742828 LKL 0.02515561319887638
epoch 13613 loss -0.20815537869930267 LR -0.233258858323097

epoch 13700 loss -0.21942269802093506 LR -0.24456708133220673 LKL 0.025144390761852264
65
epoch 13701 loss -0.19092093408107758 LR -0.2160845398902893 LKL 0.025163602083921432


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13702 loss -0.1773347109556198 LR -0.20245185494422913 LKL 0.025117138400673866
epoch 13703 loss -0.18245290219783783 LR -0.2075071930885315 LKL 0.025054290890693665
epoch 13704 loss -0.18935158848762512 LR -0.2145155668258667 LKL 0.025163980200886726
epoch 13705 loss -0.2863144278526306 LR -0.31153756380081177 LKL 0.02522313967347145
epoch 13706 loss -0.1822800189256668 LR -0.20736396312713623 LKL 0.025083940476179123
epoch 13707 loss -0.2577325701713562 LR -0.2830277681350708 LKL 0.025295203551650047
epoch 13708 loss -0.2221050262451172 LR -0.24733011424541473 LKL 0.025225088000297546
epoch 13709 loss -0.24093109369277954 LR -0.2662166357040405 LKL 0.025285542011260986
epoch 13710 loss -0.2389824539422989 LR -0.26427000761032104 LKL 0.025287553668022156
epoch 13711 loss -0.22102758288383484 LR -0.24624818563461304 LKL 0.025220602750778198
epoch 13712 loss -0.2056632936000824 LR -0.23086225986480713 LKL 0.025198958814144135
epoch 13713 loss -0.26406237483024597 LR -0.28929436206

epoch 13798 loss -0.22550995647907257 LR -0.2508554458618164 LKL 0.025345493108034134
epoch 13799 loss -0.24151870608329773 LR -0.26677870750427246 LKL 0.025259993970394135
epoch 13800 loss -0.25945597887039185 LR -0.28477922081947327 LKL 0.025323238223791122
57
epoch 13801 loss -0.2302723079919815 LR -0.2557337284088135 LKL 0.02546142414212227


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13802 loss -0.24155376851558685 LR -0.2669084370136261 LKL 0.025354672223329544
epoch 13803 loss -0.2697303295135498 LR -0.29536372423171997 LKL 0.025633402168750763
epoch 13804 loss -0.1807994693517685 LR -0.20618745684623718 LKL 0.025387991219758987
epoch 13805 loss -0.25474900007247925 LR -0.28013110160827637 LKL 0.025382108986377716
epoch 13806 loss -0.2349965125322342 LR -0.26042574644088745 LKL 0.02542923204600811
epoch 13807 loss -0.17279846966266632 LR -0.19808639585971832 LKL 0.025287922471761703
epoch 13808 loss -0.2448349893093109 LR -0.2702455520629883 LKL 0.02541055530309677
epoch 13809 loss -0.23747043311595917 LR -0.26292699575424194 LKL 0.025456566363573074
epoch 13810 loss -0.2669260501861572 LR -0.29245927929878235 LKL 0.025533217936754227
epoch 13811 loss -0.2331325113773346 LR -0.25850242376327515 LKL 0.02536991983652115
epoch 13812 loss -0.22404374182224274 LR -0.24941110610961914 LKL 0.025367366150021553
epoch 13813 loss -0.2280731499195099 LR -0.25342744588

epoch 13898 loss -0.23889456689357758 LR -0.2641952633857727 LKL 0.02530069649219513
epoch 13899 loss -0.2218274176120758 LR -0.24705901741981506 LKL 0.025231603533029556
epoch 13900 loss -0.1513368934392929 LR -0.17654858529567719 LKL 0.02521168813109398
99
epoch 13901 loss -0.1387510597705841 LR -0.16387996077537537 LKL 0.025128906592726707


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 13902 loss -0.18005207180976868 LR -0.20531237125396729 LKL 0.025260305032134056
epoch 13903 loss -0.2348642647266388 LR -0.2601337432861328 LKL 0.02526947297155857
epoch 13904 loss -0.26512616872787476 LR -0.2905714511871338 LKL 0.025445276871323586
epoch 13905 loss -0.23622412979602814 LR -0.2616267800331116 LKL 0.025402655825018883
epoch 13906 loss -0.2808206081390381 LR -0.3060489594936371 LKL 0.025228362530469894
epoch 13907 loss -0.23427237570285797 LR -0.259785920381546 LKL 0.02551354467868805
epoch 13908 loss -0.20490482449531555 LR -0.23011979460716248 LKL 0.025214968249201775
epoch 13909 loss -0.23611010611057281 LR -0.26133573055267334 LKL 0.025225626304745674
epoch 13910 loss -0.26090332865715027 LR -0.28626325726509094 LKL 0.02535993605852127
epoch 13911 loss -0.22043761610984802 LR -0.24562250077724457 LKL 0.025184892117977142
epoch 13912 loss -0.20637653768062592 LR -0.23173165321350098 LKL 0.02535511553287506
epoch 13913 loss -0.2143000364303589 LR -0.239683330059

epoch 13998 loss -0.25020724534988403 LR -0.27560555934906006 LKL 0.02539830282330513
epoch 13999 loss -0.24186274409294128 LR -0.2673892080783844 LKL 0.025526462122797966
epoch 14000 loss -0.24140755832195282 LR -0.2670304477214813 LKL 0.02562289498746395
62
epoch 14001 loss -0.16557857394218445 LR -0.1908837854862213 LKL 0.025305213406682014


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14002 loss -0.20482419431209564 LR -0.23022353649139404 LKL 0.025399338454008102
epoch 14003 loss -0.19393391907215118 LR -0.21931025385856628 LKL 0.025376340374350548
epoch 14004 loss -0.23564639687538147 LR -0.26125726103782654 LKL 0.025610871613025665
epoch 14005 loss -0.2109706699848175 LR -0.23634035885334015 LKL 0.025369683280587196
epoch 14006 loss -0.22558142244815826 LR -0.25113993883132935 LKL 0.025558510795235634
epoch 14007 loss -0.2671443819999695 LR -0.29254719614982605 LKL 0.025402812287211418
epoch 14008 loss -0.20806829631328583 LR -0.2336113601922989 LKL 0.02554306574165821
epoch 14009 loss -0.22971662878990173 LR -0.25522029399871826 LKL 0.025503672659397125
epoch 14010 loss -0.26524224877357483 LR -0.29071640968322754 LKL 0.025474168360233307
epoch 14011 loss -0.19059833884239197 LR -0.21578693389892578 LKL 0.02518860064446926
epoch 14012 loss -0.19494782388210297 LR -0.22022908926010132 LKL 0.02528126910328865
epoch 14013 loss -0.1922498643398285 LR -0.217779

epoch 14098 loss -0.23630714416503906 LR -0.26183411478996277 LKL 0.025526968762278557
epoch 14099 loss -0.25871407985687256 LR -0.28428852558135986 LKL 0.02557443454861641
epoch 14100 loss -0.22894901037216187 LR -0.25442513823509216 LKL 0.02547612600028515
74
epoch 14101 loss -0.2036256194114685 LR -0.22905860841274261 LKL 0.025432996451854706


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14102 loss -0.24346619844436646 LR -0.26921260356903076 LKL 0.025746412575244904
epoch 14103 loss -0.23428264260292053 LR -0.2596653401851654 LKL 0.025382690131664276
epoch 14104 loss -0.26926374435424805 LR -0.29492419958114624 LKL 0.025660449638962746
epoch 14105 loss -0.17272494733333588 LR -0.19811618328094482 LKL 0.02539123222231865
epoch 14106 loss -0.2683001756668091 LR -0.29402297735214233 LKL 0.02572280541062355
epoch 14107 loss -0.16080959141254425 LR -0.18624886870384216 LKL 0.025439277291297913
epoch 14108 loss -0.20056012272834778 LR -0.2260989546775818 LKL 0.025538824498653412
epoch 14109 loss -0.24044689536094666 LR -0.26596230268478394 LKL 0.025515403598546982
epoch 14110 loss -0.19807936251163483 LR -0.22342240810394287 LKL 0.025343043729662895
epoch 14111 loss -0.23045675456523895 LR -0.25612086057662964 LKL 0.025664111599326134
epoch 14112 loss -0.2695111334323883 LR -0.2950724959373474 LKL 0.02556135691702366
epoch 14113 loss -0.21469004452228546 LR -0.2402216

epoch 14197 loss -0.2903693616390228 LR -0.31594207882881165 LKL 0.02557271346449852
epoch 14198 loss -0.2439422756433487 LR -0.2695019543170929 LKL 0.02555968053638935
epoch 14199 loss -0.24680982530117035 LR -0.2724221348762512 LKL 0.02561231516301632
epoch 14200 loss -0.16509976983070374 LR -0.190449059009552 LKL 0.025349292904138565
36


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14201 loss -0.19444382190704346 LR -0.21993514895439148 LKL 0.02549132890999317
epoch 14202 loss -0.25802287459373474 LR -0.2835843861103058 LKL 0.025561517104506493
epoch 14203 loss -0.2682638168334961 LR -0.2938563823699951 LKL 0.025592567399144173
epoch 14204 loss -0.2470657378435135 LR -0.27278783917427063 LKL 0.025722099468111992
epoch 14205 loss -0.26323121786117554 LR -0.2888921797275543 LKL 0.02566097490489483
epoch 14206 loss -0.21478113532066345 LR -0.24050931632518768 LKL 0.02572818659245968
epoch 14207 loss -0.17631623148918152 LR -0.20177564024925232 LKL 0.0254594124853611
epoch 14208 loss -0.2232004851102829 LR -0.24880895018577576 LKL 0.02560846135020256
epoch 14209 loss -0.27139046788215637 LR -0.29704660177230835 LKL 0.02565612830221653
epoch 14210 loss -0.2661104202270508 LR -0.29170066118240356 LKL 0.025590233504772186
epoch 14211 loss -0.2311478555202484 LR -0.2568216919898987 LKL 0.02567382901906967
epoch 14212 loss -0.2824167013168335 LR -0.30827170610427856

epoch 14297 loss -0.23808467388153076 LR -0.26362866163253784 LKL 0.025543995201587677
epoch 14298 loss -0.27337220311164856 LR -0.2991141974925995 LKL 0.02574198506772518
epoch 14299 loss -0.23927316069602966 LR -0.2649546265602112 LKL 0.02568145841360092
epoch 14300 loss -0.13790678977966309 LR -0.1633424311876297 LKL 0.02543564885854721
58


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14301 loss -0.21226266026496887 LR -0.2380073368549347 LKL 0.02574468031525612
epoch 14302 loss -0.19177255034446716 LR -0.21741467714309692 LKL 0.02564212866127491
epoch 14303 loss -0.21304239332675934 LR -0.23878806829452515 LKL 0.02574567124247551
epoch 14304 loss -0.1895226091146469 LR -0.2151031196117401 LKL 0.02558050863444805
epoch 14305 loss -0.19934093952178955 LR -0.2248736321926117 LKL 0.025532694533467293
epoch 14306 loss -0.2790200412273407 LR -0.3048371970653534 LKL 0.025817157700657845
epoch 14307 loss -0.25267934799194336 LR -0.27831172943115234 LKL 0.025632383301854134
epoch 14308 loss -0.25217822194099426 LR -0.2778623402118683 LKL 0.02568412572145462
epoch 14309 loss -0.2555099427700043 LR -0.28123748302459717 LKL 0.025727538391947746
epoch 14310 loss -0.22858348488807678 LR -0.2541261613368988 LKL 0.02554268203675747
epoch 14311 loss -0.23765362799167633 LR -0.26343464851379395 LKL 0.025781024247407913
epoch 14312 loss -0.28255122900009155 LR -0.30835211277008

epoch 14397 loss -0.2743292450904846 LR -0.30011576414108276 LKL 0.025786515325307846
epoch 14398 loss -0.1943487673997879 LR -0.21997511386871338 LKL 0.025626342743635178
epoch 14399 loss -0.1979629546403885 LR -0.22352176904678345 LKL 0.02555881068110466
epoch 14400 loss -0.2598276734352112 LR -0.28556200861930847 LKL 0.025734320282936096
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14401 loss -0.20239239931106567 LR -0.22810664772987366 LKL 0.02571425586938858
epoch 14402 loss -0.3289787471294403 LR -0.35470542311668396 LKL 0.025726687163114548
epoch 14403 loss -0.24391399323940277 LR -0.26965150237083435 LKL 0.025737514719367027
epoch 14404 loss -0.2905130088329315 LR -0.31627458333969116 LKL 0.025761570781469345
epoch 14405 loss -0.2242065668106079 LR -0.2500069737434387 LKL 0.02580040693283081
epoch 14406 loss -0.23063337802886963 LR -0.2563757300376892 LKL 0.02574235014617443
epoch 14407 loss -0.2096126526594162 LR -0.23532292246818542 LKL 0.025710275396704674
epoch 14408 loss -0.19013066589832306 LR -0.2158568799495697 LKL 0.025726212188601494
epoch 14409 loss -0.24743548035621643 LR -0.27319368720054626 LKL 0.025758208706974983
epoch 14410 loss -0.1442943513393402 LR -0.1699504405260086 LKL 0.025656087324023247
epoch 14411 loss -0.2469572275876999 LR -0.2726547420024872 LKL 0.02569752000272274
epoch 14412 loss -0.25993406772613525 LR -0.28560578823089

epoch 14498 loss -0.24712242186069489 LR -0.27296656370162964 LKL 0.0258441474288702
epoch 14499 loss -0.28411102294921875 LR -0.30983471870422363 LKL 0.025723684579133987
epoch 14500 loss -0.250299334526062 LR -0.27597683668136597 LKL 0.02567751519382
79
epoch 14501 loss -0.23052652180194855 LR -0.2559814155101776 LKL 0.025454893708229065


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14502 loss -0.22163745760917664 LR -0.24718829989433289 LKL 0.02555083855986595
epoch 14503 loss -0.20860609412193298 LR -0.23430553078651428 LKL 0.025699432939291
epoch 14504 loss -0.2784106731414795 LR -0.30422940850257874 LKL 0.02581874281167984
epoch 14505 loss -0.25663653016090393 LR -0.28230345249176025 LKL 0.025666914880275726
epoch 14506 loss -0.2579982280731201 LR -0.28367385268211365 LKL 0.025675633922219276
epoch 14507 loss -0.23201844096183777 LR -0.25782379508018494 LKL 0.02580535039305687
epoch 14508 loss -0.2705698013305664 LR -0.296273797750473 LKL 0.025703992694616318
epoch 14509 loss -0.19623181223869324 LR -0.22192536294460297 LKL 0.02569354884326458
epoch 14510 loss -0.1996058076620102 LR -0.22529569268226624 LKL 0.02568988688290119
epoch 14511 loss -0.23401892185211182 LR -0.25978994369506836 LKL 0.02577102743089199
epoch 14512 loss -0.26375526189804077 LR -0.2895811200141907 LKL 0.025825846940279007
epoch 14513 loss -0.266169011592865 LR -0.29197317361831665

epoch 14600 loss -0.2337677776813507 LR -0.25943464040756226 LKL 0.0256668571382761
71


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14601 loss -0.29835665225982666 LR -0.3242103159427643 LKL 0.025853658095002174
epoch 14602 loss -0.20486146211624146 LR -0.23046019673347473 LKL 0.025598734617233276
epoch 14603 loss -0.26895081996917725 LR -0.29483965039253235 LKL 0.0258888378739357
epoch 14604 loss -0.218703493475914 LR -0.24441517889499664 LKL 0.02571168914437294
epoch 14605 loss -0.24844186007976532 LR -0.2742137312889099 LKL 0.02577187493443489
epoch 14606 loss -0.2710215747356415 LR -0.2967984676361084 LKL 0.025776904076337814
epoch 14607 loss -0.2751924395561218 LR -0.3010002076625824 LKL 0.02580777183175087
epoch 14608 loss -0.28933537006378174 LR -0.31505149602890015 LKL 0.025716135278344154
epoch 14609 loss -0.21375203132629395 LR -0.23952800035476685 LKL 0.025775961577892303
epoch 14610 loss -0.2963138818740845 LR -0.3221416771411896 LKL 0.025827808305621147
epoch 14611 loss -0.32473769783973694 LR -0.3505348861217499 LKL 0.025797199457883835
epoch 14612 loss -0.2695067524909973 LR -0.2953832149505615

epoch 14697 loss -0.19716855883598328 LR -0.22298434376716614 LKL 0.02581578679382801
epoch 14698 loss -0.25945207476615906 LR -0.2853163480758667 LKL 0.025864267721772194
epoch 14699 loss -0.26425325870513916 LR -0.29033076763153076 LKL 0.026077523827552795
epoch 14700 loss -0.24922846257686615 LR -0.2751389145851135 LKL 0.025910457596182823
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14701 loss -0.2191728800535202 LR -0.24504370987415314 LKL 0.025870824232697487
epoch 14702 loss -0.2413027137517929 LR -0.2671436667442322 LKL 0.02584095112979412
epoch 14703 loss -0.23241904377937317 LR -0.25831952691078186 LKL 0.02590048685669899
epoch 14704 loss -0.18502968549728394 LR -0.2108376920223236 LKL 0.02580801025032997
epoch 14705 loss -0.2954622805118561 LR -0.3213226795196533 LKL 0.02586040087044239
epoch 14706 loss -0.19777067005634308 LR -0.22358602285385132 LKL 0.02581534907221794
epoch 14707 loss -0.25331008434295654 LR -0.279244601726532 LKL 0.02593451738357544
epoch 14708 loss -0.19800885021686554 LR -0.22364777326583862 LKL 0.025638923048973083
epoch 14709 loss -0.24868091940879822 LR -0.27453428506851196 LKL 0.025853358209133148
epoch 14710 loss -0.20733454823493958 LR -0.2331477850675583 LKL 0.025813240557909012
epoch 14711 loss -0.175588458776474 LR -0.20137226581573486 LKL 0.025783803313970566
epoch 14712 loss -0.2873621881008148 LR -0.3133804202079773 

epoch 14797 loss -0.21866972744464874 LR -0.24448078870773315 LKL 0.025811059400439262
epoch 14798 loss -0.1991300880908966 LR -0.22490286827087402 LKL 0.025772783905267715
epoch 14799 loss -0.23945483565330505 LR -0.26524269580841064 LKL 0.02578785829246044
epoch 14800 loss -0.19493943452835083 LR -0.2207481563091278 LKL 0.025808725506067276
66


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14801 loss -0.22974148392677307 LR -0.25543737411499023 LKL 0.025695882737636566
epoch 14802 loss -0.29059791564941406 LR -0.31634506583213806 LKL 0.025747137144207954
epoch 14803 loss -0.24358180165290833 LR -0.2693629264831543 LKL 0.02578113041818142
epoch 14804 loss -0.2730185389518738 LR -0.2988782227039337 LKL 0.025859681889414787
epoch 14805 loss -0.2268294095993042 LR -0.2526339292526245 LKL 0.025804517790675163
epoch 14806 loss -0.29174429178237915 LR -0.31772151589393616 LKL 0.025977233424782753
epoch 14807 loss -0.21732036769390106 LR -0.2430132031440735 LKL 0.025692837312817574
epoch 14808 loss -0.29198598861694336 LR -0.3179803490638733 LKL 0.025994356721639633
epoch 14809 loss -0.2289372980594635 LR -0.2548280358314514 LKL 0.025890734046697617
epoch 14810 loss -0.292791485786438 LR -0.31877848505973816 LKL 0.025987008586525917
epoch 14811 loss -0.24210034310817719 LR -0.2678443193435669 LKL 0.02574397809803486
epoch 14812 loss -0.24451768398284912 LR -0.2703445851802

epoch 14897 loss -0.2434694766998291 LR -0.2693193852901459 LKL 0.025849904865026474
epoch 14898 loss -0.28042924404144287 LR -0.30638357996940613 LKL 0.0259543489664793
epoch 14899 loss -0.2425302118062973 LR -0.26830947399139404 LKL 0.02577926032245159
epoch 14900 loss -0.3010626435279846 LR -0.32704660296440125 LKL 0.02598397061228752
71


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 14901 loss -0.1951206624507904 LR -0.22082015872001648 LKL 0.025699492543935776
epoch 14902 loss -0.24703653156757355 LR -0.27283865213394165 LKL 0.025802114978432655
epoch 14903 loss -0.27628055214881897 LR -0.3021177649497986 LKL 0.025837216526269913
epoch 14904 loss -0.23006241023540497 LR -0.2559729814529419 LKL 0.025910569354891777
epoch 14905 loss -0.21497173607349396 LR -0.24083465337753296 LKL 0.02586292289197445
epoch 14906 loss -0.29704123735427856 LR -0.3229146897792816 LKL 0.025873443111777306
epoch 14907 loss -0.25100401043891907 LR -0.2768539488315582 LKL 0.025849932804703712
epoch 14908 loss -0.2005203664302826 LR -0.22626110911369324 LKL 0.025740735232830048
epoch 14909 loss -0.2430892288684845 LR -0.2687974274158478 LKL 0.025708194822072983
epoch 14910 loss -0.22937974333763123 LR -0.2550945580005646 LKL 0.025714822113513947
epoch 14911 loss -0.2041105180978775 LR -0.2298366278409958 LKL 0.025726111605763435
epoch 14912 loss -0.2038792073726654 LR -0.229721218347

epoch 14999 loss -0.25792428851127625 LR -0.283719539642334 LKL 0.025795243680477142
epoch 15000 loss -0.2646183669567108 LR -0.2905888259410858 LKL 0.0259704552590847
81
epoch 15001 loss -0.26788073778152466 LR -0.2938443720340729 LKL 0.025963645428419113


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15002 loss -0.2608015537261963 LR -0.28667593002319336 LKL 0.025874372571706772
epoch 15003 loss -0.26588118076324463 LR -0.291685551404953 LKL 0.025804363191127777
epoch 15004 loss -0.2327931523323059 LR -0.25852304697036743 LKL 0.025729898363351822
epoch 15005 loss -0.26440104842185974 LR -0.2903929054737091 LKL 0.02599184960126877
epoch 15006 loss -0.25279104709625244 LR -0.27855366468429565 LKL 0.025762619450688362
epoch 15007 loss -0.2344980239868164 LR -0.260355681180954 LKL 0.02585766091942787
epoch 15008 loss -0.31010228395462036 LR -0.33602869510650635 LKL 0.02592640183866024
epoch 15009 loss -0.2642652094364166 LR -0.29030197858810425 LKL 0.026036769151687622
epoch 15010 loss -0.2888616919517517 LR -0.31496691703796387 LKL 0.026105238124728203
epoch 15011 loss -0.26879242062568665 LR -0.29466572403907776 LKL 0.025873295962810516
epoch 15012 loss -0.24275371432304382 LR -0.2686775326728821 LKL 0.025923816487193108
epoch 15013 loss -0.2827816903591156 LR -0.30869340896606

epoch 15098 loss -0.27024638652801514 LR -0.29607093334198 LKL 0.02582455798983574
epoch 15099 loss -0.31972843408584595 LR -0.34564343094825745 LKL 0.025915000587701797
epoch 15100 loss -0.22277063131332397 LR -0.24874073266983032 LKL 0.02597009390592575
122
epoch 15101 loss -0.22590596973896027 LR -0.25158849358558655 LKL 0.02568252943456173


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15102 loss -0.22034981846809387 LR -0.24614669382572174 LKL 0.025796882808208466
epoch 15103 loss -0.22757983207702637 LR -0.25326746702194214 LKL 0.02568764053285122
epoch 15104 loss -0.21724510192871094 LR -0.2431163787841797 LKL 0.0258712787181139
epoch 15105 loss -0.2124699503183365 LR -0.23827514052391052 LKL 0.025805193930864334
epoch 15106 loss -0.20310816168785095 LR -0.2289685308933258 LKL 0.025860371068120003
epoch 15107 loss -0.24650833010673523 LR -0.27241432666778564 LKL 0.025905989110469818
epoch 15108 loss -0.20162460207939148 LR -0.22742514312267303 LKL 0.025800542905926704
epoch 15109 loss -0.24481186270713806 LR -0.27074435353279114 LKL 0.025932498276233673
epoch 15110 loss -0.29982808232307434 LR -0.32580187916755676 LKL 0.02597380243241787
epoch 15111 loss -0.23545946180820465 LR -0.26127326488494873 LKL 0.025813797488808632
epoch 15112 loss -0.2681300640106201 LR -0.2940901815891266 LKL 0.025960130617022514
epoch 15113 loss -0.2089846283197403 LR -0.234928101

epoch 15198 loss -0.17408646643161774 LR -0.19987793266773224 LKL 0.02579147182404995
epoch 15199 loss -0.23340800404548645 LR -0.25920629501342773 LKL 0.02579829841852188
epoch 15200 loss -0.29439422488212585 LR -0.32052895426750183 LKL 0.026134729385375977
122
epoch 15201 loss -0.25558707118034363 LR -0.28159141540527344 LKL 0.026004357263445854


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15202 loss -0.24591228365898132 LR -0.2718789875507355 LKL 0.025966698303818703
epoch 15203 loss -0.26484307646751404 LR -0.2907989025115967 LKL 0.025955813005566597
epoch 15204 loss -0.22358253598213196 LR -0.2495463639497757 LKL 0.02596382051706314
epoch 15205 loss -0.21453900635242462 LR -0.24050846695899963 LKL 0.025969460606575012
epoch 15206 loss -0.33773326873779297 LR -0.3638954162597656 LKL 0.026162143796682358
epoch 15207 loss -0.2758963406085968 LR -0.3018643260002136 LKL 0.02596798911690712
epoch 15208 loss -0.3036402761936188 LR -0.3296873867511749 LKL 0.026047099381685257
epoch 15209 loss -0.3150499165058136 LR -0.3410528898239136 LKL 0.02600296400487423
epoch 15210 loss -0.28052154183387756 LR -0.30654555559158325 LKL 0.026024019345641136
epoch 15211 loss -0.27744168043136597 LR -0.3032691478729248 LKL 0.025827471166849136
epoch 15212 loss -0.21669499576091766 LR -0.2426081895828247 LKL 0.025913190096616745
epoch 15213 loss -0.23284170031547546 LR -0.25892817974090

epoch 15299 loss -0.24553684890270233 LR -0.27168989181518555 LKL 0.026153046637773514
epoch 15300 loss -0.23014859855175018 LR -0.2561900019645691 LKL 0.026041407138109207
87
epoch 15301 loss -0.2500257194042206 LR -0.276273250579834 LKL 0.026247527450323105


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15302 loss -0.24947279691696167 LR -0.27545246481895447 LKL 0.025979669764637947
epoch 15303 loss -0.1853339523077011 LR -0.2112892121076584 LKL 0.025955254212021828
epoch 15304 loss -0.21199582517147064 LR -0.23804724216461182 LKL 0.026051415130496025
epoch 15305 loss -0.20180732011795044 LR -0.22788338363170624 LKL 0.026076070964336395
epoch 15306 loss -0.26238834857940674 LR -0.28862297534942627 LKL 0.02623462677001953
epoch 15307 loss -0.22793860733509064 LR -0.253975510597229 LKL 0.02603689767420292
epoch 15308 loss -0.20717449486255646 LR -0.2331119030714035 LKL 0.025937411934137344
epoch 15309 loss -0.25941702723503113 LR -0.28554409742355347 LKL 0.026127057150006294
epoch 15310 loss -0.21085941791534424 LR -0.2367163896560669 LKL 0.025856973603367805
epoch 15311 loss -0.28061172366142273 LR -0.3065914511680603 LKL 0.025979720056056976
epoch 15312 loss -0.2542975842952728 LR -0.28033244609832764 LKL 0.026034871116280556
epoch 15313 loss -0.23609255254268646 LR -0.262206107

epoch 15400 loss -0.24715998768806458 LR -0.273233562707901 LKL 0.026073582470417023
124
epoch 15401 loss -0.24065345525741577 LR -0.2667732834815979 LKL 0.02611982263624668


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15402 loss -0.1489856094121933 LR -0.17484214901924133 LKL 0.025856545194983482
epoch 15403 loss -0.27271002531051636 LR -0.2988881468772888 LKL 0.026178117841482162
epoch 15404 loss -0.24252569675445557 LR -0.2686563730239868 LKL 0.02613067254424095
epoch 15405 loss -0.2129080891609192 LR -0.23876865208148956 LKL 0.025860562920570374
epoch 15406 loss -0.22202861309051514 LR -0.24809738993644714 LKL 0.026068778708577156
epoch 15407 loss -0.2647784352302551 LR -0.29069042205810547 LKL 0.025911975651979446
epoch 15408 loss -0.2762821912765503 LR -0.3024357557296753 LKL 0.026153570041060448
epoch 15409 loss -0.2664414048194885 LR -0.2925013303756714 LKL 0.026059940457344055
epoch 15410 loss -0.2210196703672409 LR -0.24699561297893524 LKL 0.025975942611694336
epoch 15411 loss -0.2546459138393402 LR -0.2807140350341797 LKL 0.026068126782774925
epoch 15412 loss -0.21033671498298645 LR -0.23636682331562042 LKL 0.026030104607343674
epoch 15413 loss -0.22218400239944458 LR -0.248070225119

epoch 15500 loss -0.20416949689388275 LR -0.23037409782409668 LKL 0.02620459720492363
79


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15501 loss -0.20105934143066406 LR -0.22723418474197388 LKL 0.026174841448664665
epoch 15502 loss -0.24736647307872772 LR -0.2735852897167206 LKL 0.026218818500638008
epoch 15503 loss -0.2203017771244049 LR -0.24633052945137024 LKL 0.02602875977754593
epoch 15504 loss -0.2671816051006317 LR -0.29338258504867554 LKL 0.026200978085398674
epoch 15505 loss -0.253762423992157 LR -0.2800057828426361 LKL 0.026243360713124275
epoch 15506 loss -0.20719364285469055 LR -0.23325017094612122 LKL 0.026056524366140366
epoch 15507 loss -0.21356722712516785 LR -0.23971283435821533 LKL 0.026145610958337784
epoch 15508 loss -0.2716660797595978 LR -0.29770877957344055 LKL 0.026042696088552475
epoch 15509 loss -0.18540562689304352 LR -0.21160343289375305 LKL 0.026197802275419235
epoch 15510 loss -0.1518324315547943 LR -0.1778678297996521 LKL 0.02603539079427719
epoch 15511 loss -0.23212705552577972 LR -0.25837838649749756 LKL 0.026251325383782387
epoch 15512 loss -0.28198277950286865 LR -0.3081845045

epoch 15597 loss -0.2375321090221405 LR -0.2637902498245239 LKL 0.026258137077093124
epoch 15598 loss -0.202712744474411 LR -0.22853747010231018 LKL 0.02582472935318947
epoch 15599 loss -0.280148446559906 LR -0.30630505084991455 LKL 0.026156606152653694
epoch 15600 loss -0.21241658926010132 LR -0.23841829597949982 LKL 0.0260016992688179
82


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15601 loss -0.25738242268562317 LR -0.28356310725212097 LKL 0.026180677115917206
epoch 15602 loss -0.1565716415643692 LR -0.18265172839164734 LKL 0.02608008310198784
epoch 15603 loss -0.21872808039188385 LR -0.2449134886264801 LKL 0.0261854100972414
epoch 15604 loss -0.30908921360969543 LR -0.3352629840373993 LKL 0.026173759251832962
epoch 15605 loss -0.2808820605278015 LR -0.3070811629295349 LKL 0.0261990949511528
epoch 15606 loss -0.19446295499801636 LR -0.22047275304794312 LKL 0.02600979246199131
epoch 15607 loss -0.22292380034923553 LR -0.24912238121032715 LKL 0.026198578998446465
epoch 15608 loss -0.29108166694641113 LR -0.3173671364784241 LKL 0.02628546953201294
epoch 15609 loss -0.24952170252799988 LR -0.2756744623184204 LKL 0.02615276724100113
epoch 15610 loss -0.2576863169670105 LR -0.2838136553764343 LKL 0.026127329096198082
epoch 15611 loss -0.2773260176181793 LR -0.30371206998825073 LKL 0.026386065408587456
epoch 15612 loss -0.27475014328956604 LR -0.3009113371372223 

epoch 15697 loss -0.21139052510261536 LR -0.23750053346157074 LKL 0.026110012084245682
epoch 15698 loss -0.16784490644931793 LR -0.19385117292404175 LKL 0.026006260886788368
epoch 15699 loss -0.19321389496326447 LR -0.21913868188858032 LKL 0.025924790650606155
epoch 15700 loss -0.2061927318572998 LR -0.23234489560127258 LKL 0.02615215629339218
51


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15701 loss -0.21598027646541595 LR -0.24214786291122437 LKL 0.02616759203374386
epoch 15702 loss -0.29231682419776917 LR -0.3184894025325775 LKL 0.026172565296292305
epoch 15703 loss -0.2524569034576416 LR -0.2785930633544922 LKL 0.026136169210076332
epoch 15704 loss -0.21988500654697418 LR -0.24613556265830994 LKL 0.026250550523400307
epoch 15705 loss -0.24853232502937317 LR -0.2746571898460388 LKL 0.02612486481666565
epoch 15706 loss -0.29437705874443054 LR -0.3204304873943329 LKL 0.02605341747403145
epoch 15707 loss -0.24465182423591614 LR -0.27073365449905396 LKL 0.026081837713718414
epoch 15708 loss -0.23414656519889832 LR -0.26024261116981506 LKL 0.026096053421497345
epoch 15709 loss -0.2767144739627838 LR -0.30292022228240967 LKL 0.02620573900640011
epoch 15710 loss -0.3366270661354065 LR -0.3628568649291992 LKL 0.02622981369495392
epoch 15711 loss -0.24972012639045715 LR -0.27588629722595215 LKL 0.026166163384914398
epoch 15712 loss -0.26269686222076416 LR -0.288894653320

epoch 15799 loss -0.2146139144897461 LR -0.24098727107048035 LKL 0.02637336030602455
epoch 15800 loss -0.28937292098999023 LR -0.31568050384521484 LKL 0.026307569816708565
41


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15801 loss -0.2710689902305603 LR -0.29737183451652527 LKL 0.026302848011255264
epoch 15802 loss -0.2980392575263977 LR -0.32438215613365173 LKL 0.026342887431383133
epoch 15803 loss -0.2294144183397293 LR -0.25547635555267334 LKL 0.02606194093823433
epoch 15804 loss -0.21961301565170288 LR -0.24585117399692535 LKL 0.026238158345222473
epoch 15805 loss -0.1325145959854126 LR -0.1586289405822754 LKL 0.026114337146282196
epoch 15806 loss -0.23163072764873505 LR -0.2578323483467102 LKL 0.02620162069797516
epoch 15807 loss -0.26486122608184814 LR -0.2912611961364746 LKL 0.02639998123049736
epoch 15808 loss -0.21158558130264282 LR -0.23774895071983337 LKL 0.02616337314248085
epoch 15809 loss -0.2425229847431183 LR -0.26869773864746094 LKL 0.0261747557669878
epoch 15810 loss -0.24987035989761353 LR -0.27619466185569763 LKL 0.026324309408664703
epoch 15811 loss -0.24093350768089294 LR -0.26724499464035034 LKL 0.0263114832341671
epoch 15812 loss -0.2423698604106903 LR -0.2686298787593841

epoch 15899 loss -0.2254028469324112 LR -0.25163763761520386 LKL 0.026234792545437813
epoch 15900 loss -0.2731437683105469 LR -0.29956427216529846 LKL 0.02642049826681614
48
epoch 15901 loss -0.2460368126630783 LR -0.27226781845092773 LKL 0.026231011375784874


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 15902 loss -0.17762479186058044 LR -0.20384569466114044 LKL 0.0262208953499794
epoch 15903 loss -0.18880076706409454 LR -0.2150348722934723 LKL 0.026234107092022896
epoch 15904 loss -0.2553311288356781 LR -0.28158438205718994 LKL 0.026253245770931244
epoch 15905 loss -0.164606511592865 LR -0.1906956434249878 LKL 0.026089129969477654
epoch 15906 loss -0.24010154604911804 LR -0.2663590908050537 LKL 0.026257552206516266
epoch 15907 loss -0.3205896317958832 LR -0.3468243479728699 LKL 0.026234712451696396
epoch 15908 loss -0.2836352288722992 LR -0.30995234847068787 LKL 0.026317114010453224
epoch 15909 loss -0.16781441867351532 LR -0.1938728392124176 LKL 0.026058418676257133
epoch 15910 loss -0.3140924274921417 LR -0.34057289361953735 LKL 0.02648046240210533
epoch 15911 loss -0.25403374433517456 LR -0.2803192138671875 LKL 0.02628546953201294
epoch 15912 loss -0.2646184265613556 LR -0.2910737991333008 LKL 0.026455365121364594
epoch 15913 loss -0.23290446400642395 LR -0.25916507840156555

epoch 16000 loss -0.2859030067920685 LR -0.31218427419662476 LKL 0.026281261816620827
94
epoch 16001 loss -0.26941508054733276 LR -0.2957139313220978 LKL 0.02629886381328106


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16002 loss -0.2923014461994171 LR -0.318591833114624 LKL 0.026290399953722954
epoch 16003 loss -0.29954254627227783 LR -0.32597815990448 LKL 0.026435604318976402
epoch 16004 loss -0.2561679184436798 LR -0.28245845437049866 LKL 0.026290547102689743
epoch 16005 loss -0.2719006836414337 LR -0.2984098494052887 LKL 0.026509154587984085
epoch 16006 loss -0.28332754969596863 LR -0.3095022141933441 LKL 0.026174668222665787
epoch 16007 loss -0.24495618045330048 LR -0.2711132764816284 LKL 0.026157090440392494
epoch 16008 loss -0.2649693787097931 LR -0.2913144826889038 LKL 0.026345107704401016
epoch 16009 loss -0.2724539041519165 LR -0.29868245124816895 LKL 0.026228532195091248
epoch 16010 loss -0.279491662979126 LR -0.30592313408851624 LKL 0.02643147110939026
epoch 16011 loss -0.2672177255153656 LR -0.2935090959072113 LKL 0.02629137597978115
epoch 16012 loss -0.32158151268959045 LR -0.3480077087879181 LKL 0.026426203548908234
epoch 16013 loss -0.21848608553409576 LR -0.24470919370651245 LK

epoch 16098 loss -0.3338753879070282 LR -0.3603650629520416 LKL 0.026489686220884323
epoch 16099 loss -0.23700985312461853 LR -0.2634041905403137 LKL 0.02639433927834034
epoch 16100 loss -0.26208463311195374 LR -0.2884971499443054 LKL 0.026412522420287132
66
epoch 16101 loss -0.23524034023284912 LR -0.26148176193237305 LKL 0.026241429150104523


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16102 loss -0.2715749442577362 LR -0.2979467809200287 LKL 0.026371842250227928
epoch 16103 loss -0.30256351828575134 LR -0.328936368227005 LKL 0.026372838765382767
epoch 16104 loss -0.24795231223106384 LR -0.2741774022579193 LKL 0.02622508630156517
epoch 16105 loss -0.2295721173286438 LR -0.25586357712745667 LKL 0.026291457936167717
epoch 16106 loss -0.27083879709243774 LR -0.2972338795661926 LKL 0.02639506757259369
epoch 16107 loss -0.2608643174171448 LR -0.28725147247314453 LKL 0.026387155055999756
epoch 16108 loss -0.33794546127319336 LR -0.364378422498703 LKL 0.026432951912283897
epoch 16109 loss -0.27911630272865295 LR -0.3054606318473816 LKL 0.02634432166814804
epoch 16110 loss -0.286472886800766 LR -0.31295299530029297 LKL 0.02648010477423668
epoch 16111 loss -0.25806644558906555 LR -0.28437185287475586 LKL 0.026305416598916054
epoch 16112 loss -0.24039269983768463 LR -0.26673829555511475 LKL 0.026345599442720413
epoch 16113 loss -0.34688037633895874 LR -0.3733535706996918

epoch 16200 loss -0.27908647060394287 LR -0.3055172562599182 LKL 0.026430796831846237
103
epoch 16201 loss -0.2495061606168747 LR -0.2759087085723877 LKL 0.026402542367577553


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16202 loss -0.3238512873649597 LR -0.3502841293811798 LKL 0.026432840153574944
epoch 16203 loss -0.3118932843208313 LR -0.3383628726005554 LKL 0.026469578966498375
epoch 16204 loss -0.2732588052749634 LR -0.29970216751098633 LKL 0.02644335851073265
epoch 16205 loss -0.23010937869548798 LR -0.2563926577568054 LKL 0.026283277198672295
epoch 16206 loss -0.27173304557800293 LR -0.29808032512664795 LKL 0.02634727954864502
epoch 16207 loss -0.29886144399642944 LR -0.32537972927093506 LKL 0.026518287137150764
epoch 16208 loss -0.18485936522483826 LR -0.21117201447486877 LKL 0.026312654837965965
epoch 16209 loss -0.25548726320266724 LR -0.2817603349685669 LKL 0.026273073628544807
epoch 16210 loss -0.28940916061401367 LR -0.31577563285827637 LKL 0.02636648528277874
epoch 16211 loss -0.14871124923229218 LR -0.17494258284568787 LKL 0.02623133547604084
epoch 16212 loss -0.2842086851596832 LR -0.3105318546295166 LKL 0.026323160156607628
epoch 16213 loss -0.2910428047180176 LR -0.3174570500850

epoch 16298 loss -0.2670481204986572 LR -0.29366183280944824 LKL 0.02661370113492012
epoch 16299 loss -0.30889928340911865 LR -0.3355863392353058 LKL 0.02668706327676773
epoch 16300 loss -0.2639067769050598 LR -0.29049021005630493 LKL 0.026583421975374222
80
epoch 16301 loss -0.23551033437252045 LR -0.26197177171707153 LKL 0.026461441069841385


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16302 loss -0.2296505719423294 LR -0.2562045156955719 LKL 0.026553941890597343
epoch 16303 loss -0.2680998146533966 LR -0.29462093114852905 LKL 0.0265211034566164
epoch 16304 loss -0.29827022552490234 LR -0.3247799873352051 LKL 0.02650974690914154
epoch 16305 loss -0.2823779881000519 LR -0.308927983045578 LKL 0.02655000425875187
epoch 16306 loss -0.2050202339887619 LR -0.23119859397411346 LKL 0.026178359985351562
epoch 16307 loss -0.2716647982597351 LR -0.29802098870277405 LKL 0.026356201618909836
epoch 16308 loss -0.26472318172454834 LR -0.29136720299720764 LKL 0.026644010096788406
epoch 16309 loss -0.19830653071403503 LR -0.2245999574661255 LKL 0.026293426752090454
epoch 16310 loss -0.24625039100646973 LR -0.27273643016815186 LKL 0.026486042886972427
epoch 16311 loss -0.26574569940567017 LR -0.29218778014183044 LKL 0.026442088186740875
epoch 16312 loss -0.25099021196365356 LR -0.2774593234062195 LKL 0.02646910771727562
epoch 16313 loss -0.2671186923980713 LR -0.293586403131485 

epoch 16398 loss -0.2782076895236969 LR -0.3046734035015106 LKL 0.02646571956574917
epoch 16399 loss -0.2858186960220337 LR -0.31231924891471863 LKL 0.02650054544210434
epoch 16400 loss -0.274549275636673 LR -0.30119025707244873 LKL 0.0266409944742918
91
epoch 16401 loss -0.19780850410461426 LR -0.22426879405975342 LKL 0.026460282504558563


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16402 loss -0.21793457865715027 LR -0.24443311989307404 LKL 0.026498539373278618
epoch 16403 loss -0.2510797083377838 LR -0.2775737941265106 LKL 0.026494095101952553
epoch 16404 loss -0.1986069530248642 LR -0.2250029593706131 LKL 0.0263960063457489
epoch 16405 loss -0.22760547697544098 LR -0.25406938791275024 LKL 0.026463912799954414
epoch 16406 loss -0.264821857213974 LR -0.29118311405181885 LKL 0.026361245661973953
epoch 16407 loss -0.28113695979118347 LR -0.3075082004070282 LKL 0.026371238753199577
epoch 16408 loss -0.2549818456172943 LR -0.28140386939048767 LKL 0.02642202191054821
epoch 16409 loss -0.27861231565475464 LR -0.3050936460494995 LKL 0.026481321081519127
epoch 16410 loss -0.30048200488090515 LR -0.3272136449813843 LKL 0.026731645688414574
epoch 16411 loss -0.27929162979125977 LR -0.305767685174942 LKL 0.026476062834262848
epoch 16412 loss -0.25615546107292175 LR -0.28270137310028076 LKL 0.026545913890004158
epoch 16413 loss -0.23928245902061462 LR -0.26567888259887

epoch 16498 loss -0.26070520281791687 LR -0.2872684597969055 LKL 0.02656325325369835
epoch 16499 loss -0.24372056126594543 LR -0.2701168656349182 LKL 0.02639630064368248
epoch 16500 loss -0.2642568349838257 LR -0.2908322811126709 LKL 0.02657543681561947
39
epoch 16501 loss -0.23403902351856232 LR -0.2607618570327759 LKL 0.02672283723950386


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16502 loss -0.22572198510169983 LR -0.25223085284233093 LKL 0.026508860290050507
epoch 16503 loss -0.23676235973834991 LR -0.26334062218666077 LKL 0.026578264310956
epoch 16504 loss -0.25182899832725525 LR -0.27830439805984497 LKL 0.026475397869944572
epoch 16505 loss -0.23624621331691742 LR -0.262978196144104 LKL 0.026731980964541435
epoch 16506 loss -0.25404101610183716 LR -0.2808569073677063 LKL 0.02681589126586914
epoch 16507 loss -0.2667095959186554 LR -0.2932908833026886 LKL 0.026581300422549248
epoch 16508 loss -0.2411680817604065 LR -0.26778554916381836 LKL 0.026617469266057014
epoch 16509 loss -0.266694039106369 LR -0.2934577465057373 LKL 0.026763711124658585
epoch 16510 loss -0.24250023066997528 LR -0.2691143751144409 LKL 0.02661413885653019
epoch 16511 loss -0.2696373462677002 LR -0.29652172327041626 LKL 0.02688436768949032
epoch 16512 loss -0.25003498792648315 LR -0.2767278552055359 LKL 0.026692859828472137
epoch 16513 loss -0.2551060914993286 LR -0.28179219365119934 

epoch 16598 loss -0.283344030380249 LR -0.31000131368637085 LKL 0.02665727399289608
epoch 16599 loss -0.28139370679855347 LR -0.30812370777130127 LKL 0.026729987934231758
epoch 16600 loss -0.1902928352355957 LR -0.21680742502212524 LKL 0.026514587923884392
38
epoch 16601 loss -0.2704829275608063 LR -0.2973251938819885 LKL 0.026842277497053146


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16602 loss -0.13764317333698273 LR -0.1641923487186432 LKL 0.02654918096959591
epoch 16603 loss -0.23342908918857574 LR -0.2599560022354126 LKL 0.026526911184191704
epoch 16604 loss -0.21671034395694733 LR -0.2434971034526825 LKL 0.026786765083670616
epoch 16605 loss -0.30281561613082886 LR -0.32945412397384644 LKL 0.02663850411772728
epoch 16606 loss -0.2333698570728302 LR -0.260076642036438 LKL 0.02670677751302719
epoch 16607 loss -0.299029678106308 LR -0.3256388306617737 LKL 0.026609152555465698
epoch 16608 loss -0.28521275520324707 LR -0.3118833303451538 LKL 0.026670563966035843
epoch 16609 loss -0.3323403596878052 LR -0.35909605026245117 LKL 0.026755692437291145
epoch 16610 loss -0.29968661069869995 LR -0.3264729976654053 LKL 0.026786398142576218
epoch 16611 loss -0.2771205008029938 LR -0.3038991689682007 LKL 0.02677866443991661
epoch 16612 loss -0.2892592251300812 LR -0.3160429894924164 LKL 0.0267837755382061
epoch 16613 loss -0.2912715673446655 LR -0.31813347339630127 LKL 

epoch 16700 loss -0.2203744351863861 LR -0.24712827801704407 LKL 0.02675383910536766
43


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16701 loss -0.2755376994609833 LR -0.3023131787776947 LKL 0.026775486767292023
epoch 16702 loss -0.25759297609329224 LR -0.2844192385673523 LKL 0.026826266199350357
epoch 16703 loss -0.24452978372573853 LR -0.2711373567581177 LKL 0.0266075748950243
epoch 16704 loss -0.3084242343902588 LR -0.3352685868740082 LKL 0.02684435062110424
epoch 16705 loss -0.24980641901493073 LR -0.2764631509780884 LKL 0.0266567375510931
epoch 16706 loss -0.24588024616241455 LR -0.2725204527378082 LKL 0.026640208438038826
epoch 16707 loss -0.28610530495643616 LR -0.31296929717063904 LKL 0.02686399035155773
epoch 16708 loss -0.26078861951828003 LR -0.2874830365180969 LKL 0.026694409549236298
epoch 16709 loss -0.25460559129714966 LR -0.28127521276474 LKL 0.026669634506106377
epoch 16710 loss -0.2671819031238556 LR -0.2940630316734314 LKL 0.02688112109899521
epoch 16711 loss -0.2800430953502655 LR -0.30690550804138184 LKL 0.02686241827905178
epoch 16712 loss -0.2148057520389557 LR -0.2415580302476883 LKL 0.

epoch 16797 loss -0.3209742605686188 LR -0.34806686639785767 LKL 0.027092594653367996
epoch 16798 loss -0.2700822055339813 LR -0.2969072759151459 LKL 0.0268250685185194
epoch 16799 loss -0.28033211827278137 LR -0.3071037530899048 LKL 0.02677164599299431
epoch 16800 loss -0.2425801157951355 LR -0.26943010091781616 LKL 0.026849990710616112
78


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16801 loss -0.32932227849960327 LR -0.35622403025627136 LKL 0.026901762932538986
epoch 16802 loss -0.25690367817878723 LR -0.2838384211063385 LKL 0.026934755966067314
epoch 16803 loss -0.2894369065761566 LR -0.3161270320415497 LKL 0.026690121740102768
epoch 16804 loss -0.2636696994304657 LR -0.2903401851654053 LKL 0.026670483872294426
epoch 16805 loss -0.2471524029970169 LR -0.2738431692123413 LKL 0.026690762490034103
epoch 16806 loss -0.2620604634284973 LR -0.2888096570968628 LKL 0.026749201118946075
epoch 16807 loss -0.28028619289398193 LR -0.3071981370449066 LKL 0.02691195160150528
epoch 16808 loss -0.27027034759521484 LR -0.29713383316993713 LKL 0.026863498613238335
epoch 16809 loss -0.2789757251739502 LR -0.3056780695915222 LKL 0.026702331379055977
epoch 16810 loss -0.32768651843070984 LR -0.35469287633895874 LKL 0.027006367221474648
epoch 16811 loss -0.19684144854545593 LR -0.22354193031787872 LKL 0.026700476184487343
epoch 16812 loss -0.2681448459625244 LR -0.2950647473335

epoch 16899 loss -0.1959018111228943 LR -0.22258591651916504 LKL 0.0266841072589159
epoch 16900 loss -0.24544833600521088 LR -0.27237004041671753 LKL 0.02692170813679695
98
epoch 16901 loss -0.2092605084180832 LR -0.23594211041927338 LKL 0.026681600138545036


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 16902 loss -0.3096597492694855 LR -0.33653688430786133 LKL 0.02687714248895645
epoch 16903 loss -0.2842835485935211 LR -0.31120002269744873 LKL 0.02691647969186306
epoch 16904 loss -0.2547864317893982 LR -0.28159236907958984 LKL 0.02680594101548195
epoch 16905 loss -0.3143688440322876 LR -0.3412012755870819 LKL 0.026832420378923416
epoch 16906 loss -0.224031001329422 LR -0.250751256942749 LKL 0.02672024816274643
epoch 16907 loss -0.3061884343624115 LR -0.33313679695129395 LKL 0.02694835513830185
epoch 16908 loss -0.27395641803741455 LR -0.30092114210128784 LKL 0.026964712888002396
epoch 16909 loss -0.2729908525943756 LR -0.29983794689178467 LKL 0.026847105473279953
epoch 16910 loss -0.2686077654361725 LR -0.29548054933547974 LKL 0.026872795075178146
epoch 16911 loss -0.26946207880973816 LR -0.29643747210502625 LKL 0.026975391432642937
epoch 16912 loss -0.22306276857852936 LR -0.24988260865211487 LKL 0.02681984007358551
epoch 16913 loss -0.24521976709365845 LR -0.27191951870918274

epoch 17000 loss -0.2841222286224365 LR -0.31098687648773193 LKL 0.02686464786529541
46


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17001 loss -0.3127364218235016 LR -0.33983156085014343 LKL 0.0270951297134161
epoch 17002 loss -0.33590802550315857 LR -0.36273449659347534 LKL 0.02682647481560707
epoch 17003 loss -0.2086440771818161 LR -0.23550358414649963 LKL 0.02685951255261898
epoch 17004 loss -0.2901815176010132 LR -0.31704697012901306 LKL 0.026865456253290176
epoch 17005 loss -0.3059026896953583 LR -0.3327760398387909 LKL 0.026873338967561722
epoch 17006 loss -0.19808094203472137 LR -0.2247585505247116 LKL 0.026677606627345085
epoch 17007 loss -0.32446712255477905 LR -0.3513222336769104 LKL 0.02685510367155075
epoch 17008 loss -0.23067453503608704 LR -0.2576935589313507 LKL 0.027019020169973373
epoch 17009 loss -0.3097551167011261 LR -0.33669933676719666 LKL 0.026944220066070557
epoch 17010 loss -0.12953977286815643 LR -0.15642669796943665 LKL 0.026886919513344765
epoch 17011 loss -0.20989781618118286 LR -0.2368128001689911 LKL 0.026914985850453377
epoch 17012 loss -0.29328808188438416 LR -0.32032266259193

epoch 17100 loss -0.3066163957118988 LR -0.33361315727233887 LKL 0.026996759697794914
44


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17101 loss -0.2364622801542282 LR -0.26334020495414734 LKL 0.02687792293727398
epoch 17102 loss -0.2752494513988495 LR -0.3022006154060364 LKL 0.026951171457767487
epoch 17103 loss -0.19240222871303558 LR -0.21917341649532318 LKL 0.0267711840569973
epoch 17104 loss -0.24423307180404663 LR -0.27097469568252563 LKL 0.02674162946641445
epoch 17105 loss -0.19407761096954346 LR -0.22094428539276123 LKL 0.026866678148508072
epoch 17106 loss -0.2696020007133484 LR -0.2965489625930786 LKL 0.026946965605020523
epoch 17107 loss -0.28103724122047424 LR -0.30807042121887207 LKL 0.02703317068517208
epoch 17108 loss -0.2627204358577728 LR -0.2896307408809662 LKL 0.02691029943525791
epoch 17109 loss -0.23523090779781342 LR -0.26220571994781494 LKL 0.026974814012646675
epoch 17110 loss -0.2557157278060913 LR -0.28256601095199585 LKL 0.026850294321775436
epoch 17111 loss -0.2692749500274658 LR -0.29622381925582886 LKL 0.026948878541588783
epoch 17112 loss -0.32077476382255554 LR -0.34795194864273

epoch 17199 loss -0.29648587107658386 LR -0.32357925176620483 LKL 0.027093390002846718
epoch 17200 loss -0.25649094581604004 LR -0.28338050842285156 LKL 0.026889555156230927
41


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17201 loss -0.26043426990509033 LR -0.2875077724456787 LKL 0.027073495090007782
epoch 17202 loss -0.3076722323894501 LR -0.334954172372818 LKL 0.027281949296593666
epoch 17203 loss -0.2790846824645996 LR -0.3059133291244507 LKL 0.026828644797205925
epoch 17204 loss -0.24826817214488983 LR -0.27527305483818054 LKL 0.027004888281226158
epoch 17205 loss -0.3078417479991913 LR -0.3350236117839813 LKL 0.027181871235370636
epoch 17206 loss -0.26571211218833923 LR -0.2927481532096863 LKL 0.027036027982831
epoch 17207 loss -0.273128479719162 LR -0.3001670241355896 LKL 0.027038538828492165
epoch 17208 loss -0.22010588645935059 LR -0.24696558713912964 LKL 0.0268597062677145
epoch 17209 loss -0.28490322828292847 LR -0.3118487596511841 LKL 0.026945535093545914
epoch 17210 loss -0.24650301039218903 LR -0.2734640836715698 LKL 0.026961078867316246
epoch 17211 loss -0.2980603873729706 LR -0.32519152760505676 LKL 0.02713114023208618
epoch 17212 loss -0.3520911931991577 LR -0.37912142276763916 LKL

epoch 17299 loss -0.2802197337150574 LR -0.30719131231307983 LKL 0.02697158232331276
epoch 17300 loss -0.2971021234989166 LR -0.3239557147026062 LKL 0.02685358189046383
42
epoch 17301 loss -0.2598269581794739 LR -0.28674304485321045 LKL 0.02691609039902687


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17302 loss -0.3523022532463074 LR -0.37936291098594666 LKL 0.027060657739639282
epoch 17303 loss -0.33374515175819397 LR -0.36062249541282654 LKL 0.026877351105213165
epoch 17304 loss -0.23205715417861938 LR -0.25887826085090637 LKL 0.026821104809641838
epoch 17305 loss -0.25614285469055176 LR -0.283049076795578 LKL 0.02690623328089714
epoch 17306 loss -0.28125250339508057 LR -0.30828720331192017 LKL 0.027034705504775047
epoch 17307 loss -0.3511083424091339 LR -0.37813594937324524 LKL 0.02702760323882103
epoch 17308 loss -0.26311057806015015 LR -0.28998517990112305 LKL 0.026874590665102005
epoch 17309 loss -0.2429777979850769 LR -0.26987364888191223 LKL 0.026895852759480476
epoch 17310 loss -0.2532748878002167 LR -0.28030797839164734 LKL 0.027033090591430664
epoch 17311 loss -0.2843392789363861 LR -0.3111799955368042 LKL 0.02684072218835354
epoch 17312 loss -0.23022505640983582 LR -0.25703614950180054 LKL 0.02681109495460987
epoch 17313 loss -0.34716087579727173 LR -0.37412011623

epoch 17398 loss -0.3283953070640564 LR -0.35534974932670593 LKL 0.026954427361488342
epoch 17399 loss -0.3014087677001953 LR -0.32862961292266846 LKL 0.02722083032131195
epoch 17400 loss -0.3384968042373657 LR -0.3657388687133789 LKL 0.027242062613368034
44
epoch 17401 loss -0.20487374067306519 LR -0.23181778192520142 LKL 0.026944033801555634


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17402 loss -0.34090226888656616 LR -0.36801353096961975 LKL 0.027111273258924484
epoch 17403 loss -0.2102176398038864 LR -0.23723706603050232 LKL 0.027019426226615906
epoch 17404 loss -0.3295491635799408 LR -0.3567374348640442 LKL 0.027188267558813095
epoch 17405 loss -0.263317346572876 LR -0.2902475893497467 LKL 0.026930242776870728
epoch 17406 loss -0.2917155921459198 LR -0.3188472390174866 LKL 0.027131637558341026
epoch 17407 loss -0.3148668110370636 LR -0.3418363928794861 LKL 0.026969589293003082
epoch 17408 loss -0.3306255638599396 LR -0.35785534977912903 LKL 0.027229787781834602
epoch 17409 loss -0.31718558073043823 LR -0.3442371189594269 LKL 0.027051547542214394
epoch 17410 loss -0.27374133467674255 LR -0.3006562292575836 LKL 0.026914900168776512
epoch 17411 loss -0.3118365705013275 LR -0.3389514982700348 LKL 0.02711494080722332
epoch 17412 loss -0.24954894185066223 LR -0.2765665352344513 LKL 0.02701759710907936
epoch 17413 loss -0.28620976209640503 LR -0.3132927417755127 

epoch 17498 loss -0.31885793805122375 LR -0.34595027565956116 LKL 0.027092326432466507
epoch 17499 loss -0.274362176656723 LR -0.3013993501663208 LKL 0.027037177234888077
epoch 17500 loss -0.2773914337158203 LR -0.30449697375297546 LKL 0.027105532586574554
44
epoch 17501 loss -0.23601369559764862 LR -0.26297473907470703 LKL 0.026961039751768112


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17502 loss -0.3085781931877136 LR -0.3357629179954529 LKL 0.02718471549451351
epoch 17503 loss -0.28209638595581055 LR -0.3092217743396759 LKL 0.027125384658575058
epoch 17504 loss -0.3365693688392639 LR -0.3636125326156616 LKL 0.02704314887523651
epoch 17505 loss -0.24983038008213043 LR -0.2768325209617615 LKL 0.027002139016985893
epoch 17506 loss -0.22190342843532562 LR -0.2487243413925171 LKL 0.02682090736925602
epoch 17507 loss -0.2916618883609772 LR -0.3186294436454773 LKL 0.026967549696564674
epoch 17508 loss -0.31298568844795227 LR -0.3400976359844208 LKL 0.027111953124403954
epoch 17509 loss -0.2728285789489746 LR -0.2997742295265198 LKL 0.02694566547870636
epoch 17510 loss -0.18865329027175903 LR -0.21540896594524384 LKL 0.0267556831240654
epoch 17511 loss -0.29018858075141907 LR -0.3172847330570221 LKL 0.027096150442957878
epoch 17512 loss -0.20734219253063202 LR -0.2343238890171051 LKL 0.02698170207440853
epoch 17513 loss -0.2520998418331146 LR -0.2790061831474304 LKL 

39
epoch 17601 loss -0.2894071638584137 LR -0.3164929747581482 LKL 0.027085810899734497


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17602 loss -0.2828335165977478 LR -0.31007319688796997 LKL 0.027239670976996422
epoch 17603 loss -0.2951045036315918 LR -0.32203906774520874 LKL 0.026934579014778137
epoch 17604 loss -0.3309009075164795 LR -0.3579596281051636 LKL 0.027058713138103485
epoch 17605 loss -0.2574048340320587 LR -0.28430959582328796 LKL 0.026904774829745293
epoch 17606 loss -0.2605423331260681 LR -0.2875056266784668 LKL 0.026963280513882637
epoch 17607 loss -0.32982727885246277 LR -0.3569127023220062 LKL 0.0270854365080595
epoch 17608 loss -0.3130955100059509 LR -0.34010523557662964 LKL 0.02700972557067871
epoch 17609 loss -0.30990132689476013 LR -0.3369152843952179 LKL 0.027013948187232018
epoch 17610 loss -0.2831707000732422 LR -0.31030234694480896 LKL 0.027131637558341026
epoch 17611 loss -0.30142492055892944 LR -0.32856643199920654 LKL 0.027141515165567398
epoch 17612 loss -0.28464406728744507 LR -0.31183069944381714 LKL 0.02718663215637207
epoch 17613 loss -0.2661355137825012 LR -0.293166816234588

epoch 17698 loss -0.30906572937965393 LR -0.33623290061950684 LKL 0.027167178690433502
epoch 17699 loss -0.28226977586746216 LR -0.30952611565589905 LKL 0.027256347239017487
epoch 17700 loss -0.23209206759929657 LR -0.2591545283794403 LKL 0.02706245891749859
55
epoch 17701 loss -0.27690163254737854 LR -0.30412858724594116 LKL 0.027226941660046577


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17702 loss -0.26247626543045044 LR -0.28971201181411743 LKL 0.027235737070441246
epoch 17703 loss -0.31283700466156006 LR -0.3401053249835968 LKL 0.027268333360552788
epoch 17704 loss -0.23739656805992126 LR -0.2644873559474945 LKL 0.02709079161286354
epoch 17705 loss -0.2625074088573456 LR -0.28963997960090637 LKL 0.027132561430335045
epoch 17706 loss -0.2861788868904114 LR -0.31328436732292175 LKL 0.027105489745736122
epoch 17707 loss -0.2948671281337738 LR -0.32210347056388855 LKL 0.027236338704824448
epoch 17708 loss -0.2709672749042511 LR -0.29805096983909607 LKL 0.02708369493484497
epoch 17709 loss -0.31824833154678345 LR -0.345538467168808 LKL 0.02729015052318573
epoch 17710 loss -0.233701691031456 LR -0.26069343090057373 LKL 0.026991745457053185
epoch 17711 loss -0.2574993669986725 LR -0.2846158742904663 LKL 0.027116503566503525
epoch 17712 loss -0.23758593201637268 LR -0.2645095884799957 LKL 0.02692364901304245
epoch 17713 loss -0.2507133185863495 LR -0.2777770161628723 

epoch 17800 loss -0.23236526548862457 LR -0.25934699177742004 LKL 0.02698172628879547
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17801 loss -0.29769304394721985 LR -0.3250024914741516 LKL 0.027309449389576912
epoch 17802 loss -0.3380817174911499 LR -0.3654640316963196 LKL 0.027382321655750275
epoch 17803 loss -0.2837257981300354 LR -0.3108598291873932 LKL 0.02713402546942234
epoch 17804 loss -0.3229614198207855 LR -0.3501706123352051 LKL 0.027209199965000153
epoch 17805 loss -0.2962671220302582 LR -0.3234120011329651 LKL 0.027144884690642357
epoch 17806 loss -0.2802996039390564 LR -0.3074677288532257 LKL 0.027168115600943565
epoch 17807 loss -0.2757720947265625 LR -0.303093284368515 LKL 0.027321193367242813
epoch 17808 loss -0.3107457160949707 LR -0.3379167318344116 LKL 0.02717101201415062
epoch 17809 loss -0.2911103665828705 LR -0.3181689977645874 LKL 0.02705862931907177
epoch 17810 loss -0.3225770592689514 LR -0.3496372699737549 LKL 0.027060214430093765
epoch 17811 loss -0.25404635071754456 LR -0.2809564173221588 LKL 0.02691006101667881
epoch 17812 loss -0.3269680142402649 LR -0.3542674779891968 LKL 0.02

epoch 17897 loss -0.2804611325263977 LR -0.30772799253463745 LKL 0.02726684883236885
epoch 17898 loss -0.2289283126592636 LR -0.25603750348091125 LKL 0.027109190821647644
epoch 17899 loss -0.3117378354072571 LR -0.33890414237976074 LKL 0.027166292071342468
epoch 17900 loss -0.298662930727005 LR -0.3260709047317505 LKL 0.02740797959268093
107
epoch 17901 loss -0.3005131483078003 LR -0.3277818560600281 LKL 0.02726869471371174


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 17902 loss -0.29928287863731384 LR -0.3266550898551941 LKL 0.027372203767299652
epoch 17903 loss -0.21950070559978485 LR -0.2466745376586914 LKL 0.027173830196261406
epoch 17904 loss -0.2954680025577545 LR -0.32278284430503845 LKL 0.02731485106050968
epoch 17905 loss -0.2783709168434143 LR -0.3056142032146454 LKL 0.027243301272392273
epoch 17906 loss -0.27472278475761414 LR -0.3019307553768158 LKL 0.027207981795072556
epoch 17907 loss -0.23635411262512207 LR -0.2635147273540497 LKL 0.02716061845421791
epoch 17908 loss -0.2552252411842346 LR -0.2824673652648926 LKL 0.027242112904787064
epoch 17909 loss -0.2741946876049042 LR -0.3015018701553345 LKL 0.027307195588946342
epoch 17910 loss -0.28023743629455566 LR -0.3075456917285919 LKL 0.02730826660990715
epoch 17911 loss -0.25592607259750366 LR -0.2831016182899475 LKL 0.0271755363792181
epoch 17912 loss -0.30524805188179016 LR -0.3323872983455658 LKL 0.027139248326420784
epoch 17913 loss -0.3614456057548523 LR -0.38872212171554565 L

epoch 17998 loss -0.31408369541168213 LR -0.34154483675956726 LKL 0.027461139485239983
epoch 17999 loss -0.22372907400131226 LR -0.2511414587497711 LKL 0.02741239219903946
epoch 18000 loss -0.2844317853450775 LR -0.3116888999938965 LKL 0.027257125824689865
74
epoch 18001 loss -0.23041315376758575 LR -0.2578083276748657 LKL 0.027395177632570267


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18002 loss -0.28859901428222656 LR -0.31589919328689575 LKL 0.027300186455249786
epoch 18003 loss -0.32253381609916687 LR -0.3499075174331665 LKL 0.027373701333999634
epoch 18004 loss -0.2792215943336487 LR -0.3065304756164551 LKL 0.027308883145451546
epoch 18005 loss -0.21300911903381348 LR -0.24022743105888367 LKL 0.02721831202507019
epoch 18006 loss -0.23909802734851837 LR -0.26634466648101807 LKL 0.027246640995144844
epoch 18007 loss -0.3057139813899994 LR -0.33301326632499695 LKL 0.027299288660287857
epoch 18008 loss -0.3103254437446594 LR -0.3376263976097107 LKL 0.027300961315631866
epoch 18009 loss -0.35403016209602356 LR -0.38146454095840454 LKL 0.02743438445031643
epoch 18010 loss -0.33797675371170044 LR -0.3653717041015625 LKL 0.027394959703087807
epoch 18011 loss -0.2539827227592468 LR -0.28125128149986267 LKL 0.027268553152680397
epoch 18012 loss -0.22682520747184753 LR -0.2539863884449005 LKL 0.02716118097305298
epoch 18013 loss -0.3049527406692505 LR -0.332221865653

epoch 18099 loss -0.304627001285553 LR -0.3321002125740051 LKL 0.027473222464323044
epoch 18100 loss -0.3472065329551697 LR -0.37486451864242554 LKL 0.02765798754990101
39


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18101 loss -0.31176891922950745 LR -0.33914482593536377 LKL 0.02737589366734028
epoch 18102 loss -0.30425649881362915 LR -0.3314799666404724 LKL 0.027223477140069008
epoch 18103 loss -0.282614141702652 LR -0.3098824918270111 LKL 0.02726835571229458
epoch 18104 loss -0.23847773671150208 LR -0.26580584049224854 LKL 0.02732810005545616
epoch 18105 loss -0.28178703784942627 LR -0.3090624213218689 LKL 0.02727537974715233
epoch 18106 loss -0.2675333023071289 LR -0.2947339415550232 LKL 0.02720065228641033
epoch 18107 loss -0.2251129150390625 LR -0.2524102032184601 LKL 0.027297290042042732
epoch 18108 loss -0.2913948893547058 LR -0.31866392493247986 LKL 0.027269020676612854
epoch 18109 loss -0.3454417288303375 LR -0.37289151549339294 LKL 0.027449781075119972
epoch 18110 loss -0.23432578146457672 LR -0.2614695429801941 LKL 0.02714376337826252
epoch 18111 loss -0.2686708867549896 LR -0.2960370182991028 LKL 0.027366124093532562
epoch 18112 loss -0.32977527379989624 LR -0.357210636138916 LKL

epoch 18199 loss -0.24312125146389008 LR -0.2704375982284546 LKL 0.027316341176629066
epoch 18200 loss -0.29614514112472534 LR -0.32363760471343994 LKL 0.0274924635887146
67
epoch 18201 loss -0.3287706673145294 LR -0.35624077916145325 LKL 0.027470117434859276


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18202 loss -0.30113255977630615 LR -0.3285676836967468 LKL 0.02743513323366642
epoch 18203 loss -0.2876597046852112 LR -0.31499943137168884 LKL 0.027339735999703407
epoch 18204 loss -0.3077937662601471 LR -0.33519765734672546 LKL 0.027403894811868668
epoch 18205 loss -0.26077142357826233 LR -0.288006067276001 LKL 0.02723463997244835
epoch 18206 loss -0.2793740928173065 LR -0.3065871000289917 LKL 0.02721300721168518
epoch 18207 loss -0.3441896438598633 LR -0.37159955501556396 LKL 0.02740991860628128
epoch 18208 loss -0.309871643781662 LR -0.33709999918937683 LKL 0.027228355407714844
epoch 18209 loss -0.3312170207500458 LR -0.3586653769016266 LKL 0.027448365464806557
epoch 18210 loss -0.2715862989425659 LR -0.2990027070045471 LKL 0.02741640992462635
epoch 18211 loss -0.3316425383090973 LR -0.35913848876953125 LKL 0.027495939284563065
epoch 18212 loss -0.3784607946872711 LR -0.4059414565563202 LKL 0.027480660006403923
epoch 18213 loss -0.2962801158428192 LR -0.3235965073108673 LKL 0

epoch 18298 loss -0.36855348944664 LR -0.39616379141807556 LKL 0.027610307559370995
epoch 18299 loss -0.2565195858478546 LR -0.2839062809944153 LKL 0.02738669514656067
epoch 18300 loss -0.32853153347969055 LR -0.35596901178359985 LKL 0.02743748016655445
64
epoch 18301 loss -0.332613080739975 LR -0.360138475894928 LKL 0.02752540446817875


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18302 loss -0.22482863068580627 LR -0.25211620330810547 LKL 0.027287574484944344
epoch 18303 loss -0.3063007593154907 LR -0.33372849225997925 LKL 0.02742774784564972
epoch 18304 loss -0.2781343460083008 LR -0.30544406175613403 LKL 0.027309728786349297
epoch 18305 loss -0.3368147015571594 LR -0.3642963767051697 LKL 0.02748166397213936
epoch 18306 loss -0.27218174934387207 LR -0.2994227409362793 LKL 0.027240993455052376
epoch 18307 loss -0.2713526487350464 LR -0.2986968457698822 LKL 0.02734420634806156
epoch 18308 loss -0.2947886288166046 LR -0.3221721947193146 LKL 0.02738356776535511
epoch 18309 loss -0.29469436407089233 LR -0.32214707136154175 LKL 0.02745269425213337
epoch 18310 loss -0.29475805163383484 LR -0.3221072554588318 LKL 0.027349207550287247
epoch 18311 loss -0.3132903575897217 LR -0.34064027667045593 LKL 0.027349907904863358
epoch 18312 loss -0.22132018208503723 LR -0.24871604144573212 LKL 0.027395861223340034
epoch 18313 loss -0.3105073869228363 LR -0.3379774093627929

epoch 18398 loss -0.32621490955352783 LR -0.3536998927593231 LKL 0.027484972029924393
epoch 18399 loss -0.2994470000267029 LR -0.3268130123615265 LKL 0.027365999296307564
epoch 18400 loss -0.2623138129711151 LR -0.2897800803184509 LKL 0.02746625617146492
85
epoch 18401 loss -0.24518154561519623 LR -0.272649884223938 LKL 0.027468333020806313


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18402 loss -0.3284235894680023 LR -0.3559149205684662 LKL 0.027491318061947823
epoch 18403 loss -0.25626522302627563 LR -0.283475399017334 LKL 0.027210183441638947
epoch 18404 loss -0.30327996611595154 LR -0.3307482600212097 LKL 0.027468303218483925
epoch 18405 loss -0.2876318097114563 LR -0.3149334490299225 LKL 0.027301648631691933
epoch 18406 loss -0.26844486594200134 LR -0.29588475823402405 LKL 0.027439888566732407
epoch 18407 loss -0.23671004176139832 LR -0.26424020528793335 LKL 0.027530169114470482
epoch 18408 loss -0.2810482084751129 LR -0.30842748284339905 LKL 0.027379261329770088
epoch 18409 loss -0.23362241685390472 LR -0.260869562625885 LKL 0.027247147634625435
epoch 18410 loss -0.24916817247867584 LR -0.27657997608184814 LKL 0.0274118073284626
epoch 18411 loss -0.3390500843524933 LR -0.36633217334747314 LKL 0.027282090857625008
epoch 18412 loss -0.32810136675834656 LR -0.35556772351264954 LKL 0.02746635302901268
epoch 18413 loss -0.22352544963359833 LR -0.2505961656570

epoch 18498 loss -0.2611294388771057 LR -0.28854045271873474 LKL 0.027411021292209625
epoch 18499 loss -0.27777352929115295 LR -0.30498963594436646 LKL 0.027216104790568352
epoch 18500 loss -0.24314194917678833 LR -0.27038419246673584 LKL 0.02724224328994751
54
epoch 18501 loss -0.2670571804046631 LR -0.29426074028015137 LKL 0.027203546836972237


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18502 loss -0.25102633237838745 LR -0.2784104645252228 LKL 0.02738412655889988
epoch 18503 loss -0.3041037917137146 LR -0.3314718008041382 LKL 0.027368012815713882
epoch 18504 loss -0.261132150888443 LR -0.28857558965682983 LKL 0.02744343876838684
epoch 18505 loss -0.2637653350830078 LR -0.29106616973876953 LKL 0.027300838381052017
epoch 18506 loss -0.19340862333774567 LR -0.22049656510353088 LKL 0.02708793804049492
epoch 18507 loss -0.2943435311317444 LR -0.32185378670692444 LKL 0.0275102648884058
epoch 18508 loss -0.23333534598350525 LR -0.2606133818626404 LKL 0.02727803960442543
epoch 18509 loss -0.28269389271736145 LR -0.31005939841270447 LKL 0.027365509420633316
epoch 18510 loss -0.28983166813850403 LR -0.31730565428733826 LKL 0.027473997324705124
epoch 18511 loss -0.29130780696868896 LR -0.3186347484588623 LKL 0.02732694521546364
epoch 18512 loss -0.23769575357437134 LR -0.2651882469654083 LKL 0.027492500841617584
epoch 18513 loss -0.29854923486709595 LR -0.3258084952831268

epoch 18598 loss -0.28641268610954285 LR -0.31378403306007385 LKL 0.027371348813176155
epoch 18599 loss -0.22984935343265533 LR -0.25716447830200195 LKL 0.027315126731991768
epoch 18600 loss -0.2448720932006836 LR -0.2724369168281555 LKL 0.027564823627471924
37
epoch 18601 loss -0.2600018382072449 LR -0.2874278426170349 LKL 0.027425993233919144


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18602 loss -0.15937204658985138 LR -0.1866099089384079 LKL 0.02723786048591137
epoch 18603 loss -0.3006351888179779 LR -0.3282710313796997 LKL 0.027635853737592697
epoch 18604 loss -0.24336910247802734 LR -0.2706677317619324 LKL 0.027298636734485626
epoch 18605 loss -0.30673378705978394 LR -0.33419084548950195 LKL 0.02745705656707287
epoch 18606 loss -0.23879460990428925 LR -0.2660773694515228 LKL 0.02728276140987873
epoch 18607 loss -0.30749034881591797 LR -0.3350917100906372 LKL 0.027601370587944984
epoch 18608 loss -0.3027759790420532 LR -0.3301846385002136 LKL 0.027408665046095848
epoch 18609 loss -0.22721140086650848 LR -0.2547204792499542 LKL 0.027509082108736038
epoch 18610 loss -0.30435827374458313 LR -0.3317907452583313 LKL 0.027432478964328766
epoch 18611 loss -0.2801704406738281 LR -0.3074822723865509 LKL 0.027311820536851883
epoch 18612 loss -0.24378138780593872 LR -0.2712346017360687 LKL 0.027453215792775154
epoch 18613 loss -0.25632283091545105 LR -0.283554732799530

epoch 18700 loss -0.20361614227294922 LR -0.2310396134853363 LKL 0.027423463761806488
99
epoch 18701 loss -0.268624484539032 LR -0.29609525203704834 LKL 0.027470776811242104


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18702 loss -0.2844820022583008 LR -0.31176814436912537 LKL 0.027286149561405182
epoch 18703 loss -0.3647693395614624 LR -0.392471581697464 LKL 0.02770225703716278
epoch 18704 loss -0.2563316822052002 LR -0.2839699387550354 LKL 0.027638258412480354
epoch 18705 loss -0.3121407628059387 LR -0.3396751880645752 LKL 0.027534428983926773
epoch 18706 loss -0.23909321427345276 LR -0.2664789855480194 LKL 0.027385778725147247
epoch 18707 loss -0.3620508313179016 LR -0.38967108726501465 LKL 0.027620261535048485
epoch 18708 loss -0.3107185661792755 LR -0.3382570147514343 LKL 0.02753843553364277
epoch 18709 loss -0.31149131059646606 LR -0.33890578150749207 LKL 0.027414470911026
epoch 18710 loss -0.33094263076782227 LR -0.35846954584121704 LKL 0.027526918798685074
epoch 18711 loss -0.3097713887691498 LR -0.33718210458755493 LKL 0.02741072326898575
epoch 18712 loss -0.32650479674339294 LR -0.35397276282310486 LKL 0.02746795490384102
epoch 18713 loss -0.293849915266037 LR -0.32148274779319763 LKL

epoch 18800 loss -0.29644718766212463 LR -0.323917031288147 LKL 0.027469832450151443
45


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18801 loss -0.3468432128429413 LR -0.3743780553340912 LKL 0.0275348499417305
epoch 18802 loss -0.3065713047981262 LR -0.3340027332305908 LKL 0.027431441470980644
epoch 18803 loss -0.33895325660705566 LR -0.3667631447315216 LKL 0.027809886261820793
epoch 18804 loss -0.348429411649704 LR -0.3761337101459503 LKL 0.027704287320375443
epoch 18805 loss -0.31347978115081787 LR -0.34101513028144836 LKL 0.027535339817404747
epoch 18806 loss -0.24900603294372559 LR -0.2763860821723938 LKL 0.02738005667924881
epoch 18807 loss -0.3026581406593323 LR -0.33025968074798584 LKL 0.02760152891278267
epoch 18808 loss -0.31244751811027527 LR -0.3400832414627075 LKL 0.0276357289403677
epoch 18809 loss -0.2822151184082031 LR -0.3097323775291443 LKL 0.02751726098358631
epoch 18810 loss -0.23800575733184814 LR -0.26537540555000305 LKL 0.027369650080800056
epoch 18811 loss -0.3133474886417389 LR -0.3408487141132355 LKL 0.027501212432980537
epoch 18812 loss -0.3360702395439148 LR -0.363631933927536 LKL 0.

epoch 18898 loss -0.2419641613960266 LR -0.26945191621780396 LKL 0.02748776040971279
epoch 18899 loss -0.3147331476211548 LR -0.3422277867794037 LKL 0.0274946428835392
epoch 18900 loss -0.3326340615749359 LR -0.3602316379547119 LKL 0.027597567066550255
57
epoch 18901 loss -0.30551156401634216 LR -0.33320802450180054 LKL 0.02769644744694233


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 18902 loss -0.28630468249320984 LR -0.3139002323150635 LKL 0.027595555409789085
epoch 18903 loss -0.23961187899112701 LR -0.26701945066452026 LKL 0.027407575398683548
epoch 18904 loss -0.31287702918052673 LR -0.3405572175979614 LKL 0.027680180966854095
epoch 18905 loss -0.32489439845085144 LR -0.3523966073989868 LKL 0.027502212673425674
epoch 18906 loss -0.3273433744907379 LR -0.35478049516677856 LKL 0.027437128126621246
epoch 18907 loss -0.1782108098268509 LR -0.2055966556072235 LKL 0.02738584764301777
epoch 18908 loss -0.3395858705043793 LR -0.3670903742313385 LKL 0.02750450000166893
epoch 18909 loss -0.3448665738105774 LR -0.3724125325679779 LKL 0.027545953169465065
epoch 18910 loss -0.29256394505500793 LR -0.32022684812545776 LKL 0.027662891894578934
epoch 18911 loss -0.20483334362506866 LR -0.23222443461418152 LKL 0.027391092851758003
epoch 18912 loss -0.270198792219162 LR -0.29758161306381226 LKL 0.027382832020521164
epoch 18913 loss -0.31325769424438477 LR -0.3410069048404

epoch 18999 loss -0.30902811884880066 LR -0.33675944805145264 LKL 0.027731342241168022
epoch 19000 loss -0.30451297760009766 LR -0.3324008584022522 LKL 0.02788788266479969
66
epoch 19001 loss -0.38752102851867676 LR -0.4155104160308838 LKL 0.027989376336336136


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19002 loss -0.32342997193336487 LR -0.3509241044521332 LKL 0.027494128793478012
epoch 19003 loss -0.3382718563079834 LR -0.3659566044807434 LKL 0.027684757485985756
epoch 19004 loss -0.297807514667511 LR -0.32543542981147766 LKL 0.02762792259454727
epoch 19005 loss -0.25428512692451477 LR -0.2818976640701294 LKL 0.027612535282969475
epoch 19006 loss -0.3595370054244995 LR -0.3872566521167755 LKL 0.027719642966985703
epoch 19007 loss -0.3025820255279541 LR -0.3301814794540405 LKL 0.027599455788731575
epoch 19008 loss -0.23141156136989594 LR -0.2590578496456146 LKL 0.027646292001008987
epoch 19009 loss -0.2688974142074585 LR -0.29643264412879944 LKL 0.027535228058695793
epoch 19010 loss -0.2541514039039612 LR -0.2816537022590637 LKL 0.027502290904521942
epoch 19011 loss -0.29649320244789124 LR -0.32403913140296936 LKL 0.02754591964185238
epoch 19012 loss -0.3143274188041687 LR -0.342011958360672 LKL 0.027684547007083893
epoch 19013 loss -0.32155200839042664 LR -0.34921592473983765 

epoch 19100 loss -0.277723491191864 LR -0.3053540289402008 LKL 0.027630522847175598
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19101 loss -0.314286470413208 LR -0.3420681953430176 LKL 0.02778172306716442
epoch 19102 loss -0.32291942834854126 LR -0.35055050253868103 LKL 0.027631068602204323
epoch 19103 loss -0.3510778844356537 LR -0.3787171244621277 LKL 0.027639251202344894
epoch 19104 loss -0.375772088766098 LR -0.4036087095737457 LKL 0.02783663012087345
epoch 19105 loss -0.32154980301856995 LR -0.34924620389938354 LKL 0.027696387842297554
epoch 19106 loss -0.28660154342651367 LR -0.3141981065273285 LKL 0.027596576139330864
epoch 19107 loss -0.24062387645244598 LR -0.2680662274360657 LKL 0.02744235470890999
epoch 19108 loss -0.2857246398925781 LR -0.3132169842720032 LKL 0.027492357417941093
epoch 19109 loss -0.36306270956993103 LR -0.3908965587615967 LKL 0.027833862230181694
epoch 19110 loss -0.32346317172050476 LR -0.3511659502983093 LKL 0.027702784165740013
epoch 19111 loss -0.3393635153770447 LR -0.36696764826774597 LKL 0.027604147791862488
epoch 19112 loss -0.29491138458251953 LR -0.32239627838134766

epoch 19197 loss -0.31757810711860657 LR -0.3452270030975342 LKL 0.027648886665701866
epoch 19198 loss -0.2643485963344574 LR -0.2919396460056305 LKL 0.02759104035794735
epoch 19199 loss -0.21424682438373566 LR -0.24170702695846558 LKL 0.027460208162665367
epoch 19200 loss -0.23207053542137146 LR -0.2597782015800476 LKL 0.027707668021321297
103
epoch 19201 loss -0.3053208589553833 LR -0.3329809904098511 LKL 0.02766011655330658


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19202 loss -0.31431975960731506 LR -0.34219974279403687 LKL 0.027879975736141205
epoch 19203 loss -0.3259635269641876 LR -0.35358914732933044 LKL 0.027625611051917076
epoch 19204 loss -0.2810169458389282 LR -0.30878376960754395 LKL 0.027766810730099678
epoch 19205 loss -0.2675667703151703 LR -0.29535165429115295 LKL 0.027784883975982666
epoch 19206 loss -0.2650078237056732 LR -0.2926000654697418 LKL 0.02759222872555256
epoch 19207 loss -0.35426729917526245 LR -0.38214144110679626 LKL 0.027874136343598366
epoch 19208 loss -0.30402421951293945 LR -0.33165422081947327 LKL 0.027629990130662918
epoch 19209 loss -0.32384246587753296 LR -0.35158205032348633 LKL 0.027739588171243668
epoch 19210 loss -0.26858243346214294 LR -0.29608115553855896 LKL 0.027498716488480568
epoch 19211 loss -0.3862363398075104 LR -0.4141959547996521 LKL 0.02795962244272232
epoch 19212 loss -0.28598499298095703 LR -0.31362050771713257 LKL 0.02763551101088524
epoch 19213 loss -0.335201233625412 LR -0.36301609873

epoch 19298 loss -0.30963626503944397 LR -0.33732372522354126 LKL 0.027687454596161842
epoch 19299 loss -0.32370302081108093 LR -0.351346492767334 LKL 0.027643471956253052
epoch 19300 loss -0.27119624614715576 LR -0.29888758063316345 LKL 0.027691340073943138
53
epoch 19301 loss -0.32902929186820984 LR -0.3568567931652069 LKL 0.027827512472867966


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19302 loss -0.2861683964729309 LR -0.313839316368103 LKL 0.027670925483107567
epoch 19303 loss -0.3098629415035248 LR -0.3377220332622528 LKL 0.027859080582857132
epoch 19304 loss -0.29991278052330017 LR -0.32758578658103943 LKL 0.027673019096255302
epoch 19305 loss -0.2971841096878052 LR -0.3247816860675812 LKL 0.027597574517130852
epoch 19306 loss -0.34804174304008484 LR -0.3757552206516266 LKL 0.027713468298316002
epoch 19307 loss -0.2438097447156906 LR -0.2714707851409912 LKL 0.0276610367000103
epoch 19308 loss -0.32780778408050537 LR -0.35559073090553284 LKL 0.02778293378651142
epoch 19309 loss -0.31687164306640625 LR -0.344588041305542 LKL 0.027716413140296936
epoch 19310 loss -0.2657003700733185 LR -0.2934718728065491 LKL 0.027771495282649994
epoch 19311 loss -0.4078138470649719 LR -0.43577733635902405 LKL 0.027963504195213318
epoch 19312 loss -0.3301590085029602 LR -0.3579595685005188 LKL 0.027800550684332848
epoch 19313 loss -0.24707423150539398 LR -0.2745587229728699 LK

epoch 19398 loss -0.284685343503952 LR -0.3124130368232727 LKL 0.02772769331932068
epoch 19399 loss -0.24204906821250916 LR -0.2696998119354248 LKL 0.02765073999762535
epoch 19400 loss -0.2595893144607544 LR -0.28725725412368774 LKL 0.027667945250868797
68
epoch 19401 loss -0.29898935556411743 LR -0.326748251914978 LKL 0.027758896350860596


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19402 loss -0.3253612816333771 LR -0.35326945781707764 LKL 0.027908170595765114
epoch 19403 loss -0.3106057345867157 LR -0.33852171897888184 LKL 0.027915989980101585
epoch 19404 loss -0.26761314272880554 LR -0.29545024037361145 LKL 0.02783709391951561
epoch 19405 loss -0.32087352871894836 LR -0.34873855113983154 LKL 0.027865024283528328
epoch 19406 loss -0.3718165457248688 LR -0.39980167150497437 LKL 0.02798512578010559
epoch 19407 loss -0.23927797377109528 LR -0.26693451404571533 LKL 0.027656543999910355
epoch 19408 loss -0.23877614736557007 LR -0.26654189825057983 LKL 0.027765747159719467
epoch 19409 loss -0.2943936586380005 LR -0.3221740126609802 LKL 0.027780352160334587
epoch 19410 loss -0.2739604711532593 LR -0.301763653755188 LKL 0.027803175151348114
epoch 19411 loss -0.20201148092746735 LR -0.2297818660736084 LKL 0.027770385146141052
epoch 19412 loss -0.30948346853256226 LR -0.33727601170539856 LKL 0.0277925506234169
epoch 19413 loss -0.25571149587631226 LR -0.283242255449

epoch 19500 loss -0.33154046535491943 LR -0.35949909687042236 LKL 0.027958635240793228
76


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19501 loss -0.25402167439460754 LR -0.2819996476173401 LKL 0.02797798067331314
epoch 19502 loss -0.2808724343776703 LR -0.308740496635437 LKL 0.027868064120411873
epoch 19503 loss -0.30487677454948425 LR -0.3326438367366791 LKL 0.02776707336306572
epoch 19504 loss -0.27286791801452637 LR -0.3005776107311249 LKL 0.027709685266017914
epoch 19505 loss -0.2985924482345581 LR -0.32639849185943604 LKL 0.02780603989958763
epoch 19506 loss -0.32579976320266724 LR -0.35359957814216614 LKL 0.027799800038337708
epoch 19507 loss -0.3015901744365692 LR -0.32936105132102966 LKL 0.027770880609750748
epoch 19508 loss -0.33107033371925354 LR -0.35908523201942444 LKL 0.0280148908495903
epoch 19509 loss -0.34916234016418457 LR -0.3770444989204407 LKL 0.027882162481546402
epoch 19510 loss -0.2780478894710541 LR -0.30574408173561096 LKL 0.027696184813976288
epoch 19511 loss -0.32027867436408997 LR -0.3482212424278259 LKL 0.027942564338445663
epoch 19512 loss -0.29328206181526184 LR -0.321020126342773

epoch 19597 loss -0.38303107023239136 LR -0.4110080301761627 LKL 0.027976956218481064
epoch 19598 loss -0.3867760896682739 LR -0.41478514671325684 LKL 0.028009066358208656
epoch 19599 loss -0.23890528082847595 LR -0.26673173904418945 LKL 0.027826454490423203
epoch 19600 loss -0.2361697554588318 LR -0.26404282450675964 LKL 0.02787306345999241
40


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19601 loss -0.28722891211509705 LR -0.3149958848953247 LKL 0.027766961604356766
epoch 19602 loss -0.32220062613487244 LR -0.35027414560317993 LKL 0.028073513880372047
epoch 19603 loss -0.2964739203453064 LR -0.32422682642936707 LKL 0.027752898633480072
epoch 19604 loss -0.2284487783908844 LR -0.25615087151527405 LKL 0.02770208567380905
epoch 19605 loss -0.3024459481239319 LR -0.3303191661834717 LKL 0.02787320502102375
epoch 19606 loss -0.28289806842803955 LR -0.3108266294002533 LKL 0.02792857028543949
epoch 19607 loss -0.31220296025276184 LR -0.34005939960479736 LKL 0.027856439352035522
epoch 19608 loss -0.3453337848186493 LR -0.3730955421924591 LKL 0.02776176854968071
epoch 19609 loss -0.3014991283416748 LR -0.3294847011566162 LKL 0.02798556722700596
epoch 19610 loss -0.32945752143859863 LR -0.35748735070228577 LKL 0.028029827401041985
epoch 19611 loss -0.3177209794521332 LR -0.34561866521835327 LKL 0.027897698804736137
epoch 19612 loss -0.26049262285232544 LR -0.288396149873733

epoch 19699 loss -0.3473210036754608 LR -0.375240296125412 LKL 0.027919286862015724
epoch 19700 loss -0.24081793427467346 LR -0.26849493384361267 LKL 0.027676992118358612
49
epoch 19701 loss -0.2694663405418396 LR -0.2972913682460785 LKL 0.027825037017464638


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19702 loss -0.2862710952758789 LR -0.3138953745365143 LKL 0.027624277397990227
epoch 19703 loss -0.29428553581237793 LR -0.3221708834171295 LKL 0.02788536250591278
epoch 19704 loss -0.32981470227241516 LR -0.35774242877960205 LKL 0.027927737683057785
epoch 19705 loss -0.30611151456832886 LR -0.3340637683868408 LKL 0.027952266857028008
epoch 19706 loss -0.33095401525497437 LR -0.35881128907203674 LKL 0.02785726822912693
epoch 19707 loss -0.26541468501091003 LR -0.29304492473602295 LKL 0.027630247175693512
epoch 19708 loss -0.3592720925807953 LR -0.3871802091598511 LKL 0.027908124029636383
epoch 19709 loss -0.31771397590637207 LR -0.34549838304519653 LKL 0.02778439410030842
epoch 19710 loss -0.28825435042381287 LR -0.3161344826221466 LKL 0.027880128473043442
epoch 19711 loss -0.3008047640323639 LR -0.32879677414894104 LKL 0.027992013841867447
epoch 19712 loss -0.30212000012397766 LR -0.3300110101699829 LKL 0.027891011908650398
epoch 19713 loss -0.28734493255615234 LR -0.31542557477

epoch 19800 loss -0.2599509656429291 LR -0.2877761423587799 LKL 0.027825171127915382
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19801 loss -0.3170333504676819 LR -0.34505170583724976 LKL 0.028018340468406677
epoch 19802 loss -0.22478744387626648 LR -0.25259077548980713 LKL 0.02780332788825035
epoch 19803 loss -0.31928011775016785 LR -0.346988320350647 LKL 0.027708204463124275
epoch 19804 loss -0.3151319921016693 LR -0.3429136872291565 LKL 0.02778170071542263
epoch 19805 loss -0.30769994854927063 LR -0.33561068773269653 LKL 0.027910742908716202
epoch 19806 loss -0.29668623208999634 LR -0.32441645860671997 LKL 0.02773023396730423
epoch 19807 loss -0.27635636925697327 LR -0.3041192293167114 LKL 0.027762852609157562
epoch 19808 loss -0.2678324580192566 LR -0.29560601711273193 LKL 0.027773557230830193
epoch 19809 loss -0.3551679253578186 LR -0.38317838311195374 LKL 0.028010457754135132
epoch 19810 loss -0.31797629594802856 LR -0.3459489941596985 LKL 0.027972692623734474
epoch 19811 loss -0.33730652928352356 LR -0.3653407394886017 LKL 0.028034213930368423
epoch 19812 loss -0.3057229816913605 LR -0.3337637186050

epoch 19897 loss -0.26714056730270386 LR -0.2951279282569885 LKL 0.027987368404865265
epoch 19898 loss -0.2902225852012634 LR -0.3182031214237213 LKL 0.027980521321296692
epoch 19899 loss -0.3335564136505127 LR -0.36144500970840454 LKL 0.027888597920536995
epoch 19900 loss -0.3273949921131134 LR -0.3553653955459595 LKL 0.02797039970755577
100
epoch 19901 loss -0.30482131242752075 LR -0.3327740430831909 LKL 0.027952726930379868


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 19902 loss -0.28626829385757446 LR -0.31405216455459595 LKL 0.02778388001024723
epoch 19903 loss -0.2837979197502136 LR -0.3116344213485718 LKL 0.02783648855984211
epoch 19904 loss -0.3476557433605194 LR -0.3757009506225586 LKL 0.028045212849974632
epoch 19905 loss -0.29623863101005554 LR -0.32421496510505676 LKL 0.027976321056485176
epoch 19906 loss -0.35870712995529175 LR -0.3866981565952301 LKL 0.027991026639938354
epoch 19907 loss -0.29735633730888367 LR -0.3252686560153961 LKL 0.027912311255931854
epoch 19908 loss -0.3199305832386017 LR -0.34800875186920166 LKL 0.028078177943825722
epoch 19909 loss -0.2995063066482544 LR -0.32736703753471375 LKL 0.027860719710588455
epoch 19910 loss -0.34001410007476807 LR -0.367935448884964 LKL 0.02792135439813137
epoch 19911 loss -0.38216495513916016 LR -0.4101537764072418 LKL 0.027988817542791367
epoch 19912 loss -0.3494805097579956 LR -0.37729012966156006 LKL 0.027809614315629005
epoch 19913 loss -0.33613431453704834 LR -0.36380124092102

epoch 19998 loss -0.3004048466682434 LR -0.32838138937950134 LKL 0.02797655761241913
epoch 19999 loss -0.3337557911872864 LR -0.3617326021194458 LKL 0.027976814657449722
epoch 20000 loss -0.3301084339618683 LR -0.3582133650779724 LKL 0.02810492552816868
82
epoch 20001 loss -0.308981716632843 LR -0.3371817171573639 LKL 0.028200000524520874


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20002 loss -0.32721149921417236 LR -0.35528773069381714 LKL 0.02807624079287052
epoch 20003 loss -0.31291449069976807 LR -0.34094810485839844 LKL 0.028033606708049774
epoch 20004 loss -0.28964969515800476 LR -0.3175762891769409 LKL 0.027926603332161903
epoch 20005 loss -0.33744940161705017 LR -0.36556968092918396 LKL 0.028120292350649834
epoch 20006 loss -0.36252692341804504 LR -0.3907244801521301 LKL 0.02819756045937538
epoch 20007 loss -0.32546132802963257 LR -0.35359475016593933 LKL 0.028133420273661613
epoch 20008 loss -0.3152908980846405 LR -0.3433164358139038 LKL 0.028025534003973007
epoch 20009 loss -0.3393505811691284 LR -0.3675425052642822 LKL 0.028191937133669853
epoch 20010 loss -0.2933172285556793 LR -0.32128670811653137 LKL 0.0279694851487875
epoch 20011 loss -0.3511795401573181 LR -0.3792976140975952 LKL 0.0281180739402771
epoch 20012 loss -0.28539377450942993 LR -0.31316980719566345 LKL 0.027776043862104416
epoch 20013 loss -0.25201040506362915 LR -0.27982831001281

77
epoch 20101 loss -0.32539504766464233 LR -0.35341545939445496 LKL 0.02802041918039322


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20102 loss -0.33986085653305054 LR -0.3679066300392151 LKL 0.028045782819390297
epoch 20103 loss -0.2809692323207855 LR -0.30881041288375854 LKL 0.027841176837682724
epoch 20104 loss -0.33985409140586853 LR -0.3679131865501404 LKL 0.028059108182787895
epoch 20105 loss -0.35365909337997437 LR -0.3817426860332489 LKL 0.02808358334004879
epoch 20106 loss -0.3157350420951843 LR -0.34388092160224915 LKL 0.028145868331193924
epoch 20107 loss -0.2765576243400574 LR -0.30460524559020996 LKL 0.02804761566221714
epoch 20108 loss -0.30024513602256775 LR -0.3281475901603699 LKL 0.027902446687221527
epoch 20109 loss -0.3492067754268646 LR -0.37723106145858765 LKL 0.02802429534494877
epoch 20110 loss -0.3552732765674591 LR -0.38354185223579407 LKL 0.02826857753098011
epoch 20111 loss -0.313716858625412 LR -0.34185999631881714 LKL 0.028143148869276047
epoch 20112 loss -0.3256952464580536 LR -0.35381990671157837 LKL 0.02812466397881508
epoch 20113 loss -0.30351534485816956 LR -0.3314882814884186

epoch 20198 loss -0.2698014974594116 LR -0.29797065258026123 LKL 0.028169166296720505
epoch 20199 loss -0.2887052595615387 LR -0.3165538012981415 LKL 0.02784852869808674
epoch 20200 loss -0.3053882122039795 LR -0.3334773778915405 LKL 0.028089163824915886
72
epoch 20201 loss -0.2932794690132141 LR -0.3214739263057709 LKL 0.028194459155201912


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20202 loss -0.2855035960674286 LR -0.3134678900241852 LKL 0.02796429954469204
epoch 20203 loss -0.3177732825279236 LR -0.3457532227039337 LKL 0.027979930862784386
epoch 20204 loss -0.3173989951610565 LR -0.34564077854156494 LKL 0.028241774067282677
epoch 20205 loss -0.31146854162216187 LR -0.33966386318206787 LKL 0.028195329010486603
epoch 20206 loss -0.27212291955947876 LR -0.3001902103424072 LKL 0.02806730568408966
epoch 20207 loss -0.2951989471912384 LR -0.3233484923839569 LKL 0.02814953401684761
epoch 20208 loss -0.2976718246936798 LR -0.325661301612854 LKL 0.027989475056529045
epoch 20209 loss -0.2853254973888397 LR -0.31347453594207764 LKL 0.028149044141173363
epoch 20210 loss -0.3382117450237274 LR -0.3662399649620056 LKL 0.028028227388858795
epoch 20211 loss -0.24377818405628204 LR -0.2716291844844818 LKL 0.027851002290844917
epoch 20212 loss -0.2725750207901001 LR -0.3006722629070282 LKL 0.028097251430153847
epoch 20213 loss -0.28704145550727844 LR -0.31499695777893066 L

epoch 20298 loss -0.3466317653656006 LR -0.3748646080493927 LKL 0.02823285385966301
epoch 20299 loss -0.3194236159324646 LR -0.34749850630760193 LKL 0.028074875473976135
epoch 20300 loss -0.2980290949344635 LR -0.32614561915397644 LKL 0.028116516768932343
62
epoch 20301 loss -0.2683495879173279 LR -0.296406090259552 LKL 0.028056517243385315


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20302 loss -0.30144888162612915 LR -0.3295362591743469 LKL 0.02808736264705658
epoch 20303 loss -0.3386912941932678 LR -0.3667771816253662 LKL 0.028085893020033836
epoch 20304 loss -0.3688340187072754 LR -0.39727479219436646 LKL 0.02844078280031681
epoch 20305 loss -0.28806445002555847 LR -0.3162606656551361 LKL 0.028196224942803383
epoch 20306 loss -0.2948269248008728 LR -0.32295626401901245 LKL 0.028129346668720245
epoch 20307 loss -0.20254671573638916 LR -0.23051416873931885 LKL 0.027967454865574837
epoch 20308 loss -0.33613988757133484 LR -0.3643662929534912 LKL 0.02822640910744667
epoch 20309 loss -0.26926371455192566 LR -0.2973300814628601 LKL 0.028066353872418404
epoch 20310 loss -0.29555320739746094 LR -0.3236406445503235 LKL 0.02808743715286255
epoch 20311 loss -0.34454843401908875 LR -0.3726499378681183 LKL 0.028101513162255287
epoch 20312 loss -0.24538908898830414 LR -0.2732917070388794 LKL 0.027902621775865555
epoch 20313 loss -0.32353487610816956 LR -0.35170426964759

epoch 20398 loss -0.3081603944301605 LR -0.33644333481788635 LKL 0.02828294225037098
epoch 20399 loss -0.32951590418815613 LR -0.35760408639907837 LKL 0.028088169172406197
epoch 20400 loss -0.359463095664978 LR -0.3877541422843933 LKL 0.02829103358089924
37
epoch 20401 loss -0.29888075590133667 LR -0.3269904851913452 LKL 0.028109727427363396


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20402 loss -0.2792832553386688 LR -0.3072315454483032 LKL 0.027948277071118355
epoch 20403 loss -0.28963878750801086 LR -0.3176240921020508 LKL 0.027985312044620514
epoch 20404 loss -0.2844656705856323 LR -0.3126165270805359 LKL 0.02815084345638752
epoch 20405 loss -0.28338637948036194 LR -0.3114972412586212 LKL 0.028110867366194725
epoch 20406 loss -0.39307647943496704 LR -0.4213852882385254 LKL 0.028308799490332603
epoch 20407 loss -0.2853737771511078 LR -0.31347113847732544 LKL 0.028097353875637054
epoch 20408 loss -0.29110193252563477 LR -0.3192773461341858 LKL 0.02817542478442192
epoch 20409 loss -0.3520776927471161 LR -0.3803595304489136 LKL 0.02828182652592659
epoch 20410 loss -0.2897456884384155 LR -0.31784704327583313 LKL 0.028101354837417603
epoch 20411 loss -0.3570736050605774 LR -0.385532408952713 LKL 0.02845880016684532
epoch 20412 loss -0.3244342803955078 LR -0.3526611924171448 LKL 0.028226904571056366
epoch 20413 loss -0.2670844495296478 LR -0.2952340841293335 LKL 

74
epoch 20501 loss -0.31882143020629883 LR -0.34708088636398315 LKL 0.028259456157684326


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20502 loss -0.3316250741481781 LR -0.3599422872066498 LKL 0.028317224234342575
epoch 20503 loss -0.29160428047180176 LR -0.31992465257644653 LKL 0.02832038514316082
epoch 20504 loss -0.31490373611450195 LR -0.3430413007736206 LKL 0.0281375702470541
epoch 20505 loss -0.30131539702415466 LR -0.32944247126579285 LKL 0.028127074241638184
epoch 20506 loss -0.2641673982143402 LR -0.29234862327575684 LKL 0.02818121574819088
epoch 20507 loss -0.2898275554180145 LR -0.3179202377796173 LKL 0.02809269167482853
epoch 20508 loss -0.27087339758872986 LR -0.2992091774940491 LKL 0.028335774317383766
epoch 20509 loss -0.3533015847206116 LR -0.38149186968803406 LKL 0.02819029428064823
epoch 20510 loss -0.25984054803848267 LR -0.28798770904541016 LKL 0.028147170320153236
epoch 20511 loss -0.3027532696723938 LR -0.33060434460639954 LKL 0.027851060032844543
epoch 20512 loss -0.30567142367362976 LR -0.33377373218536377 LKL 0.028102297335863113
epoch 20513 loss -0.3576255440711975 LR -0.385740458965301

epoch 20598 loss -0.2957499325275421 LR -0.3240290880203247 LKL 0.02827916108071804
epoch 20599 loss -0.3165077865123749 LR -0.3444827198982239 LKL 0.027974935248494148
epoch 20600 loss -0.28398919105529785 LR -0.3122987747192383 LKL 0.028309576213359833
68
epoch 20601 loss -0.28633221983909607 LR -0.31446373462677 LKL 0.028131509199738503


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20602 loss -0.31732726097106934 LR -0.3455418348312378 LKL 0.02821457013487816
epoch 20603 loss -0.33478835225105286 LR -0.3629704415798187 LKL 0.028182080015540123
epoch 20604 loss -0.3055804371833801 LR -0.3337496221065521 LKL 0.02816919982433319
epoch 20605 loss -0.26941221952438354 LR -0.2974739670753479 LKL 0.02806173823773861
epoch 20606 loss -0.33970752358436584 LR -0.36795347929000854 LKL 0.02824595384299755
epoch 20607 loss -0.2696510851383209 LR -0.29779157042503357 LKL 0.028140490874648094
epoch 20608 loss -0.3167225122451782 LR -0.3447551727294922 LKL 0.028032656759023666
epoch 20609 loss -0.2603530287742615 LR -0.28836777806282043 LKL 0.028014739975333214
epoch 20610 loss -0.3327171504497528 LR -0.36077240109443665 LKL 0.028055256232619286
epoch 20611 loss -0.3535078465938568 LR -0.38189202547073364 LKL 0.028384167701005936
epoch 20612 loss -0.2981826663017273 LR -0.3262622654438019 LKL 0.028079597279429436
epoch 20613 loss -0.3489561080932617 LR -0.37719088792800903

epoch 20698 loss -0.3944234549999237 LR -0.4226580262184143 LKL 0.028234582394361496
epoch 20699 loss -0.3321206867694855 LR -0.3604271113872528 LKL 0.028306422755122185
epoch 20700 loss -0.3397974371910095 LR -0.3679315149784088 LKL 0.028134077787399292
85
epoch 20701 loss -0.3208012580871582 LR -0.3488638997077942 LKL 0.028062650933861732


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20702 loss -0.2888626754283905 LR -0.31699615716934204 LKL 0.028133483603596687
epoch 20703 loss -0.2844550609588623 LR -0.3126389980316162 LKL 0.028183935210108757
epoch 20704 loss -0.3265115022659302 LR -0.3545887768268585 LKL 0.0280772615224123
epoch 20705 loss -0.30571040511131287 LR -0.3337550759315491 LKL 0.02804466523230076
epoch 20706 loss -0.3379560112953186 LR -0.36619991064071655 LKL 0.028243912383913994
epoch 20707 loss -0.3618755042552948 LR -0.39022478461265564 LKL 0.02834928408265114
epoch 20708 loss -0.31940242648124695 LR -0.34750422835350037 LKL 0.028101807460188866
epoch 20709 loss -0.2834498882293701 LR -0.3117421269416809 LKL 0.028292253613471985
epoch 20710 loss -0.321956604719162 LR -0.35024064779281616 LKL 0.028284041211009026
epoch 20711 loss -0.3458220958709717 LR -0.3740319609642029 LKL 0.028209878131747246
epoch 20712 loss -0.2904619872570038 LR -0.31870585680007935 LKL 0.028243878856301308
epoch 20713 loss -0.39301687479019165 LR -0.4212740361690521 L

epoch 20800 loss -0.31394606828689575 LR -0.34217777848243713 LKL 0.028231708332896233
93
epoch 20801 loss -0.30781644582748413 LR -0.3361295163631439 LKL 0.028313081711530685


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20802 loss -0.3255171477794647 LR -0.35374119877815247 LKL 0.028224050998687744
epoch 20803 loss -0.3087616264820099 LR -0.3368567228317261 LKL 0.028095096349716187
epoch 20804 loss -0.3151232898235321 LR -0.3432593047618866 LKL 0.028136003762483597
epoch 20805 loss -0.3147011399269104 LR -0.3430229425430298 LKL 0.028321802616119385
epoch 20806 loss -0.3170638978481293 LR -0.3451785147190094 LKL 0.028114626184105873
epoch 20807 loss -0.32018446922302246 LR -0.3484728932380676 LKL 0.028288424015045166
epoch 20808 loss -0.32765406370162964 LR -0.3558856248855591 LKL 0.02823156677186489
epoch 20809 loss -0.2745046019554138 LR -0.3026049733161926 LKL 0.028100380674004555
epoch 20810 loss -0.3658978044986725 LR -0.394111692905426 LKL 0.028213899582624435
epoch 20811 loss -0.3348018527030945 LR -0.3630969524383545 LKL 0.02829509973526001
epoch 20812 loss -0.37288400530815125 LR -0.4011106789112091 LKL 0.028226666152477264
epoch 20813 loss -0.3427007496356964 LR -0.3709181547164917 LKL 

epoch 20898 loss -0.288503497838974 LR -0.31672054529190063 LKL 0.02821703627705574
epoch 20899 loss -0.30048736929893494 LR -0.3285619020462036 LKL 0.02807452529668808
epoch 20900 loss -0.32561421394348145 LR -0.35390645265579224 LKL 0.02829224057495594
71
epoch 20901 loss -0.3235419988632202 LR -0.3516809940338135 LKL 0.028138985857367516


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 20902 loss -0.3026546239852905 LR -0.3308814465999603 LKL 0.028226817026734352
epoch 20903 loss -0.2967284023761749 LR -0.32506507635116577 LKL 0.02833668142557144
epoch 20904 loss -0.2978576421737671 LR -0.3261116147041321 LKL 0.028253987431526184
epoch 20905 loss -0.34004485607147217 LR -0.3681134581565857 LKL 0.02806858718395233
epoch 20906 loss -0.37479332089424133 LR -0.4031664729118347 LKL 0.028373152017593384
epoch 20907 loss -0.349663108587265 LR -0.37808704376220703 LKL 0.028423938900232315
epoch 20908 loss -0.2804039418697357 LR -0.30870321393013 LKL 0.02829926647245884
epoch 20909 loss -0.2886696457862854 LR -0.31684431433677673 LKL 0.028174666687846184
epoch 20910 loss -0.25220048427581787 LR -0.2804120182991028 LKL 0.028211547061800957
epoch 20911 loss -0.3441043198108673 LR -0.37229451537132263 LKL 0.028190208598971367
epoch 20912 loss -0.35458144545555115 LR -0.38308191299438477 LKL 0.028500476852059364
epoch 20913 loss -0.34458276629447937 LR -0.37310460209846497 

epoch 21000 loss -0.3298610746860504 LR -0.35808247327804565 LKL 0.028221406042575836
96
epoch 21001 loss -0.32900992035865784 LR -0.3574700355529785 LKL 0.028460128232836723


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21002 loss -0.27473244071006775 LR -0.30291858315467834 LKL 0.02818615362048149
epoch 21003 loss -0.305869996547699 LR -0.33421093225479126 LKL 0.02834094874560833
epoch 21004 loss -0.3677385747432709 LR -0.39608243107795715 LKL 0.028343861922621727
epoch 21005 loss -0.299265593290329 LR -0.32752177119255066 LKL 0.028256168588995934
epoch 21006 loss -0.3269560933113098 LR -0.3553321957588196 LKL 0.02837611548602581
epoch 21007 loss -0.3676925003528595 LR -0.3963332176208496 LKL 0.02864072099328041
epoch 21008 loss -0.31456825137138367 LR -0.3431037664413452 LKL 0.0285355057567358
epoch 21009 loss -0.3038271367549896 LR -0.3321741819381714 LKL 0.028347034007310867
epoch 21010 loss -0.3744445741176605 LR -0.40296614170074463 LKL 0.028521565720438957
epoch 21011 loss -0.2714831829071045 LR -0.2999381124973297 LKL 0.028454938903450966
epoch 21012 loss -0.19479307532310486 LR -0.22288015484809875 LKL 0.028087077662348747
epoch 21013 loss -0.3051935136318207 LR -0.3336740732192993 LKL 

76
epoch 21101 loss -0.2913116216659546 LR -0.3196702301502228 LKL 0.02835860103368759


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21102 loss -0.29935500025749207 LR -0.3277871906757355 LKL 0.028432199731469154
epoch 21103 loss -0.326388955116272 LR -0.354872465133667 LKL 0.02848351001739502
epoch 21104 loss -0.29952630400657654 LR -0.3279355764389038 LKL 0.02840927243232727
epoch 21105 loss -0.3324362635612488 LR -0.36090561747550964 LKL 0.028469359502196312
epoch 21106 loss -0.35135170817375183 LR -0.37983959913253784 LKL 0.028487883508205414
epoch 21107 loss -0.3334965407848358 LR -0.361834317445755 LKL 0.028337784111499786
epoch 21108 loss -0.3434188663959503 LR -0.37181293964385986 LKL 0.0283940602093935
epoch 21109 loss -0.28292569518089294 LR -0.3113478720188141 LKL 0.02842218428850174
epoch 21110 loss -0.3594099283218384 LR -0.3879324793815613 LKL 0.028522538021206856
epoch 21111 loss -0.3647976219654083 LR -0.3932473957538605 LKL 0.028449762612581253
epoch 21112 loss -0.3015984892845154 LR -0.33012276887893677 LKL 0.02852429263293743
epoch 21113 loss -0.27315983176231384 LR -0.30140024423599243 LKL 

epoch 21198 loss -0.3460727334022522 LR -0.37465110421180725 LKL 0.02857835777103901
epoch 21199 loss -0.35458850860595703 LR -0.38305437564849854 LKL 0.02846585400402546
epoch 21200 loss -0.3790357708930969 LR -0.40750306844711304 LKL 0.028467310592532158
69
epoch 21201 loss -0.3418925106525421 LR -0.3703041076660156 LKL 0.028411591425538063


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21202 loss -0.32748445868492126 LR -0.35589906573295593 LKL 0.02841460146009922
epoch 21203 loss -0.398345023393631 LR -0.4267478883266449 LKL 0.02840285748243332
epoch 21204 loss -0.2980230450630188 LR -0.3262985348701477 LKL 0.02827550284564495
epoch 21205 loss -0.34050554037094116 LR -0.36906707286834717 LKL 0.028561528772115707
epoch 21206 loss -0.35957059264183044 LR -0.3881414532661438 LKL 0.0285708736628294
epoch 21207 loss -0.2859307825565338 LR -0.31437259912490845 LKL 0.028441810980439186
epoch 21208 loss -0.307902991771698 LR -0.3363022208213806 LKL 0.028399214148521423
epoch 21209 loss -0.2775881290435791 LR -0.30577465891838074 LKL 0.028186539188027382
epoch 21210 loss -0.31166255474090576 LR -0.3399978578090668 LKL 0.028335291892290115
epoch 21211 loss -0.29312750697135925 LR -0.32158178091049194 LKL 0.028454279527068138
epoch 21212 loss -0.34771499037742615 LR -0.37605273723602295 LKL 0.028337759897112846
epoch 21213 loss -0.243087500333786 LR -0.2714809775352478 L

52
epoch 21301 loss -0.3116393983364105 LR -0.3402263820171356 LKL 0.028586994856595993


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21302 loss -0.34824803471565247 LR -0.37678447365760803 LKL 0.02853643335402012
epoch 21303 loss -0.3069133758544922 LR -0.33541032671928406 LKL 0.02849694900214672
epoch 21304 loss -0.3340994119644165 LR -0.3627735376358032 LKL 0.02867412567138672
epoch 21305 loss -0.3562091886997223 LR -0.3847588896751404 LKL 0.028549712151288986
epoch 21306 loss -0.39482924342155457 LR -0.42359238862991333 LKL 0.02876313403248787
epoch 21307 loss -0.31279298663139343 LR -0.34125399589538574 LKL 0.028460999950766563
epoch 21308 loss -0.3951139748096466 LR -0.42362117767333984 LKL 0.028507212176918983
epoch 21309 loss -0.2818610668182373 LR -0.31022554636001587 LKL 0.028364472091197968
epoch 21310 loss -0.37938031554222107 LR -0.4080537259578705 LKL 0.028673406690359116
epoch 21311 loss -0.3076757788658142 LR -0.3363063931465149 LKL 0.02863061986863613
epoch 21312 loss -0.262664258480072 LR -0.291120707988739 LKL 0.02845645509660244
epoch 21313 loss -0.30555668473243713 LR -0.33400487899780273 L

epoch 21400 loss -0.3181082606315613 LR -0.34670543670654297 LKL 0.028597179800271988
70


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21401 loss -0.371223121881485 LR -0.4000037908554077 LKL 0.028780682012438774
epoch 21402 loss -0.3588985800743103 LR -0.38755306601524353 LKL 0.028654497116804123
epoch 21403 loss -0.34704092144966125 LR -0.3755187392234802 LKL 0.028477808460593224
epoch 21404 loss -0.30629169940948486 LR -0.33497023582458496 LKL 0.02867855131626129
epoch 21405 loss -0.3779802918434143 LR -0.40649157762527466 LKL 0.028511296957731247
epoch 21406 loss -0.3453178405761719 LR -0.37392309308052063 LKL 0.02860524132847786
epoch 21407 loss -0.25940483808517456 LR -0.28796806931495667 LKL 0.028563242405653
epoch 21408 loss -0.35511454939842224 LR -0.38385486602783203 LKL 0.028740307316184044
epoch 21409 loss -0.2863669991493225 LR -0.3149627447128296 LKL 0.02859574742615223
epoch 21410 loss -0.26320940256118774 LR -0.29164573550224304 LKL 0.028436340391635895
epoch 21411 loss -0.27993711829185486 LR -0.30831605195999146 LKL 0.028378920629620552
epoch 21412 loss -0.2695583999156952 LR -0.297928810119628

epoch 21499 loss -0.27775105834007263 LR -0.3063109219074249 LKL 0.028559867292642593
epoch 21500 loss -0.37056151032447815 LR -0.39919185638427734 LKL 0.028630351647734642
61
epoch 21501 loss -0.34191009402275085 LR -0.3706054985523224 LKL 0.02869539149105549


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21502 loss -0.3451666831970215 LR -0.373803973197937 LKL 0.028637290000915527
epoch 21503 loss -0.28922295570373535 LR -0.31780681014060974 LKL 0.028583861887454987
epoch 21504 loss -0.3187516927719116 LR -0.3475973308086395 LKL 0.028845645487308502
epoch 21505 loss -0.3211851119995117 LR -0.34993550181388855 LKL 0.028750380501151085
epoch 21506 loss -0.285203218460083 LR -0.3137356638908386 LKL 0.02853243798017502
epoch 21507 loss -0.31536221504211426 LR -0.3440830111503601 LKL 0.028720809146761894
epoch 21508 loss -0.3293491303920746 LR -0.357888787984848 LKL 0.02853964827954769
epoch 21509 loss -0.31651797890663147 LR -0.34530922770500183 LKL 0.02879125066101551
epoch 21510 loss -0.3031594157218933 LR -0.3315030336380005 LKL 0.02834361046552658
epoch 21511 loss -0.25267329812049866 LR -0.28125280141830444 LKL 0.02857951447367668
epoch 21512 loss -0.26711514592170715 LR -0.2958328127861023 LKL 0.02871767058968544
epoch 21513 loss -0.2913232445716858 LR -0.31980541348457336 LKL 

epoch 21598 loss -0.32083648443222046 LR -0.34956812858581543 LKL 0.02873164415359497
epoch 21599 loss -0.26842865347862244 LR -0.2972228527069092 LKL 0.028794195502996445
epoch 21600 loss -0.3422631025314331 LR -0.3708912432193756 LKL 0.02862812578678131
68
epoch 21601 loss -0.3521314859390259 LR -0.3807815909385681 LKL 0.02865009568631649


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21602 loss -0.3392016887664795 LR -0.36771321296691895 LKL 0.0285115297883749
epoch 21603 loss -0.33718976378440857 LR -0.3658531904220581 LKL 0.02866343781352043
epoch 21604 loss -0.36588579416275024 LR -0.3947274684906006 LKL 0.028841665014624596
epoch 21605 loss -0.3289085626602173 LR -0.3577152192592621 LKL 0.0288066565990448
epoch 21606 loss -0.3245040476322174 LR -0.35317134857177734 LKL 0.028667306527495384
epoch 21607 loss -0.3521968424320221 LR -0.38100335001945496 LKL 0.02880651131272316
epoch 21608 loss -0.3370702266693115 LR -0.36588943004608154 LKL 0.028819192200899124
epoch 21609 loss -0.29110825061798096 LR -0.31971392035484314 LKL 0.028605680912733078
epoch 21610 loss -0.3318459987640381 LR -0.3605634272098541 LKL 0.028717419132590294
epoch 21611 loss -0.3195907175540924 LR -0.3483424484729767 LKL 0.028751719743013382
epoch 21612 loss -0.2364935725927353 LR -0.2649896740913391 LKL 0.028496095910668373
epoch 21613 loss -0.26912254095077515 LR -0.29774659872055054 L

epoch 21698 loss -0.25585487484931946 LR -0.2845156192779541 LKL 0.028660744428634644
epoch 21699 loss -0.3357430696487427 LR -0.36441829800605774 LKL 0.028675232082605362
epoch 21700 loss -0.28001725673675537 LR -0.30881819128990173 LKL 0.028800923377275467
60
epoch 21701 loss -0.34778836369514465 LR -0.3765029311180115 LKL 0.02871456742286682


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21702 loss -0.39226847887039185 LR -0.4210124611854553 LKL 0.028743986040353775
epoch 21703 loss -0.3454217314720154 LR -0.3741905093193054 LKL 0.028768785297870636
epoch 21704 loss -0.3307438790798187 LR -0.35958388447761536 LKL 0.028839994221925735
epoch 21705 loss -0.3323972821235657 LR -0.3611055016517639 LKL 0.028708210214972496
epoch 21706 loss -0.35838648676872253 LR -0.38715580105781555 LKL 0.028769303113222122
epoch 21707 loss -0.37021705508232117 LR -0.3988487720489502 LKL 0.028631726279854774
epoch 21708 loss -0.3500765562057495 LR -0.37902218103408813 LKL 0.02894563227891922
epoch 21709 loss -0.32086607813835144 LR -0.34963613748550415 LKL 0.028770064935088158
epoch 21710 loss -0.3536180853843689 LR -0.3823856711387634 LKL 0.028767600655555725
epoch 21711 loss -0.3128231465816498 LR -0.341439425945282 LKL 0.0286162868142128
epoch 21712 loss -0.3446132242679596 LR -0.3732776641845703 LKL 0.028664445504546165
epoch 21713 loss -0.3611976206302643 LR -0.3899952471256256 L

63
epoch 21801 loss -0.3662303388118744 LR -0.3949449956417084 LKL 0.028714662417769432


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21802 loss -0.27102720737457275 LR -0.2996802031993866 LKL 0.028652997687458992
epoch 21803 loss -0.34448036551475525 LR -0.3733077645301819 LKL 0.028827404603362083
epoch 21804 loss -0.378394216299057 LR -0.4072335362434387 LKL 0.02883933298289776
epoch 21805 loss -0.3193381726741791 LR -0.34820568561553955 LKL 0.028867512941360474
epoch 21806 loss -0.28575342893600464 LR -0.3144300580024719 LKL 0.02867661789059639
epoch 21807 loss -0.3584815561771393 LR -0.3872542381286621 LKL 0.028772683814167976
epoch 21808 loss -0.30845963954925537 LR -0.33713480830192566 LKL 0.02867516502737999
epoch 21809 loss -0.248038649559021 LR -0.2765488922595978 LKL 0.028510235249996185
epoch 21810 loss -0.38266268372535706 LR -0.4115501046180725 LKL 0.02888742834329605
epoch 21811 loss -0.3835439383983612 LR -0.41237813234329224 LKL 0.028834190219640732
epoch 21812 loss -0.406381756067276 LR -0.43524011969566345 LKL 0.02885836735367775
epoch 21813 loss -0.396538645029068 LR -0.42538946866989136 LKL 

54
epoch 21901 loss -0.37971681356430054 LR -0.4086042642593384 LKL 0.028887443244457245


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 21902 loss -0.3389435410499573 LR -0.367777556180954 LKL 0.028834015130996704
epoch 21903 loss -0.3815109431743622 LR -0.4102928638458252 LKL 0.028781913220882416
epoch 21904 loss -0.3125973343849182 LR -0.34141814708709717 LKL 0.0288208220154047
epoch 21905 loss -0.32030683755874634 LR -0.3488987684249878 LKL 0.0285919439047575
epoch 21906 loss -0.3719775974750519 LR -0.4008569121360779 LKL 0.028879309073090553
epoch 21907 loss -0.34853145480155945 LR -0.3774470090866089 LKL 0.028915544971823692
epoch 21908 loss -0.27137812972068787 LR -0.29978838562965393 LKL 0.028410250321030617
epoch 21909 loss -0.3754901587963104 LR -0.40428563952445984 LKL 0.028795484453439713
epoch 21910 loss -0.33493906259536743 LR -0.36362797021865845 LKL 0.028688915073871613
epoch 21911 loss -0.3132147789001465 LR -0.3420151174068451 LKL 0.028800329193472862
epoch 21912 loss -0.3137362599372864 LR -0.3424101173877716 LKL 0.028673844411969185
epoch 21913 loss -0.3662828505039215 LR -0.39512893557548523 L

57
epoch 22001 loss -0.3705063760280609 LR -0.3993951678276062 LKL 0.02888878434896469


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22002 loss -0.27831384539604187 LR -0.306962251663208 LKL 0.028648393228650093
epoch 22003 loss -0.323386549949646 LR -0.35224294662475586 LKL 0.028856387361884117
epoch 22004 loss -0.22996839880943298 LR -0.2585468888282776 LKL 0.028578484430909157
epoch 22005 loss -0.3197439908981323 LR -0.34853872656822205 LKL 0.028794731944799423
epoch 22006 loss -0.3295920789241791 LR -0.3584895133972168 LKL 0.02889743633568287
epoch 22007 loss -0.33860790729522705 LR -0.3674596846103668 LKL 0.028851768001914024
epoch 22008 loss -0.26432839035987854 LR -0.29292547702789307 LKL 0.028597086668014526
epoch 22009 loss -0.3233259618282318 LR -0.3522431254386902 LKL 0.028917165473103523
epoch 22010 loss -0.33240947127342224 LR -0.3610636293888092 LKL 0.02865416184067726
epoch 22011 loss -0.2768974304199219 LR -0.30539563298225403 LKL 0.028498200699687004
epoch 22012 loss -0.35761958360671997 LR -0.3864375352859497 LKL 0.028817949816584587
epoch 22013 loss -0.3074161410331726 LR -0.3361018896102905

epoch 22098 loss -0.3161097466945648 LR -0.3446965217590332 LKL 0.02858678065240383
epoch 22099 loss -0.28755325078964233 LR -0.3162934482097626 LKL 0.02874019555747509
epoch 22100 loss -0.3387071490287781 LR -0.36757969856262207 LKL 0.028872564435005188
44
epoch 22101 loss -0.3339519500732422 LR -0.36265382170677185 LKL 0.028701864182949066


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22102 loss -0.3118007779121399 LR -0.3406359553337097 LKL 0.02883518673479557
epoch 22103 loss -0.35983943939208984 LR -0.3886585533618927 LKL 0.028819099068641663
epoch 22104 loss -0.3594933748245239 LR -0.3882344961166382 LKL 0.028741125017404556
epoch 22105 loss -0.3082738220691681 LR -0.33704906702041626 LKL 0.028775235638022423
epoch 22106 loss -0.32775184512138367 LR -0.35635316371917725 LKL 0.028601326048374176
epoch 22107 loss -0.3093906044960022 LR -0.33809220790863037 LKL 0.028701607137918472
epoch 22108 loss -0.27359122037887573 LR -0.3023136854171753 LKL 0.02872246690094471
epoch 22109 loss -0.34043607115745544 LR -0.3692692816257477 LKL 0.02883322350680828
epoch 22110 loss -0.2976180613040924 LR -0.32625675201416016 LKL 0.02863868698477745
epoch 22111 loss -0.3306295871734619 LR -0.35948270559310913 LKL 0.028853124007582664
epoch 22112 loss -0.3130430281162262 LR -0.34185031056404114 LKL 0.028807293623685837
epoch 22113 loss -0.34657472372055054 LR -0.375327527523040

epoch 22198 loss -0.2784426212310791 LR -0.30719661712646484 LKL 0.02875400520861149
epoch 22199 loss -0.35416123270988464 LR -0.3829328715801239 LKL 0.028771651908755302
epoch 22200 loss -0.42364513874053955 LR -0.4523835778236389 LKL 0.028738446533679962
79
epoch 22201 loss -0.4013330638408661 LR -0.4301513433456421 LKL 0.028818268328905106


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22202 loss -0.3221485912799835 LR -0.3509728014469147 LKL 0.028824200853705406
epoch 22203 loss -0.33459579944610596 LR -0.36353668570518494 LKL 0.028940871357917786
epoch 22204 loss -0.304708868265152 LR -0.3335094451904297 LKL 0.028800569474697113
epoch 22205 loss -0.31810513138771057 LR -0.34688976407051086 LKL 0.028784632682800293
epoch 22206 loss -0.2105487883090973 LR -0.23923182487487793 LKL 0.028683029115200043
epoch 22207 loss -0.36650916934013367 LR -0.39509129524230957 LKL 0.0285821333527565
epoch 22208 loss -0.34841611981391907 LR -0.3771654963493347 LKL 0.0287493746727705
epoch 22209 loss -0.394942045211792 LR -0.4238305687904358 LKL 0.028888508677482605
epoch 22210 loss -0.3454875648021698 LR -0.3744314908981323 LKL 0.02894393727183342
epoch 22211 loss -0.37745219469070435 LR -0.40634843707084656 LKL 0.028896242380142212
epoch 22212 loss -0.35209041833877563 LR -0.381002813577652 LKL 0.028912408277392387
epoch 22213 loss -0.3307669758796692 LR -0.359666109085083 LKL

epoch 22299 loss -0.2481241524219513 LR -0.2768424451351166 LKL 0.028718292713165283
epoch 22300 loss -0.3003888428211212 LR -0.32941630482673645 LKL 0.029027454555034637
48
epoch 22301 loss -0.25062093138694763 LR -0.27928584814071655 LKL 0.028664924204349518


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22302 loss -0.26748377084732056 LR -0.29612287878990173 LKL 0.028639107942581177
epoch 22303 loss -0.3092372417449951 LR -0.3379438519477844 LKL 0.02870660461485386
epoch 22304 loss -0.31086334586143494 LR -0.33948469161987305 LKL 0.028621336445212364
epoch 22305 loss -0.3159419298171997 LR -0.34474265575408936 LKL 0.028800714761018753
epoch 22306 loss -0.33380693197250366 LR -0.3627229928970337 LKL 0.028916046023368835
epoch 22307 loss -0.35600900650024414 LR -0.3850032091140747 LKL 0.028994211927056313
epoch 22308 loss -0.37838345766067505 LR -0.4073370099067688 LKL 0.028953567147254944
epoch 22309 loss -0.30571454763412476 LR -0.334460973739624 LKL 0.028746431693434715
epoch 22310 loss -0.20692016184329987 LR -0.23562970757484436 LKL 0.028709543868899345
epoch 22311 loss -0.35355138778686523 LR -0.38241344690322876 LKL 0.02886207215487957
epoch 22312 loss -0.35535305738449097 LR -0.3840526044368744 LKL 0.02869953215122223
epoch 22313 loss -0.353625625371933 LR -0.3822614550590

epoch 22398 loss -0.2703118920326233 LR -0.2991999089717865 LKL 0.028888022527098656
epoch 22399 loss -0.33631420135498047 LR -0.3652050495147705 LKL 0.028890836983919144
epoch 22400 loss -0.34865790605545044 LR -0.377556174993515 LKL 0.02889825776219368
114
epoch 22401 loss -0.26149269938468933 LR -0.2903284430503845 LKL 0.02883574366569519


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22402 loss -0.3493001163005829 LR -0.37821948528289795 LKL 0.028919367119669914
epoch 22403 loss -0.3010113537311554 LR -0.32967421412467957 LKL 0.028662871569395065
epoch 22404 loss -0.34672585129737854 LR -0.3755800724029541 LKL 0.028854215517640114
epoch 22405 loss -0.42952385544776917 LR -0.45859014987945557 LKL 0.029066288843750954
epoch 22406 loss -0.28703707456588745 LR -0.31567031145095825 LKL 0.028633248060941696
epoch 22407 loss -0.2919183671474457 LR -0.3207106590270996 LKL 0.028792288154363632
epoch 22408 loss -0.41002780199050903 LR -0.43903958797454834 LKL 0.029011771082878113
epoch 22409 loss -0.331251859664917 LR -0.35998600721359253 LKL 0.028734132647514343
epoch 22410 loss -0.2905174493789673 LR -0.3193463087081909 LKL 0.02882884442806244
epoch 22411 loss -0.3390924334526062 LR -0.3679145574569702 LKL 0.02882211096584797
epoch 22412 loss -0.29754766821861267 LR -0.3263949751853943 LKL 0.028847310692071915
epoch 22413 loss -0.3439072370529175 LR -0.37270760536193

epoch 22498 loss -0.33991628885269165 LR -0.3689064383506775 LKL 0.02899014949798584
epoch 22499 loss -0.3122652769088745 LR -0.34109753370285034 LKL 0.028832266107201576
epoch 22500 loss -0.3057559132575989 LR -0.33460357785224915 LKL 0.028847649693489075
42
epoch 22501 loss -0.2949651777744293 LR -0.3237764835357666 LKL 0.02881130576133728


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22502 loss -0.33120644092559814 LR -0.35998666286468506 LKL 0.028780218213796616
epoch 22503 loss -0.34098953008651733 LR -0.36995866894721985 LKL 0.02896912954747677
epoch 22504 loss -0.4046229422092438 LR -0.4335821866989136 LKL 0.028959250077605247
epoch 22505 loss -0.29907211661338806 LR -0.32773926854133606 LKL 0.028667140752077103
epoch 22506 loss -0.3160783648490906 LR -0.3449249267578125 LKL 0.028846576809883118
epoch 22507 loss -0.35894039273262024 LR -0.3877863585948944 LKL 0.02884596958756447
epoch 22508 loss -0.31576016545295715 LR -0.34445327520370483 LKL 0.028693119063973427
epoch 22509 loss -0.2619185149669647 LR -0.29058727622032166 LKL 0.028668761253356934
epoch 22510 loss -0.3324306905269623 LR -0.36117544770240784 LKL 0.028744744136929512
epoch 22511 loss -0.3214074969291687 LR -0.3504738211631775 LKL 0.029066337272524834
epoch 22512 loss -0.37772369384765625 LR -0.4066835939884186 LKL 0.02895989641547203
epoch 22513 loss -0.3164061903953552 LR -0.3452657163143

epoch 22598 loss -0.2880542278289795 LR -0.3168158233165741 LKL 0.02876160852611065
epoch 22599 loss -0.305642694234848 LR -0.33455783128738403 LKL 0.028915148228406906
epoch 22600 loss -0.3886089026927948 LR -0.4174945056438446 LKL 0.028885595500469208
56
epoch 22601 loss -0.32340800762176514 LR -0.3521484136581421 LKL 0.028740398585796356


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22602 loss -0.24729715287685394 LR -0.27598837018013 LKL 0.028691211715340614
epoch 22603 loss -0.27542561292648315 LR -0.30413907766342163 LKL 0.02871345728635788
epoch 22604 loss -0.3296973705291748 LR -0.3584640622138977 LKL 0.028766697272658348
epoch 22605 loss -0.36468440294265747 LR -0.3936466872692108 LKL 0.02896229177713394
epoch 22606 loss -0.32315438985824585 LR -0.351726233959198 LKL 0.028571859002113342
epoch 22607 loss -0.34930381178855896 LR -0.3782579004764557 LKL 0.02895408682525158
epoch 22608 loss -0.3388914465904236 LR -0.36769652366638184 LKL 0.028805075213313103
epoch 22609 loss -0.26617857813835144 LR -0.2949243187904358 LKL 0.028745729476213455
epoch 22610 loss -0.25671353936195374 LR -0.28545939922332764 LKL 0.028745869174599648
epoch 22611 loss -0.3726329207420349 LR -0.4016132354736328 LKL 0.02898031286895275
epoch 22612 loss -0.2899009585380554 LR -0.3185247480869293 LKL 0.028623780235648155
epoch 22613 loss -0.3078424632549286 LR -0.3365882337093353 LK

epoch 22700 loss -0.24488885700702667 LR -0.27347663044929504 LKL 0.02858777716755867
86
epoch 22701 loss -0.3059690296649933 LR -0.33458060026168823 LKL 0.028611570596694946


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22702 loss -0.3478713631629944 LR -0.37665241956710815 LKL 0.028781048953533173
epoch 22703 loss -0.3261848986148834 LR -0.35508793592453003 LKL 0.028903044760227203
epoch 22704 loss -0.278626948595047 LR -0.3072735667228699 LKL 0.028646616265177727
epoch 22705 loss -0.38054147362709045 LR -0.4094589948654175 LKL 0.028917530551552773
epoch 22706 loss -0.32897427678108215 LR -0.3578612804412842 LKL 0.028887011110782623
epoch 22707 loss -0.38903066515922546 LR -0.4178503155708313 LKL 0.028819642961025238
epoch 22708 loss -0.3547111451625824 LR -0.3835848271846771 LKL 0.02887367457151413
epoch 22709 loss -0.32177066802978516 LR -0.3505968451499939 LKL 0.028826192021369934
epoch 22710 loss -0.37746691703796387 LR -0.4063405990600586 LKL 0.02887367270886898
epoch 22711 loss -0.29509133100509644 LR -0.3240131735801697 LKL 0.028921831399202347
epoch 22712 loss -0.2870109975337982 LR -0.3158299922943115 LKL 0.028818994760513306
epoch 22713 loss -0.3622879683971405 LR -0.3911155164241791 

74
epoch 22801 loss -0.34975913166999817 LR -0.37864112854003906 LKL 0.02888200618326664


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22802 loss -0.29263147711753845 LR -0.3214820325374603 LKL 0.028850562870502472
epoch 22803 loss -0.34196147322654724 LR -0.37071821093559265 LKL 0.028756730258464813
epoch 22804 loss -0.32724303007125854 LR -0.35602277517318726 LKL 0.028779741376638412
epoch 22805 loss -0.27438682317733765 LR -0.30326318740844727 LKL 0.02887636423110962
epoch 22806 loss -0.31355294585227966 LR -0.3424355983734131 LKL 0.02888265624642372
epoch 22807 loss -0.33830615878105164 LR -0.3669741749763489 LKL 0.028668029233813286
epoch 22808 loss -0.329294890165329 LR -0.3582305312156677 LKL 0.028935642912983894
epoch 22809 loss -0.3621414005756378 LR -0.39099419116973877 LKL 0.02885279431939125
epoch 22810 loss -0.32308006286621094 LR -0.351955384016037 LKL 0.028875313699245453
epoch 22811 loss -0.34594571590423584 LR -0.37508147954940796 LKL 0.029135774821043015
epoch 22812 loss -0.31033244729042053 LR -0.339128702878952 LKL 0.02879626862704754
epoch 22813 loss -0.3717857599258423 LR -0.400651425123214

epoch 22898 loss -0.3875153064727783 LR -0.41640543937683105 LKL 0.028890132904052734
epoch 22899 loss -0.3164825439453125 LR -0.3452659845352173 LKL 0.02878342568874359
epoch 22900 loss -0.3527623414993286 LR -0.38184791803359985 LKL 0.029085570946335793
41
epoch 22901 loss -0.3528527617454529 LR -0.38183021545410156 LKL 0.028977451846003532


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 22902 loss -0.29031771421432495 LR -0.3190779685974121 LKL 0.028760261833667755
epoch 22903 loss -0.3425804376602173 LR -0.37145161628723145 LKL 0.028871193528175354
epoch 22904 loss -0.3668436110019684 LR -0.3956117331981659 LKL 0.028768135234713554
epoch 22905 loss -0.32053518295288086 LR -0.34950536489486694 LKL 0.028970180079340935
epoch 22906 loss -0.3745703399181366 LR -0.4035191237926483 LKL 0.02894877828657627
epoch 22907 loss -0.393023282289505 LR -0.42206814885139465 LKL 0.029044857248663902
epoch 22908 loss -0.30583953857421875 LR -0.33445385098457336 LKL 0.028614316135644913
epoch 22909 loss -0.34120678901672363 LR -0.37007462978363037 LKL 0.02886783704161644
epoch 22910 loss -0.2762369215488434 LR -0.30488064885139465 LKL 0.02864372357726097
epoch 22911 loss -0.43682461977005005 LR -0.465935617685318 LKL 0.02911100909113884
epoch 22912 loss -0.32256853580474854 LR -0.3515116572380066 LKL 0.028943117707967758
epoch 22913 loss -0.3498987555503845 LR -0.3787392377853393

epoch 23000 loss -0.3710315227508545 LR -0.40004345774650574 LKL 0.029011942446231842
100
epoch 23001 loss -0.3283698260784149 LR -0.35744860768318176 LKL 0.02907879278063774


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23002 loss -0.33909857273101807 LR -0.36823558807373047 LKL 0.02913702465593815
epoch 23003 loss -0.3531583845615387 LR -0.3820776343345642 LKL 0.028919249773025513
epoch 23004 loss -0.39348965883255005 LR -0.42252591252326965 LKL 0.02903626300394535
epoch 23005 loss -0.32781368494033813 LR -0.35677435994148254 LKL 0.028960682451725006
epoch 23006 loss -0.38863903284072876 LR -0.4176563322544098 LKL 0.02901730313897133
epoch 23007 loss -0.3732999563217163 LR -0.402268648147583 LKL 0.028968706727027893
epoch 23008 loss -0.30861982703208923 LR -0.33734309673309326 LKL 0.028723271563649178
epoch 23009 loss -0.3389892876148224 LR -0.36807411909103394 LKL 0.029084835201501846
epoch 23010 loss -0.3396192491054535 LR -0.3685309886932373 LKL 0.028911733999848366
epoch 23011 loss -0.3058355450630188 LR -0.33470502495765686 LKL 0.02886948734521866
epoch 23012 loss -0.266259104013443 LR -0.29518744349479675 LKL 0.028928352519869804
epoch 23013 loss -0.40298402309417725 LR -0.432140350341796

epoch 23100 loss -0.3480622470378876 LR -0.3771384358406067 LKL 0.02907618321478367
50


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23101 loss -0.3553343415260315 LR -0.3842984139919281 LKL 0.028964059427380562
epoch 23102 loss -0.25778675079345703 LR -0.28664106130599976 LKL 0.028854306787252426
epoch 23103 loss -0.4366450607776642 LR -0.4657645523548126 LKL 0.029119499027729034
epoch 23104 loss -0.3634002208709717 LR -0.39242881536483765 LKL 0.02902858331799507
epoch 23105 loss -0.3491882085800171 LR -0.3783140778541565 LKL 0.029125861823558807
epoch 23106 loss -0.3138834834098816 LR -0.3427692949771881 LKL 0.02888580411672592
epoch 23107 loss -0.34291601181030273 LR -0.37196844816207886 LKL 0.029052425175905228
epoch 23108 loss -0.22936135530471802 LR -0.25824910402297974 LKL 0.028887754306197166
epoch 23109 loss -0.3190053701400757 LR -0.3481298089027405 LKL 0.029124446213245392
epoch 23110 loss -0.27832889556884766 LR -0.30728650093078613 LKL 0.028957614675164223
epoch 23111 loss -0.3036056458950043 LR -0.33280906081199646 LKL 0.02920341119170189
epoch 23112 loss -0.24527086317539215 LR -0.27425992488861

epoch 23198 loss -0.27053767442703247 LR -0.29959219694137573 LKL 0.02905452623963356
epoch 23199 loss -0.33606722950935364 LR -0.36518797278404236 LKL 0.029120754450559616
epoch 23200 loss -0.319186806678772 LR -0.34810763597488403 LKL 0.028920838609337807
59


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23201 loss -0.36900046467781067 LR -0.39813232421875 LKL 0.029131855815649033
epoch 23202 loss -0.24506011605262756 LR -0.27417123317718506 LKL 0.029111124575138092
epoch 23203 loss -0.3282460570335388 LR -0.35737359523773193 LKL 0.029127534478902817
epoch 23204 loss -0.2436002492904663 LR -0.27256783843040466 LKL 0.028967594727873802
epoch 23205 loss -0.3106655478477478 LR -0.3397250771522522 LKL 0.02905954234302044
epoch 23206 loss -0.4266228675842285 LR -0.4559170603752136 LKL 0.029294203966856003
epoch 23207 loss -0.34113624691963196 LR -0.37001949548721313 LKL 0.028883254155516624
epoch 23208 loss -0.35965245962142944 LR -0.3888170123100281 LKL 0.029164565727114677
epoch 23209 loss -0.27923280000686646 LR -0.3081979751586914 LKL 0.028965162113308907
epoch 23210 loss -0.3078290522098541 LR -0.33685824275016785 LKL 0.029029203578829765
epoch 23211 loss -0.25366324186325073 LR -0.282667338848114 LKL 0.02900410257279873
epoch 23212 loss -0.32956957817077637 LR -0.358660697937011

epoch 23298 loss -0.34280064702033997 LR -0.3720288872718811 LKL 0.029228243976831436
epoch 23299 loss -0.38733184337615967 LR -0.4165528416633606 LKL 0.029221011325716972
epoch 23300 loss -0.3371775150299072 LR -0.36629343032836914 LKL 0.029115917161107063
70
epoch 23301 loss -0.38348066806793213 LR -0.4127819240093231 LKL 0.029301244765520096
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 23302 loss -0.34559717774391174 LR -0.3748258948326111 LKL 0.02922871522605419
epoch 23303 loss -0.3444899916648865 LR -0.3737931251525879 LKL 0.02930312789976597
epoch 23304 loss -0.3863656520843506 LR -0.4156040549278259 LKL 0.029238412156701088
epoch 23305 loss -0.30963364243507385 LR -0.3386456370353699 LKL 0.029011990875005722
epoch 23306 loss -0.33323758840560913 LR -0.3623580038547516 LKL 0.02912040986120701
epoch 23307 loss -0.3694992661476135 LR -0.3988298177719116 LKL 0.029330546036362648
epoch 23308 loss -0.2854160666465759 LR -0.31444698572158813 LKL 0.02903091348707676
epoch 23309 loss -0.3267596662044525 LR -0.3557686507701874 LKL 0.029008973389863968
epoch 23310 loss -0.3814391493797302 LR -0.4105873703956604 LKL 0.02914823405444622
epoch 23311 loss -0.39880719780921936 LR -0.4279748797416687 LKL 0.029167672619223595
epoch 23312 loss -0.25688570737838745 LR -0.2859010398387909 LKL 0.02901533804833889
epoch 23313 loss -0.3237271010875702 LR -0.3528648614883423 LKL 0.0291

128
epoch 23401 loss -0.3414663076400757 LR -0.3707430362701416 LKL 0.02927672304213047


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23402 loss -0.36816373467445374 LR -0.3974541425704956 LKL 0.02929041162133217
epoch 23403 loss -0.32571274042129517 LR -0.3548893332481384 LKL 0.029176583513617516
epoch 23404 loss -0.3473820686340332 LR -0.37649253010749817 LKL 0.029110463336110115
epoch 23405 loss -0.29267618060112 LR -0.3218868374824524 LKL 0.029210664331912994
epoch 23406 loss -0.3988264799118042 LR -0.42813512682914734 LKL 0.029308650642633438
epoch 23407 loss -0.34808823466300964 LR -0.37718772888183594 LKL 0.02909948118031025
epoch 23408 loss -0.3838256001472473 LR -0.41310495138168335 LKL 0.02927936054766178
epoch 23409 loss -0.35015153884887695 LR -0.3794488310813904 LKL 0.029297303408384323
epoch 23410 loss -0.3274688720703125 LR -0.3566347360610962 LKL 0.02916586399078369
epoch 23411 loss -0.33783355355262756 LR -0.3670801520347595 LKL 0.029246589168906212
epoch 23412 loss -0.354987770318985 LR -0.38401925563812256 LKL 0.02903147228062153
epoch 23413 loss -0.36013075709342957 LR -0.38934746384620667 L

epoch 23498 loss -0.3496726453304291 LR -0.3789498209953308 LKL 0.02927718311548233
epoch 23499 loss -0.33798494935035706 LR -0.3671000301837921 LKL 0.02911507524549961
epoch 23500 loss -0.34557223320007324 LR -0.3748248815536499 LKL 0.029252657666802406
46
epoch 23501 loss -0.31996360421180725 LR -0.3490891456604004 LKL 0.029125547036528587


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23502 loss -0.3819367587566376 LR -0.41116613149642944 LKL 0.02922937087714672
epoch 23503 loss -0.36817097663879395 LR -0.39746928215026855 LKL 0.029298318549990654
epoch 23504 loss -0.3101363778114319 LR -0.3393588066101074 LKL 0.029222438111901283
epoch 23505 loss -0.45448631048202515 LR -0.48409074544906616 LKL 0.029604429379105568
epoch 23506 loss -0.3779461085796356 LR -0.4074046313762665 LKL 0.029458533972501755
epoch 23507 loss -0.355430543422699 LR -0.3845922350883484 LKL 0.02916170097887516
epoch 23508 loss -0.3235682249069214 LR -0.35291093587875366 LKL 0.02934270352125168
epoch 23509 loss -0.3333939015865326 LR -0.3625454306602478 LKL 0.029151516035199165
epoch 23510 loss -0.34821614623069763 LR -0.37752652168273926 LKL 0.029310373589396477
epoch 23511 loss -0.33624282479286194 LR -0.3656390905380249 LKL 0.02939627692103386
epoch 23512 loss -0.30543065071105957 LR -0.334648460149765 LKL 0.02921782061457634
epoch 23513 loss -0.3653925657272339 LR -0.39468318223953247 L

66
epoch 23601 loss -0.29764458537101746 LR -0.32679516077041626 LKL 0.029150571674108505


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23602 loss -0.32060039043426514 LR -0.34974050521850586 LKL 0.02914012409746647
epoch 23603 loss -0.36636433005332947 LR -0.3955552875995636 LKL 0.029190970584750175
epoch 23604 loss -0.3733249008655548 LR -0.40226131677627563 LKL 0.028936423361301422
epoch 23605 loss -0.3510734438896179 LR -0.3803229331970215 LKL 0.029249481856822968
epoch 23606 loss -0.3465242385864258 LR -0.375842422246933 LKL 0.029318183660507202
epoch 23607 loss -0.3712227940559387 LR -0.40057313442230225 LKL 0.02935035154223442
epoch 23608 loss -0.35094356536865234 LR -0.3802716135978699 LKL 0.02932805009186268
epoch 23609 loss -0.26281091570854187 LR -0.2919589579105377 LKL 0.029148036614060402
epoch 23610 loss -0.360552579164505 LR -0.3900459408760071 LKL 0.02949335426092148
epoch 23611 loss -0.31971773505210876 LR -0.3489686846733093 LKL 0.029250940307974815
epoch 23612 loss -0.3705636262893677 LR -0.3997192978858948 LKL 0.029155679047107697
epoch 23613 loss -0.34106114506721497 LR -0.3703407347202301 LK

52
epoch 23701 loss -0.24352788925170898 LR -0.2725578844547272 LKL 0.02902998775243759


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23702 loss -0.3312189280986786 LR -0.3605148494243622 LKL 0.0292959101498127
epoch 23703 loss -0.3252943754196167 LR -0.35455837845802307 LKL 0.029264016076922417
epoch 23704 loss -0.32352855801582336 LR -0.35277822613716125 LKL 0.02924966812133789
epoch 23705 loss -0.3055722415447235 LR -0.33469003438949585 LKL 0.02911779284477234
epoch 23706 loss -0.40482455492019653 LR -0.4342036247253418 LKL 0.029379073530435562
epoch 23707 loss -0.30198976397514343 LR -0.3311535120010376 LKL 0.029163740575313568
epoch 23708 loss -0.29910650849342346 LR -0.3283165395259857 LKL 0.0292100440710783
epoch 23709 loss -0.3050299882888794 LR -0.33422255516052246 LKL 0.029192574322223663
epoch 23710 loss -0.31893929839134216 LR -0.34839630126953125 LKL 0.029457012191414833
epoch 23711 loss -0.3255697786808014 LR -0.35464319586753845 LKL 0.02907342091202736
epoch 23712 loss -0.37518754601478577 LR -0.4046385586261749 LKL 0.02945101261138916
epoch 23713 loss -0.324094295501709 LR -0.3534952402114868 LK

epoch 23798 loss -0.2886298596858978 LR -0.3179943561553955 LKL 0.02936450205743313
epoch 23799 loss -0.41101452708244324 LR -0.44055891036987305 LKL 0.02954438328742981
epoch 23800 loss -0.31825149059295654 LR -0.3474675416946411 LKL 0.029216062277555466
82
epoch 23801 loss -0.36371687054634094 LR -0.3930054306983948 LKL 0.029288550838828087


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23802 loss -0.35230255126953125 LR -0.3816627860069275 LKL 0.029360231012105942
epoch 23803 loss -0.3533308804035187 LR -0.38271117210388184 LKL 0.029380297288298607
epoch 23804 loss -0.3405922055244446 LR -0.3698521852493286 LKL 0.02925998345017433
epoch 23805 loss -0.3661155700683594 LR -0.3955250680446625 LKL 0.029409483075141907
epoch 23806 loss -0.37603694200515747 LR -0.4055060148239136 LKL 0.02946907840669155
epoch 23807 loss -0.31691136956214905 LR -0.34628844261169434 LKL 0.02937706746160984
epoch 23808 loss -0.3762529194355011 LR -0.4055895209312439 LKL 0.029336590319871902
epoch 23809 loss -0.3317544460296631 LR -0.361112505197525 LKL 0.029358074069023132
epoch 23810 loss -0.3428887128829956 LR -0.37234431505203247 LKL 0.02945558913052082
epoch 23811 loss -0.3430367410182953 LR -0.3724335730075836 LKL 0.029396820813417435
epoch 23812 loss -0.30225974321365356 LR -0.33145248889923096 LKL 0.02919274941086769
epoch 23813 loss -0.3965694010257721 LR -0.42592760920524597 LK

epoch 23898 loss -0.29714611172676086 LR -0.3265380561351776 LKL 0.029391948133707047
epoch 23899 loss -0.3589244484901428 LR -0.3883665204048157 LKL 0.029442062601447105
epoch 23900 loss -0.3358801603317261 LR -0.3652364909648895 LKL 0.029356317594647408
62
epoch 23901 loss -0.3806932270526886 LR -0.41017135977745056 LKL 0.02947814017534256


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 23902 loss -0.3127615749835968 LR -0.34216463565826416 LKL 0.029403064399957657
epoch 23903 loss -0.33686015009880066 LR -0.3663766384124756 LKL 0.029516488313674927
epoch 23904 loss -0.35007673501968384 LR -0.3794971704483032 LKL 0.02942044287919998
epoch 23905 loss -0.2914903461933136 LR -0.3205859661102295 LKL 0.02909562550485134
epoch 23906 loss -0.33897387981414795 LR -0.36824744939804077 LKL 0.029273556545376778
epoch 23907 loss -0.2971267104148865 LR -0.32662567496299744 LKL 0.029498953372240067
epoch 23908 loss -0.34240230917930603 LR -0.37179097533226013 LKL 0.029388656839728355
epoch 23909 loss -0.3213719427585602 LR -0.3507649898529053 LKL 0.029393058270215988
epoch 23910 loss -0.3115704357624054 LR -0.3407074809074402 LKL 0.029137035831809044
epoch 23911 loss -0.3874482214450836 LR -0.4167487919330597 LKL 0.02930055931210518
epoch 23912 loss -0.3483472168445587 LR -0.3777512311935425 LKL 0.029404019936919212
epoch 23913 loss -0.3500868082046509 LR -0.3794271945953369 

49
epoch 24001 loss -0.39677858352661133 LR -0.42629677057266235 LKL 0.02951817214488983


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24002 loss -0.3623609244823456 LR -0.39171063899993896 LKL 0.029349714517593384
epoch 24003 loss -0.3770567774772644 LR -0.40640780329704285 LKL 0.02935102954506874
epoch 24004 loss -0.3689166307449341 LR -0.3983306884765625 LKL 0.029414044693112373
epoch 24005 loss -0.3611697554588318 LR -0.39072293043136597 LKL 0.029553169384598732
epoch 24006 loss -0.3964717388153076 LR -0.42607545852661133 LKL 0.029603730887174606
epoch 24007 loss -0.34513533115386963 LR -0.37465900182724 LKL 0.029523679986596107
epoch 24008 loss -0.36640626192092896 LR -0.3957750201225281 LKL 0.029368773102760315
epoch 24009 loss -0.33705413341522217 LR -0.3664082884788513 LKL 0.029354140162467957
epoch 24010 loss -0.37540122866630554 LR -0.4047755002975464 LKL 0.02937428094446659
epoch 24011 loss -0.36639639735221863 LR -0.395717054605484 LKL 0.029320670291781425
epoch 24012 loss -0.3428381085395813 LR -0.3719640076160431 LKL 0.02912590093910694
epoch 24013 loss -0.37848830223083496 LR -0.40778350830078125 

epoch 24099 loss -0.2543187737464905 LR -0.2836785316467285 LKL 0.029359765350818634
epoch 24100 loss -0.37478774785995483 LR -0.40432488918304443 LKL 0.029537132009863853
70
epoch 24101 loss -0.34382984042167664 LR -0.3731904923915863 LKL 0.029360659420490265


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24102 loss -0.3449980318546295 LR -0.374430388212204 LKL 0.02943234331905842
epoch 24103 loss -0.2976917624473572 LR -0.32692205905914307 LKL 0.029230311512947083
epoch 24104 loss -0.39379188418388367 LR -0.4233422577381134 LKL 0.029550382867455482
epoch 24105 loss -0.34092074632644653 LR -0.3702365756034851 LKL 0.02931581623852253
epoch 24106 loss -0.2443593591451645 LR -0.2736714482307434 LKL 0.029312090948224068
epoch 24107 loss -0.3615984320640564 LR -0.39091598987579346 LKL 0.029317567124962807
epoch 24108 loss -0.37620821595191956 LR -0.4058934450149536 LKL 0.029685242101550102
epoch 24109 loss -0.2994116544723511 LR -0.3288848400115967 LKL 0.02947317063808441
epoch 24110 loss -0.32488179206848145 LR -0.35451146960258484 LKL 0.029629675671458244
epoch 24111 loss -0.3441903293132782 LR -0.3733864426612854 LKL 0.029196107760071754
epoch 24112 loss -0.40382564067840576 LR -0.43333280086517334 LKL 0.02950715459883213
epoch 24113 loss -0.33794069290161133 LR -0.367377907037735 L

45
epoch 24201 loss -0.332876980304718 LR -0.3621748983860016 LKL 0.02929791435599327


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24202 loss -0.3732103407382965 LR -0.4028305113315582 LKL 0.02962016500532627
epoch 24203 loss -0.2756637930870056 LR -0.3049636483192444 LKL 0.029299842193722725
epoch 24204 loss -0.3952604830265045 LR -0.42474600672721863 LKL 0.029485533013939857
epoch 24205 loss -0.38780316710472107 LR -0.4172689914703369 LKL 0.02946583181619644
epoch 24206 loss -0.2932785749435425 LR -0.32275456190109253 LKL 0.029475979506969452
epoch 24207 loss -0.376762330532074 LR -0.40642744302749634 LKL 0.029665116220712662
epoch 24208 loss -0.4250001311302185 LR -0.45454782247543335 LKL 0.029547693207859993
epoch 24209 loss -0.3450658619403839 LR -0.3746834695339203 LKL 0.029617615044116974
epoch 24210 loss -0.3254103362560272 LR -0.3548342287540436 LKL 0.02942388504743576
epoch 24211 loss -0.35690054297447205 LR -0.38627785444259644 LKL 0.02937730960547924
epoch 24212 loss -0.30573713779449463 LR -0.33498328924179077 LKL 0.029246164485812187
epoch 24213 loss -0.35958951711654663 LR -0.38904619216918945

epoch 24298 loss -0.34828105568885803 LR -0.37790602445602417 LKL 0.029624972492456436
epoch 24299 loss -0.3678481876850128 LR -0.3973773419857025 LKL 0.0295291505753994
epoch 24300 loss -0.34345391392707825 LR -0.37316054105758667 LKL 0.029706617817282677
80
epoch 24301 loss -0.386921226978302 LR -0.41649216413497925 LKL 0.02957092598080635


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24302 loss -0.32150983810424805 LR -0.350972056388855 LKL 0.029462220147252083
epoch 24303 loss -0.3456217050552368 LR -0.37495672702789307 LKL 0.029335027560591698
epoch 24304 loss -0.3461994528770447 LR -0.37568801641464233 LKL 0.029488569125533104
epoch 24305 loss -0.316494882106781 LR -0.3457929790019989 LKL 0.02929811179637909
epoch 24306 loss -0.37837207317352295 LR -0.4080173969268799 LKL 0.02964532934129238
epoch 24307 loss -0.342576801776886 LR -0.37213435769081116 LKL 0.029557568952441216
epoch 24308 loss -0.34105661511421204 LR -0.3705887496471405 LKL 0.02953212708234787
epoch 24309 loss -0.34022271633148193 LR -0.3696979284286499 LKL 0.029475219547748566
epoch 24310 loss -0.32067131996154785 LR -0.3500072658061981 LKL 0.029335949569940567
epoch 24311 loss -0.37892183661460876 LR -0.4085545539855957 LKL 0.029632721096277237
epoch 24312 loss -0.4090014100074768 LR -0.43846848607063293 LKL 0.029467064887285233
epoch 24313 loss -0.34905558824539185 LR -0.3783608078956604 

75
epoch 24401 loss -0.3824177384376526 LR -0.41206154227256775 LKL 0.029643816873431206


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24402 loss -0.4323975741863251 LR -0.462147057056427 LKL 0.029749492183327675
epoch 24403 loss -0.26129716634750366 LR -0.29057422280311584 LKL 0.02927706390619278
epoch 24404 loss -0.3045254945755005 LR -0.33384716510772705 LKL 0.029321683570742607
epoch 24405 loss -0.3147028386592865 LR -0.34402742981910706 LKL 0.029324578121304512
epoch 24406 loss -0.3718283772468567 LR -0.4013287127017975 LKL 0.02950032241642475
epoch 24407 loss -0.4056899845600128 LR -0.43531331419944763 LKL 0.029623329639434814
epoch 24408 loss -0.3045403063297272 LR -0.33395254611968994 LKL 0.02941223606467247
epoch 24409 loss -0.3455304503440857 LR -0.37518569827079773 LKL 0.02965523675084114
epoch 24410 loss -0.3473673462867737 LR -0.37670475244522095 LKL 0.029337404295802116
epoch 24411 loss -0.30741146206855774 LR -0.3369182050228119 LKL 0.029506750404834747
epoch 24412 loss -0.3346974849700928 LR -0.3640511929988861 LKL 0.029353708028793335
epoch 24413 loss -0.3683759868144989 LR -0.39796558022499084 

epoch 24498 loss -0.35771897435188293 LR -0.3873240351676941 LKL 0.029605073854327202
epoch 24499 loss -0.4023427367210388 LR -0.4320608675479889 LKL 0.029718128964304924
epoch 24500 loss -0.3606201708316803 LR -0.3899785578250885 LKL 0.029358381405472755
40
epoch 24501 loss -0.4268234670162201 LR -0.45665502548217773 LKL 0.02983156032860279


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24502 loss -0.34794723987579346 LR -0.3775525689125061 LKL 0.029605314135551453
epoch 24503 loss -0.3847670555114746 LR -0.41427627205848694 LKL 0.02950921654701233
epoch 24504 loss -0.3582151532173157 LR -0.3877220153808594 LKL 0.029506869614124298
epoch 24505 loss -0.41314953565597534 LR -0.4428531527519226 LKL 0.02970360405743122
epoch 24506 loss -0.36456769704818726 LR -0.39410796761512756 LKL 0.02954026684165001
epoch 24507 loss -0.31250008940696716 LR -0.34204328060150146 LKL 0.02954319305717945
epoch 24508 loss -0.3375697433948517 LR -0.3672146201133728 LKL 0.029644887894392014
epoch 24509 loss -0.2848977744579315 LR -0.3141859173774719 LKL 0.02928813360631466
epoch 24510 loss -0.38009265065193176 LR -0.40963125228881836 LKL 0.029538612812757492
epoch 24511 loss -0.3435870110988617 LR -0.37322768568992615 LKL 0.02964068204164505
epoch 24512 loss -0.4084021747112274 LR -0.43802669644355774 LKL 0.02962452545762062
epoch 24513 loss -0.26847970485687256 LR -0.2979353666305542 

epoch 24600 loss -0.3440956771373749 LR -0.37370091676712036 LKL 0.02960525080561638
85


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24601 loss -0.29052722454071045 LR -0.3200882375240326 LKL 0.02956101857125759
epoch 24602 loss -0.35712385177612305 LR -0.386729896068573 LKL 0.029606042429804802
epoch 24603 loss -0.35819536447525024 LR -0.38772857189178467 LKL 0.02953322045505047
epoch 24604 loss -0.3966033458709717 LR -0.4262962341308594 LKL 0.0296928733587265
epoch 24605 loss -0.365383118391037 LR -0.39503979682922363 LKL 0.029656672850251198
epoch 24606 loss -0.3180367350578308 LR -0.3474992513656616 LKL 0.029462501406669617
epoch 24607 loss -0.274565726518631 LR -0.3041074275970459 LKL 0.029541688039898872
epoch 24608 loss -0.32140249013900757 LR -0.3508470058441162 LKL 0.029444517567753792
epoch 24609 loss -0.2892441153526306 LR -0.318766713142395 LKL 0.02952260710299015
epoch 24610 loss -0.252081960439682 LR -0.2815738916397095 LKL 0.029491933062672615
epoch 24611 loss -0.36716893315315247 LR -0.3966447412967682 LKL 0.029475808143615723
epoch 24612 loss -0.359453409910202 LR -0.3888918161392212 LKL 0.029

epoch 24699 loss -0.3066808879375458 LR -0.33622246980667114 LKL 0.029541583731770515
epoch 24700 loss -0.34780827164649963 LR -0.37752753496170044 LKL 0.029719272628426552
65
epoch 24701 loss -0.3749963939189911 LR -0.40480029582977295 LKL 0.029803907498717308


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24702 loss -0.35828250646591187 LR -0.3879300355911255 LKL 0.02964751422405243
epoch 24703 loss -0.28885889053344727 LR -0.3185092806816101 LKL 0.029650399461388588
epoch 24704 loss -0.3647953271865845 LR -0.3945782780647278 LKL 0.029782962054014206
epoch 24705 loss -0.3596647083759308 LR -0.3893160820007324 LKL 0.02965136058628559
epoch 24706 loss -0.3954330086708069 LR -0.4250549077987671 LKL 0.029621906578540802
epoch 24707 loss -0.3299722969532013 LR -0.3594828248023987 LKL 0.02951052412390709
epoch 24708 loss -0.3300114572048187 LR -0.35959237813949585 LKL 0.029580920934677124
epoch 24709 loss -0.3877609968185425 LR -0.4174725413322449 LKL 0.029711537063121796
epoch 24710 loss -0.3152216076850891 LR -0.34464704990386963 LKL 0.029425444081425667
epoch 24711 loss -0.30520978569984436 LR -0.3349822461605072 LKL 0.029772460460662842
epoch 24712 loss -0.3901097774505615 LR -0.4198833405971527 LKL 0.029773548245429993
epoch 24713 loss -0.3434564173221588 LR -0.37309443950653076 LK

epoch 24798 loss -0.3095342218875885 LR -0.3389379382133484 LKL 0.029403716325759888
epoch 24799 loss -0.34935280680656433 LR -0.3790808618068695 LKL 0.029728058725595474
epoch 24800 loss -0.38635557889938354 LR -0.4161348342895508 LKL 0.029779264703392982
74
epoch 24801 loss -0.25321164727211 LR -0.2825686037540436 LKL 0.0293569453060627


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24802 loss -0.34712252020835876 LR -0.37686431407928467 LKL 0.029741782695055008
epoch 24803 loss -0.32032251358032227 LR -0.349917471408844 LKL 0.029594961553812027
epoch 24804 loss -0.3223460912704468 LR -0.35184013843536377 LKL 0.029494060203433037
epoch 24805 loss -0.3224405348300934 LR -0.3520579934120178 LKL 0.02961745299398899
epoch 24806 loss -0.3860480487346649 LR -0.4155460000038147 LKL 0.029497964307665825
epoch 24807 loss -0.3389577269554138 LR -0.36844953894615173 LKL 0.02949180081486702
epoch 24808 loss -0.396892786026001 LR -0.4266766607761383 LKL 0.029783865436911583
epoch 24809 loss -0.36812570691108704 LR -0.39771509170532227 LKL 0.02958938106894493
epoch 24810 loss -0.2899147570133209 LR -0.31916970014572144 LKL 0.029254941269755363
epoch 24811 loss -0.33563894033432007 LR -0.36530232429504395 LKL 0.02966337651014328
epoch 24812 loss -0.33145028352737427 LR -0.36093980073928833 LKL 0.02948950231075287
epoch 24813 loss -0.3220389485359192 LR -0.3514576554298401 

epoch 24900 loss -0.41257524490356445 LR -0.44231683015823364 LKL 0.02974158152937889
103
epoch 24901 loss -0.3516959547996521 LR -0.381367951631546 LKL 0.02967199683189392


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 24902 loss -0.3553469479084015 LR -0.3850601613521576 LKL 0.02971320040524006
epoch 24903 loss -0.346981018781662 LR -0.37655261158943176 LKL 0.02957160212099552
epoch 24904 loss -0.36977100372314453 LR -0.3995162844657898 LKL 0.029745275154709816
epoch 24905 loss -0.339179128408432 LR -0.36879852414131165 LKL 0.029619403183460236
epoch 24906 loss -0.420870304107666 LR -0.4507366716861725 LKL 0.029866378754377365
epoch 24907 loss -0.3710535168647766 LR -0.40068602561950684 LKL 0.029632510617375374
epoch 24908 loss -0.4040467143058777 LR -0.4337928593158722 LKL 0.029746130108833313
epoch 24909 loss -0.3909069001674652 LR -0.4207913875579834 LKL 0.02988448552787304
epoch 24910 loss -0.302772581577301 LR -0.33230605721473694 LKL 0.02953348308801651
epoch 24911 loss -0.267265260219574 LR -0.29675477743148804 LKL 0.029489507898688316
epoch 24912 loss -0.32193443179130554 LR -0.3516066074371338 LKL 0.02967218868434429
epoch 24913 loss -0.39972901344299316 LR -0.4294719696044922 LKL 0.0

epoch 24998 loss -0.3593551814556122 LR -0.38910335302352905 LKL 0.02974816784262657
epoch 24999 loss -0.3176921308040619 LR -0.3471657633781433 LKL 0.029473645612597466
epoch 25000 loss -0.37640705704689026 LR -0.40621423721313477 LKL 0.029807183891534805
59
epoch 25001 loss -0.3258565068244934 LR -0.35548433661460876 LKL 0.029627831652760506


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25002 loss -0.35283076763153076 LR -0.3822941482067108 LKL 0.02946336567401886
epoch 25003 loss -0.3146851062774658 LR -0.3443841338157654 LKL 0.029699023813009262
epoch 25004 loss -0.3227722644805908 LR -0.35239583253860474 LKL 0.02962355501949787
epoch 25005 loss -0.38171425461769104 LR -0.41140109300613403 LKL 0.029686838388442993
epoch 25006 loss -0.3543526232242584 LR -0.3841213285923004 LKL 0.02976871468126774
epoch 25007 loss -0.3905109465122223 LR -0.4202688932418823 LKL 0.029757944867014885
epoch 25008 loss -0.37226971983909607 LR -0.4020503759384155 LKL 0.029780667275190353
epoch 25009 loss -0.3411214053630829 LR -0.37091970443725586 LKL 0.029798297211527824
epoch 25010 loss -0.3037398159503937 LR -0.3334171772003174 LKL 0.02967735566198826
epoch 25011 loss -0.3569866418838501 LR -0.3865971565246582 LKL 0.02961052767932415
epoch 25012 loss -0.3604840934276581 LR -0.3902200758457184 LKL 0.02973598800599575
epoch 25013 loss -0.2748170495033264 LR -0.3043491840362549 LKL 0

epoch 25100 loss -0.3578595221042633 LR -0.3876255750656128 LKL 0.029766062274575233
74


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25101 loss -0.40654057264328003 LR -0.436509907245636 LKL 0.029969334602355957
epoch 25102 loss -0.3291563391685486 LR -0.35880061984062195 LKL 0.02964426763355732
epoch 25103 loss -0.3242426812648773 LR -0.353873074054718 LKL 0.02963038720190525
epoch 25104 loss -0.3825722634792328 LR -0.4124651551246643 LKL 0.029892878606915474
epoch 25105 loss -0.37193581461906433 LR -0.4016594886779785 LKL 0.02972368709743023
epoch 25106 loss -0.34506142139434814 LR -0.3748067319393158 LKL 0.02974531427025795
epoch 25107 loss -0.4065374433994293 LR -0.4364467263221741 LKL 0.0299092885106802
epoch 25108 loss -0.3596815764904022 LR -0.3896019458770752 LKL 0.029920371249318123
epoch 25109 loss -0.3412776589393616 LR -0.37090909481048584 LKL 0.029631424695253372
epoch 25110 loss -0.34073781967163086 LR -0.37063848972320557 LKL 0.029900670051574707
epoch 25111 loss -0.3433920741081238 LR -0.3731137216091156 LKL 0.029721640050411224
epoch 25112 loss -0.33191701769828796 LR -0.36164581775665283 LKL 

epoch 25197 loss -0.303417444229126 LR -0.33291319012641907 LKL 0.02949574775993824
epoch 25198 loss -0.411830335855484 LR -0.4418568015098572 LKL 0.030026452615857124
epoch 25199 loss -0.2977786064147949 LR -0.32765597105026245 LKL 0.029877357184886932
epoch 25200 loss -0.32429999113082886 LR -0.35414981842041016 LKL 0.029849819839000702
60


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25201 loss -0.28019821643829346 LR -0.3098733425140381 LKL 0.029675133526325226
epoch 25202 loss -0.3733898997306824 LR -0.40303289890289307 LKL 0.02964298613369465
epoch 25203 loss -0.34757208824157715 LR -0.3772276043891907 LKL 0.029655512422323227
epoch 25204 loss -0.39345869421958923 LR -0.42343229055404663 LKL 0.0299735888838768
epoch 25205 loss -0.3442281484603882 LR -0.37381988763809204 LKL 0.029591724276542664
epoch 25206 loss -0.3810074031352997 LR -0.4105096757411957 LKL 0.02950226701796055
epoch 25207 loss -0.3680558204650879 LR -0.3978567123413086 LKL 0.029800886288285255
epoch 25208 loss -0.3727123737335205 LR -0.4027213752269745 LKL 0.030008992180228233
epoch 25209 loss -0.3813048005104065 LR -0.41106104850769043 LKL 0.029756242409348488
epoch 25210 loss -0.31354013085365295 LR -0.3433486223220825 LKL 0.029808498919010162
epoch 25211 loss -0.3460516929626465 LR -0.37577491998672485 LKL 0.029723234474658966
epoch 25212 loss -0.3693807125091553 LR -0.3991757929325104 

epoch 25299 loss -0.3805055022239685 LR -0.4102996587753296 LKL 0.029794152826070786
epoch 25300 loss -0.421082466840744 LR -0.4510733187198639 LKL 0.02999086119234562
95
epoch 25301 loss -0.37515297532081604 LR -0.404989093542099 LKL 0.02983611263334751


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25302 loss -0.3341064155101776 LR -0.3640749752521515 LKL 0.02996857278048992
epoch 25303 loss -0.36832940578460693 LR -0.3982810378074646 LKL 0.029951637610793114
epoch 25304 loss -0.3552318215370178 LR -0.38492855429649353 LKL 0.02969673089683056
epoch 25305 loss -0.38301992416381836 LR -0.4129698872566223 LKL 0.02994997426867485
epoch 25306 loss -0.321349561214447 LR -0.3511761426925659 LKL 0.029826590791344643
epoch 25307 loss -0.3336188793182373 LR -0.36342668533325195 LKL 0.029807815328240395
epoch 25308 loss -0.41795769333839417 LR -0.44784778356552124 LKL 0.02989007718861103
epoch 25309 loss -0.3801136910915375 LR -0.4101051092147827 LKL 0.02999141253530979
epoch 25310 loss -0.3353702127933502 LR -0.36502605676651 LKL 0.02965584024786949
epoch 25311 loss -0.37557685375213623 LR -0.40524816513061523 LKL 0.0296713188290596
epoch 25312 loss -0.3852878212928772 LR -0.41505542397499084 LKL 0.0297675933688879
epoch 25313 loss -0.36153072118759155 LR -0.39140135049819946 LKL 0.0

53
epoch 25401 loss -0.3864248991012573 LR -0.41626617312431335 LKL 0.02984127774834633


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25402 loss -0.37346094846725464 LR -0.4033781886100769 LKL 0.029917225241661072
epoch 25403 loss -0.421821266412735 LR -0.451790988445282 LKL 0.02996971644461155
epoch 25404 loss -0.32055428624153137 LR -0.35040098428726196 LKL 0.02984669804573059
epoch 25405 loss -0.3486199378967285 LR -0.3783103823661804 LKL 0.02969043329358101
epoch 25406 loss -0.4055151343345642 LR -0.43541011214256287 LKL 0.029894966632127762
epoch 25407 loss -0.36442795395851135 LR -0.3941974341869354 LKL 0.029769472777843475
epoch 25408 loss -0.3207782208919525 LR -0.35049813985824585 LKL 0.02971990965306759
epoch 25409 loss -0.37546390295028687 LR -0.4052411913871765 LKL 0.0297772865742445
epoch 25410 loss -0.3464641571044922 LR -0.37622958421707153 LKL 0.0297654177993536
epoch 25411 loss -0.3898622393608093 LR -0.4197995066642761 LKL 0.0299372635781765
epoch 25412 loss -0.429849773645401 LR -0.4597910940647125 LKL 0.029941312968730927
epoch 25413 loss -0.39067864418029785 LR -0.42061305046081543 LKL 0.02

epoch 25500 loss -0.37647098302841187 LR -0.40643274784088135 LKL 0.02996177412569523
69


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25501 loss -0.34292665123939514 LR -0.37255430221557617 LKL 0.029627639800310135
epoch 25502 loss -0.35263603925704956 LR -0.38220950961112976 LKL 0.029573485255241394
epoch 25503 loss -0.3712800443172455 LR -0.4010981023311615 LKL 0.02981804497539997
epoch 25504 loss -0.34192395210266113 LR -0.3719331622123718 LKL 0.030009200796484947
epoch 25505 loss -0.3892454206943512 LR -0.4190725088119507 LKL 0.02982707880437374
epoch 25506 loss -0.3826296031475067 LR -0.41239631175994873 LKL 0.029766708612442017
epoch 25507 loss -0.3220134377479553 LR -0.35176748037338257 LKL 0.029754050076007843
epoch 25508 loss -0.3900406062602997 LR -0.41998302936553955 LKL 0.029942423105239868
epoch 25509 loss -0.4114122688770294 LR -0.44139543175697327 LKL 0.029983151704072952
epoch 25510 loss -0.3560875356197357 LR -0.38595977425575256 LKL 0.029872240498661995
epoch 25511 loss -0.3202229142189026 LR -0.3500165045261383 LKL 0.029793575406074524
epoch 25512 loss -0.37005794048309326 LR -0.3999286890029

epoch 25598 loss -0.40169402956962585 LR -0.4314314126968384 LKL 0.029737381264567375
epoch 25599 loss -0.4160293936729431 LR -0.4459942579269409 LKL 0.029964866116642952
epoch 25600 loss -0.2831856608390808 LR -0.31293225288391113 LKL 0.029746590182185173
100


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25601 loss -0.3597971498966217 LR -0.3894590735435486 LKL 0.029661932960152626
epoch 25602 loss -0.3123927116394043 LR -0.3421405255794525 LKL 0.029747800901532173
epoch 25603 loss -0.34520602226257324 LR -0.3751535713672638 LKL 0.029947536066174507
epoch 25604 loss -0.3415166735649109 LR -0.37146708369255066 LKL 0.029950402677059174
epoch 25605 loss -0.37530940771102905 LR -0.405140221118927 LKL 0.029830817133188248
epoch 25606 loss -0.2653835415840149 LR -0.2949478030204773 LKL 0.029564255848526955
epoch 25607 loss -0.347526490688324 LR -0.3774091303348541 LKL 0.029882624745368958
epoch 25608 loss -0.39261341094970703 LR -0.42254704236984253 LKL 0.029933633282780647
epoch 25609 loss -0.4280787706375122 LR -0.4581415057182312 LKL 0.030062740668654442
epoch 25610 loss -0.35017770528793335 LR -0.38014236092567444 LKL 0.029964657500386238
epoch 25611 loss -0.3138626515865326 LR -0.3435894548892975 LKL 0.02972680889070034
epoch 25612 loss -0.31366515159606934 LR -0.3433786630630493 

epoch 25700 loss -0.4107324182987213 LR -0.4407598376274109 LKL 0.03002740629017353
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25701 loss -0.4253198206424713 LR -0.4553493857383728 LKL 0.03002955950796604
epoch 25702 loss -0.42133113741874695 LR -0.4513034224510193 LKL 0.029972296208143234
epoch 25703 loss -0.34263187646865845 LR -0.37251588702201843 LKL 0.02988399565219879
epoch 25704 loss -0.3852076232433319 LR -0.4152235686779022 LKL 0.030015945434570312
epoch 25705 loss -0.36832183599472046 LR -0.39824622869491577 LKL 0.029924392700195312
epoch 25706 loss -0.33987852931022644 LR -0.3697008490562439 LKL 0.02982230670750141
epoch 25707 loss -0.39299073815345764 LR -0.423089861869812 LKL 0.03009912185370922
epoch 25708 loss -0.34000253677368164 LR -0.3698081374168396 LKL 0.02980559691786766
epoch 25709 loss -0.42150750756263733 LR -0.4515771269798279 LKL 0.030069625005126
epoch 25710 loss -0.421933650970459 LR -0.45191723108291626 LKL 0.02998359315097332
epoch 25711 loss -0.41093453764915466 LR -0.44077858328819275 LKL 0.029844049364328384
epoch 25712 loss -0.425042986869812 LR -0.45509231090545654 LKL 

epoch 25798 loss -0.3409440517425537 LR -0.3707645833492279 LKL 0.02982054464519024
epoch 25799 loss -0.3879263997077942 LR -0.4177788197994232 LKL 0.02985241636633873
epoch 25800 loss -0.3016129732131958 LR -0.33137163519859314 LKL 0.029758650809526443
101
epoch 25801 loss -0.2871536314487457 LR -0.3171629011631012 LKL 0.030009262263774872


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25802 loss -0.21200844645500183 LR -0.24164694547653198 LKL 0.029638495296239853
epoch 25803 loss -0.37010058760643005 LR -0.3999488651752472 LKL 0.02984827756881714
epoch 25804 loss -0.34631842374801636 LR -0.3760695457458496 LKL 0.029751112684607506
epoch 25805 loss -0.329728901386261 LR -0.3595370650291443 LKL 0.02980816550552845
epoch 25806 loss -0.33599236607551575 LR -0.3660128116607666 LKL 0.030020439997315407
epoch 25807 loss -0.38317713141441345 LR -0.4132091999053955 LKL 0.03003207966685295
epoch 25808 loss -0.3702371120452881 LR -0.4000837206840515 LKL 0.02984662353992462
epoch 25809 loss -0.35426196455955505 LR -0.3842225670814514 LKL 0.029960593208670616
epoch 25810 loss -0.39426687359809875 LR -0.42429107427597046 LKL 0.030024196952581406
epoch 25811 loss -0.37823253870010376 LR -0.4082196354866028 LKL 0.02998710796236992
epoch 25812 loss -0.35496819019317627 LR -0.38508403301239014 LKL 0.030115848407149315
epoch 25813 loss -0.37060683965682983 LR -0.400699734687805

42
epoch 25901 loss -0.35719186067581177 LR -0.38717418909072876 LKL 0.029982322826981544


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 25902 loss -0.38930419087409973 LR -0.4192604124546051 LKL 0.029956210404634476
epoch 25903 loss -0.37514880299568176 LR -0.4050969183444977 LKL 0.02994810976088047
epoch 25904 loss -0.404460072517395 LR -0.4343542158603668 LKL 0.029894141480326653
epoch 25905 loss -0.3507499098777771 LR -0.38062942028045654 LKL 0.02987951599061489
epoch 25906 loss -0.4292013943195343 LR -0.45922747254371643 LKL 0.030026080086827278
epoch 25907 loss -0.36842185258865356 LR -0.39857760071754456 LKL 0.030155761167407036
epoch 25908 loss -0.3519553542137146 LR -0.3819105625152588 LKL 0.029955197125673294
epoch 25909 loss -0.3192237615585327 LR -0.3492385745048523 LKL 0.030014799907803535
epoch 25910 loss -0.37816715240478516 LR -0.40815269947052 LKL 0.029985550791025162
epoch 25911 loss -0.29289743304252625 LR -0.322898268699646 LKL 0.0300008412450552
epoch 25912 loss -0.32981741428375244 LR -0.3596876859664917 LKL 0.029870271682739258
epoch 25913 loss -0.3891381025314331 LR -0.41917282342910767 LKL

56
epoch 26001 loss -0.35259413719177246 LR -0.38253381848335266 LKL 0.029939692467451096
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 26002 loss -0.30900198221206665 LR -0.3390072286128998 LKL 0.030005259439349174
epoch 26003 loss -0.3673863708972931 LR -0.39730560779571533 LKL 0.029919244349002838
epoch 26004 loss -0.47774529457092285 LR -0.5081190466880798 LKL 0.030373739078640938
epoch 26005 loss -0.3606951832771301 LR -0.39086729288101196 LKL 0.030172115191817284
epoch 26006 loss -0.38702884316444397 LR -0.4171367883682251 LKL 0.030107932165265083
epoch 26007 loss -0.30110377073287964 LR -0.3308551013469696 LKL 0.02975134551525116
epoch 26008 loss -0.30976369976997375 LR -0.3396582007408142 LKL 0.02989448793232441
epoch 26009 loss -0.39130881428718567 LR -0.42127758264541626 LKL 0.029968775808811188
epoch 26010 loss -0.31121042370796204 LR -0.34109577536582947 LKL 0.02988535165786743
epoch 26011 loss -0.37623876333236694 LR -0.40632352232933044 LKL 0.030084772035479546
epoch 26012 loss -0.31259065866470337 LR -0.3427032232284546 LKL 0.03011256642639637
epoch 26013 loss -0.3645813465118408 LR -0.39474451541900635

epoch 26098 loss -0.3911294639110565 LR -0.42109420895576477 LKL 0.029964733868837357
epoch 26099 loss -0.39960870146751404 LR -0.42980819940567017 LKL 0.03019949048757553
epoch 26100 loss -0.3759537935256958 LR -0.40598806738853455 LKL 0.030034275725483894
47
epoch 26101 loss -0.3411601483821869 LR -0.3712628483772278 LKL 0.030102703720331192


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26102 loss -0.3995368778705597 LR -0.4296960234642029 LKL 0.030159158632159233
epoch 26103 loss -0.3598581850528717 LR -0.3899308145046234 LKL 0.030072618275880814
epoch 26104 loss -0.3530900478363037 LR -0.38325804471969604 LKL 0.030167987570166588
epoch 26105 loss -0.39270225167274475 LR -0.4229358434677124 LKL 0.03023359179496765
epoch 26106 loss -0.3279573321342468 LR -0.3580477833747864 LKL 0.030090436339378357
epoch 26107 loss -0.34005191922187805 LR -0.3699246346950531 LKL 0.0298727136105299
epoch 26108 loss -0.2980542778968811 LR -0.3280320167541504 LKL 0.02997773513197899
epoch 26109 loss -0.38469094038009644 LR -0.41471147537231445 LKL 0.030020521953701973
epoch 26110 loss -0.31573379039764404 LR -0.3457801938056946 LKL 0.030046403408050537
epoch 26111 loss -0.33397209644317627 LR -0.3640957772731781 LKL 0.030123673379421234
epoch 26112 loss -0.3736211657524109 LR -0.4036144018173218 LKL 0.029993243515491486
epoch 26113 loss -0.33775970339775085 LR -0.3677998185157776 L

epoch 26198 loss -0.40000882744789124 LR -0.4302438199520111 LKL 0.030234985053539276
epoch 26199 loss -0.4231519401073456 LR -0.4533922076225281 LKL 0.03024025820195675
epoch 26200 loss -0.32529065012931824 LR -0.35521015524864197 LKL 0.029919512569904327
48
epoch 26201 loss -0.3141791820526123 LR -0.3439621925354004 LKL 0.029783006757497787


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26202 loss -0.34417998790740967 LR -0.3744981288909912 LKL 0.030318155884742737
epoch 26203 loss -0.3631785809993744 LR -0.39329248666763306 LKL 0.030113903805613518
epoch 26204 loss -0.3623984456062317 LR -0.3923541009426117 LKL 0.02995564043521881
epoch 26205 loss -0.35380885004997253 LR -0.3838462233543396 LKL 0.03003736026585102
epoch 26206 loss -0.3940785825252533 LR -0.42425623536109924 LKL 0.03017764911055565
epoch 26207 loss -0.3879299461841583 LR -0.4180671274662018 LKL 0.030137183144688606
epoch 26208 loss -0.3615853488445282 LR -0.39158740639686584 LKL 0.030002055689692497
epoch 26209 loss -0.34214159846305847 LR -0.37226468324661255 LKL 0.030123092234134674
epoch 26210 loss -0.3555997312068939 LR -0.3855193853378296 LKL 0.029919655993580818
epoch 26211 loss -0.36671486496925354 LR -0.39681217074394226 LKL 0.030097315087914467
epoch 26212 loss -0.37298721075057983 LR -0.40298622846603394 LKL 0.029999032616615295
epoch 26213 loss -0.3697911202907562 LR -0.39981952309608

58
epoch 26301 loss -0.3802347481250763 LR -0.41056716442108154 LKL 0.030332423746585846


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26302 loss -0.2911483943462372 LR -0.3211105763912201 LKL 0.029962176457047462
epoch 26303 loss -0.3770933151245117 LR -0.4073915481567383 LKL 0.030298247933387756
epoch 26304 loss -0.37579575181007385 LR -0.4060276448726654 LKL 0.030231885612010956
epoch 26305 loss -0.38209691643714905 LR -0.4121288061141968 LKL 0.030031895264983177
epoch 26306 loss -0.3797090947628021 LR -0.40973740816116333 LKL 0.03002832643687725
epoch 26307 loss -0.35885411500930786 LR -0.3891901969909668 LKL 0.03033609502017498
epoch 26308 loss -0.37255266308784485 LR -0.40275657176971436 LKL 0.03020390309393406
epoch 26309 loss -0.41260796785354614 LR -0.44271066784858704 LKL 0.0301026850938797
epoch 26310 loss -0.386673241853714 LR -0.4169684648513794 LKL 0.030295226722955704
epoch 26311 loss -0.3368433117866516 LR -0.36675092577934265 LKL 0.029907627031207085
epoch 26312 loss -0.3672829270362854 LR -0.3972080945968628 LKL 0.02992517501115799
epoch 26313 loss -0.3684690296649933 LR -0.3985210359096527 LKL

89
epoch 26401 loss -0.3404153287410736 LR -0.3702448010444641 LKL 0.02982945926487446


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26402 loss -0.35202017426490784 LR -0.3817930817604065 LKL 0.02977289818227291
epoch 26403 loss -0.2826607823371887 LR -0.3126413822174072 LKL 0.02998058870434761
epoch 26404 loss -0.3580762445926666 LR -0.3881211280822754 LKL 0.030044881626963615
epoch 26405 loss -0.3767898976802826 LR -0.4069150686264038 LKL 0.03012516163289547
epoch 26406 loss -0.40637314319610596 LR -0.4365704357624054 LKL 0.03019729256629944
epoch 26407 loss -0.36377593874931335 LR -0.3939315676689148 LKL 0.030155625194311142
epoch 26408 loss -0.3474375605583191 LR -0.37741541862487793 LKL 0.029977865517139435
epoch 26409 loss -0.4010562300682068 LR -0.4312489628791809 LKL 0.03019273839890957
epoch 26410 loss -0.4402991831302643 LR -0.4704728424549103 LKL 0.03017364628612995
epoch 26411 loss -0.3721216917037964 LR -0.4021785855293274 LKL 0.030056899413466454
epoch 26412 loss -0.42221036553382874 LR -0.452581524848938 LKL 0.030371153727173805
epoch 26413 loss -0.4173746407032013 LR -0.447553813457489 LKL 0.03

epoch 26498 loss -0.3011952042579651 LR -0.3312196135520935 LKL 0.03002440370619297
epoch 26499 loss -0.3256306052207947 LR -0.35567599534988403 LKL 0.03004538081586361
epoch 26500 loss -0.3570694029331207 LR -0.38718676567077637 LKL 0.03011736087501049
47
epoch 26501 loss -0.3383457064628601 LR -0.3685387372970581 LKL 0.030193015933036804


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26502 loss -0.34167689085006714 LR -0.3719426691532135 LKL 0.03026578761637211
epoch 26503 loss -0.3439784348011017 LR -0.3740476369857788 LKL 0.030069194734096527
epoch 26504 loss -0.3300609290599823 LR -0.3602151572704315 LKL 0.030154230073094368
epoch 26505 loss -0.41711586713790894 LR -0.4474143981933594 LKL 0.03029852733016014
epoch 26506 loss -0.33558574318885803 LR -0.3655397593975067 LKL 0.02995402179658413
epoch 26507 loss -0.38644516468048096 LR -0.41641563177108765 LKL 0.029970472678542137
epoch 26508 loss -0.38555869460105896 LR -0.4156869947910309 LKL 0.03012831322848797
epoch 26509 loss -0.33865678310394287 LR -0.36877012252807617 LKL 0.030113352462649345
epoch 26510 loss -0.3388564884662628 LR -0.3690621554851532 LKL 0.030205676332116127
epoch 26511 loss -0.37559956312179565 LR -0.40570393204689026 LKL 0.030104361474514008
epoch 26512 loss -0.33790087699890137 LR -0.3678790330886841 LKL 0.029978150501847267
epoch 26513 loss -0.43681812286376953 LR -0.46737965941429

73
epoch 26601 loss -0.39434996247291565 LR -0.4246792197227478 LKL 0.030329257249832153


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26602 loss -0.42717957496643066 LR -0.45751291513442993 LKL 0.030333353206515312
epoch 26603 loss -0.3675802946090698 LR -0.3976341485977173 LKL 0.03005385957658291
epoch 26604 loss -0.29961296916007996 LR -0.3294494152069092 LKL 0.02983645722270012
epoch 26605 loss -0.41965359449386597 LR -0.44991761445999146 LKL 0.030264021828770638
epoch 26606 loss -0.3440171480178833 LR -0.3741147816181183 LKL 0.03009762242436409
epoch 26607 loss -0.3253318965435028 LR -0.35558077692985535 LKL 0.030248871073126793
epoch 26608 loss -0.3709202706813812 LR -0.40092313289642334 LKL 0.030002858489751816
epoch 26609 loss -0.4392131268978119 LR -0.4695773422718048 LKL 0.030364202335476875
epoch 26610 loss -0.3342072367668152 LR -0.3644341826438904 LKL 0.030226944014430046
epoch 26611 loss -0.32460638880729675 LR -0.3547860085964203 LKL 0.030179619789123535
epoch 26612 loss -0.3032280206680298 LR -0.3332372009754181 LKL 0.030009182170033455
epoch 26613 loss -0.34473299980163574 LR -0.3750016093254089

epoch 26698 loss -0.3522593379020691 LR -0.38237348198890686 LKL 0.030114131048321724
epoch 26699 loss -0.38112184405326843 LR -0.4112798571586609 LKL 0.030158018693327904
epoch 26700 loss -0.3748188018798828 LR -0.40496858954429626 LKL 0.030149785801768303
63
epoch 26701 loss -0.3803257942199707 LR -0.4106697738170624 LKL 0.03034396842122078


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26702 loss -0.3940858840942383 LR -0.4243503212928772 LKL 0.030264442786574364
epoch 26703 loss -0.3987799882888794 LR -0.42899399995803833 LKL 0.03021400049328804
epoch 26704 loss -0.3876631259918213 LR -0.41799741983413696 LKL 0.03033428266644478
epoch 26705 loss -0.3596459627151489 LR -0.3898445665836334 LKL 0.0301985964179039
epoch 26706 loss -0.430216521024704 LR -0.460448682308197 LKL 0.030232161283493042
epoch 26707 loss -0.3482770025730133 LR -0.3783746659755707 LKL 0.03009767457842827
epoch 26708 loss -0.3785609304904938 LR -0.40865781903266907 LKL 0.030096881091594696
epoch 26709 loss -0.40499135851860046 LR -0.43533605337142944 LKL 0.030344689264893532
epoch 26710 loss -0.3645954728126526 LR -0.39475706219673157 LKL 0.030161578208208084
epoch 26711 loss -0.4378116726875305 LR -0.4682292342185974 LKL 0.030417561531066895
epoch 26712 loss -0.3764759600162506 LR -0.4067283272743225 LKL 0.030252356082201004
epoch 26713 loss -0.4031962752342224 LR -0.4334169030189514 LKL 0.

64
epoch 26801 loss -0.38247150182724 LR -0.4126848876476288 LKL 0.03021339699625969


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26802 loss -0.32317885756492615 LR -0.3534584045410156 LKL 0.030279552564024925
epoch 26803 loss -0.3494073450565338 LR -0.3793860673904419 LKL 0.02997872233390808
epoch 26804 loss -0.41369232535362244 LR -0.44389328360557556 LKL 0.030200958251953125
epoch 26805 loss -0.38306906819343567 LR -0.41332027316093445 LKL 0.03025120124220848
epoch 26806 loss -0.37141740322113037 LR -0.4017191529273987 LKL 0.030301755294203758
epoch 26807 loss -0.3375798761844635 LR -0.36791563034057617 LKL 0.030335746705532074
epoch 26808 loss -0.36065706610679626 LR -0.3909246027469635 LKL 0.030267536640167236
epoch 26809 loss -0.3365892767906189 LR -0.3669036030769348 LKL 0.030314313247799873
epoch 26810 loss -0.3395330607891083 LR -0.3697039783000946 LKL 0.03017091006040573
epoch 26811 loss -0.3391537070274353 LR -0.3693610429763794 LKL 0.030207330361008644
epoch 26812 loss -0.33488497138023376 LR -0.3650679290294647 LKL 0.03018294647336006
epoch 26813 loss -0.3567420244216919 LR -0.38683339953422546

70
epoch 26901 loss -0.3333311378955841 LR -0.3633873462677002 LKL 0.030056221410632133


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 26902 loss -0.4235410690307617 LR -0.4538276195526123 LKL 0.030286550521850586
epoch 26903 loss -0.39765989780426025 LR -0.427999883890152 LKL 0.03033999353647232
epoch 26904 loss -0.43697354197502136 LR -0.4673860967159271 LKL 0.030412567779421806
epoch 26905 loss -0.4375651180744171 LR -0.46800297498703003 LKL 0.030437856912612915
epoch 26906 loss -0.37831273674964905 LR -0.4083995223045349 LKL 0.03008679486811161
epoch 26907 loss -0.3525716960430145 LR -0.3827609419822693 LKL 0.030189258977770805
epoch 26908 loss -0.37614983320236206 LR -0.4064587652683258 LKL 0.030308932065963745
epoch 26909 loss -0.360502153635025 LR -0.39063212275505066 LKL 0.030129974707961082
epoch 26910 loss -0.3716178834438324 LR -0.40161705017089844 LKL 0.02999916672706604
epoch 26911 loss -0.3854486644268036 LR -0.41564831137657166 LKL 0.030199633911252022
epoch 26912 loss -0.37732991576194763 LR -0.4075220227241516 LKL 0.030192095786333084
epoch 26913 loss -0.34407955408096313 LR -0.374305784702301 L

epoch 26998 loss -0.3796181082725525 LR -0.4099128544330597 LKL 0.030294759199023247
epoch 26999 loss -0.43994760513305664 LR -0.47036924958229065 LKL 0.030421646311879158
epoch 27000 loss -0.43790140748023987 LR -0.46831318736076355 LKL 0.03041178360581398
59


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27001 loss -0.430123507976532 LR -0.460599809885025 LKL 0.03047630377113819
epoch 27002 loss -0.3628731966018677 LR -0.39314574003219604 LKL 0.03027254156768322
epoch 27003 loss -0.37389039993286133 LR -0.4041031301021576 LKL 0.03021271526813507
epoch 27004 loss -0.42521509528160095 LR -0.45564502477645874 LKL 0.030429931357502937
epoch 27005 loss -0.4135587513446808 LR -0.443803608417511 LKL 0.03024485893547535
epoch 27006 loss -0.3828389048576355 LR -0.412979394197464 LKL 0.030140498653054237
epoch 27007 loss -0.3918832838535309 LR -0.42211806774139404 LKL 0.030234776437282562
epoch 27008 loss -0.32846197485923767 LR -0.35852864384651184 LKL 0.030066680163145065
epoch 27009 loss -0.3863879442214966 LR -0.41647273302078247 LKL 0.030084801837801933
epoch 27010 loss -0.47149133682250977 LR -0.5019707679748535 LKL 0.03047942742705345
epoch 27011 loss -0.3741207718849182 LR -0.40429461002349854 LKL 0.030173836275935173
epoch 27012 loss -0.3923865854740143 LR -0.4226737916469574 LKL 

epoch 27097 loss -0.3872808516025543 LR -0.417479008436203 LKL 0.030198169872164726
epoch 27098 loss -0.3592790961265564 LR -0.38954290747642517 LKL 0.030263813212513924
epoch 27099 loss -0.3627507984638214 LR -0.3929106891155243 LKL 0.030159881338477135
epoch 27100 loss -0.35128337144851685 LR -0.3814568817615509 LKL 0.030173517763614655
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27101 loss -0.35603541135787964 LR -0.3863072693347931 LKL 0.030271850526332855
epoch 27102 loss -0.3799760043621063 LR -0.41040337085723877 LKL 0.03042737953364849
epoch 27103 loss -0.42664679884910583 LR -0.4571104347705841 LKL 0.03046363592147827
epoch 27104 loss -0.3781067430973053 LR -0.408692330121994 LKL 0.030585573986172676
epoch 27105 loss -0.3745422959327698 LR -0.4047236144542694 LKL 0.03018133156001568
epoch 27106 loss -0.38357022404670715 LR -0.41400617361068726 LKL 0.030435949563980103
epoch 27107 loss -0.3880836069583893 LR -0.41823625564575195 LKL 0.030152639374136925
epoch 27108 loss -0.4616425633430481 LR -0.4921720325946808 LKL 0.03052947297692299
epoch 27109 loss -0.3382645845413208 LR -0.3684365749359131 LKL 0.030171994119882584
epoch 27110 loss -0.400357186794281 LR -0.4307602047920227 LKL 0.0304030179977417
epoch 27111 loss -0.32705608010292053 LR -0.3573475778102875 LKL 0.0302914846688509
epoch 27112 loss -0.31291407346725464 LR -0.34324660897254944 LKL 0.

epoch 27197 loss -0.3726806640625 LR -0.4030013084411621 LKL 0.030320629477500916
epoch 27198 loss -0.33139845728874207 LR -0.3615615963935852 LKL 0.030163144692778587
epoch 27199 loss -0.40053749084472656 LR -0.4308338761329651 LKL 0.03029637038707733
epoch 27200 loss -0.36275047063827515 LR -0.39319324493408203 LKL 0.03044278174638748
43


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27201 loss -0.4420970380306244 LR -0.4725414216518402 LKL 0.030444396659731865
epoch 27202 loss -0.42486727237701416 LR -0.4554347097873688 LKL 0.030567435547709465
epoch 27203 loss -0.42806732654571533 LR -0.4584500789642334 LKL 0.030382761731743813
epoch 27204 loss -0.3054017722606659 LR -0.3355274200439453 LKL 0.030125651508569717
epoch 27205 loss -0.3017292022705078 LR -0.33200523257255554 LKL 0.030276037752628326
epoch 27206 loss -0.38496002554893494 LR -0.4153526723384857 LKL 0.030392633751034737
epoch 27207 loss -0.385052889585495 LR -0.4155234694480896 LKL 0.03047056682407856
epoch 27208 loss -0.387859970331192 LR -0.4183146357536316 LKL 0.03045465797185898
epoch 27209 loss -0.4316721558570862 LR -0.4620618522167206 LKL 0.030389681458473206
epoch 27210 loss -0.3307316303253174 LR -0.360883891582489 LKL 0.030152272433042526
epoch 27211 loss -0.3566511869430542 LR -0.3869010806083679 LKL 0.030249906703829765
epoch 27212 loss -0.36985284090042114 LR -0.40019434690475464 LKL 

epoch 27297 loss -0.36489027738571167 LR -0.3951120674610138 LKL 0.030221791937947273
epoch 27298 loss -0.4128856062889099 LR -0.44343268871307373 LKL 0.030547069385647774
epoch 27299 loss -0.28098246455192566 LR -0.31103888154029846 LKL 0.030056428164243698
epoch 27300 loss -0.3903374671936035 LR -0.420602023601532 LKL 0.030264560133218765
47


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27301 loss -0.4385136365890503 LR -0.4690651595592499 LKL 0.030551522970199585
epoch 27302 loss -0.41395628452301025 LR -0.44414782524108887 LKL 0.03019152581691742
epoch 27303 loss -0.3391674757003784 LR -0.36935123801231384 LKL 0.03018374927341938
epoch 27304 loss -0.3971711993217468 LR -0.42757734656333923 LKL 0.030406150966882706
epoch 27305 loss -0.4317774772644043 LR -0.4622936248779297 LKL 0.03051615320146084
epoch 27306 loss -0.3831773102283478 LR -0.4134221076965332 LKL 0.030244793742895126
epoch 27307 loss -0.3689485788345337 LR -0.39935967326164246 LKL 0.03041108138859272
epoch 27308 loss -0.37025266885757446 LR -0.40064433217048645 LKL 0.030391670763492584
epoch 27309 loss -0.3659916818141937 LR -0.3961106836795807 LKL 0.03011901304125786
epoch 27310 loss -0.37455669045448303 LR -0.40495720505714417 LKL 0.030400527641177177
epoch 27311 loss -0.36376720666885376 LR -0.39419662952423096 LKL 0.03042941354215145
epoch 27312 loss -0.3504151701927185 LR -0.3807269036769867 

epoch 27397 loss -0.4188039302825928 LR -0.44917407631874084 LKL 0.030370140448212624
epoch 27398 loss -0.3427969515323639 LR -0.3729029893875122 LKL 0.03010602667927742
epoch 27399 loss -0.3309151530265808 LR -0.36116135120391846 LKL 0.030246209353208542
epoch 27400 loss -0.3030032515525818 LR -0.3333158493041992 LKL 0.030312588438391685
80


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27401 loss -0.3819506764411926 LR -0.412023663520813 LKL 0.030072998255491257
epoch 27402 loss -0.39101630449295044 LR -0.42154139280319214 LKL 0.030525078997015953
epoch 27403 loss -0.2902524471282959 LR -0.32032454013824463 LKL 0.03007209300994873
epoch 27404 loss -0.4028153121471405 LR -0.4333474040031433 LKL 0.030532091856002808
epoch 27405 loss -0.43929731845855713 LR -0.4697577953338623 LKL 0.03046046569943428
epoch 27406 loss -0.4208392798900604 LR -0.451236754655838 LKL 0.030397487804293633
epoch 27407 loss -0.3732829689979553 LR -0.40352100133895874 LKL 0.03023802489042282
epoch 27408 loss -0.4038170874118805 LR -0.43425217270851135 LKL 0.030435090884566307
epoch 27409 loss -0.3481954038143158 LR -0.37855222821235657 LKL 0.030356815084815025
epoch 27410 loss -0.4225764274597168 LR -0.4529551565647125 LKL 0.030378714203834534
epoch 27411 loss -0.32854628562927246 LR -0.3587237000465393 LKL 0.030177408829331398
epoch 27412 loss -0.3309614360332489 LR -0.36120232939720154 L

epoch 27500 loss -0.4189909100532532 LR -0.4494873583316803 LKL 0.030496444553136826
114
epoch 27501 loss -0.3732891380786896 LR -0.4038257896900177 LKL 0.030536655336618423


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27502 loss -0.37065568566322327 LR -0.40101537108421326 LKL 0.03035968914628029
epoch 27503 loss -0.41407373547554016 LR -0.4443444013595581 LKL 0.03027067519724369
epoch 27504 loss -0.3265332579612732 LR -0.3569294512271881 LKL 0.03039620630443096
epoch 27505 loss -0.31255167722702026 LR -0.3428676128387451 LKL 0.030315930023789406
epoch 27506 loss -0.3826383352279663 LR -0.41315245628356934 LKL 0.030514128506183624
epoch 27507 loss -0.33402305841445923 LR -0.364501953125 LKL 0.03047890029847622
epoch 27508 loss -0.4224209487438202 LR -0.4528535008430481 LKL 0.03043256141245365
epoch 27509 loss -0.37576931715011597 LR -0.4063850939273834 LKL 0.0306157898157835
epoch 27510 loss -0.37419310212135315 LR -0.4044601321220398 LKL 0.030267026275396347
epoch 27511 loss -0.4082435369491577 LR -0.4386599659919739 LKL 0.03041643649339676
epoch 27512 loss -0.37224280834198 LR -0.40270158648490906 LKL 0.03045877255499363
epoch 27513 loss -0.3632144033908844 LR -0.39325809478759766 LKL 0.0300

61
epoch 27601 loss -0.4343620240688324 LR -0.4647549092769623 LKL 0.030392896384000778


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27602 loss -0.32243797183036804 LR -0.35284873843193054 LKL 0.030410759150981903
epoch 27603 loss -0.4496646523475647 LR -0.480266273021698 LKL 0.030601628124713898
epoch 27604 loss -0.4350641369819641 LR -0.46555888652801514 LKL 0.03049474023282528
epoch 27605 loss -0.4038994312286377 LR -0.4344474673271179 LKL 0.030548030510544777
epoch 27606 loss -0.3811020255088806 LR -0.4114927053451538 LKL 0.030390694737434387
epoch 27607 loss -0.34145158529281616 LR -0.3717152178287506 LKL 0.030263632535934448
epoch 27608 loss -0.3268981873989105 LR -0.35719481110572815 LKL 0.030296633020043373
epoch 27609 loss -0.3332040309906006 LR -0.3633454442024231 LKL 0.030141400173306465
epoch 27610 loss -0.3672800064086914 LR -0.3975992500782013 LKL 0.03031923994421959
epoch 27611 loss -0.37597358226776123 LR -0.406440794467926 LKL 0.0304672010242939
epoch 27612 loss -0.3350107967853546 LR -0.36540722846984863 LKL 0.03039642609655857
epoch 27613 loss -0.38716691732406616 LR -0.4176286458969116 LKL 

epoch 27700 loss -0.42223358154296875 LR -0.4528304934501648 LKL 0.030596911907196045
59


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27701 loss -0.41313880681991577 LR -0.4436424970626831 LKL 0.03050367720425129
epoch 27702 loss -0.33026713132858276 LR -0.3607025146484375 LKL 0.030435392633080482
epoch 27703 loss -0.34118151664733887 LR -0.37151145935058594 LKL 0.03032994642853737
epoch 27704 loss -0.36705002188682556 LR -0.3973628580570221 LKL 0.03031282313168049
epoch 27705 loss -0.4190025329589844 LR -0.44964897632598877 LKL 0.030646435916423798
epoch 27706 loss -0.3686833679676056 LR -0.39895331859588623 LKL 0.030269939452409744
epoch 27707 loss -0.34442800283432007 LR -0.37471139430999756 LKL 0.030283378437161446
epoch 27708 loss -0.33610138297080994 LR -0.3664909601211548 LKL 0.030389579012989998
epoch 27709 loss -0.4127240777015686 LR -0.44324079155921936 LKL 0.030516719445586205
epoch 27710 loss -0.37586095929145813 LR -0.4062458872795105 LKL 0.030384914949536324
epoch 27711 loss -0.4031822383403778 LR -0.4334070384502411 LKL 0.030224813148379326
epoch 27712 loss -0.3248315751552582 LR -0.3548877835273

epoch 27797 loss -0.4029102623462677 LR -0.43326902389526367 LKL 0.03035876899957657
epoch 27798 loss -0.40346625447273254 LR -0.43387308716773987 LKL 0.030406827107071877
epoch 27799 loss -0.2963455021381378 LR -0.32663244009017944 LKL 0.03028692863881588
epoch 27800 loss -0.3835424780845642 LR -0.4138509929180145 LKL 0.030308522284030914
80


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27801 loss -0.38404762744903564 LR -0.4144882261753082 LKL 0.03044060990214348
epoch 27802 loss -0.3078445792198181 LR -0.33805814385414124 LKL 0.030213559046387672
epoch 27803 loss -0.38334593176841736 LR -0.4138714671134949 LKL 0.03052552603185177
epoch 27804 loss -0.37082958221435547 LR -0.4012252688407898 LKL 0.030395682901144028
epoch 27805 loss -0.34345704317092896 LR -0.37379422783851624 LKL 0.03033718839287758
epoch 27806 loss -0.3598576784133911 LR -0.39025285840034485 LKL 0.030395178124308586
epoch 27807 loss -0.32724207639694214 LR -0.35760051012039185 LKL 0.030358437448740005
epoch 27808 loss -0.36713042855262756 LR -0.3974813222885132 LKL 0.03035089001059532
epoch 27809 loss -0.45290321111679077 LR -0.483529269695282 LKL 0.030626052990555763
epoch 27810 loss -0.418645977973938 LR -0.44884687662124634 LKL 0.030200891196727753
epoch 27811 loss -0.3762846291065216 LR -0.40665820240974426 LKL 0.030373575165867805
epoch 27812 loss -0.3588877022266388 LR -0.389209389686584

epoch 27897 loss -0.33666300773620605 LR -0.36683881282806396 LKL 0.030175816267728806
epoch 27898 loss -0.36351925134658813 LR -0.3939222991466522 LKL 0.03040306083858013
epoch 27899 loss -0.3727668225765228 LR -0.40318000316619873 LKL 0.030413180589675903
epoch 27900 loss -0.4140111804008484 LR -0.4444316625595093 LKL 0.03042047843337059
104
epoch 27901 loss -0.40979987382888794 LR -0.4402572512626648 LKL 0.030457377433776855


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 27902 loss -0.42841753363609314 LR -0.45896318554878235 LKL 0.03054564818739891
epoch 27903 loss -0.42143863439559937 LR -0.4519757330417633 LKL 0.030537104234099388
epoch 27904 loss -0.37410447001457214 LR -0.4046313464641571 LKL 0.03052687458693981
epoch 27905 loss -0.4466364085674286 LR -0.47724705934524536 LKL 0.030610643327236176
epoch 27906 loss -0.4133460223674774 LR -0.4439895749092102 LKL 0.030643561854958534
epoch 27907 loss -0.39027026295661926 LR -0.42071911692619324 LKL 0.03044884279370308
epoch 27908 loss -0.4177892804145813 LR -0.44819414615631104 LKL 0.03040487878024578
epoch 27909 loss -0.34455907344818115 LR -0.3750211000442505 LKL 0.030462011694908142
epoch 27910 loss -0.3294619917869568 LR -0.35976046323776245 LKL 0.03029845654964447
epoch 27911 loss -0.34911105036735535 LR -0.37969276309013367 LKL 0.03058171644806862
epoch 27912 loss -0.429029643535614 LR -0.45948314666748047 LKL 0.030453497543931007
epoch 27913 loss -0.4028750956058502 LR -0.4332634210586548

epoch 27998 loss -0.3548034727573395 LR -0.3852039575576782 LKL 0.030400484800338745
epoch 27999 loss -0.372189998626709 LR -0.4026723802089691 LKL 0.030482370406389236
epoch 28000 loss -0.38904720544815063 LR -0.4194369912147522 LKL 0.03038979321718216
92
epoch 28001 loss -0.4551667869091034 LR -0.4858319163322449 LKL 0.030665138736367226


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28002 loss -0.34182819724082947 LR -0.372123658657074 LKL 0.030295461416244507
epoch 28003 loss -0.3676365911960602 LR -0.3981064558029175 LKL 0.0304698683321476
epoch 28004 loss -0.3984735608100891 LR -0.4289906620979309 LKL 0.030517101287841797
epoch 28005 loss -0.3559354245662689 LR -0.38638797402381897 LKL 0.030452551320195198
epoch 28006 loss -0.28573498129844666 LR -0.3160915970802307 LKL 0.030356621369719505
epoch 28007 loss -0.38136160373687744 LR -0.4117381274700165 LKL 0.030376536771655083
epoch 28008 loss -0.35161951184272766 LR -0.38205525279045105 LKL 0.030435753986239433
epoch 28009 loss -0.38102078437805176 LR -0.41150572896003723 LKL 0.030484935268759727
epoch 28010 loss -0.4328930377960205 LR -0.463417112827301 LKL 0.03052406944334507
epoch 28011 loss -0.38255858421325684 LR -0.41295096278190613 LKL 0.030392369255423546
epoch 28012 loss -0.39762577414512634 LR -0.4281306266784668 LKL 0.030504856258630753
epoch 28013 loss -0.4132630527019501 LR -0.4438471794128418

epoch 28100 loss -0.38344913721084595 LR -0.41388005018234253 LKL 0.030430924147367477
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28101 loss -0.3949621319770813 LR -0.4254918396472931 LKL 0.030529702082276344
epoch 28102 loss -0.40879595279693604 LR -0.4393743872642517 LKL 0.030578434467315674
epoch 28103 loss -0.39586806297302246 LR -0.42640379071235657 LKL 0.030535735189914703
epoch 28104 loss -0.3588663637638092 LR -0.38926514983177185 LKL 0.030398797243833542
epoch 28105 loss -0.4309271574020386 LR -0.4614203870296478 LKL 0.030493244528770447
epoch 28106 loss -0.3909190893173218 LR -0.42144715785980225 LKL 0.030528057366609573
epoch 28107 loss -0.4214482009410858 LR -0.4520000219345093 LKL 0.030551813542842865
epoch 28108 loss -0.42887601256370544 LR -0.4591977000236511 LKL 0.030321693047881126
epoch 28109 loss -0.42498406767845154 LR -0.4556032717227936 LKL 0.030619200319051743
epoch 28110 loss -0.41933950781822205 LR -0.4500238597393036 LKL 0.03068436123430729
epoch 28111 loss -0.35671675205230713 LR -0.38705122470855713 LKL 0.030334463343024254
epoch 28112 loss -0.3727090060710907 LR -0.4033382833003

epoch 28199 loss -0.38490161299705505 LR -0.4154798686504364 LKL 0.030578242614865303
epoch 28200 loss -0.3822709918022156 LR -0.41280221939086914 LKL 0.03053123690187931
56
epoch 28201 loss -0.40755653381347656 LR -0.4383510947227478 LKL 0.030794570222496986


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28202 loss -0.38148388266563416 LR -0.4120558500289917 LKL 0.03057195618748665
epoch 28203 loss -0.3654017150402069 LR -0.3956661820411682 LKL 0.030264459550380707
epoch 28204 loss -0.39961758255958557 LR -0.4301367998123169 LKL 0.030519209802150726
epoch 28205 loss -0.361747145652771 LR -0.39209794998168945 LKL 0.030350791290402412
epoch 28206 loss -0.4167180359363556 LR -0.4471980333328247 LKL 0.030480001121759415
epoch 28207 loss -0.4204086363315582 LR -0.4510379433631897 LKL 0.030629295855760574
epoch 28208 loss -0.3962825536727905 LR -0.4267951250076294 LKL 0.03051256574690342
epoch 28209 loss -0.34690654277801514 LR -0.37744149565696716 LKL 0.03053496591746807
epoch 28210 loss -0.4165227711200714 LR -0.44706594944000244 LKL 0.03054318204522133
epoch 28211 loss -0.31478530168533325 LR -0.34525054693222046 LKL 0.03046523779630661
epoch 28212 loss -0.33167895674705505 LR -0.3619474470615387 LKL 0.03026849962770939
epoch 28213 loss -0.40376192331314087 LR -0.4342363476753235 LK

65
epoch 28301 loss -0.3610125482082367 LR -0.391576886177063 LKL 0.030564334243535995


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28302 loss -0.4793998599052429 LR -0.5101324915885925 LKL 0.03073263354599476
epoch 28303 loss -0.40155962109565735 LR -0.4320234954357147 LKL 0.03046388179063797
epoch 28304 loss -0.3477444052696228 LR -0.3784010112285614 LKL 0.030656596645712852
epoch 28305 loss -0.3989722728729248 LR -0.4295158386230469 LKL 0.030543578788638115
epoch 28306 loss -0.39868780970573425 LR -0.4292271137237549 LKL 0.03053930401802063
epoch 28307 loss -0.3954141438007355 LR -0.42595845460891724 LKL 0.03054432012140751
epoch 28308 loss -0.38689857721328735 LR -0.4174821972846985 LKL 0.03058362379670143
epoch 28309 loss -0.3387153744697571 LR -0.36932164430618286 LKL 0.03060627542436123
epoch 28310 loss -0.41106733679771423 LR -0.4415983557701111 LKL 0.030531015247106552
epoch 28311 loss -0.4087473750114441 LR -0.4394320845603943 LKL 0.030684713274240494
epoch 28312 loss -0.39368119835853577 LR -0.4243265986442566 LKL 0.030645407736301422
epoch 28313 loss -0.3867959678173065 LR -0.41729018092155457 LKL

epoch 28398 loss -0.3230895698070526 LR -0.3534158766269684 LKL 0.030326293781399727
epoch 28399 loss -0.37630102038383484 LR -0.4067768454551697 LKL 0.030475830659270287
epoch 28400 loss -0.40455129742622375 LR -0.43517208099365234 LKL 0.030620796605944633
62
epoch 28401 loss -0.27402880787849426 LR -0.3042090833187103 LKL 0.030180277302861214


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28402 loss -0.4095102846622467 LR -0.4403682351112366 LKL 0.030857941135764122
epoch 28403 loss -0.3771112561225891 LR -0.4076204299926758 LKL 0.030509179458022118
epoch 28404 loss -0.3712848424911499 LR -0.4018239974975586 LKL 0.030539140105247498
epoch 28405 loss -0.3306501507759094 LR -0.3612135052680969 LKL 0.030563348904252052
epoch 28406 loss -0.4068880081176758 LR -0.4376318156719208 LKL 0.030743807554244995
epoch 28407 loss -0.3719576895236969 LR -0.4023645520210266 LKL 0.030406875535845757
epoch 28408 loss -0.4008583426475525 LR -0.4315361976623535 LKL 0.03067784011363983
epoch 28409 loss -0.3258015215396881 LR -0.3563407063484192 LKL 0.030539197847247124
epoch 28410 loss -0.41556960344314575 LR -0.4461710751056671 LKL 0.030601486563682556
epoch 28411 loss -0.42702361941337585 LR -0.45761436223983765 LKL 0.030590740963816643
epoch 28412 loss -0.3087247312068939 LR -0.33907267451286316 LKL 0.030347952619194984
epoch 28413 loss -0.3901885747909546 LR -0.42080122232437134 L

epoch 28500 loss -0.39426496624946594 LR -0.4248896539211273 LKL 0.030624687671661377
83


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28501 loss -0.3712175786495209 LR -0.40185049176216125 LKL 0.030632905662059784
epoch 28502 loss -0.4102111756801605 LR -0.44101279973983765 LKL 0.030801624059677124
epoch 28503 loss -0.4201829433441162 LR -0.4507431983947754 LKL 0.030560247600078583
epoch 28504 loss -0.39440321922302246 LR -0.4250969886779785 LKL 0.03069378063082695
epoch 28505 loss -0.39828234910964966 LR -0.42893561720848083 LKL 0.030653273686766624
epoch 28506 loss -0.30518031120300293 LR -0.3354637622833252 LKL 0.03028344362974167
epoch 28507 loss -0.36788874864578247 LR -0.39848044514656067 LKL 0.030591703951358795
epoch 28508 loss -0.38593459129333496 LR -0.41665029525756836 LKL 0.0307156965136528
epoch 28509 loss -0.3428157866001129 LR -0.37341445684432983 LKL 0.030598675832152367
epoch 28510 loss -0.37169280648231506 LR -0.4022241532802582 LKL 0.030531346797943115
epoch 28511 loss -0.38117843866348267 LR -0.41192981600761414 LKL 0.03075137734413147
epoch 28512 loss -0.36269596219062805 LR -0.393085211515

epoch 28599 loss -0.3836933672428131 LR -0.41436830163002014 LKL 0.03067493997514248
epoch 28600 loss -0.35241085290908813 LR -0.38292646408081055 LKL 0.030515607446432114
47
epoch 28601 loss -0.37341126799583435 LR -0.4039507508277893 LKL 0.03053947165608406


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28602 loss -0.4305962324142456 LR -0.4614315927028656 LKL 0.030835360288619995
epoch 28603 loss -0.45617178082466125 LR -0.4867655634880066 LKL 0.03059379570186138
epoch 28604 loss -0.36229658126831055 LR -0.3929339051246643 LKL 0.030637331306934357
epoch 28605 loss -0.3673416078090668 LR -0.39800113439559937 LKL 0.030659524723887444
epoch 28606 loss -0.37394243478775024 LR -0.40454772114753723 LKL 0.030605299398303032
epoch 28607 loss -0.4331252872943878 LR -0.46375077962875366 LKL 0.03062549978494644
epoch 28608 loss -0.39887380599975586 LR -0.42930924892425537 LKL 0.030435428023338318
epoch 28609 loss -0.42563512921333313 LR -0.45640528202056885 LKL 0.030770139768719673
epoch 28610 loss -0.3775896430015564 LR -0.4082169234752655 LKL 0.030627278611063957
epoch 28611 loss -0.34315311908721924 LR -0.37368884682655334 LKL 0.03053572215139866
epoch 28612 loss -0.44753074645996094 LR -0.47821924090385437 LKL 0.030688485130667686
epoch 28613 loss -0.43691906332969666 LR -0.4676846265

epoch 28698 loss -0.36308059096336365 LR -0.3935595750808716 LKL 0.030478976666927338
epoch 28699 loss -0.34646785259246826 LR -0.3770878314971924 LKL 0.030619990080595016
epoch 28700 loss -0.3160882592201233 LR -0.346407413482666 LKL 0.03031914308667183
86
epoch 28701 loss -0.4471653997898102 LR -0.47787076234817505 LKL 0.03070535697042942


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28702 loss -0.40593457221984863 LR -0.43676334619522095 LKL 0.03082878701388836
epoch 28703 loss -0.33572590351104736 LR -0.3664194941520691 LKL 0.03069359064102173
epoch 28704 loss -0.3680182993412018 LR -0.39850884675979614 LKL 0.030490556731820107
epoch 28705 loss -0.4377262592315674 LR -0.468354195356369 LKL 0.030627937987446785
epoch 28706 loss -0.3653624355792999 LR -0.3958161473274231 LKL 0.030453698709607124
epoch 28707 loss -0.4014928340911865 LR -0.43226826190948486 LKL 0.03077542595565319
epoch 28708 loss -0.4228900074958801 LR -0.45357370376586914 LKL 0.03068368323147297
epoch 28709 loss -0.357022225856781 LR -0.387845516204834 LKL 0.030823297798633575
epoch 28710 loss -0.331260621547699 LR -0.3616761863231659 LKL 0.03041556105017662
epoch 28711 loss -0.33896565437316895 LR -0.3696039617061615 LKL 0.0306383203715086
epoch 28712 loss -0.35247567296028137 LR -0.3828876316547394 LKL 0.030411947518587112
epoch 28713 loss -0.45435652136802673 LR -0.4852639436721802 LKL 0.0

epoch 28798 loss -0.40034162998199463 LR -0.4309372007846832 LKL 0.030595559626817703
epoch 28799 loss -0.35410112142562866 LR -0.38460981845855713 LKL 0.030508682131767273
epoch 28800 loss -0.35199910402297974 LR -0.38248932361602783 LKL 0.0304902121424675
39
epoch 28801 loss -0.38736578822135925 LR -0.4179772138595581 LKL 0.0306114349514246


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28802 loss -0.39079999923706055 LR -0.4215162694454193 LKL 0.030716266483068466
epoch 28803 loss -0.4157165586948395 LR -0.4465712904930115 LKL 0.03085472248494625
epoch 28804 loss -0.36446553468704224 LR -0.3950767517089844 LKL 0.03061121329665184
epoch 28805 loss -0.4083814024925232 LR -0.43887433409690857 LKL 0.03049292415380478
epoch 28806 loss -0.3202318549156189 LR -0.3506969213485718 LKL 0.030465077608823776
epoch 28807 loss -0.40094366669654846 LR -0.4316022992134094 LKL 0.030658645555377007
epoch 28808 loss -0.3889046311378479 LR -0.4195129871368408 LKL 0.030608367174863815
epoch 28809 loss -0.4005436897277832 LR -0.431212842464447 LKL 0.030669154599308968
epoch 28810 loss -0.3721674084663391 LR -0.40272265672683716 LKL 0.03055523708462715
epoch 28811 loss -0.39826664328575134 LR -0.42881083488464355 LKL 0.03054419532418251
epoch 28812 loss -0.3991336226463318 LR -0.4297478199005127 LKL 0.030614204704761505
epoch 28813 loss -0.41888391971588135 LR -0.4495222866535187 LKL

epoch 28898 loss -0.41931799054145813 LR -0.4501868486404419 LKL 0.03086886741220951
epoch 28899 loss -0.30145707726478577 LR -0.3319891095161438 LKL 0.030532026663422585
epoch 28900 loss -0.369981586933136 LR -0.400836706161499 LKL 0.030855128541588783
54
epoch 28901 loss -0.37107226252555847 LR -0.40150362253189087 LKL 0.03043135441839695


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 28902 loss -0.3896120488643646 LR -0.4203561246395111 LKL 0.03074408881366253
epoch 28903 loss -0.46753260493278503 LR -0.4983375370502472 LKL 0.030804922804236412
epoch 28904 loss -0.39561542868614197 LR -0.4262293577194214 LKL 0.03061392903327942
epoch 28905 loss -0.48579904437065125 LR -0.5167100429534912 LKL 0.03091099113225937
epoch 28906 loss -0.3909941613674164 LR -0.4218689203262329 LKL 0.030874768272042274
epoch 28907 loss -0.34959676861763 LR -0.38041791319847107 LKL 0.030821148306131363
epoch 28908 loss -0.4226776659488678 LR -0.45348286628723145 LKL 0.03080519661307335
epoch 28909 loss -0.35292187333106995 LR -0.38358819484710693 LKL 0.030666330829262733
epoch 28910 loss -0.4110509753227234 LR -0.4417792856693268 LKL 0.030728301033377647
epoch 28911 loss -0.3785613179206848 LR -0.40922266244888306 LKL 0.030661338940262794
epoch 28912 loss -0.4272765517234802 LR -0.45808595418930054 LKL 0.03080941177904606
epoch 28913 loss -0.35269588232040405 LR -0.3833999037742615 LK

epoch 28998 loss -0.44972050189971924 LR -0.4806020259857178 LKL 0.030881527811288834
epoch 28999 loss -0.36977753043174744 LR -0.40057122707366943 LKL 0.030793704092502594
epoch 29000 loss -0.38384225964546204 LR -0.4144006669521332 LKL 0.03055841661989689
68
epoch 29001 loss -0.34372854232788086 LR -0.3741726279258728 LKL 0.030444078147411346


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29002 loss -0.33304867148399353 LR -0.3637513518333435 LKL 0.030702685937285423
epoch 29003 loss -0.36433908343315125 LR -0.3949659764766693 LKL 0.030626900494098663
epoch 29004 loss -0.37151002883911133 LR -0.4021961987018585 LKL 0.030686168000102043
epoch 29005 loss -0.3837697207927704 LR -0.41441816091537476 LKL 0.030648445710539818
epoch 29006 loss -0.427766352891922 LR -0.45854616165161133 LKL 0.03077981248497963
epoch 29007 loss -0.3669489622116089 LR -0.39772775769233704 LKL 0.030778788030147552
epoch 29008 loss -0.3603900969028473 LR -0.39096975326538086 LKL 0.030579645186662674
epoch 29009 loss -0.36141350865364075 LR -0.39192697405815125 LKL 0.0305134579539299
epoch 29010 loss -0.393925279378891 LR -0.42467570304870605 LKL 0.03075043112039566
epoch 29011 loss -0.3451095223426819 LR -0.37588316202163696 LKL 0.03077363595366478
epoch 29012 loss -0.44783729314804077 LR -0.4785953462123871 LKL 0.03075805865228176
epoch 29013 loss -0.40783125162124634 LR -0.4384753704071045 

epoch 29098 loss -0.40108123421669006 LR -0.4317227900028229 LKL 0.030641552060842514
epoch 29099 loss -0.37542983889579773 LR -0.40614527463912964 LKL 0.030715439468622208
epoch 29100 loss -0.3788567781448364 LR -0.40946164727211 LKL 0.03060486540198326
73
epoch 29101 loss -0.41235435009002686 LR -0.4430946111679077 LKL 0.03074025548994541


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29102 loss -0.3691575527191162 LR -0.3997874855995178 LKL 0.030629929155111313
epoch 29103 loss -0.42527660727500916 LR -0.45608970522880554 LKL 0.03081309236586094
epoch 29104 loss -0.454151451587677 LR -0.4850251376628876 LKL 0.030873700976371765
epoch 29105 loss -0.3674811124801636 LR -0.3983854651451111 LKL 0.030904360115528107
epoch 29106 loss -0.4253743886947632 LR -0.45613813400268555 LKL 0.030763749033212662
epoch 29107 loss -0.3285518288612366 LR -0.3590081036090851 LKL 0.030456284061074257
epoch 29108 loss -0.37057098746299744 LR -0.40120190382003784 LKL 0.0306309275329113
epoch 29109 loss -0.439557820558548 LR -0.4701900780200958 LKL 0.030632244423031807
epoch 29110 loss -0.45170819759368896 LR -0.4825124144554138 LKL 0.030804218724370003
epoch 29111 loss -0.39332374930381775 LR -0.424041211605072 LKL 0.030717462301254272
epoch 29112 loss -0.4265816807746887 LR -0.45727846026420593 LKL 0.030696792528033257
epoch 29113 loss -0.37653228640556335 LR -0.4072774648666382 LK

epoch 29198 loss -0.3945043087005615 LR -0.4253208041191101 LKL 0.030816487967967987
epoch 29199 loss -0.45987996459007263 LR -0.49100834131240845 LKL 0.03112836740911007
epoch 29200 loss -0.3852401077747345 LR -0.416040301322937 LKL 0.03080020658671856
47
epoch 29201 loss -0.3605121970176697 LR -0.3911963701248169 LKL 0.030684176832437515


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29202 loss -0.3471553325653076 LR -0.3778395652770996 LKL 0.030684225261211395
epoch 29203 loss -0.3837144672870636 LR -0.4147059917449951 LKL 0.030991535633802414
epoch 29204 loss -0.47211769223213196 LR -0.5030652284622192 LKL 0.03094753623008728
epoch 29205 loss -0.39608022570610046 LR -0.4269508719444275 LKL 0.03087064065039158
epoch 29206 loss -0.35998788475990295 LR -0.3906412422657013 LKL 0.03065336123108864
epoch 29207 loss -0.4568149447441101 LR -0.48790276050567627 LKL 0.03108782321214676
epoch 29208 loss -0.32579898834228516 LR -0.356437087059021 LKL 0.030638093128800392
epoch 29209 loss -0.35887157917022705 LR -0.3896082937717438 LKL 0.030736707150936127
epoch 29210 loss -0.437331885099411 LR -0.46823495626449585 LKL 0.030903061851859093
epoch 29211 loss -0.3434727191925049 LR -0.3740975260734558 LKL 0.03062482178211212
epoch 29212 loss -0.46439746022224426 LR -0.4954086244106293 LKL 0.03101116232573986
epoch 29213 loss -0.4152819514274597 LR -0.44594162702560425 LKL 

epoch 29300 loss -0.3858844041824341 LR -0.4167691767215729 LKL 0.03088478557765484
42


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29301 loss -0.41825228929519653 LR -0.4490281343460083 LKL 0.030775830149650574
epoch 29302 loss -0.401070237159729 LR -0.43186354637145996 LKL 0.03079330176115036
epoch 29303 loss -0.4156513810157776 LR -0.4463544189929962 LKL 0.030703026801347733
epoch 29304 loss -0.3764236569404602 LR -0.4071539640426636 LKL 0.030730318278074265
epoch 29305 loss -0.38490182161331177 LR -0.4156433045864105 LKL 0.030741477385163307
epoch 29306 loss -0.3895512819290161 LR -0.4203766882419586 LKL 0.03082541562616825
epoch 29307 loss -0.3283608853816986 LR -0.35899248719215393 LKL 0.030631596222519875
epoch 29308 loss -0.30643710494041443 LR -0.3372719883918762 LKL 0.030834879726171494
epoch 29309 loss -0.36345377564430237 LR -0.39415982365608215 LKL 0.030706048011779785
epoch 29310 loss -0.3959910273551941 LR -0.4268985688686371 LKL 0.030907539650797844
epoch 29311 loss -0.3971831202507019 LR -0.4278283715248108 LKL 0.030645247548818588
epoch 29312 loss -0.3389005661010742 LR -0.3696351647377014 L

epoch 29397 loss -0.402532160282135 LR -0.43331313133239746 LKL 0.0307809729129076
epoch 29398 loss -0.32744696736335754 LR -0.3582114577293396 LKL 0.030764499679207802
epoch 29399 loss -0.36103999614715576 LR -0.39204898476600647 LKL 0.031009001657366753
epoch 29400 loss -0.4210335612297058 LR -0.4519581198692322 LKL 0.03092454932630062
80


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29401 loss -0.3747241795063019 LR -0.405506432056427 LKL 0.03078225441277027
epoch 29402 loss -0.43799176812171936 LR -0.46899300813674927 LKL 0.031001253053545952
epoch 29403 loss -0.3915541172027588 LR -0.42258715629577637 LKL 0.03103303164243698
epoch 29404 loss -0.4140373170375824 LR -0.44520580768585205 LKL 0.03116847760975361
epoch 29405 loss -0.39282891154289246 LR -0.4236128032207489 LKL 0.0307838786393404
epoch 29406 loss -0.3782171905040741 LR -0.4092963933944702 LKL 0.031079202890396118
epoch 29407 loss -0.44498497247695923 LR -0.47596311569213867 LKL 0.03097815439105034
epoch 29408 loss -0.37308594584465027 LR -0.4038226008415222 LKL 0.030736660584807396
epoch 29409 loss -0.3912953734397888 LR -0.422203004360199 LKL 0.03090761788189411
epoch 29410 loss -0.42795827984809875 LR -0.45919355750083923 LKL 0.031235285103321075
epoch 29411 loss -0.40413233637809753 LR -0.43505212664604187 LKL 0.03091980330646038
epoch 29412 loss -0.39543765783309937 LR -0.42637282609939575 L

epoch 29500 loss -0.4270601272583008 LR -0.4578356146812439 LKL 0.030775485560297966
95
epoch 29501 loss -0.33843371272087097 LR -0.36933720111846924 LKL 0.030903493985533714


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29502 loss -0.4100513458251953 LR -0.44110825657844543 LKL 0.031056907027959824
epoch 29503 loss -0.38957077264785767 LR -0.42037636041641235 LKL 0.030805574730038643
epoch 29504 loss -0.3874850273132324 LR -0.41852107644081116 LKL 0.03103603795170784
epoch 29505 loss -0.4510936737060547 LR -0.4820844233036041 LKL 0.030990738421678543
epoch 29506 loss -0.39233753085136414 LR -0.4234022796154022 LKL 0.031064758077263832
epoch 29507 loss -0.39491206407546997 LR -0.42563480138778687 LKL 0.03072272427380085
epoch 29508 loss -0.35792139172554016 LR -0.3889371454715729 LKL 0.03101574257016182
epoch 29509 loss -0.3395226299762726 LR -0.3703215420246124 LKL 0.03079892508685589
epoch 29510 loss -0.40499237179756165 LR -0.43579310178756714 LKL 0.03080073371529579
epoch 29511 loss -0.4367743730545044 LR -0.46773266792297363 LKL 0.03095829300582409
epoch 29512 loss -0.43274611234664917 LR -0.4637254476547241 LKL 0.030979342758655548
epoch 29513 loss -0.4083537459373474 LR -0.4395667314529419

epoch 29599 loss -0.38784924149513245 LR -0.41864678263664246 LKL 0.03079753741621971
epoch 29600 loss -0.46451297402381897 LR -0.4956340789794922 LKL 0.031121091917157173
42


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29601 loss -0.39717674255371094 LR -0.4280548095703125 LKL 0.030878055840730667
epoch 29602 loss -0.4684988558292389 LR -0.4995541572570801 LKL 0.031055288389325142
epoch 29603 loss -0.389455646276474 LR -0.420367568731308 LKL 0.03091193176805973
epoch 29604 loss -0.4322362244129181 LR -0.4632880687713623 LKL 0.031051848083734512
epoch 29605 loss -0.3639181852340698 LR -0.3948826491832733 LKL 0.030964473262429237
epoch 29606 loss -0.3709832727909088 LR -0.40196937322616577 LKL 0.030986113473773003
epoch 29607 loss -0.43748489022254944 LR -0.4685858488082886 LKL 0.031100967898964882
epoch 29608 loss -0.4243707060813904 LR -0.45532065629959106 LKL 0.030949950218200684
epoch 29609 loss -0.4057174026966095 LR -0.4368341863155365 LKL 0.03111678920686245
epoch 29610 loss -0.4115256369113922 LR -0.4423995316028595 LKL 0.03087390586733818
epoch 29611 loss -0.38490936160087585 LR -0.4158003330230713 LKL 0.03089096024632454
epoch 29612 loss -0.3319956958293915 LR -0.3627210557460785 LKL 0.

epoch 29697 loss -0.39023903012275696 LR -0.4210471212863922 LKL 0.030808094888925552
epoch 29698 loss -0.3721674680709839 LR -0.40288734436035156 LKL 0.030719880014657974
epoch 29699 loss -0.38973361253738403 LR -0.42064520716667175 LKL 0.03091159462928772
epoch 29700 loss -0.3317718207836151 LR -0.36246442794799805 LKL 0.030692603439092636
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29701 loss -0.4361090660095215 LR -0.46714529395103455 LKL 0.03103623352944851
epoch 29702 loss -0.4072835445404053 LR -0.438274621963501 LKL 0.03099106438457966
epoch 29703 loss -0.39552971720695496 LR -0.42650604248046875 LKL 0.03097633644938469
epoch 29704 loss -0.3508795499801636 LR -0.38178953528404236 LKL 0.030909990891814232
epoch 29705 loss -0.3740294575691223 LR -0.4046545624732971 LKL 0.03062509000301361
epoch 29706 loss -0.38126370310783386 LR -0.41189366579055786 LKL 0.030629962682724
epoch 29707 loss -0.3956609070301056 LR -0.4264580309391022 LKL 0.030797110870480537
epoch 29708 loss -0.3559122383594513 LR -0.3866786062717438 LKL 0.03076636604964733
epoch 29709 loss -0.42761778831481934 LR -0.4585067331790924 LKL 0.030888954177498817
epoch 29710 loss -0.3654565215110779 LR -0.3962564468383789 LKL 0.030799925327301025
epoch 29711 loss -0.4558544158935547 LR -0.4869307279586792 LKL 0.03107631579041481
epoch 29712 loss -0.3955542743206024 LR -0.42654526233673096 LKL 0.0

epoch 29799 loss -0.4031842350959778 LR -0.43433988094329834 LKL 0.031155630946159363
epoch 29800 loss -0.3541530668735504 LR -0.3850482702255249 LKL 0.030895205214619637
83
epoch 29801 loss -0.35165077447891235 LR -0.38266652822494507 LKL 0.031015751883387566


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29802 loss -0.34654802083969116 LR -0.37732139229774475 LKL 0.030773360282182693
epoch 29803 loss -0.4257723391056061 LR -0.45674484968185425 LKL 0.030972512438893318
epoch 29804 loss -0.3902091383934021 LR -0.4211589992046356 LKL 0.030949870124459267
epoch 29805 loss -0.40072527527809143 LR -0.4317024350166321 LKL 0.030977146700024605
epoch 29806 loss -0.39287200570106506 LR -0.4237983822822571 LKL 0.030926378443837166
epoch 29807 loss -0.409249871969223 LR -0.4402170181274414 LKL 0.03096715547144413
epoch 29808 loss -0.3630637228488922 LR -0.39386168122291565 LKL 0.030797963961958885
epoch 29809 loss -0.4241037368774414 LR -0.4550909996032715 LKL 0.030987247824668884
epoch 29810 loss -0.42682841420173645 LR -0.45789527893066406 LKL 0.031066864728927612
epoch 29811 loss -0.4042002558708191 LR -0.43526965379714966 LKL 0.03106938861310482
epoch 29812 loss -0.47245126962661743 LR -0.5035175681114197 LKL 0.031066305935382843
epoch 29813 loss -0.33059629797935486 LR -0.36138278245925

epoch 29898 loss -0.3100574016571045 LR -0.3407505750656128 LKL 0.030693164095282555
epoch 29899 loss -0.4599478542804718 LR -0.49098220467567444 LKL 0.03103434294462204
epoch 29900 loss -0.40898311138153076 LR -0.44004738330841064 LKL 0.03106427565217018
76
epoch 29901 loss -0.32381558418273926 LR -0.3547601103782654 LKL 0.03094451315701008


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 29902 loss -0.39720577001571655 LR -0.428151398897171 LKL 0.030945615842938423
epoch 29903 loss -0.36620238423347473 LR -0.3969120979309082 LKL 0.030709708109498024
epoch 29904 loss -0.38939663767814636 LR -0.42023831605911255 LKL 0.030841682106256485
epoch 29905 loss -0.38700732588768005 LR -0.4179043769836426 LKL 0.030897054821252823
epoch 29906 loss -0.44867053627967834 LR -0.47978097200393677 LKL 0.031110424548387527
epoch 29907 loss -0.3641100227832794 LR -0.39495939016342163 LKL 0.030849363654851913
epoch 29908 loss -0.37961897253990173 LR -0.4106161594390869 LKL 0.03099718876183033
epoch 29909 loss -0.3808438777923584 LR -0.4115864336490631 LKL 0.030742544680833817
epoch 29910 loss -0.39439502358436584 LR -0.4253646731376648 LKL 0.030969662591814995
epoch 29911 loss -0.4293239414691925 LR -0.4599735736846924 LKL 0.03064962476491928
epoch 29912 loss -0.3423708379268646 LR -0.3730337917804718 LKL 0.03066294640302658
epoch 29913 loss -0.35387513041496277 LR -0.384721100330352

epoch 29998 loss -0.34962597489356995 LR -0.380474716424942 LKL 0.030848734080791473
epoch 29999 loss -0.42308947443962097 LR -0.45405447483062744 LKL 0.030964991077780724
epoch 30000 loss -0.42442265152931213 LR -0.4555148184299469 LKL 0.031092161312699318
86
epoch 30001 loss -0.3958035707473755 LR -0.42669692635536194 LKL 0.030893364921212196


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30002 loss -0.40140682458877563 LR -0.4324570298194885 LKL 0.031050199642777443
epoch 30003 loss -0.38024285435676575 LR -0.41123175621032715 LKL 0.030988898128271103
epoch 30004 loss -0.3399749994277954 LR -0.3709886074066162 LKL 0.03101360984146595
epoch 30005 loss -0.43685808777809143 LR -0.46777594089508057 LKL 0.030917854979634285
epoch 30006 loss -0.43265578150749207 LR -0.46351391077041626 LKL 0.0308581180870533
epoch 30007 loss -0.393766850233078 LR -0.4246363043785095 LKL 0.030869457870721817
epoch 30008 loss -0.3296739161014557 LR -0.3605233132839203 LKL 0.0308493971824646
epoch 30009 loss -0.4394485354423523 LR -0.4704514741897583 LKL 0.03100292943418026
epoch 30010 loss -0.4254721403121948 LR -0.45638102293014526 LKL 0.03090888448059559
epoch 30011 loss -0.41922250390052795 LR -0.4503992795944214 LKL 0.031176766380667686
epoch 30012 loss -0.3291718661785126 LR -0.35993123054504395 LKL 0.030759353190660477
epoch 30013 loss -0.38374826312065125 LR -0.41481342911720276 L

epoch 30098 loss -0.4124561548233032 LR -0.44344642758369446 LKL 0.03099025972187519
epoch 30099 loss -0.4166406989097595 LR -0.447592556476593 LKL 0.030951859429478645
epoch 30100 loss -0.4105415940284729 LR -0.4416739046573639 LKL 0.031132305040955544
57
epoch 30101 loss -0.47026708722114563 LR -0.5013929009437561 LKL 0.03112581931054592


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30102 loss -0.3280702829360962 LR -0.358878493309021 LKL 0.03080822341144085
epoch 30103 loss -0.35212695598602295 LR -0.3829719126224518 LKL 0.030844952911138535
epoch 30104 loss -0.3960537016391754 LR -0.42692601680755615 LKL 0.030872317031025887
epoch 30105 loss -0.3635875880718231 LR -0.39446118474006653 LKL 0.03087359108030796
epoch 30106 loss -0.3944427967071533 LR -0.4253925681114197 LKL 0.030949784442782402
epoch 30107 loss -0.453561395406723 LR -0.4844781756401062 LKL 0.030916783958673477
epoch 30108 loss -0.3870934545993805 LR -0.41794687509536743 LKL 0.030853422358632088
epoch 30109 loss -0.39994949102401733 LR -0.43094316124916077 LKL 0.03099367395043373
epoch 30110 loss -0.39778968691825867 LR -0.4289502501487732 LKL 0.031160566955804825
epoch 30111 loss -0.4117347002029419 LR -0.44260749220848083 LKL 0.030872803181409836
epoch 30112 loss -0.34437280893325806 LR -0.37520724534988403 LKL 0.030834445729851723
epoch 30113 loss -0.506000280380249 LR -0.5372483730316162 L

36
epoch 30201 loss -0.4000178575515747 LR -0.431166410446167 LKL 0.03114856407046318


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30202 loss -0.41317012906074524 LR -0.4444441795349121 LKL 0.03127404674887657
epoch 30203 loss -0.41403794288635254 LR -0.44518038630485535 LKL 0.031142450869083405
epoch 30204 loss -0.4094369411468506 LR -0.4406191408634186 LKL 0.031182201579213142
epoch 30205 loss -0.39975228905677795 LR -0.43075260519981384 LKL 0.031000325456261635
epoch 30206 loss -0.38655710220336914 LR -0.4176188111305237 LKL 0.03106171265244484
epoch 30207 loss -0.35902485251426697 LR -0.3899174928665161 LKL 0.03089265339076519
epoch 30208 loss -0.3234770894050598 LR -0.3542928695678711 LKL 0.030815767124295235
epoch 30209 loss -0.41693541407585144 LR -0.44783127307891846 LKL 0.030895862728357315
epoch 30210 loss -0.3592848777770996 LR -0.3903147578239441 LKL 0.03102988936007023
epoch 30211 loss -0.3415161371231079 LR -0.372507244348526 LKL 0.030991120263934135
epoch 30212 loss -0.35519981384277344 LR -0.386199414730072 LKL 0.03099958784878254
epoch 30213 loss -0.4134267270565033 LR -0.4442625343799591 LK

65
epoch 30301 loss -0.4053638279438019 LR -0.4365348815917969 LKL 0.03117106296122074


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30302 loss -0.3731982111930847 LR -0.40442895889282227 LKL 0.031230749562382698
epoch 30303 loss -0.4410560727119446 LR -0.4722174406051636 LKL 0.03116137906908989
epoch 30304 loss -0.4206944406032562 LR -0.45180994272232056 LKL 0.03111550770699978
epoch 30305 loss -0.35241398215293884 LR -0.38324594497680664 LKL 0.030831966549158096
epoch 30306 loss -0.34925195574760437 LR -0.380199670791626 LKL 0.030947726219892502
epoch 30307 loss -0.37549465894699097 LR -0.4067239761352539 LKL 0.031229304149746895
epoch 30308 loss -0.4472225308418274 LR -0.4782744348049164 LKL 0.03105189837515354
epoch 30309 loss -0.41626298427581787 LR -0.44725650548934937 LKL 0.03099353052675724
epoch 30310 loss -0.29023608565330505 LR -0.32110604643821716 LKL 0.030869951471686363
epoch 30311 loss -0.3638422191143036 LR -0.39454948902130127 LKL 0.030707281082868576
epoch 30312 loss -0.4138515591621399 LR -0.44479817152023315 LKL 0.03094661608338356
epoch 30313 loss -0.43328264355659485 LR -0.464477986097335

epoch 30400 loss -0.3884633779525757 LR -0.41957205533981323 LKL 0.031108662486076355
39


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30401 loss -0.42771509289741516 LR -0.4589080810546875 LKL 0.031192978844046593
epoch 30402 loss -0.45565715432167053 LR -0.4868222177028656 LKL 0.03116505779325962
epoch 30403 loss -0.3472617268562317 LR -0.37829825282096863 LKL 0.031036514788866043
epoch 30404 loss -0.42249035835266113 LR -0.45354875922203064 LKL 0.0310584157705307
epoch 30405 loss -0.4274015426635742 LR -0.4584019184112549 LKL 0.03100036270916462
epoch 30406 loss -0.37931177020072937 LR -0.4103439748287201 LKL 0.03103220835328102
epoch 30407 loss -0.39020729064941406 LR -0.42141228914260864 LKL 0.031205011531710625
epoch 30408 loss -0.40572062134742737 LR -0.4369205832481384 LKL 0.031199965626001358
epoch 30409 loss -0.4383958578109741 LR -0.4695308208465576 LKL 0.031134970486164093
epoch 30410 loss -0.37669825553894043 LR -0.4077664613723755 LKL 0.031068220734596252
epoch 30411 loss -0.4528125822544098 LR -0.484089732170105 LKL 0.03127715364098549
epoch 30412 loss -0.47597572207450867 LR -0.5072836875915527 L

epoch 30497 loss -0.4544108510017395 LR -0.4857882857322693 LKL 0.03137742355465889
epoch 30498 loss -0.4371229112148285 LR -0.4683542251586914 LKL 0.031231315806508064
epoch 30499 loss -0.37934261560440063 LR -0.4103151261806488 LKL 0.030972497537732124
epoch 30500 loss -0.4151586890220642 LR -0.4463465213775635 LKL 0.03118784725666046
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30501 loss -0.4186899960041046 LR -0.44971954822540283 LKL 0.031029554083943367
epoch 30502 loss -0.46327492594718933 LR -0.4944748878479004 LKL 0.031199954450130463
epoch 30503 loss -0.3426809310913086 LR -0.3738422393798828 LKL 0.031161297112703323
epoch 30504 loss -0.4613039791584015 LR -0.49243223667144775 LKL 0.03112824447453022
epoch 30505 loss -0.41207355260849 LR -0.4431125819683075 LKL 0.03103901818394661
epoch 30506 loss -0.40998804569244385 LR -0.441206693649292 LKL 0.03121866099536419
epoch 30507 loss -0.3062555491924286 LR -0.3373035788536072 LKL 0.031048022210597992
epoch 30508 loss -0.38283807039260864 LR -0.4141254425048828 LKL 0.031287383288145065
epoch 30509 loss -0.4232324957847595 LR -0.4544984996318817 LKL 0.031266000121831894
epoch 30510 loss -0.3742033541202545 LR -0.4051045775413513 LKL 0.030901214107871056
epoch 30511 loss -0.41204774379730225 LR -0.44320982694625854 LKL 0.031162090599536896
epoch 30512 loss -0.34319424629211426 LR -0.37431710958480835 LK

epoch 30600 loss -0.37009909749031067 LR -0.401002436876297 LKL 0.030903352424502373
80


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30601 loss -0.4435664117336273 LR -0.4748063087463379 LKL 0.031239893287420273
epoch 30602 loss -0.4221920073032379 LR -0.4531879425048828 LKL 0.030995937064290047
epoch 30603 loss -0.4103860855102539 LR -0.4413643777370453 LKL 0.030978277325630188
epoch 30604 loss -0.4655866324901581 LR -0.4966791868209839 LKL 0.031092558056116104
epoch 30605 loss -0.3346835672855377 LR -0.3657470941543579 LKL 0.031063513830304146
epoch 30606 loss -0.42812636494636536 LR -0.45929160714149475 LKL 0.03116525337100029
epoch 30607 loss -0.46140730381011963 LR -0.49270445108413696 LKL 0.031297147274017334
epoch 30608 loss -0.3851373791694641 LR -0.41609394550323486 LKL 0.030956555157899857
epoch 30609 loss -0.4384443759918213 LR -0.46974509954452515 LKL 0.03130073472857475
epoch 30610 loss -0.40836334228515625 LR -0.43932807445526123 LKL 0.030964728444814682
epoch 30611 loss -0.388309121131897 LR -0.4194753170013428 LKL 0.0311661995947361
epoch 30612 loss -0.3915766179561615 LR -0.42259782552719116 L

epoch 30700 loss -0.3994901776313782 LR -0.43078622221946716 LKL 0.03129603713750839
79


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30701 loss -0.32721543312072754 LR -0.35818415880203247 LKL 0.03096873313188553
epoch 30702 loss -0.3390749990940094 LR -0.3699258863925934 LKL 0.03085089661180973
epoch 30703 loss -0.4267878532409668 LR -0.4582056701183319 LKL 0.03141782432794571
epoch 30704 loss -0.3924057185649872 LR -0.4234776198863983 LKL 0.031071899458765984
epoch 30705 loss -0.3770414888858795 LR -0.4081248641014099 LKL 0.031083369627594948
epoch 30706 loss -0.3849557638168335 LR -0.4161358177661896 LKL 0.031180063262581825
epoch 30707 loss -0.3945987820625305 LR -0.425594687461853 LKL 0.030995910987257957
epoch 30708 loss -0.35437658429145813 LR -0.38527506589889526 LKL 0.03089846856892109
epoch 30709 loss -0.43841058015823364 LR -0.46957579255104065 LKL 0.03116520307958126
epoch 30710 loss -0.4479029178619385 LR -0.47906991839408875 LKL 0.031166985630989075
epoch 30711 loss -0.45731544494628906 LR -0.48873987793922424 LKL 0.03142442926764488
epoch 30712 loss -0.389880895614624 LR -0.42092567682266235 LKL

epoch 30797 loss -0.36937305331230164 LR -0.4005484879016876 LKL 0.03117542527616024
epoch 30798 loss -0.3877599239349365 LR -0.41874754428863525 LKL 0.030987616628408432
epoch 30799 loss -0.46603864431381226 LR -0.4972275495529175 LKL 0.031188905239105225
epoch 30800 loss -0.4495287835597992 LR -0.4805910885334015 LKL 0.03106231614947319
122
epoch 30801 loss -0.3304257094860077 LR -0.36141425371170044 LKL 0.030988536775112152


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30802 loss -0.4061639904975891 LR -0.43738457560539246 LKL 0.0312205757945776
epoch 30803 loss -0.4155982434749603 LR -0.44687461853027344 LKL 0.03127637505531311
epoch 30804 loss -0.4700605869293213 LR -0.5013898611068726 LKL 0.03132928907871246
epoch 30805 loss -0.4220277965068817 LR -0.4531349539756775 LKL 0.031107168644666672
epoch 30806 loss -0.43652617931365967 LR -0.46778035163879395 LKL 0.03125418350100517
epoch 30807 loss -0.48319581151008606 LR -0.5143784284591675 LKL 0.031182607635855675
epoch 30808 loss -0.3052580654621124 LR -0.3363752067089081 LKL 0.031117135658860207
epoch 30809 loss -0.3545941710472107 LR -0.3856621980667114 LKL 0.031068023294210434
epoch 30810 loss -0.4075884222984314 LR -0.43895649909973145 LKL 0.03136807307600975
epoch 30811 loss -0.4194074273109436 LR -0.4508073329925537 LKL 0.031399913132190704
epoch 30812 loss -0.3930768370628357 LR -0.42433005571365356 LKL 0.03125321492552757
epoch 30813 loss -0.41925814747810364 LR -0.45027098059654236 LKL

epoch 30900 loss -0.42740803956985474 LR -0.458578884601593 LKL 0.031170830130577087
55


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 30901 loss -0.28797250986099243 LR -0.31877440214157104 LKL 0.03080190345644951
epoch 30902 loss -0.43758538365364075 LR -0.4687299132347107 LKL 0.03114454261958599
epoch 30903 loss -0.4872850775718689 LR -0.5185745358467102 LKL 0.03128945827484131
epoch 30904 loss -0.4169590175151825 LR -0.44829627871513367 LKL 0.03133725747466087
epoch 30905 loss -0.37717968225479126 LR -0.4087381362915039 LKL 0.03155844658613205
epoch 30906 loss -0.37870413064956665 LR -0.40983763337135315 LKL 0.03113350085914135
epoch 30907 loss -0.4307881295681 LR -0.4618018865585327 LKL 0.03101375512778759
epoch 30908 loss -0.3716675937175751 LR -0.4027749001979828 LKL 0.031107310205698013
epoch 30909 loss -0.4165498912334442 LR -0.4479276239871979 LKL 0.03137774020433426
epoch 30910 loss -0.43670856952667236 LR -0.46782535314559937 LKL 0.031116798520088196
epoch 30911 loss -0.4176878333091736 LR -0.44881337881088257 LKL 0.031125551089644432
epoch 30912 loss -0.4320972263813019 LR -0.4633435904979706 LKL 0.

epoch 30998 loss -0.46213120222091675 LR -0.4932301938533783 LKL 0.03109898418188095
epoch 30999 loss -0.4511861205101013 LR -0.48252302408218384 LKL 0.03133689984679222
epoch 31000 loss -0.4169459342956543 LR -0.4483102262020111 LKL 0.03136430308222771
56
epoch 31001 loss -0.39025750756263733 LR -0.42151838541030884 LKL 0.03126086667180061


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31002 loss -0.3911031484603882 LR -0.4222417175769806 LKL 0.03113856352865696
epoch 31003 loss -0.3739178478717804 LR -0.40498268604278564 LKL 0.03106483444571495
epoch 31004 loss -0.37270137667655945 LR -0.40393972396850586 LKL 0.03123834729194641
epoch 31005 loss -0.35441717505455017 LR -0.385424941778183 LKL 0.031007766723632812
epoch 31006 loss -0.40217408537864685 LR -0.43338239192962646 LKL 0.031208297237753868
epoch 31007 loss -0.378582626581192 LR -0.40973758697509766 LKL 0.03115496225655079
epoch 31008 loss -0.38443827629089355 LR -0.415565550327301 LKL 0.03112727217376232
epoch 31009 loss -0.44002971053123474 LR -0.47127920389175415 LKL 0.031249502673745155
epoch 31010 loss -0.36239373683929443 LR -0.39359888434410095 LKL 0.031205160543322563
epoch 31011 loss -0.4356890022754669 LR -0.46697598695755005 LKL 0.03128697723150253
epoch 31012 loss -0.3824370503425598 LR -0.4135638177394867 LKL 0.031126756221055984
epoch 31013 loss -0.4318712651729584 LR -0.4630206823348999 L

73
epoch 31101 loss -0.419057160615921 LR -0.45051485300064087 LKL 0.031457699835300446


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31102 loss -0.517494261264801 LR -0.5488749146461487 LKL 0.03138064220547676
epoch 31103 loss -0.37482309341430664 LR -0.40583348274230957 LKL 0.03101038746535778
epoch 31104 loss -0.4196075201034546 LR -0.4507265090942383 LKL 0.03111898899078369
epoch 31105 loss -0.43176013231277466 LR -0.46298128366470337 LKL 0.031221145763993263
epoch 31106 loss -0.4012954831123352 LR -0.43264129757881165 LKL 0.031345807015895844
epoch 31107 loss -0.43717509508132935 LR -0.46848058700561523 LKL 0.03130548447370529
epoch 31108 loss -0.4187227487564087 LR -0.45005032420158386 LKL 0.031327590346336365
epoch 31109 loss -0.39726176857948303 LR -0.4285171627998352 LKL 0.031255390495061874
epoch 31110 loss -0.44080543518066406 LR -0.47218233346939087 LKL 0.03137689456343651
epoch 31111 loss -0.3787792921066284 LR -0.41007280349731445 LKL 0.03129350394010544
epoch 31112 loss -0.48676803708076477 LR -0.5182239413261414 LKL 0.03145589679479599
epoch 31113 loss -0.3868912160396576 LR -0.4181612432003021 

71
epoch 31201 loss -0.3733881413936615 LR -0.4046589732170105 LKL 0.0312708355486393


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31202 loss -0.3812602758407593 LR -0.4124979078769684 LKL 0.031237643212080002
epoch 31203 loss -0.44034692645072937 LR -0.47156277298927307 LKL 0.031215839087963104
epoch 31204 loss -0.40204307436943054 LR -0.43331295251846313 LKL 0.03126988932490349
epoch 31205 loss -0.43586039543151855 LR -0.4669383466243744 LKL 0.031077954918146133
epoch 31206 loss -0.4059222936630249 LR -0.4370301067829132 LKL 0.031107818707823753
epoch 31207 loss -0.4073728024959564 LR -0.4385847747325897 LKL 0.031211981549859047
epoch 31208 loss -0.36300671100616455 LR -0.3942938446998596 LKL 0.03128713741898537
epoch 31209 loss -0.32853397727012634 LR -0.3598286509513855 LKL 0.031294673681259155
epoch 31210 loss -0.39247313141822815 LR -0.4238068759441376 LKL 0.03133373707532883
epoch 31211 loss -0.3878370225429535 LR -0.41904282569885254 LKL 0.031205816194415092
epoch 31212 loss -0.4458644986152649 LR -0.4771849513053894 LKL 0.03132043778896332
epoch 31213 loss -0.44867822527885437 LR -0.4800305068492889

74
epoch 31301 loss -0.3786764144897461 LR -0.4099108576774597 LKL 0.03123443014919758


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31302 loss -0.45941489934921265 LR -0.4910062551498413 LKL 0.031591352075338364
epoch 31303 loss -0.31690526008605957 LR -0.3479306101799011 LKL 0.03102535754442215
epoch 31304 loss -0.3626767098903656 LR -0.3939017355442047 LKL 0.031225036829710007
epoch 31305 loss -0.37005266547203064 LR -0.40119755268096924 LKL 0.031144889071583748
epoch 31306 loss -0.37253040075302124 LR -0.4038458466529846 LKL 0.03131544589996338
epoch 31307 loss -0.35326457023620605 LR -0.3844261169433594 LKL 0.031161531805992126
epoch 31308 loss -0.42444881796836853 LR -0.45552536845207214 LKL 0.031076541170477867
epoch 31309 loss -0.36333319544792175 LR -0.39450839161872864 LKL 0.03117518685758114
epoch 31310 loss -0.38538047671318054 LR -0.4164876937866211 LKL 0.031107222661376
epoch 31311 loss -0.3850441873073578 LR -0.4160933196544647 LKL 0.03104911930859089
epoch 31312 loss -0.3809521198272705 LR -0.41192173957824707 LKL 0.030969634652137756
epoch 31313 loss -0.3684462606906891 LR -0.3995019793510437 

epoch 31400 loss -0.3615554869174957 LR -0.3925837278366089 LKL 0.03102824091911316
98
epoch 31401 loss -0.41702842712402344 LR -0.44826406240463257 LKL 0.031235625967383385


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31402 loss -0.42545169591903687 LR -0.4569113254547119 LKL 0.03145961835980415
epoch 31403 loss -0.4185405373573303 LR -0.4497382342815399 LKL 0.0311976820230484
epoch 31404 loss -0.3903474509716034 LR -0.42158037424087524 LKL 0.031232932582497597
epoch 31405 loss -0.40268176794052124 LR -0.433993935585022 LKL 0.031312182545661926
epoch 31406 loss -0.4554658532142639 LR -0.48683005571365356 LKL 0.03136420622467995
epoch 31407 loss -0.43903401494026184 LR -0.4704374670982361 LKL 0.03140345215797424
epoch 31408 loss -0.42283061146736145 LR -0.45419541001319885 LKL 0.031364791095256805
epoch 31409 loss -0.4229583144187927 LR -0.4541762173175812 LKL 0.031217914074659348
epoch 31410 loss -0.4623968303203583 LR -0.4938216507434845 LKL 0.031424831598997116
epoch 31411 loss -0.39379459619522095 LR -0.4250814914703369 LKL 0.03128691017627716
epoch 31412 loss -0.4115462899208069 LR -0.44275176525115967 LKL 0.03120548650622368
epoch 31413 loss -0.44576865434646606 LR -0.4771971106529236 LKL

109
epoch 31501 loss -0.4262670874595642 LR -0.4579365849494934 LKL 0.0316694900393486


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31502 loss -0.36421844363212585 LR -0.3955439329147339 LKL 0.03132549300789833
epoch 31503 loss -0.4489307105541229 LR -0.48039737343788147 LKL 0.03146666660904884
epoch 31504 loss -0.41224104166030884 LR -0.44339704513549805 LKL 0.031155994161963463
epoch 31505 loss -0.34505343437194824 LR -0.3760504722595215 LKL 0.030997024849057198
epoch 31506 loss -0.45698070526123047 LR -0.48838698863983154 LKL 0.03140628710389137
epoch 31507 loss -0.3428581953048706 LR -0.3741205334663391 LKL 0.03126232698559761
epoch 31508 loss -0.40488919615745544 LR -0.43609321117401123 LKL 0.031204020604491234
epoch 31509 loss -0.4351646304130554 LR -0.46661072969436646 LKL 0.03144609183073044
epoch 31510 loss -0.44136106967926025 LR -0.4727151095867157 LKL 0.03135405480861664
epoch 31511 loss -0.5262556672096252 LR -0.5575558543205261 LKL 0.03130018711090088
epoch 31512 loss -0.43347305059432983 LR -0.4648028016090393 LKL 0.03132973611354828
epoch 31513 loss -0.4254574179649353 LR -0.45659351348876953 

61
epoch 31601 loss -0.4059753715991974 LR -0.43745777010917664 LKL 0.03148238733410835


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31602 loss -0.4209357798099518 LR -0.4522480070590973 LKL 0.031312230974435806
epoch 31603 loss -0.34770554304122925 LR -0.3789081573486328 LKL 0.031202618032693863
epoch 31604 loss -0.38764262199401855 LR -0.4189514219760895 LKL 0.031308792531490326
epoch 31605 loss -0.3646170198917389 LR -0.39570292830467224 LKL 0.031085921451449394
epoch 31606 loss -0.3587337136268616 LR -0.38980457186698914 LKL 0.031070873141288757
epoch 31607 loss -0.4368848204612732 LR -0.46819841861724854 LKL 0.031313613057136536
epoch 31608 loss -0.3510707914829254 LR -0.3822934031486511 LKL 0.031222624704241753
epoch 31609 loss -0.3039571940898895 LR -0.3349635601043701 LKL 0.031006360426545143
epoch 31610 loss -0.37800103425979614 LR -0.409197062253952 LKL 0.031196042895317078
epoch 31611 loss -0.3067304491996765 LR -0.33763962984085083 LKL 0.030909167602658272
epoch 31612 loss -0.34935134649276733 LR -0.38042035698890686 LKL 0.03106902353465557
epoch 31613 loss -0.35354307293891907 LR -0.38465961813926

58
epoch 31701 loss -0.41508781909942627 LR -0.4461827278137207 LKL 0.031094904989004135


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31702 loss -0.40381377935409546 LR -0.4351710081100464 LKL 0.03135722503066063
epoch 31703 loss -0.39549770951271057 LR -0.4269707202911377 LKL 0.03147300332784653
epoch 31704 loss -0.40379342436790466 LR -0.4351729154586792 LKL 0.03137949854135513
epoch 31705 loss -0.39393243193626404 LR -0.42527180910110474 LKL 0.0313393659889698
epoch 31706 loss -0.42340102791786194 LR -0.4548302888870239 LKL 0.031429268419742584
epoch 31707 loss -0.37344038486480713 LR -0.40473198890686035 LKL 0.031291600316762924
epoch 31708 loss -0.40561163425445557 LR -0.4368579387664795 LKL 0.031246306374669075
epoch 31709 loss -0.3978307843208313 LR -0.42902234196662903 LKL 0.031191561371088028
epoch 31710 loss -0.3939996361732483 LR -0.4253483712673187 LKL 0.03134874999523163
epoch 31711 loss -0.3069016933441162 LR -0.3380894958972931 LKL 0.031187808141112328
epoch 31712 loss -0.4044996500015259 LR -0.43580853939056396 LKL 0.031308893114328384
epoch 31713 loss -0.3686144948005676 LR -0.3998035192489624 

epoch 31799 loss -0.41362813115119934 LR -0.4451234042644501 LKL 0.031495269387960434
epoch 31800 loss -0.41344061493873596 LR -0.44493427872657776 LKL 0.031493667513132095
56
epoch 31801 loss -0.36157241463661194 LR -0.39284953474998474 LKL 0.031277112662792206


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31802 loss -0.34533724188804626 LR -0.37671053409576416 LKL 0.031373292207717896
epoch 31803 loss -0.40234073996543884 LR -0.43384799361228943 LKL 0.03150726482272148
epoch 31804 loss -0.3524976074695587 LR -0.3837915360927582 LKL 0.03129393234848976
epoch 31805 loss -0.44333207607269287 LR -0.4747229814529419 LKL 0.03139089420437813
epoch 31806 loss -0.3752900958061218 LR -0.4066053032875061 LKL 0.03131521865725517
epoch 31807 loss -0.35138970613479614 LR -0.3826485872268677 LKL 0.03125886991620064
epoch 31808 loss -0.4278309941291809 LR -0.45905473828315735 LKL 0.031223731115460396
epoch 31809 loss -0.3834977447986603 LR -0.4149700105190277 LKL 0.03147226572036743
epoch 31810 loss -0.4062623381614685 LR -0.4376237094402313 LKL 0.03136138245463371
epoch 31811 loss -0.40836963057518005 LR -0.4395367205142975 LKL 0.031167080625891685
epoch 31812 loss -0.4107505977153778 LR -0.44200655817985535 LKL 0.03125595673918724
epoch 31813 loss -0.4318540692329407 LR -0.46338701248168945 LKL

67
epoch 31901 loss -0.3628371059894562 LR -0.3941478133201599 LKL 0.03131071478128433


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 31902 loss -0.4599760174751282 LR -0.4913863241672516 LKL 0.031410302966833115
epoch 31903 loss -0.4271610975265503 LR -0.45841699838638306 LKL 0.03125591203570366
epoch 31904 loss -0.41133344173431396 LR -0.4426644444465637 LKL 0.03133099898695946
epoch 31905 loss -0.39124616980552673 LR -0.4225686490535736 LKL 0.03132248297333717
epoch 31906 loss -0.40624165534973145 LR -0.43769562244415283 LKL 0.03145395591855049
epoch 31907 loss -0.41809290647506714 LR -0.4494711458683014 LKL 0.03137824311852455
epoch 31908 loss -0.4215320348739624 LR -0.4528941214084625 LKL 0.03136207163333893
epoch 31909 loss -0.408270925283432 LR -0.43963131308555603 LKL 0.031360380351543427
epoch 31910 loss -0.42232778668403625 LR -0.4538916051387787 LKL 0.03156381472945213
epoch 31911 loss -0.5551775097846985 LR -0.5868734121322632 LKL 0.0316958986222744
epoch 31912 loss -0.42028555274009705 LR -0.45167797803878784 LKL 0.0313924178481102
epoch 31913 loss -0.4372272491455078 LR -0.468555212020874 LKL 0.03

109
epoch 32001 loss -0.4392620623111725 LR -0.4706833064556122 LKL 0.031421247869729996


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32002 loss -0.44157445430755615 LR -0.4729451537132263 LKL 0.03137071058154106
epoch 32003 loss -0.3918524384498596 LR -0.423362135887146 LKL 0.03150968626141548
epoch 32004 loss -0.4540728032588959 LR -0.4854986071586609 LKL 0.03142581507563591
epoch 32005 loss -0.36229872703552246 LR -0.39352649450302124 LKL 0.031227782368659973
epoch 32006 loss -0.4123774766921997 LR -0.44377416372299194 LKL 0.03139667212963104
epoch 32007 loss -0.34492722153663635 LR -0.37620094418525696 LKL 0.0312737300992012
epoch 32008 loss -0.4070010185241699 LR -0.43836158514022827 LKL 0.031360555440187454
epoch 32009 loss -0.46515849232673645 LR -0.49667948484420776 LKL 0.03152099624276161
epoch 32010 loss -0.4266829788684845 LR -0.45816516876220703 LKL 0.031482186168432236
epoch 32011 loss -0.4364820122718811 LR -0.46796154975891113 LKL 0.03147953376173973
epoch 32012 loss -0.4395160973072052 LR -0.4709450900554657 LKL 0.03142900392413139
epoch 32013 loss -0.4006604850292206 LR -0.43214309215545654 LKL

98
epoch 32101 loss -0.3655069172382355 LR -0.3968181610107422 LKL 0.03131123632192612


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32102 loss -0.3385104238986969 LR -0.3700195848941803 LKL 0.0315091498196125
epoch 32103 loss -0.39485207200050354 LR -0.42631328105926514 LKL 0.031461216509342194
epoch 32104 loss -0.45381513237953186 LR -0.4855591654777527 LKL 0.031744033098220825
epoch 32105 loss -0.383954793214798 LR -0.41544264554977417 LKL 0.031487852334976196
epoch 32106 loss -0.43615156412124634 LR -0.46751365065574646 LKL 0.031362101435661316
epoch 32107 loss -0.4265456199645996 LR -0.45797452330589294 LKL 0.03142888844013214
epoch 32108 loss -0.37151920795440674 LR -0.4029339849948883 LKL 0.031414762139320374
epoch 32109 loss -0.382320761680603 LR -0.41391026973724365 LKL 0.03158951923251152
epoch 32110 loss -0.37193623185157776 LR -0.40328240394592285 LKL 0.03134617954492569
epoch 32111 loss -0.45197954773902893 LR -0.4836377501487732 LKL 0.031658194959163666
epoch 32112 loss -0.4602254331111908 LR -0.491809606552124 LKL 0.03158416971564293
epoch 32113 loss -0.43312886357307434 LR -0.46461939811706543 

118
epoch 32201 loss -0.48153549432754517 LR -0.5132842063903809 LKL 0.0317487008869648


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32202 loss -0.36762359738349915 LR -0.3989464044570923 LKL 0.03132279962301254
epoch 32203 loss -0.42088744044303894 LR -0.4524126648902893 LKL 0.03152521327137947
epoch 32204 loss -0.38536345958709717 LR -0.41678106784820557 LKL 0.0314176082611084
epoch 32205 loss -0.3767194151878357 LR -0.4080318212509155 LKL 0.03131241723895073
epoch 32206 loss -0.44286012649536133 LR -0.47431787848472595 LKL 0.03145775943994522
epoch 32207 loss -0.4779003858566284 LR -0.5093416571617126 LKL 0.03144126385450363
epoch 32208 loss -0.3339870274066925 LR -0.3654225766658783 LKL 0.03143554553389549
epoch 32209 loss -0.389117956161499 LR -0.42033088207244873 LKL 0.031212935224175453
epoch 32210 loss -0.4585966169834137 LR -0.4903104305267334 LKL 0.031713806092739105
epoch 32211 loss -0.38473281264305115 LR -0.4161161184310913 LKL 0.031383316963911057
epoch 32212 loss -0.3395976424217224 LR -0.37102946639060974 LKL 0.03143183887004852
epoch 32213 loss -0.42548221349716187 LR -0.456956148147583 LKL 0.

epoch 32300 loss -0.3159557580947876 LR -0.34758469462394714 LKL 0.03162893280386925
81


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32301 loss -0.41656267642974854 LR -0.4481097459793091 LKL 0.03154706209897995
epoch 32302 loss -0.3981171250343323 LR -0.42970994114875793 LKL 0.031592801213264465
epoch 32303 loss -0.3825245499610901 LR -0.41387978196144104 LKL 0.031355228275060654
epoch 32304 loss -0.3809444308280945 LR -0.412304162979126 LKL 0.03135973587632179
epoch 32305 loss -0.3963206708431244 LR -0.427823543548584 LKL 0.03150288015604019
epoch 32306 loss -0.37695595622062683 LR -0.4083619713783264 LKL 0.031406011432409286
epoch 32307 loss -0.4426039457321167 LR -0.4743642210960388 LKL 0.03176029026508331
epoch 32308 loss -0.37879085540771484 LR -0.4101831912994385 LKL 0.031392332166433334
epoch 32309 loss -0.4943493902683258 LR -0.526024580001831 LKL 0.031675197184085846
epoch 32310 loss -0.3230191171169281 LR -0.3543645739555359 LKL 0.03134545311331749
epoch 32311 loss -0.4843783378601074 LR -0.5161556005477905 LKL 0.031777266412973404
epoch 32312 loss -0.37425801157951355 LR -0.40562063455581665 LKL 0.

epoch 32400 loss -0.3602541983127594 LR -0.3915656805038452 LKL 0.031311485916376114
110
epoch 32401 loss -0.4323510229587555 LR -0.4638849198818207 LKL 0.03153388574719429


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32402 loss -0.42629924416542053 LR -0.45778393745422363 LKL 0.0314846895635128
epoch 32403 loss -0.4106261730194092 LR -0.4421139657497406 LKL 0.03148777782917023
epoch 32404 loss -0.3593509793281555 LR -0.39063307642936707 LKL 0.031282100826501846
epoch 32405 loss -0.3680688440799713 LR -0.39949148893356323 LKL 0.03142264485359192
epoch 32406 loss -0.3910461366176605 LR -0.4224753975868225 LKL 0.03142925351858139
epoch 32407 loss -0.46892809867858887 LR -0.5007280707359314 LKL 0.03179998695850372
epoch 32408 loss -0.3825385868549347 LR -0.41420605778694153 LKL 0.03166746720671654
epoch 32409 loss -0.3981325924396515 LR -0.4297162890434265 LKL 0.03158370405435562
epoch 32410 loss -0.324623167514801 LR -0.3557557463645935 LKL 0.031132593750953674
epoch 32411 loss -0.3086966276168823 LR -0.34005382657051086 LKL 0.031357210129499435
epoch 32412 loss -0.3705506920814514 LR -0.40197405219078064 LKL 0.03142336383461952
epoch 32413 loss -0.4078357219696045 LR -0.439349502325058 LKL 0.03

39
epoch 32501 loss -0.3981878161430359 LR -0.4295152723789215 LKL 0.031327445060014725


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32502 loss -0.38957467675209045 LR -0.4210807979106903 LKL 0.03150611370801926
epoch 32503 loss -0.34843745827674866 LR -0.3797602653503418 LKL 0.03132281079888344
epoch 32504 loss -0.41575145721435547 LR -0.4471933841705322 LKL 0.03144192695617676
epoch 32505 loss -0.32720983028411865 LR -0.35863226652145386 LKL 0.0314224436879158
epoch 32506 loss -0.42596375942230225 LR -0.4576263427734375 LKL 0.031662579625844955
epoch 32507 loss -0.36373576521873474 LR -0.39523065090179443 LKL 0.0314948745071888
epoch 32508 loss -0.45122620463371277 LR -0.48283830285072327 LKL 0.0316120944917202
epoch 32509 loss -0.32809898257255554 LR -0.3594408333301544 LKL 0.03134183958172798
epoch 32510 loss -0.3866732120513916 LR -0.41825053095817566 LKL 0.03157730773091316
epoch 32511 loss -0.3825569748878479 LR -0.41415947675704956 LKL 0.03160250186920166
epoch 32512 loss -0.38304129242897034 LR -0.41462844610214233 LKL 0.0315871462225914
epoch 32513 loss -0.330793559551239 LR -0.36229896545410156 LKL 

49
epoch 32601 loss -0.46785032749176025 LR -0.4994993209838867 LKL 0.03164900466799736


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32602 loss -0.3981591761112213 LR -0.42944347858428955 LKL 0.03128429502248764
epoch 32603 loss -0.38965561985969543 LR -0.42124056816101074 LKL 0.03158494085073471
epoch 32604 loss -0.394436776638031 LR -0.42603132128715515 LKL 0.03159455582499504
epoch 32605 loss -0.42560186982154846 LR -0.4572266936302185 LKL 0.03162483125925064
epoch 32606 loss -0.46748360991477966 LR -0.4992240369319916 LKL 0.031740423291921616
epoch 32607 loss -0.433376282453537 LR -0.46478691697120667 LKL 0.03141064569354057
epoch 32608 loss -0.43237417936325073 LR -0.46373432874679565 LKL 0.03136016055941582
epoch 32609 loss -0.4354124069213867 LR -0.46712684631347656 LKL 0.03171444311738014
epoch 32610 loss -0.40310320258140564 LR -0.4347987174987793 LKL 0.03169551491737366
epoch 32611 loss -0.2706136107444763 LR -0.30191218852996826 LKL 0.03129858151078224
epoch 32612 loss -0.3751623034477234 LR -0.40663760900497437 LKL 0.03147531673312187
epoch 32613 loss -0.4354530870914459 LR -0.4669940769672394 LKL 

73
epoch 32701 loss -0.43568935990333557 LR -0.46735823154449463 LKL 0.03166886419057846


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32702 loss -0.4438406825065613 LR -0.47565412521362305 LKL 0.03181343525648117
epoch 32703 loss -0.34827327728271484 LR -0.3795918822288513 LKL 0.031318604946136475
epoch 32704 loss -0.40619564056396484 LR -0.4378151595592499 LKL 0.03161952644586563
epoch 32705 loss -0.4319012463092804 LR -0.4634159803390503 LKL 0.031514722853899
epoch 32706 loss -0.4380479156970978 LR -0.46970343589782715 LKL 0.03165552392601967
epoch 32707 loss -0.41561204195022583 LR -0.4472990930080414 LKL 0.03168705478310585
epoch 32708 loss -0.4422526955604553 LR -0.474082350730896 LKL 0.031829655170440674
epoch 32709 loss -0.40819239616394043 LR -0.4398276209831238 LKL 0.03163523226976395
epoch 32710 loss -0.39434659481048584 LR -0.42596757411956787 LKL 0.03162098675966263
epoch 32711 loss -0.4653588831424713 LR -0.49715834856033325 LKL 0.03179945796728134
epoch 32712 loss -0.4082021415233612 LR -0.4397052228450775 LKL 0.031503092497587204
epoch 32713 loss -0.349322646856308 LR -0.3809049427509308 LKL 0.03

epoch 32799 loss -0.39183509349823 LR -0.42338693141937256 LKL 0.031551845371723175
epoch 32800 loss -0.3437531292438507 LR -0.37515830993652344 LKL 0.031405191868543625
44


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32801 loss -0.48934024572372437 LR -0.5211482644081116 LKL 0.03180801495909691
epoch 32802 loss -0.39325428009033203 LR -0.4249128997325897 LKL 0.03165862336754799
epoch 32803 loss -0.4005601406097412 LR -0.4321323037147522 LKL 0.03157217055559158
epoch 32804 loss -0.36251863837242126 LR -0.3940025568008423 LKL 0.03148391470313072
epoch 32805 loss -0.4463980793952942 LR -0.4780897796154022 LKL 0.031691692769527435
epoch 32806 loss -0.40473127365112305 LR -0.43635058403015137 LKL 0.03161931037902832
epoch 32807 loss -0.37587520480155945 LR -0.4073387682437897 LKL 0.031463563442230225
epoch 32808 loss -0.4336206614971161 LR -0.4652208685874939 LKL 0.03160019963979721
epoch 32809 loss -0.4600849747657776 LR -0.4919988512992859 LKL 0.0319138802587986
epoch 32810 loss -0.31455355882644653 LR -0.34597688913345337 LKL 0.031423334032297134
epoch 32811 loss -0.4205310344696045 LR -0.45204490423202515 LKL 0.03151388466358185
epoch 32812 loss -0.40457189083099365 LR -0.43603992462158203 LKL

epoch 32900 loss -0.3739062249660492 LR -0.4055025279521942 LKL 0.031596310436725616
68


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 32901 loss -0.40809839963912964 LR -0.4398839473724365 LKL 0.03178555145859718
epoch 32902 loss -0.3786170184612274 LR -0.4103281497955322 LKL 0.031711120158433914
epoch 32903 loss -0.31514599919319153 LR -0.3464190661907196 LKL 0.031273066997528076
epoch 32904 loss -0.3379032611846924 LR -0.3694944381713867 LKL 0.03159118443727493
epoch 32905 loss -0.3914026916027069 LR -0.42301297187805176 LKL 0.031610287725925446
epoch 32906 loss -0.34627774357795715 LR -0.3776949942111969 LKL 0.03141726180911064
epoch 32907 loss -0.4257447123527527 LR -0.4575676918029785 LKL 0.03182297572493553
epoch 32908 loss -0.35250088572502136 LR -0.3839832544326782 LKL 0.03148237615823746
epoch 32909 loss -0.3863053321838379 LR -0.41779839992523193 LKL 0.03149308264255524
epoch 32910 loss -0.4140319526195526 LR -0.4459257423877716 LKL 0.03189379721879959
epoch 32911 loss -0.4088747799396515 LR -0.4406868815422058 LKL 0.03181210532784462
epoch 32912 loss -0.3272266685962677 LR -0.3587201237678528 LKL 0.0

epoch 33000 loss -0.41004154086112976 LR -0.4416721761226654 LKL 0.031630631536245346
45


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33001 loss -0.3690533936023712 LR -0.40067532658576965 LKL 0.03162193298339844
epoch 33002 loss -0.46454665064811707 LR -0.4962283968925476 LKL 0.03168173506855965
epoch 33003 loss -0.3315126895904541 LR -0.3630659878253937 LKL 0.03155331313610077
epoch 33004 loss -0.4002186357975006 LR -0.43187856674194336 LKL 0.031659919768571854
epoch 33005 loss -0.3932109475135803 LR -0.42475077509880066 LKL 0.03153982758522034
epoch 33006 loss -0.4205813705921173 LR -0.4521729350090027 LKL 0.03159155324101448
epoch 33007 loss -0.38797444105148315 LR -0.4196443259716034 LKL 0.03166988492012024
epoch 33008 loss -0.46924811601638794 LR -0.500990092754364 LKL 0.03174198046326637
epoch 33009 loss -0.41325968503952026 LR -0.4448244273662567 LKL 0.031564727425575256
epoch 33010 loss -0.4419754445552826 LR -0.4735754728317261 LKL 0.03160003945231438
epoch 33011 loss -0.37328916788101196 LR -0.40485697984695435 LKL 0.03156781569123268
epoch 33012 loss -0.4123864769935608 LR -0.4439392685890198 LKL 0.

epoch 33100 loss -0.3930973708629608 LR -0.42482271790504456 LKL 0.03172533959150314
63


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33101 loss -0.4354364275932312 LR -0.46732640266418457 LKL 0.03188998997211456
epoch 33102 loss -0.41174623370170593 LR -0.44322389364242554 LKL 0.03147765249013901
epoch 33103 loss -0.37969857454299927 LR -0.41140708327293396 LKL 0.0317084938287735
epoch 33104 loss -0.41643208265304565 LR -0.4479678273200989 LKL 0.03153575584292412
epoch 33105 loss -0.4770985245704651 LR -0.508918046951294 LKL 0.031819526106119156
epoch 33106 loss -0.4770803153514862 LR -0.5089747905731201 LKL 0.031894467771053314
epoch 33107 loss -0.4115809500217438 LR -0.44323867559432983 LKL 0.031657714396715164
epoch 33108 loss -0.43656474351882935 LR -0.46844014525413513 LKL 0.03187539801001549
epoch 33109 loss -0.4465760290622711 LR -0.47839462757110596 LKL 0.031818609684705734
epoch 33110 loss -0.39728906750679016 LR -0.4289401173591614 LKL 0.031651049852371216
epoch 33111 loss -0.42173078656196594 LR -0.45327427983283997 LKL 0.03154349699616432
epoch 33112 loss -0.35842815041542053 LR -0.3901863992214203

epoch 33200 loss -0.38528090715408325 LR -0.4169154167175293 LKL 0.03163451701402664
64


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33201 loss -0.44103848934173584 LR -0.47281017899513245 LKL 0.0317717045545578
epoch 33202 loss -0.4160767197608948 LR -0.4479014277458191 LKL 0.031824707984924316
epoch 33203 loss -0.47324877977371216 LR -0.5051059722900391 LKL 0.031857188791036606
epoch 33204 loss -0.44577258825302124 LR -0.4774540662765503 LKL 0.03168149292469025
epoch 33205 loss -0.35937169194221497 LR -0.3908480107784271 LKL 0.03147630766034126
epoch 33206 loss -0.5172685980796814 LR -0.5493214726448059 LKL 0.03205285593867302
epoch 33207 loss -0.38537147641181946 LR -0.41678375005722046 LKL 0.0314122773706913
epoch 33208 loss -0.42170777916908264 LR -0.4534355700016022 LKL 0.031727783381938934
epoch 33209 loss -0.33507972955703735 LR -0.36664751172065735 LKL 0.03156778961420059
epoch 33210 loss -0.40921711921691895 LR -0.4407557547092438 LKL 0.03153863549232483
epoch 33211 loss -0.4167013168334961 LR -0.44850364327430725 LKL 0.03180232644081116
epoch 33212 loss -0.42289435863494873 LR -0.454561710357666 LKL

epoch 33300 loss -0.4020290672779083 LR -0.4337259531021118 LKL 0.03169689700007439
49


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33301 loss -0.40483707189559937 LR -0.4365203380584717 LKL 0.03168325498700142
epoch 33302 loss -0.3745366632938385 LR -0.4061456620693207 LKL 0.031609006226062775
epoch 33303 loss -0.4496041536331177 LR -0.48156970739364624 LKL 0.031965553760528564
epoch 33304 loss -0.41150975227355957 LR -0.4432273805141449 LKL 0.03171762451529503
epoch 33305 loss -0.47601568698883057 LR -0.5079493522644043 LKL 0.031933680176734924
epoch 33306 loss -0.4159276485443115 LR -0.4475502669811249 LKL 0.03162261098623276
epoch 33307 loss -0.37226495146751404 LR -0.40401384234428406 LKL 0.031748902052640915
epoch 33308 loss -0.45269763469696045 LR -0.48436203598976135 LKL 0.031664397567510605
epoch 33309 loss -0.3942338526248932 LR -0.42579662799835205 LKL 0.03156276419758797
epoch 33310 loss -0.4125957787036896 LR -0.4444914758205414 LKL 0.031895700842142105
epoch 33311 loss -0.42687028646469116 LR -0.45866289734840393 LKL 0.03179261460900307
epoch 33312 loss -0.38095909357070923 LR -0.412578791379928

epoch 33399 loss -0.4213859736919403 LR -0.4532551169395447 LKL 0.03186913952231407
epoch 33400 loss -0.36399686336517334 LR -0.39563480019569397 LKL 0.031637925654649734
69
epoch 33401 loss -0.3802628517150879 LR -0.41178810596466064 LKL 0.03152526170015335


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33402 loss -0.4219230115413666 LR -0.45360660552978516 LKL 0.03168358653783798
epoch 33403 loss -0.4099923074245453 LR -0.4419328570365906 LKL 0.031940557062625885
epoch 33404 loss -0.44608476758003235 LR -0.4778570234775543 LKL 0.03177226334810257
epoch 33405 loss -0.4622279405593872 LR -0.4940364956855774 LKL 0.03180856257677078
epoch 33406 loss -0.4057059586048126 LR -0.4374029338359833 LKL 0.031696971505880356
epoch 33407 loss -0.3986395597457886 LR -0.4300975203514099 LKL 0.03145797550678253
epoch 33408 loss -0.37938302755355835 LR -0.4109993278980255 LKL 0.03161628916859627
epoch 33409 loss -0.39120379090309143 LR -0.42288634181022644 LKL 0.03168254718184471
epoch 33410 loss -0.4227191209793091 LR -0.45450109243392944 LKL 0.03178197145462036
epoch 33411 loss -0.37698063254356384 LR -0.4085858166217804 LKL 0.03160518407821655
epoch 33412 loss -0.4507535696029663 LR -0.4827146530151367 LKL 0.031961098313331604
epoch 33413 loss -0.410084068775177 LR -0.4417378604412079 LKL 0.0

69
epoch 33501 loss -0.46728643774986267 LR -0.49926963448524475 LKL 0.03198318928480148


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33502 loss -0.41268157958984375 LR -0.44442522525787354 LKL 0.03174363449215889
epoch 33503 loss -0.40895891189575195 LR -0.44066670536994934 LKL 0.031707800924777985
epoch 33504 loss -0.44286391139030457 LR -0.47484874725341797 LKL 0.031984832137823105
epoch 33505 loss -0.42153000831604004 LR -0.4531394839286804 LKL 0.03160947933793068
epoch 33506 loss -0.432386189699173 LR -0.4642891585826874 LKL 0.031902965158224106
epoch 33507 loss -0.384317547082901 LR -0.4159386157989502 LKL 0.03162107616662979
epoch 33508 loss -0.44110289216041565 LR -0.47283726930618286 LKL 0.03173438459634781
epoch 33509 loss -0.36263731122016907 LR -0.3943080008029938 LKL 0.03167068585753441
epoch 33510 loss -0.4214307963848114 LR -0.45316845178604126 LKL 0.031737662851810455
epoch 33511 loss -0.4080955386161804 LR -0.43976524472236633 LKL 0.03166971728205681
epoch 33512 loss -0.3952065706253052 LR -0.42696964740753174 LKL 0.03176308423280716
epoch 33513 loss -0.42939695715904236 LR -0.46114858984947205

86
epoch 33601 loss -0.3834843933582306 LR -0.4150564670562744 LKL 0.031572066247463226


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33602 loss -0.39718297123908997 LR -0.4289492070674896 LKL 0.03176623210310936
epoch 33603 loss -0.4264078438282013 LR -0.45827242732048035 LKL 0.03186457231640816
epoch 33604 loss -0.4500071108341217 LR -0.48184478282928467 LKL 0.03183768317103386
epoch 33605 loss -0.41705670952796936 LR -0.4487062990665436 LKL 0.031649600714445114
epoch 33606 loss -0.37907207012176514 LR -0.41085463762283325 LKL 0.03178258240222931
epoch 33607 loss -0.39237141609191895 LR -0.42416322231292725 LKL 0.0317918062210083
epoch 33608 loss -0.3870393633842468 LR -0.4187503159046173 LKL 0.03171095624566078
epoch 33609 loss -0.43248069286346436 LR -0.464357852935791 LKL 0.031877148896455765
epoch 33610 loss -0.42124608159065247 LR -0.4528697431087494 LKL 0.03162367269396782
epoch 33611 loss -0.3964248299598694 LR -0.4281713366508484 LKL 0.03174649178981781
epoch 33612 loss -0.38492551445961 LR -0.4167570471763611 LKL 0.031831543892621994
epoch 33613 loss -0.423113077878952 LR -0.45490145683288574 LKL 0.0

epoch 33700 loss -0.41818907856941223 LR -0.4499986171722412 LKL 0.03180953860282898
58


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33701 loss -0.37939807772636414 LR -0.4112395942211151 LKL 0.03184151649475098
epoch 33702 loss -0.39321208000183105 LR -0.4249734878540039 LKL 0.03176139295101166
epoch 33703 loss -0.4304124116897583 LR -0.4623515009880066 LKL 0.03193909302353859
epoch 33704 loss -0.42252323031425476 LR -0.454401433467865 LKL 0.031878210604190826
epoch 33705 loss -0.46826669573783875 LR -0.5001156330108643 LKL 0.03184894844889641
epoch 33706 loss -0.37583285570144653 LR -0.4075474441051483 LKL 0.031714580953121185
epoch 33707 loss -0.48516517877578735 LR -0.5169702768325806 LKL 0.03180508315563202
epoch 33708 loss -0.42079201340675354 LR -0.45248422026634216 LKL 0.03169221803545952
epoch 33709 loss -0.4812972843647003 LR -0.5131067037582397 LKL 0.031809430569410324
epoch 33710 loss -0.3862740993499756 LR -0.417874813079834 LKL 0.0316007025539875
epoch 33711 loss -0.3917153775691986 LR -0.42346763610839844 LKL 0.03175225853919983
epoch 33712 loss -0.4166457951068878 LR -0.44832324981689453 LKL 0.

epoch 33800 loss -0.4162771701812744 LR -0.4480207562446594 LKL 0.03174357861280441
62


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33801 loss -0.44372719526290894 LR -0.47550928592681885 LKL 0.03178207948803902
epoch 33802 loss -0.4332929849624634 LR -0.46508902311325073 LKL 0.03179604187607765
epoch 33803 loss -0.43309295177459717 LR -0.46489667892456055 LKL 0.031803734600543976
epoch 33804 loss -0.48562270402908325 LR -0.5175172090530396 LKL 0.0318944938480854
epoch 33805 loss -0.37423816323280334 LR -0.4057410657405853 LKL 0.03150290623307228
epoch 33806 loss -0.4045334756374359 LR -0.4364144802093506 LKL 0.03188100457191467
epoch 33807 loss -0.42644667625427246 LR -0.4583393335342407 LKL 0.03189266100525856
epoch 33808 loss -0.386217325925827 LR -0.41791224479675293 LKL 0.0316949225962162
epoch 33809 loss -0.4157528281211853 LR -0.4475453495979309 LKL 0.03179250657558441
epoch 33810 loss -0.46251019835472107 LR -0.49434003233909607 LKL 0.031829845160245895
epoch 33811 loss -0.4687734842300415 LR -0.5009387731552124 LKL 0.032165296375751495
epoch 33812 loss -0.44266748428344727 LR -0.4744672477245331 LKL 

epoch 33900 loss -0.4687158167362213 LR -0.5006682276725769 LKL 0.03195241838693619
76


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 33901 loss -0.4517432749271393 LR -0.4835774898529053 LKL 0.031834203749895096
epoch 33902 loss -0.4495652914047241 LR -0.48154690861701965 LKL 0.03198162466287613
epoch 33903 loss -0.39071476459503174 LR -0.4225883483886719 LKL 0.03187357261776924
epoch 33904 loss -0.43909794092178345 LR -0.4708474278450012 LKL 0.03174949437379837
epoch 33905 loss -0.3633977174758911 LR -0.39490580558776855 LKL 0.031508076936006546
epoch 33906 loss -0.36699122190475464 LR -0.3988162577152252 LKL 0.031825046986341476
epoch 33907 loss -0.3920406103134155 LR -0.4239545464515686 LKL 0.03191393241286278
epoch 33908 loss -0.41919049620628357 LR -0.4509614408016205 LKL 0.03177093714475632
epoch 33909 loss -0.39915570616722107 LR -0.43098899722099304 LKL 0.031833283603191376
epoch 33910 loss -0.4352211356163025 LR -0.46705710887908936 LKL 0.03183596953749657
epoch 33911 loss -0.4232029616832733 LR -0.45521843433380127 LKL 0.03201548382639885
epoch 33912 loss -0.48200565576553345 LR -0.5137131810188293 L

epoch 33998 loss -0.4225322902202606 LR -0.45442456007003784 LKL 0.031892262399196625
epoch 33999 loss -0.4997909367084503 LR -0.5316430330276489 LKL 0.03185209259390831
epoch 34000 loss -0.37719395756721497 LR -0.4088726043701172 LKL 0.031678635627031326
46
epoch 34001 loss -0.4423767328262329 LR -0.47433096170425415 LKL 0.031954240053892136


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34002 loss -0.4576527774333954 LR -0.48939353227615356 LKL 0.03174075856804848
epoch 34003 loss -0.4779256582260132 LR -0.5101038217544556 LKL 0.03217814862728119
epoch 34004 loss -0.42379030585289 LR -0.45558610558509827 LKL 0.03179580345749855
epoch 34005 loss -0.43685182929039 LR -0.4688040018081665 LKL 0.031952179968357086
epoch 34006 loss -0.44854941964149475 LR -0.48053908348083496 LKL 0.03198966383934021
epoch 34007 loss -0.403076708316803 LR -0.43492281436920166 LKL 0.03184610605239868
epoch 34008 loss -0.4279647171497345 LR -0.45971331000328064 LKL 0.03174859657883644
epoch 34009 loss -0.4470314383506775 LR -0.4788644015789032 LKL 0.03183296322822571
epoch 34010 loss -0.437713623046875 LR -0.46964195370674133 LKL 0.031928326934576035
epoch 34011 loss -0.38297444581985474 LR -0.4145057499408722 LKL 0.03153129667043686
epoch 34012 loss -0.4258301258087158 LR -0.4577605128288269 LKL 0.03193037956953049
epoch 34013 loss -0.45634984970092773 LR -0.4881725311279297 LKL 0.03182

epoch 34100 loss -0.44768026471138 LR -0.479749470949173 LKL 0.03206920251250267
115
epoch 34101 loss -0.3690657913684845 LR -0.4006884694099426 LKL 0.03162268176674843


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34102 loss -0.42637497186660767 LR -0.4583548903465271 LKL 0.03197990357875824
epoch 34103 loss -0.39786237478256226 LR -0.42966175079345703 LKL 0.03179936110973358
epoch 34104 loss -0.4132247269153595 LR -0.4452289044857025 LKL 0.03200416639447212
epoch 34105 loss -0.4025994837284088 LR -0.4344443380832672 LKL 0.0318448469042778
epoch 34106 loss -0.43507084250450134 LR -0.46709001064300537 LKL 0.03201916813850403
epoch 34107 loss -0.3780650794506073 LR -0.4097951054573059 LKL 0.03173002973198891
epoch 34108 loss -0.45382705330848694 LR -0.48568376898765564 LKL 0.0318567119538784
epoch 34109 loss -0.4114331603050232 LR -0.44312626123428345 LKL 0.03169311583042145
epoch 34110 loss -0.39968088269233704 LR -0.4314590394496918 LKL 0.03177816420793533
epoch 34111 loss -0.46999937295913696 LR -0.5018701553344727 LKL 0.03187078610062599
epoch 34112 loss -0.3963792622089386 LR -0.4283910095691681 LKL 0.03201175481081009
epoch 34113 loss -0.4289151132106781 LR -0.4609135389328003 LKL 0.03

epoch 34200 loss -0.3867533802986145 LR -0.41871315240859985 LKL 0.03195977583527565
70


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34201 loss -0.40178680419921875 LR -0.4337655007839203 LKL 0.031978681683540344
epoch 34202 loss -0.4202667474746704 LR -0.452143132686615 LKL 0.03187638521194458
epoch 34203 loss -0.4318696856498718 LR -0.4638676345348358 LKL 0.03199796378612518
epoch 34204 loss -0.4031720459461212 LR -0.43510034680366516 LKL 0.03192831203341484
epoch 34205 loss -0.34606650471687317 LR -0.3780309855937958 LKL 0.0319644920527935
epoch 34206 loss -0.4244203567504883 LR -0.4565289318561554 LKL 0.03210856392979622
epoch 34207 loss -0.4756712019443512 LR -0.5077521800994873 LKL 0.03208097442984581
epoch 34208 loss -0.4309523403644562 LR -0.4629133343696594 LKL 0.03196099027991295
epoch 34209 loss -0.4487325847148895 LR -0.4807339310646057 LKL 0.03200133517384529
epoch 34210 loss -0.3790653347969055 LR -0.41100171208381653 LKL 0.03193637356162071
epoch 34211 loss -0.442426472902298 LR -0.4744400382041931 LKL 0.03201356902718544
epoch 34212 loss -0.4826667606830597 LR -0.5146993398666382 LKL 0.03203257

epoch 34300 loss -0.3083358705043793 LR -0.3399498164653778 LKL 0.031613945960998535
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34301 loss -0.4378378391265869 LR -0.46970662474632263 LKL 0.031868789345026016
epoch 34302 loss -0.4380311369895935 LR -0.470071017742157 LKL 0.03203986585140228
epoch 34303 loss -0.39865627884864807 LR -0.43053004145622253 LKL 0.03187376633286476
epoch 34304 loss -0.4195396900177002 LR -0.45140114426612854 LKL 0.03186146914958954
epoch 34305 loss -0.3831723928451538 LR -0.415114164352417 LKL 0.03194176405668259
epoch 34306 loss -0.4500226676464081 LR -0.4820586144924164 LKL 0.032035943120718
epoch 34307 loss -0.43026432394981384 LR -0.46210548281669617 LKL 0.03184117004275322
epoch 34308 loss -0.4612744450569153 LR -0.49334463477134705 LKL 0.032070182263851166
epoch 34309 loss -0.4868345856666565 LR -0.519084095954895 LKL 0.03224949911236763
epoch 34310 loss -0.4575708508491516 LR -0.48960715532302856 LKL 0.03203631937503815
epoch 34311 loss -0.40876200795173645 LR -0.4407402276992798 LKL 0.03197821229696274
epoch 34312 loss -0.37191081047058105 LR -0.40362587571144104 LKL 0.03

epoch 34400 loss -0.42749133706092834 LR -0.45958301424980164 LKL 0.03209167718887329
52


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34401 loss -0.40883687138557434 LR -0.4406261742115021 LKL 0.03178929537534714
epoch 34402 loss -0.46982455253601074 LR -0.5017957091331482 LKL 0.03197116032242775
epoch 34403 loss -0.42678388953208923 LR -0.45861709117889404 LKL 0.03183319419622421
epoch 34404 loss -0.41393426060676575 LR -0.4457506835460663 LKL 0.03181642293930054
epoch 34405 loss -0.5254462361335754 LR -0.5575947165489197 LKL 0.032148487865924835
epoch 34406 loss -0.410552442073822 LR -0.4425489604473114 LKL 0.03199652582406998
epoch 34407 loss -0.47566285729408264 LR -0.5078485012054443 LKL 0.0321856327354908
epoch 34408 loss -0.45514434576034546 LR -0.4870598614215851 LKL 0.03191550448536873
epoch 34409 loss -0.5258637070655823 LR -0.5579560995101929 LKL 0.0320923887193203
epoch 34410 loss -0.44766783714294434 LR -0.4795333743095398 LKL 0.031865548342466354
epoch 34411 loss -0.45035621523857117 LR -0.4824157953262329 LKL 0.032059576362371445
epoch 34412 loss -0.44496968388557434 LR -0.47696617245674133 LKL 0

epoch 34499 loss -0.3720032870769501 LR -0.40403926372528076 LKL 0.03203596547245979
epoch 34500 loss -0.4433628022670746 LR -0.4751822352409363 LKL 0.03181943669915199
44
epoch 34501 loss -0.3867850601673126 LR -0.4186541736125946 LKL 0.03186911344528198


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34502 loss -0.4572640061378479 LR -0.4893830418586731 LKL 0.032119039446115494
epoch 34503 loss -0.4359195828437805 LR -0.46801143884658813 LKL 0.03209187090396881
epoch 34504 loss -0.41174551844596863 LR -0.4437398612499237 LKL 0.03199433535337448
epoch 34505 loss -0.38596558570861816 LR -0.4179152548313141 LKL 0.03194967657327652
epoch 34506 loss -0.46843209862709045 LR -0.5004898309707642 LKL 0.032057732343673706
epoch 34507 loss -0.4132258892059326 LR -0.445124089717865 LKL 0.03189821168780327
epoch 34508 loss -0.4088026285171509 LR -0.4406389594078064 LKL 0.03183632716536522
epoch 34509 loss -0.4119713008403778 LR -0.44417044520378113 LKL 0.032199155539274216
epoch 34510 loss -0.43074026703834534 LR -0.46288880705833435 LKL 0.03214852884411812
epoch 34511 loss -0.436688095331192 LR -0.46871882677078247 LKL 0.03203073516488075
epoch 34512 loss -0.43564480543136597 LR -0.46760788559913635 LKL 0.03196306526660919
epoch 34513 loss -0.4250212609767914 LR -0.45695018768310547 LKL 

51
epoch 34601 loss -0.43956077098846436 LR -0.47179821133613586 LKL 0.03223743662238121


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34602 loss -0.3858853280544281 LR -0.41768115758895874 LKL 0.03179582580924034
epoch 34603 loss -0.46044254302978516 LR -0.49260544776916504 LKL 0.03216290473937988
epoch 34604 loss -0.5003544688224792 LR -0.5325658917427063 LKL 0.032211437821388245
epoch 34605 loss -0.4334720969200134 LR -0.4654792547225952 LKL 0.03200715780258179
epoch 34606 loss -0.41743090748786926 LR -0.44937410950660706 LKL 0.03194320201873779
epoch 34607 loss -0.3850744366645813 LR -0.41705793142318726 LKL 0.03198350593447685
epoch 34608 loss -0.37288159132003784 LR -0.40475279092788696 LKL 0.031871192157268524
epoch 34609 loss -0.40536412596702576 LR -0.43733733892440796 LKL 0.0319732204079628
epoch 34610 loss -0.38248786330223083 LR -0.4143385887145996 LKL 0.031850721687078476
epoch 34611 loss -0.47039464116096497 LR -0.5023261308670044 LKL 0.03193148598074913
epoch 34612 loss -0.42594483494758606 LR -0.45810839533805847 LKL 0.03216356784105301
epoch 34613 loss -0.38495320081710815 LR -0.4168570637702942

epoch 34700 loss -0.38525670766830444 LR -0.4173346161842346 LKL 0.03207790479063988
65


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34701 loss -0.41397756338119507 LR -0.4461410641670227 LKL 0.03216349706053734
epoch 34702 loss -0.39578837156295776 LR -0.427883118391037 LKL 0.032094743102788925
epoch 34703 loss -0.5093347430229187 LR -0.5416110754013062 LKL 0.03227635473012924
epoch 34704 loss -0.4038068652153015 LR -0.43568146228790283 LKL 0.03187460079789162
epoch 34705 loss -0.40845948457717896 LR -0.4403190016746521 LKL 0.03185952454805374
epoch 34706 loss -0.40183225274086 LR -0.43361198902130127 LKL 0.031779736280441284
epoch 34707 loss -0.45718520879745483 LR -0.48928728699684143 LKL 0.032102085649967194
epoch 34708 loss -0.46302908658981323 LR -0.4949982762336731 LKL 0.03196920081973076
epoch 34709 loss -0.35694020986557007 LR -0.38900044560432434 LKL 0.032060228288173676
epoch 34710 loss -0.44289320707321167 LR -0.47495508193969727 LKL 0.0320618636906147
epoch 34711 loss -0.4220316410064697 LR -0.45415157079696655 LKL 0.03211991488933563
epoch 34712 loss -0.3670560121536255 LR -0.3990151286125183 LKL

epoch 34799 loss -0.36900925636291504 LR -0.4007419943809509 LKL 0.031732723116874695
epoch 34800 loss -0.4580584466457367 LR -0.4901958405971527 LKL 0.03213740140199661
89
epoch 34801 loss -0.3998970091342926 LR -0.43192416429519653 LKL 0.032027166336774826


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34802 loss -0.4173260033130646 LR -0.4493286609649658 LKL 0.032002661377191544
epoch 34803 loss -0.42868804931640625 LR -0.460593581199646 LKL 0.031905535608530045
epoch 34804 loss -0.4715466797351837 LR -0.5038691163063049 LKL 0.03232244402170181
epoch 34805 loss -0.47347602248191833 LR -0.5054720640182495 LKL 0.03199603408575058
epoch 34806 loss -0.4832850694656372 LR -0.5153772830963135 LKL 0.032092221081256866
epoch 34807 loss -0.5092591047286987 LR -0.541363000869751 LKL 0.03210387006402016
epoch 34808 loss -0.4262775480747223 LR -0.45828238129615784 LKL 0.03200482577085495
epoch 34809 loss -0.4612778425216675 LR -0.49327147006988525 LKL 0.03199361637234688
epoch 34810 loss -0.39606156945228577 LR -0.42800799012184143 LKL 0.03194642439484596
epoch 34811 loss -0.4681375026702881 LR -0.5003853440284729 LKL 0.032247837632894516
epoch 34812 loss -0.4133065342903137 LR -0.4452015459537506 LKL 0.03189501538872719
epoch 34813 loss -0.37293341755867004 LR -0.40484702587127686 LKL 0.

51
epoch 34901 loss -0.4641694724559784 LR -0.4962625503540039 LKL 0.032093074172735214


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 34902 loss -0.4807511568069458 LR -0.5129255056381226 LKL 0.03217436373233795
epoch 34903 loss -0.4309578835964203 LR -0.4629313349723816 LKL 0.031973451375961304
epoch 34904 loss -0.4800027012825012 LR -0.5119761824607849 LKL 0.03197348490357399
epoch 34905 loss -0.4660694897174835 LR -0.49844813346862793 LKL 0.03237864747643471
epoch 34906 loss -0.4780158996582031 LR -0.5101063251495361 LKL 0.032090410590171814
epoch 34907 loss -0.39298567175865173 LR -0.4249730408191681 LKL 0.031987372785806656
epoch 34908 loss -0.3908796012401581 LR -0.42262500524520874 LKL 0.031745415180921555
epoch 34909 loss -0.48183009028434753 LR -0.5140277147293091 LKL 0.03219762071967125
epoch 34910 loss -0.4327939450740814 LR -0.46495577692985535 LKL 0.03216182813048363
epoch 34911 loss -0.4134199023246765 LR -0.4453500211238861 LKL 0.0319301076233387
epoch 34912 loss -0.36441463232040405 LR -0.3962940573692322 LKL 0.03187943622469902
epoch 34913 loss -0.39771875739097595 LR -0.42982858419418335 LKL 0

68
epoch 35001 loss -0.46644529700279236 LR -0.49854952096939087 LKL 0.03210423141717911


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35002 loss -0.42651569843292236 LR -0.4587090015411377 LKL 0.03219331428408623
epoch 35003 loss -0.43244895339012146 LR -0.4643438160419464 LKL 0.03189486265182495
epoch 35004 loss -0.39819636940956116 LR -0.4301894009113312 LKL 0.031993020325899124
epoch 35005 loss -0.38876429200172424 LR -0.42079436779022217 LKL 0.032030075788497925
epoch 35006 loss -0.45462706685066223 LR -0.48678478598594666 LKL 0.03215772286057472
epoch 35007 loss -0.42515280842781067 LR -0.45729777216911316 LKL 0.03214495629072189
epoch 35008 loss -0.3486190438270569 LR -0.38039958477020264 LKL 0.03178054839372635
epoch 35009 loss -0.2637017071247101 LR -0.29544180631637573 LKL 0.03174009174108505
epoch 35010 loss -0.39678752422332764 LR -0.42890483140945435 LKL 0.03211730718612671
epoch 35011 loss -0.4483901858329773 LR -0.4804849922657013 LKL 0.0320947989821434
epoch 35012 loss -0.4815431237220764 LR -0.5136479735374451 LKL 0.032104842364788055
epoch 35013 loss -0.3984248638153076 LR -0.4304577708244324 L

epoch 35100 loss -0.3798207938671112 LR -0.41155365109443665 LKL 0.031732864677906036
62


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35101 loss -0.38526660203933716 LR -0.4173295497894287 LKL 0.03206295147538185
epoch 35102 loss -0.39887621998786926 LR -0.43085455894470215 LKL 0.03197834640741348
epoch 35103 loss -0.4230898916721344 LR -0.4553166627883911 LKL 0.03222677484154701
epoch 35104 loss -0.41872984170913696 LR -0.45081809163093567 LKL 0.0320882573723793
epoch 35105 loss -0.4036135673522949 LR -0.43572431802749634 LKL 0.032110754400491714
epoch 35106 loss -0.43630677461624146 LR -0.4683855473995209 LKL 0.03207877650856972
epoch 35107 loss -0.4473469853401184 LR -0.47948622703552246 LKL 0.03213925659656525
epoch 35108 loss -0.44507190585136414 LR -0.47694873809814453 LKL 0.031876835972070694
epoch 35109 loss -0.4395458996295929 LR -0.47154924273490906 LKL 0.03200335055589676
epoch 35110 loss -0.43476197123527527 LR -0.4669782519340515 LKL 0.03221626952290535
epoch 35111 loss -0.40963491797447205 LR -0.44175440073013306 LKL 0.03211947903037071
epoch 35112 loss -0.393322229385376 LR -0.42537933588027954 L

epoch 35200 loss -0.38147810101509094 LR -0.4131928086280823 LKL 0.03171471878886223
54


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35201 loss -0.4307914972305298 LR -0.46304649114608765 LKL 0.032254986464977264
epoch 35202 loss -0.44849684834480286 LR -0.4805075526237488 LKL 0.032010696828365326
epoch 35203 loss -0.4374145567417145 LR -0.46956396102905273 LKL 0.03214940056204796
epoch 35204 loss -0.481900155544281 LR -0.5141184329986572 LKL 0.03221826255321503
epoch 35205 loss -0.44885075092315674 LR -0.48084715008735657 LKL 0.03199639916419983
epoch 35206 loss -0.4472031593322754 LR -0.4794623851776123 LKL 0.03225923702120781
epoch 35207 loss -0.41392114758491516 LR -0.44608479738235474 LKL 0.03216364234685898
epoch 35208 loss -0.3935420513153076 LR -0.425475537776947 LKL 0.03193347156047821
epoch 35209 loss -0.371920108795166 LR -0.4038655161857605 LKL 0.03194541111588478
epoch 35210 loss -0.5213083028793335 LR -0.5535975098609924 LKL 0.032289229333400726
epoch 35211 loss -0.4109896421432495 LR -0.44315439462661743 LKL 0.03216475248336792
epoch 35212 loss -0.4950949549674988 LR -0.5271070003509521 LKL 0.03

epoch 35300 loss -0.39924344420433044 LR -0.4313808083534241 LKL 0.03213736042380333
61


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35301 loss -0.49118751287460327 LR -0.5233447551727295 LKL 0.03215724602341652
epoch 35302 loss -0.41720446944236755 LR -0.44925302267074585 LKL 0.0320485420525074
epoch 35303 loss -0.47383689880371094 LR -0.5057790279388428 LKL 0.03194211423397064
epoch 35304 loss -0.46548908948898315 LR -0.49794986844062805 LKL 0.0324607715010643
epoch 35305 loss -0.4223845601081848 LR -0.45447033643722534 LKL 0.03208579123020172
epoch 35306 loss -0.4364032447338104 LR -0.4684843122959137 LKL 0.03208106383681297
epoch 35307 loss -0.4823639988899231 LR -0.5143567323684692 LKL 0.03199274465441704
epoch 35308 loss -0.44009268283843994 LR -0.4722103774547577 LKL 0.03211768716573715
epoch 35309 loss -0.4253619313240051 LR -0.45746910572052 LKL 0.03210717812180519
epoch 35310 loss -0.39787375926971436 LR -0.4298310875892639 LKL 0.03195733577013016
epoch 35311 loss -0.4281425178050995 LR -0.4601895809173584 LKL 0.03204707056283951
epoch 35312 loss -0.39152708649635315 LR -0.42343032360076904 LKL 0.031

epoch 35399 loss -0.49781718850135803 LR -0.5299005508422852 LKL 0.03208337351679802
epoch 35400 loss -0.4831492602825165 LR -0.5155229568481445 LKL 0.03237369656562805
40


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35401 loss -0.40792420506477356 LR -0.4398267865180969 LKL 0.03190259262919426
epoch 35402 loss -0.4218583405017853 LR -0.45391297340393066 LKL 0.03205462545156479
epoch 35403 loss -0.4407176971435547 LR -0.4729081988334656 LKL 0.03219050168991089
epoch 35404 loss -0.4204619526863098 LR -0.45258522033691406 LKL 0.032123252749443054
epoch 35405 loss -0.45688295364379883 LR -0.4891364276409149 LKL 0.032253462821245193
epoch 35406 loss -0.4230926036834717 LR -0.4550960659980774 LKL 0.03200344741344452
epoch 35407 loss -0.404923677444458 LR -0.4368065297603607 LKL 0.03188285231590271
epoch 35408 loss -0.4028812050819397 LR -0.4350051283836365 LKL 0.03212391957640648
epoch 35409 loss -0.4133068323135376 LR -0.44535842537879944 LKL 0.032051581889390945
epoch 35410 loss -0.37964361906051636 LR -0.41174957156181335 LKL 0.032105956226587296
epoch 35411 loss -0.4407274127006531 LR -0.4728417992591858 LKL 0.03211439400911331
epoch 35412 loss -0.41338327527046204 LR -0.44521886110305786 LKL 

epoch 35500 loss -0.41644006967544556 LR -0.4486124515533447 LKL 0.03217238932847977
122
epoch 35501 loss -0.4092754125595093 LR -0.44127732515335083 LKL 0.03200192376971245


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35502 loss -0.44110527634620667 LR -0.4732632040977478 LKL 0.03215792775154114
epoch 35503 loss -0.4090290069580078 LR -0.4409855306148529 LKL 0.03195653855800629
epoch 35504 loss -0.42110925912857056 LR -0.45322245359420776 LKL 0.0321132093667984
epoch 35505 loss -0.4633285403251648 LR -0.4954129457473755 LKL 0.03208441659808159
epoch 35506 loss -0.40341848134994507 LR -0.4354042410850525 LKL 0.03198574483394623
epoch 35507 loss -0.42337557673454285 LR -0.45548999309539795 LKL 0.0321144200861454
epoch 35508 loss -0.4349755644798279 LR -0.46699684858322144 LKL 0.032021284103393555
epoch 35509 loss -0.4597437083721161 LR -0.49203842878341675 LKL 0.03229472041130066
epoch 35510 loss -0.4601306915283203 LR -0.49246811866760254 LKL 0.03233744204044342
epoch 35511 loss -0.43138375878334045 LR -0.463567316532135 LKL 0.03218355402350426
epoch 35512 loss -0.3960542678833008 LR -0.42818135023117065 LKL 0.03212708234786987
epoch 35513 loss -0.45909953117370605 LR -0.49093663692474365 LKL 0

epoch 35599 loss -0.4051668047904968 LR -0.43717098236083984 LKL 0.032004162669181824
epoch 35600 loss -0.5151979923248291 LR -0.5475162863731384 LKL 0.032318320125341415
65
epoch 35601 loss -0.3860401511192322 LR -0.4181307852268219 LKL 0.032090626657009125


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35602 loss -0.4713784456253052 LR -0.5034592747688293 LKL 0.032080814242362976
epoch 35603 loss -0.40145403146743774 LR -0.43339282274246216 LKL 0.03193880617618561
epoch 35604 loss -0.4285068213939667 LR -0.4605315625667572 LKL 0.03202474117279053
epoch 35605 loss -0.43810510635375977 LR -0.4700508117675781 LKL 0.03194569796323776
epoch 35606 loss -0.41161632537841797 LR -0.443727046251297 LKL 0.03211072087287903
epoch 35607 loss -0.414420485496521 LR -0.44637829065322876 LKL 0.03195781260728836
epoch 35608 loss -0.38740113377571106 LR -0.41946911811828613 LKL 0.03206799551844597
epoch 35609 loss -0.40933895111083984 LR -0.441395103931427 LKL 0.03205616772174835
epoch 35610 loss -0.4499267041683197 LR -0.48220205307006836 LKL 0.03227533772587776
epoch 35611 loss -0.43257033824920654 LR -0.4645880460739136 LKL 0.03201770782470703
epoch 35612 loss -0.38173216581344604 LR -0.4139691889286041 LKL 0.032237034291028976
epoch 35613 loss -0.35909759998321533 LR -0.390849769115448 LKL 0.

89
epoch 35701 loss -0.4386506974697113 LR -0.47058796882629395 LKL 0.03193726763129234


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35702 loss -0.38974279165267944 LR -0.4218485951423645 LKL 0.03210579603910446
epoch 35703 loss -0.40927058458328247 LR -0.4414345324039459 LKL 0.03216393664479256
epoch 35704 loss -0.4408034384250641 LR -0.4729643166065216 LKL 0.03216088190674782
epoch 35705 loss -0.44570937752723694 LR -0.47790759801864624 LKL 0.032198209315538406
epoch 35706 loss -0.44941574335098267 LR -0.48148584365844727 LKL 0.0320700965821743
epoch 35707 loss -0.4271945059299469 LR -0.4593201279640198 LKL 0.03212561830878258
epoch 35708 loss -0.4364340305328369 LR -0.46831634640693665 LKL 0.031882330775260925
epoch 35709 loss -0.3541167378425598 LR -0.38614511489868164 LKL 0.03202836960554123
epoch 35710 loss -0.5143023729324341 LR -0.5467041730880737 LKL 0.03240180388092995
epoch 35711 loss -0.4359617531299591 LR -0.4683019518852234 LKL 0.03234020620584488
epoch 35712 loss -0.4582061469554901 LR -0.49058282375335693 LKL 0.03237667679786682
epoch 35713 loss -0.3934290111064911 LR -0.4254625141620636 LKL 0.

72
epoch 35801 loss -0.4116259813308716 LR -0.44377386569976807 LKL 0.032147884368896484


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35802 loss -0.44862112402915955 LR -0.4807218313217163 LKL 0.032100703567266464
epoch 35803 loss -0.48453956842422485 LR -0.5169658660888672 LKL 0.03242629021406174
epoch 35804 loss -0.413578063249588 LR -0.44592297077178955 LKL 0.03234490379691124
epoch 35805 loss -0.4099094271659851 LR -0.4418093264102936 LKL 0.031899888068437576
epoch 35806 loss -0.528179943561554 LR -0.5604457855224609 LKL 0.03226584196090698
epoch 35807 loss -0.33941876888275146 LR -0.37125131487846375 LKL 0.031832531094551086
epoch 35808 loss -0.4958037734031677 LR -0.528230607509613 LKL 0.032426849007606506
epoch 35809 loss -0.4140298068523407 LR -0.4461657404899597 LKL 0.032135944813489914
epoch 35810 loss -0.36737120151519775 LR -0.3995024263858795 LKL 0.03213120996952057
epoch 35811 loss -0.42194727063179016 LR -0.45401811599731445 LKL 0.03207085281610489
epoch 35812 loss -0.4721951484680176 LR -0.5042136311531067 LKL 0.03201846778392792
epoch 35813 loss -0.38673368096351624 LR -0.4187261462211609 LKL 0

epoch 35900 loss -0.449032187461853 LR -0.48112648725509644 LKL 0.03209429606795311
82
epoch 35901 loss -0.45573556423187256 LR -0.4877401888370514 LKL 0.03200460970401764


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 35902 loss -0.4246698021888733 LR -0.4571041762828827 LKL 0.03243438899517059
epoch 35903 loss -0.4091908931732178 LR -0.44113850593566895 LKL 0.031947601586580276
epoch 35904 loss -0.38872507214546204 LR -0.42080259323120117 LKL 0.03207752853631973
epoch 35905 loss -0.40890973806381226 LR -0.44109684228897095 LKL 0.03218710795044899
epoch 35906 loss -0.4427553415298462 LR -0.4748745560646057 LKL 0.03211919963359833
epoch 35907 loss -0.4442659318447113 LR -0.4765075743198395 LKL 0.032241638749837875
epoch 35908 loss -0.366191565990448 LR -0.3982493281364441 LKL 0.03205776587128639
epoch 35909 loss -0.41942623257637024 LR -0.45162084698677063 LKL 0.03219461813569069
epoch 35910 loss -0.4671350419521332 LR -0.499309778213501 LKL 0.0321747250854969
epoch 35911 loss -0.3967060148715973 LR -0.42883729934692383 LKL 0.03213127329945564
epoch 35912 loss -0.3941013514995575 LR -0.4260769486427307 LKL 0.03197560831904411
epoch 35913 loss -0.4650496542453766 LR -0.49721938371658325 LKL 0.03

70
epoch 36001 loss -0.42804354429244995 LR -0.4600098729133606 LKL 0.03196633234620094


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36002 loss -0.4437272548675537 LR -0.4760008454322815 LKL 0.032273586839437485
epoch 36003 loss -0.4305887520313263 LR -0.4627600908279419 LKL 0.032171327620744705
epoch 36004 loss -0.417208731174469 LR -0.4495799243450165 LKL 0.03237118199467659
epoch 36005 loss -0.38884419202804565 LR -0.42100095748901367 LKL 0.032156769186258316
epoch 36006 loss -0.49642395973205566 LR -0.5287132263183594 LKL 0.032289259135723114
epoch 36007 loss -0.4836747944355011 LR -0.5158472061157227 LKL 0.03217240795493126
epoch 36008 loss -0.4401698410511017 LR -0.4723183512687683 LKL 0.03214849904179573
epoch 36009 loss -0.4540095627307892 LR -0.4862203001976013 LKL 0.03221072629094124
epoch 36010 loss -0.445361465215683 LR -0.4774795174598694 LKL 0.0321180522441864
epoch 36011 loss -0.3568483293056488 LR -0.3888041377067566 LKL 0.03195579722523689
epoch 36012 loss -0.3774145841598511 LR -0.4095647633075714 LKL 0.03215017914772034
epoch 36013 loss -0.44245317578315735 LR -0.4746122360229492 LKL 0.03215

epoch 36100 loss -0.3877621293067932 LR -0.41973787546157837 LKL 0.031975749880075455
127
epoch 36101 loss -0.43285349011421204 LR -0.4651239514350891 LKL 0.032270461320877075


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36102 loss -0.4031457304954529 LR -0.43533632159233093 LKL 0.03219059482216835
epoch 36103 loss -0.3765909969806671 LR -0.4086906611919403 LKL 0.032099660485982895
epoch 36104 loss -0.31966477632522583 LR -0.3517688512802124 LKL 0.03210407868027687
epoch 36105 loss -0.3541978895664215 LR -0.38625261187553406 LKL 0.03205471858382225
epoch 36106 loss -0.40116092562675476 LR -0.433363139629364 LKL 0.032202210277318954
epoch 36107 loss -0.4824109375476837 LR -0.5148084163665771 LKL 0.03239746764302254
epoch 36108 loss -0.45468243956565857 LR -0.4870852530002594 LKL 0.03240280970931053
epoch 36109 loss -0.3159577250480652 LR -0.3477301001548767 LKL 0.03177236393094063
epoch 36110 loss -0.43715789914131165 LR -0.46922987699508667 LKL 0.03207197040319443
epoch 36111 loss -0.4325941801071167 LR -0.46481579542160034 LKL 0.03222161903977394
epoch 36112 loss -0.4237935245037079 LR -0.45594072341918945 LKL 0.03214721009135246
epoch 36113 loss -0.4495431184768677 LR -0.4817037582397461 LKL 0.

115
epoch 36201 loss -0.362287700176239 LR -0.3945215046405792 LKL 0.032233793288469315


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36202 loss -0.42569246888160706 LR -0.4580082893371582 LKL 0.03231583163142204
epoch 36203 loss -0.32278871536254883 LR -0.3547270894050598 LKL 0.03193836286664009
epoch 36204 loss -0.42246079444885254 LR -0.45475876331329346 LKL 0.032297953963279724
epoch 36205 loss -0.4388931393623352 LR -0.4709612727165222 LKL 0.032068148255348206
epoch 36206 loss -0.4192839562892914 LR -0.4515606164932251 LKL 0.032276660203933716
epoch 36207 loss -0.37562045454978943 LR -0.40778160095214844 LKL 0.032161157578229904
epoch 36208 loss -0.4626089334487915 LR -0.4949631989002228 LKL 0.03235427290201187
epoch 36209 loss -0.43892163038253784 LR -0.4710467457771301 LKL 0.03212510794401169
epoch 36210 loss -0.47342121601104736 LR -0.50576251745224 LKL 0.03234129399061203
epoch 36211 loss -0.36165377497673035 LR -0.3937133252620697 LKL 0.032059550285339355
epoch 36212 loss -0.41721680760383606 LR -0.4491627514362335 LKL 0.031945955008268356
epoch 36213 loss -0.40073198080062866 LR -0.4327651262283325 L

74
epoch 36301 loss -0.4735602140426636 LR -0.5059027671813965 LKL 0.032342568039894104


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36302 loss -0.4135684370994568 LR -0.44585686922073364 LKL 0.03228844702243805
epoch 36303 loss -0.4343482255935669 LR -0.46666717529296875 LKL 0.03231894597411156
epoch 36304 loss -0.444044291973114 LR -0.4762057363986969 LKL 0.03216145560145378
epoch 36305 loss -0.4620506465435028 LR -0.49446824193000793 LKL 0.03241758421063423
epoch 36306 loss -0.3915059566497803 LR -0.42366427183151245 LKL 0.03215831518173218
epoch 36307 loss -0.47839635610580444 LR -0.5108255743980408 LKL 0.03242923319339752
epoch 36308 loss -0.3724306523799896 LR -0.40430065989494324 LKL 0.03186999633908272
epoch 36309 loss -0.3674485981464386 LR -0.39957189559936523 LKL 0.03212330862879753
epoch 36310 loss -0.393131285905838 LR -0.42515063285827637 LKL 0.032019346952438354
epoch 36311 loss -0.4732033610343933 LR -0.5056037306785583 LKL 0.032400358468294144
epoch 36312 loss -0.4297444224357605 LR -0.46174418926239014 LKL 0.031999774277210236
epoch 36313 loss -0.4174022078514099 LR -0.44958826899528503 LKL 0

56
epoch 36401 loss -0.37426719069480896 LR -0.4064966142177582 LKL 0.032229434698820114


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36402 loss -0.4660499393939972 LR -0.4985377788543701 LKL 0.03248784318566322
epoch 36403 loss -0.4679489731788635 LR -0.5003010630607605 LKL 0.03235208988189697
epoch 36404 loss -0.4499094486236572 LR -0.4825093448162079 LKL 0.03259989246726036
epoch 36405 loss -0.4396725594997406 LR -0.47194766998291016 LKL 0.03227510675787926
epoch 36406 loss -0.36912766098976135 LR -0.4012022614479065 LKL 0.03207461163401604
epoch 36407 loss -0.4814668893814087 LR -0.5138683915138245 LKL 0.032401494681835175
epoch 36408 loss -0.42941510677337646 LR -0.46171340346336365 LKL 0.03229830414056778
epoch 36409 loss -0.4138108491897583 LR -0.4460810422897339 LKL 0.03227017819881439
epoch 36410 loss -0.40707623958587646 LR -0.43923255801200867 LKL 0.0321563296020031
epoch 36411 loss -0.4887654185295105 LR -0.5211960077285767 LKL 0.03243059664964676
epoch 36412 loss -0.43772000074386597 LR -0.47009241580963135 LKL 0.03237242251634598
epoch 36413 loss -0.4382128119468689 LR -0.47059088945388794 LKL 0.0

epoch 36500 loss -0.46374207735061646 LR -0.4961540699005127 LKL 0.03241199627518654
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36501 loss -0.44117650389671326 LR -0.4733958840370178 LKL 0.03221938759088516
epoch 36502 loss -0.3860602378845215 LR -0.41814449429512024 LKL 0.03208427131175995
epoch 36503 loss -0.4846091568470001 LR -0.5168329477310181 LKL 0.03222380205988884
epoch 36504 loss -0.4394538402557373 LR -0.4719759523868561 LKL 0.03252209722995758
epoch 36505 loss -0.496744841337204 LR -0.5289871692657471 LKL 0.03224233165383339
epoch 36506 loss -0.42352861166000366 LR -0.45593780279159546 LKL 0.0324091874063015
epoch 36507 loss -0.43702006340026855 LR -0.46940794587135315 LKL 0.032387882471084595
epoch 36508 loss -0.43648090958595276 LR -0.4687751829624176 LKL 0.03229426592588425
epoch 36509 loss -0.4759644567966461 LR -0.5080686807632446 LKL 0.03210423141717911
epoch 36510 loss -0.3527127504348755 LR -0.38458460569381714 LKL 0.031871870160102844
epoch 36511 loss -0.45814740657806396 LR -0.49034643173217773 LKL 0.032199036329984665
epoch 36512 loss -0.36538973450660706 LR -0.3974558115005493 LKL 

epoch 36600 loss -0.42019617557525635 LR -0.452394962310791 LKL 0.03219878673553467
50


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36601 loss -0.47908589243888855 LR -0.5115416049957275 LKL 0.03245571255683899
epoch 36602 loss -0.36438947916030884 LR -0.3967134356498718 LKL 0.03232395276427269
epoch 36603 loss -0.4166973829269409 LR -0.4489039182662964 LKL 0.03220652416348457
epoch 36604 loss -0.419788122177124 LR -0.4522409439086914 LKL 0.03245282173156738
epoch 36605 loss -0.4428201913833618 LR -0.4753636121749878 LKL 0.03254340589046478
epoch 36606 loss -0.458633154630661 LR -0.490893691778183 LKL 0.03226054087281227
epoch 36607 loss -0.39568984508514404 LR -0.42792826890945435 LKL 0.0322384238243103
epoch 36608 loss -0.34244245290756226 LR -0.37447720766067505 LKL 0.03203476965427399
epoch 36609 loss -0.44764813780784607 LR -0.48013806343078613 LKL 0.032489925622940063
epoch 36610 loss -0.408619225025177 LR -0.4408380091190338 LKL 0.032218798995018005
epoch 36611 loss -0.43239256739616394 LR -0.4647369682788849 LKL 0.032344408333301544
epoch 36612 loss -0.43200555443763733 LR -0.46434807777404785 LKL 0.0

epoch 36700 loss -0.4379158914089203 LR -0.4700391888618469 LKL 0.03212329000234604
40


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36701 loss -0.40384846925735474 LR -0.4359472990036011 LKL 0.03209884464740753
epoch 36702 loss -0.4376765787601471 LR -0.46999478340148926 LKL 0.03231819346547127
epoch 36703 loss -0.46312421560287476 LR -0.495429128408432 LKL 0.03230491280555725
epoch 36704 loss -0.40079793334007263 LR -0.43276381492614746 LKL 0.03196587786078453
epoch 36705 loss -0.4338974952697754 LR -0.4662455916404724 LKL 0.032348085194826126
epoch 36706 loss -0.41271311044692993 LR -0.44503650069236755 LKL 0.03232337534427643
epoch 36707 loss -0.4517461359500885 LR -0.4843176603317261 LKL 0.032571520656347275
epoch 36708 loss -0.5091703534126282 LR -0.5413792133331299 LKL 0.0322088859975338
epoch 36709 loss -0.40542396903038025 LR -0.4377630352973938 LKL 0.032339077442884445
epoch 36710 loss -0.4279929995536804 LR -0.46028807759284973 LKL 0.03229506313800812
epoch 36711 loss -0.3747740685939789 LR -0.40698403120040894 LKL 0.03220996633172035
epoch 36712 loss -0.4640813171863556 LR -0.4965626001358032 LKL 0

epoch 36800 loss -0.4095611870288849 LR -0.44176775217056274 LKL 0.03220657631754875
56


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36801 loss -0.4262685477733612 LR -0.45876839756965637 LKL 0.03249984234571457
epoch 36802 loss -0.37833136320114136 LR -0.41041383147239685 LKL 0.032082464545965195
epoch 36803 loss -0.41365835070610046 LR -0.4456673860549927 LKL 0.03200903907418251
epoch 36804 loss -0.446672648191452 LR -0.47889208793640137 LKL 0.03221943601965904
epoch 36805 loss -0.38898855447769165 LR -0.4211357831954956 LKL 0.03214721754193306
epoch 36806 loss -0.34187036752700806 LR -0.37397730350494385 LKL 0.032106950879096985
epoch 36807 loss -0.45864346623420715 LR -0.4911002516746521 LKL 0.03245679661631584
epoch 36808 loss -0.4888147711753845 LR -0.521283745765686 LKL 0.03246896713972092
epoch 36809 loss -0.45948463678359985 LR -0.49189263582229614 LKL 0.03240801393985748
epoch 36810 loss -0.46895039081573486 LR -0.5014394521713257 LKL 0.03248905390501022
epoch 36811 loss -0.4289153218269348 LR -0.4612470269203186 LKL 0.032331716269254684
epoch 36812 loss -0.42278993129730225 LR -0.4550071954727173 LK

epoch 36900 loss -0.3387462794780731 LR -0.37083345651626587 LKL 0.03208716958761215
68


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 36901 loss -0.4031308889389038 LR -0.43540769815444946 LKL 0.03227679803967476
epoch 36902 loss -0.4116630554199219 LR -0.44403329491615295 LKL 0.03237023949623108
epoch 36903 loss -0.4375726580619812 LR -0.4701034426689148 LKL 0.03253079950809479
epoch 36904 loss -0.3974878787994385 LR -0.42954427003860474 LKL 0.03205638378858566
epoch 36905 loss -0.4515934884548187 LR -0.4838927388191223 LKL 0.03229925036430359
epoch 36906 loss -0.4140574336051941 LR -0.4464096426963806 LKL 0.03235219791531563
epoch 36907 loss -0.4452563524246216 LR -0.4775271415710449 LKL 0.03227079659700394
epoch 36908 loss -0.4167000651359558 LR -0.4491705298423767 LKL 0.0324704684317112
epoch 36909 loss -0.4496103823184967 LR -0.48198479413986206 LKL 0.032374415546655655
epoch 36910 loss -0.3767109811306 LR -0.4090072512626648 LKL 0.03229627013206482
epoch 36911 loss -0.3677977919578552 LR -0.4000106751918793 LKL 0.03221289440989494
epoch 36912 loss -0.4359886646270752 LR -0.4684334695339203 LKL 0.032444790

epoch 37000 loss -0.4407059848308563 LR -0.47330203652381897 LKL 0.03259606286883354
70


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37001 loss -0.4062747657299042 LR -0.4383343756198883 LKL 0.032059598714113235
epoch 37002 loss -0.47888872027397156 LR -0.5113994479179382 LKL 0.032510723918676376
epoch 37003 loss -0.4234469532966614 LR -0.45590031147003174 LKL 0.03245336189866066
epoch 37004 loss -0.4514310657978058 LR -0.4838773012161255 LKL 0.0324462354183197
epoch 37005 loss -0.40864503383636475 LR -0.44089341163635254 LKL 0.032248370349407196
epoch 37006 loss -0.40792062878608704 LR -0.44006991386413574 LKL 0.03214928135275841
epoch 37007 loss -0.39861536026000977 LR -0.4309643805027008 LKL 0.03234901279211044
epoch 37008 loss -0.44964075088500977 LR -0.48211905360221863 LKL 0.03247831389307976
epoch 37009 loss -0.3890424370765686 LR -0.4214251935482025 LKL 0.03238276392221451
epoch 37010 loss -0.4640950560569763 LR -0.49642616510391235 LKL 0.032331112772226334
epoch 37011 loss -0.4728592336177826 LR -0.5053383111953735 LKL 0.03247908875346184
epoch 37012 loss -0.41212356090545654 LR -0.4446084797382355 LK

epoch 37100 loss -0.39860799908638 LR -0.43101128935813904 LKL 0.032403286546468735
45


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37101 loss -0.39938998222351074 LR -0.4316297769546509 LKL 0.03223977982997894
epoch 37102 loss -0.38499414920806885 LR -0.41741999983787537 LKL 0.032425835728645325
epoch 37103 loss -0.423339307308197 LR -0.45575058460235596 LKL 0.03241126611828804
epoch 37104 loss -0.39995020627975464 LR -0.43231791257858276 LKL 0.032367706298828125
epoch 37105 loss -0.48675641417503357 LR -0.5189691185951233 LKL 0.03221270814538002
epoch 37106 loss -0.4349075257778168 LR -0.46732497215270996 LKL 0.03241745010018349
epoch 37107 loss -0.3892325758934021 LR -0.4216253459453583 LKL 0.03239278122782707
epoch 37108 loss -0.44153010845184326 LR -0.47374463081359863 LKL 0.032214514911174774
epoch 37109 loss -0.439734548330307 LR -0.47206634283065796 LKL 0.032331790775060654
epoch 37110 loss -0.4306609034538269 LR -0.46303364634513855 LKL 0.03237275779247284
epoch 37111 loss -0.4004000425338745 LR -0.43257826566696167 LKL 0.032178208231925964
epoch 37112 loss -0.4097951650619507 LR -0.44199860095977783

epoch 37200 loss -0.4061688482761383 LR -0.43851497769355774 LKL 0.03234613314270973
79


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37201 loss -0.49832892417907715 LR -0.5310369729995728 LKL 0.03270803764462471
epoch 37202 loss -0.4013468325138092 LR -0.43366849422454834 LKL 0.03232167288661003
epoch 37203 loss -0.3639516532421112 LR -0.3960643410682678 LKL 0.032112687826156616
epoch 37204 loss -0.36938029527664185 LR -0.4017963409423828 LKL 0.032416053116321564
epoch 37205 loss -0.45470958948135376 LR -0.48736950755119324 LKL 0.032659921795129776
epoch 37206 loss -0.37681907415390015 LR -0.40896573662757874 LKL 0.03214665874838829
epoch 37207 loss -0.46182066202163696 LR -0.49417778849601746 LKL 0.032357119023799896
epoch 37208 loss -0.33530497550964355 LR -0.3673447072505951 LKL 0.03203973546624184
epoch 37209 loss -0.3959904909133911 LR -0.42832931876182556 LKL 0.03233882039785385
epoch 37210 loss -0.392422080039978 LR -0.42461851239204407 LKL 0.032196447253227234
epoch 37211 loss -0.4800429344177246 LR -0.5124872326850891 LKL 0.03244428709149361
epoch 37212 loss -0.44107872247695923 LR -0.4734189212322235

epoch 37299 loss -0.4320298433303833 LR -0.4644775092601776 LKL 0.03244766965508461
epoch 37300 loss -0.36967575550079346 LR -0.40201860666275024 LKL 0.03234283998608589
55
epoch 37301 loss -0.45585882663726807 LR -0.48821309208869934 LKL 0.03235427662730217


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37302 loss -0.43709859251976013 LR -0.4696320593357086 LKL 0.03253346309065819
epoch 37303 loss -0.39295676350593567 LR -0.42544233798980713 LKL 0.03248557075858116
epoch 37304 loss -0.42423203587532043 LR -0.45676809549331665 LKL 0.03253607079386711
epoch 37305 loss -0.380778431892395 LR -0.4128971993923187 LKL 0.03211876004934311
epoch 37306 loss -0.48409634828567505 LR -0.5166308879852295 LKL 0.03253452852368355
epoch 37307 loss -0.47179654240608215 LR -0.5040615797042847 LKL 0.03226502612233162
epoch 37308 loss -0.45852044224739075 LR -0.49086281657218933 LKL 0.032342374324798584
epoch 37309 loss -0.40485113859176636 LR -0.4371461272239685 LKL 0.03229497745633125
epoch 37310 loss -0.4417041540145874 LR -0.4742220640182495 LKL 0.032517895102500916
epoch 37311 loss -0.3865605592727661 LR -0.41874366998672485 LKL 0.03218310698866844
epoch 37312 loss -0.32828933000564575 LR -0.36044129729270935 LKL 0.032151952385902405
epoch 37313 loss -0.4205356538295746 LR -0.45255523920059204 

66
epoch 37401 loss -0.3820027709007263 LR -0.4142799973487854 LKL 0.03227723389863968


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37402 loss -0.4191955626010895 LR -0.45131662487983704 LKL 0.03212106600403786
epoch 37403 loss -0.34070587158203125 LR -0.37296396493911743 LKL 0.03225808963179588
epoch 37404 loss -0.42111077904701233 LR -0.4537152647972107 LKL 0.03260448947548866
epoch 37405 loss -0.4437655806541443 LR -0.4763641357421875 LKL 0.032598547637462616
epoch 37406 loss -0.40291669964790344 LR -0.43514284491539 LKL 0.032226141542196274
epoch 37407 loss -0.4410538077354431 LR -0.47357577085494995 LKL 0.032521963119506836
epoch 37408 loss -0.4819139540195465 LR -0.5144283771514893 LKL 0.03251441940665245
epoch 37409 loss -0.44021695852279663 LR -0.4725611209869385 LKL 0.032344166189432144
epoch 37410 loss -0.45922136306762695 LR -0.49193626642227173 LKL 0.03271491825580597
epoch 37411 loss -0.40497058629989624 LR -0.4374539852142334 LKL 0.03248341381549835
epoch 37412 loss -0.398027628660202 LR -0.43039214611053467 LKL 0.032364509999752045
epoch 37413 loss -0.38078993558883667 LR -0.4129781126976013 LK

48
epoch 37501 loss -0.4091438055038452 LR -0.44145306944847107 LKL 0.03230925276875496


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37502 loss -0.3671894967556 LR -0.39944255352020264 LKL 0.032253067940473557
epoch 37503 loss -0.40778517723083496 LR -0.4401978552341461 LKL 0.032412681728601456
epoch 37504 loss -0.38535135984420776 LR -0.41783607006073 LKL 0.032484717667102814
epoch 37505 loss -0.43369507789611816 LR -0.466156542301178 LKL 0.03246146813035011
epoch 37506 loss -0.4435473680496216 LR -0.4762669801712036 LKL 0.03271959722042084
epoch 37507 loss -0.447925329208374 LR -0.4803870916366577 LKL 0.032461751252412796
epoch 37508 loss -0.4749571979045868 LR -0.5074971318244934 LKL 0.03253992274403572
epoch 37509 loss -0.43056172132492065 LR -0.4629899561405182 LKL 0.03242824599146843
epoch 37510 loss -0.4419950246810913 LR -0.47456347942352295 LKL 0.03256845846772194
epoch 37511 loss -0.37154754996299744 LR -0.4039257764816284 LKL 0.03237823024392128
epoch 37512 loss -0.3764595091342926 LR -0.40859347581863403 LKL 0.032133977860212326
epoch 37513 loss -0.3818739652633667 LR -0.41423171758651733 LKL 0.032

39
epoch 37601 loss -0.4042852222919464 LR -0.4364462196826935 LKL 0.03216098994016647


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37602 loss -0.4317910969257355 LR -0.46404939889907837 LKL 0.032258305698633194
epoch 37603 loss -0.4265090227127075 LR -0.4589490294456482 LKL 0.03244001790881157
epoch 37604 loss -0.4263840913772583 LR -0.4591720402240753 LKL 0.03278793394565582
epoch 37605 loss -0.38395795226097107 LR -0.4166291356086731 LKL 0.03267117962241173
epoch 37606 loss -0.47965797781944275 LR -0.5120676755905151 LKL 0.03240969032049179
epoch 37607 loss -0.41944047808647156 LR -0.45187973976135254 LKL 0.03243926540017128
epoch 37608 loss -0.48557838797569275 LR -0.518096923828125 LKL 0.03251853212714195
epoch 37609 loss -0.39802443981170654 LR -0.43048015236854553 LKL 0.03245572745800018
epoch 37610 loss -0.4084775149822235 LR -0.44090536236763 LKL 0.0324278362095356
epoch 37611 loss -0.46990081667900085 LR -0.5023426413536072 LKL 0.03244181349873543
epoch 37612 loss -0.44036024808883667 LR -0.472621887922287 LKL 0.03226165473461151
epoch 37613 loss -0.4430360198020935 LR -0.47546082735061646 LKL 0.032

39
epoch 37701 loss -0.4106123447418213 LR -0.442963570356369 LKL 0.03235122561454773


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37702 loss -0.45214688777923584 LR -0.4847462773323059 LKL 0.03259940445423126
epoch 37703 loss -0.44306886196136475 LR -0.47564011812210083 LKL 0.03257124125957489
epoch 37704 loss -0.3559531569480896 LR -0.3880855441093445 LKL 0.03213240206241608
epoch 37705 loss -0.4517330825328827 LR -0.48431867361068726 LKL 0.03258558735251427
epoch 37706 loss -0.41264113783836365 LR -0.4450671076774597 LKL 0.032425977289676666
epoch 37707 loss -0.5006505250930786 LR -0.533236026763916 LKL 0.032585509121418
epoch 37708 loss -0.47445210814476013 LR -0.506873369216919 LKL 0.03242126852273941
epoch 37709 loss -0.4094122350215912 LR -0.44204211235046387 LKL 0.032629866153001785
epoch 37710 loss -0.4068285822868347 LR -0.4393828511238098 LKL 0.03255428001284599
epoch 37711 loss -0.43256378173828125 LR -0.46528360247612 LKL 0.03271980583667755
epoch 37712 loss -0.41006505489349365 LR -0.44268056750297546 LKL 0.03261550888419151
epoch 37713 loss -0.3498978614807129 LR -0.38201791048049927 LKL 0.032

epoch 37800 loss -0.388367235660553 LR -0.42058658599853516 LKL 0.032219335436820984
56


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37801 loss -0.4286952614784241 LR -0.46099790930747986 LKL 0.03230264410376549
epoch 37802 loss -0.4108703136444092 LR -0.4431925117969513 LKL 0.032322198152542114
epoch 37803 loss -0.4435499310493469 LR -0.47616419196128845 LKL 0.03261424973607063
epoch 37804 loss -0.36134403944015503 LR -0.3935675024986267 LKL 0.032223451882600784
epoch 37805 loss -0.40586453676223755 LR -0.4380930960178375 LKL 0.03222855180501938
epoch 37806 loss -0.4283999800682068 LR -0.4609512686729431 LKL 0.03255128115415573
epoch 37807 loss -0.46922263503074646 LR -0.5016283392906189 LKL 0.03240571543574333
epoch 37808 loss -0.39009782671928406 LR -0.4224892854690552 LKL 0.03239145129919052
epoch 37809 loss -0.443766713142395 LR -0.4763496518135071 LKL 0.03258293867111206
epoch 37810 loss -0.43274930119514465 LR -0.4654579162597656 LKL 0.03270862251520157
epoch 37811 loss -0.4220503270626068 LR -0.45429524779319763 LKL 0.03224491328001022
epoch 37812 loss -0.44737276434898376 LR -0.4798542261123657 LKL 0.

epoch 37900 loss -0.38529372215270996 LR -0.41781508922576904 LKL 0.03252135589718819
109
epoch 37901 loss -0.4975849390029907 LR -0.5303034782409668 LKL 0.03271854668855667


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 37902 loss -0.38875871896743774 LR -0.4212055206298828 LKL 0.03244679421186447
epoch 37903 loss -0.47995051741600037 LR -0.5126392841339111 LKL 0.032688774168491364
epoch 37904 loss -0.4184541404247284 LR -0.4508660137653351 LKL 0.032411862164735794
epoch 37905 loss -0.3881429433822632 LR -0.4205329716205597 LKL 0.03239001706242561
epoch 37906 loss -0.4498613774776459 LR -0.48221397399902344 LKL 0.032352592796087265
epoch 37907 loss -0.4500274658203125 LR -0.48272019624710083 LKL 0.03269273042678833
epoch 37908 loss -0.4560091495513916 LR -0.4884639084339142 LKL 0.032454755157232285
epoch 37909 loss -0.502798318862915 LR -0.535429060459137 LKL 0.032630737870931625
epoch 37910 loss -0.5408385396003723 LR -0.5736006498336792 LKL 0.03276211768388748
epoch 37911 loss -0.43441906571388245 LR -0.46687862277030945 LKL 0.032459549605846405
epoch 37912 loss -0.3794679045677185 LR -0.41182905435562134 LKL 0.032361146062612534
epoch 37913 loss -0.39903751015663147 LR -0.4314892292022705 LKL

56
epoch 38001 loss -0.44243529438972473 LR -0.4749866724014282 LKL 0.03255137801170349


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38002 loss -0.3887172341346741 LR -0.42128056287765503 LKL 0.032563336193561554
epoch 38003 loss -0.5145702362060547 LR -0.5473788976669312 LKL 0.032808635383844376
epoch 38004 loss -0.4138162136077881 LR -0.44618043303489685 LKL 0.032364215701818466
epoch 38005 loss -0.4853629171848297 LR -0.5181460976600647 LKL 0.03278319165110588
epoch 38006 loss -0.4608546793460846 LR -0.493621826171875 LKL 0.03276714310050011
epoch 38007 loss -0.40678635239601135 LR -0.43915021419525146 LKL 0.03236385062336922
epoch 38008 loss -0.39092326164245605 LR -0.42316389083862305 LKL 0.03224063292145729
epoch 38009 loss -0.45705509185791016 LR -0.48952192068099976 LKL 0.0324668325483799
epoch 38010 loss -0.4735216796398163 LR -0.5062382221221924 LKL 0.0327165424823761
epoch 38011 loss -0.467068612575531 LR -0.49944716691970825 LKL 0.03237854316830635
epoch 38012 loss -0.4290216863155365 LR -0.46119219064712524 LKL 0.03217051178216934
epoch 38013 loss -0.4108542203903198 LR -0.44326838850975037 LKL 0.

epoch 38100 loss -0.4318545162677765 LR -0.4642297923564911 LKL 0.032375264912843704
56


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38101 loss -0.4308784604072571 LR -0.46349555253982544 LKL 0.032617080956697464
epoch 38102 loss -0.47842302918434143 LR -0.5110018253326416 LKL 0.032578807324171066
epoch 38103 loss -0.4676603674888611 LR -0.5002478361129761 LKL 0.03258747607469559
epoch 38104 loss -0.31918641924858093 LR -0.3515905439853668 LKL 0.03240412101149559
epoch 38105 loss -0.42805466055870056 LR -0.46070408821105957 LKL 0.03264941647648811
epoch 38106 loss -0.40577536821365356 LR -0.43837445974349976 LKL 0.032599076628685
epoch 38107 loss -0.40540611743927 LR -0.43794041872024536 LKL 0.032534316182136536
epoch 38108 loss -0.453721284866333 LR -0.486409068107605 LKL 0.032687775790691376
epoch 38109 loss -0.43309950828552246 LR -0.46568673849105835 LKL 0.03258723020553589
epoch 38110 loss -0.41216564178466797 LR -0.4444921016693115 LKL 0.03232644498348236
epoch 38111 loss -0.4537004828453064 LR -0.48627665638923645 LKL 0.03257618844509125
epoch 38112 loss -0.4734407365322113 LR -0.5061531066894531 LKL 0.

epoch 38200 loss -0.43178701400756836 LR -0.46432968974113464 LKL 0.03254268318414688
72


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38201 loss -0.4589143395423889 LR -0.4913138449192047 LKL 0.032399509102106094
epoch 38202 loss -0.39359050989151 LR -0.42592865228652954 LKL 0.032338134944438934
epoch 38203 loss -0.3931339979171753 LR -0.4256153106689453 LKL 0.03248131275177002
epoch 38204 loss -0.34842678904533386 LR -0.3809315264225006 LKL 0.03250472992658615
epoch 38205 loss -0.3639850616455078 LR -0.39655160903930664 LKL 0.032566558569669724
epoch 38206 loss -0.44848766922950745 LR -0.4810529053211212 LKL 0.03256523236632347
epoch 38207 loss -0.4456925690174103 LR -0.47823187708854675 LKL 0.03253930062055588
epoch 38208 loss -0.43644753098487854 LR -0.4689001441001892 LKL 0.032452624291181564
epoch 38209 loss -0.5066095590591431 LR -0.5394107699394226 LKL 0.03280118107795715
epoch 38210 loss -0.45495787262916565 LR -0.48772406578063965 LKL 0.032766200602054596
epoch 38211 loss -0.41501012444496155 LR -0.44749635457992554 LKL 0.032486237585544586
epoch 38212 loss -0.461904376745224 LR -0.49452054500579834 LK

epoch 38299 loss -0.38506102561950684 LR -0.4175876975059509 LKL 0.032526686787605286
epoch 38300 loss -0.46583518385887146 LR -0.498514860868454 LKL 0.03267967328429222
51
epoch 38301 loss -0.4130330979824066 LR -0.4455035328865051 LKL 0.032470446079969406


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38302 loss -0.45896202325820923 LR -0.4913565516471863 LKL 0.032394517213106155
epoch 38303 loss -0.4019143879413605 LR -0.43455857038497925 LKL 0.03264417126774788
epoch 38304 loss -0.4197010099887848 LR -0.4520478844642639 LKL 0.032346874475479126
epoch 38305 loss -0.4148412346839905 LR -0.4471661150455475 LKL 0.03232486918568611
epoch 38306 loss -0.45460718870162964 LR -0.4872174859046936 LKL 0.032610297203063965
epoch 38307 loss -0.4936293363571167 LR -0.5264196991920471 LKL 0.03279036656022072
epoch 38308 loss -0.4919929802417755 LR -0.5247922539710999 LKL 0.032799266278743744
epoch 38309 loss -0.40167924761772156 LR -0.4340471029281616 LKL 0.03236786648631096
epoch 38310 loss -0.44771629571914673 LR -0.48037323355674744 LKL 0.032656945288181305
epoch 38311 loss -0.4398196339607239 LR -0.472348153591156 LKL 0.03252851963043213
epoch 38312 loss -0.45078176259994507 LR -0.48333683609962463 LKL 0.03255505859851837
epoch 38313 loss -0.44292378425598145 LR -0.47554779052734375 LK

80
epoch 38401 loss -0.4169432520866394 LR -0.44950515031814575 LKL 0.03256191313266754


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38402 loss -0.42863234877586365 LR -0.4610934853553772 LKL 0.03246113285422325
epoch 38403 loss -0.45347535610198975 LR -0.4861525595188141 LKL 0.03267720341682434
epoch 38404 loss -0.4755135774612427 LR -0.5081908106803894 LKL 0.03267722576856613
epoch 38405 loss -0.42666536569595337 LR -0.45927488803863525 LKL 0.032609518617391586
epoch 38406 loss -0.4001828730106354 LR -0.4326527714729309 LKL 0.03246990591287613
epoch 38407 loss -0.43590956926345825 LR -0.46859490871429443 LKL 0.03268533572554588
epoch 38408 loss -0.4648366868495941 LR -0.4973110556602478 LKL 0.03247436136007309
epoch 38409 loss -0.48291370272636414 LR -0.5156068801879883 LKL 0.032693177461624146
epoch 38410 loss -0.398331880569458 LR -0.4305918216705322 LKL 0.03225993737578392
epoch 38411 loss -0.43815743923187256 LR -0.47071537375450134 LKL 0.03255794197320938
epoch 38412 loss -0.4688848853111267 LR -0.5016065835952759 LKL 0.032721683382987976
epoch 38413 loss -0.42180705070495605 LR -0.4542878568172455 LKL 

99
epoch 38501 loss -0.4360446333885193 LR -0.46859797835350037 LKL 0.03255333751440048


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38502 loss -0.46503061056137085 LR -0.497639000415802 LKL 0.03260837867856026
epoch 38503 loss -0.37240153551101685 LR -0.4048200845718384 LKL 0.03241854906082153
epoch 38504 loss -0.5508925318717957 LR -0.5836004614830017 LKL 0.03270794823765755
epoch 38505 loss -0.4729171395301819 LR -0.5057394504547119 LKL 0.03282232582569122
epoch 38506 loss -0.420750230550766 LR -0.45311349630355835 LKL 0.032363273203372955
epoch 38507 loss -0.4754217863082886 LR -0.5079175233840942 LKL 0.032495733350515366
epoch 38508 loss -0.4721265733242035 LR -0.5049089789390564 LKL 0.03278239443898201
epoch 38509 loss -0.37850528955459595 LR -0.4109324514865875 LKL 0.03242714703083038
epoch 38510 loss -0.417026162147522 LR -0.4496821463108063 LKL 0.0326559878885746
epoch 38511 loss -0.43755224347114563 LR -0.4701426327228546 LKL 0.032590385526418686
epoch 38512 loss -0.4610922932624817 LR -0.4936688542366028 LKL 0.0325765460729599
epoch 38513 loss -0.45011118054389954 LR -0.4827728569507599 LKL 0.032661

85
epoch 38601 loss -0.40112459659576416 LR -0.4335671067237854 LKL 0.03244251757860184


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38602 loss -0.43003058433532715 LR -0.46260619163513184 LKL 0.03257561847567558
epoch 38603 loss -0.519640326499939 LR -0.5523388981819153 LKL 0.032698582857847214
epoch 38604 loss -0.41221436858177185 LR -0.4448806047439575 LKL 0.032666243612766266
epoch 38605 loss -0.4311582148075104 LR -0.46376705169677734 LKL 0.03260883688926697
epoch 38606 loss -0.4558488130569458 LR -0.48854878544807434 LKL 0.032699957489967346
epoch 38607 loss -0.40559878945350647 LR -0.4378490746021271 LKL 0.03225028142333031
epoch 38608 loss -0.4263498783111572 LR -0.45890048146247864 LKL 0.03255060687661171
epoch 38609 loss -0.4597959518432617 LR -0.49241402745246887 LKL 0.03261809051036835
epoch 38610 loss -0.4808579981327057 LR -0.5134570598602295 LKL 0.03259905055165291
epoch 38611 loss -0.43323221802711487 LR -0.46577703952789307 LKL 0.0325448215007782
epoch 38612 loss -0.42969417572021484 LR -0.4625101089477539 LKL 0.032815948128700256
epoch 38613 loss -0.45749878883361816 LR -0.4901180565357208 LK

128
epoch 38701 loss -0.39683976769447327 LR -0.4292813539505005 LKL 0.03244159743189812
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 38702 loss -0.41757315397262573 LR -0.45014965534210205 LKL 0.03257651627063751
epoch 38703 loss -0.44095924496650696 LR -0.4736178517341614 LKL 0.032658617943525314
epoch 38704 loss -0.4775632917881012 LR -0.5103290677070618 LKL 0.03276577219367027
epoch 38705 loss -0.5068325400352478 LR -0.5394682884216309 LKL 0.03263574838638306
epoch 38706 loss -0.4350757300853729 LR -0.4676611125469208 LKL 0.03258538991212845
epoch 38707 loss -0.3726796507835388 LR -0.4053078889846802 LKL 0.03262822702527046
epoch 38708 loss -0.4603535234928131 LR -0.493237167596817 LKL 0.0328836552798748
epoch 38709 loss -0.4236087203025818 LR -0.4561771750450134 LKL 0.032568447291851044
epoch 38710 loss -0.5020062327384949 LR -0.534940779209137 LKL 0.03293453902006149
epoch 38711 loss -0.3489849865436554 LR -0.38142815232276917 LKL 0.03244316950440407
epoch 38712 loss -0.41554173827171326 LR -0.44848763942718506 LKL 0.0329459123313427
epoch 38713 loss -0.45929813385009766 LR -0.49206990003585815 LKL 0.032771766

58
epoch 38801 loss -0.3880685567855835 LR -0.42036131024360657 LKL 0.03229273855686188


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38802 loss -0.29107666015625 LR -0.3235672414302826 LKL 0.0324905663728714
epoch 38803 loss -0.4546152949333191 LR -0.4871949553489685 LKL 0.03257967159152031
epoch 38804 loss -0.5015227794647217 LR -0.5340844988822937 LKL 0.03256171569228172
epoch 38805 loss -0.43401238322257996 LR -0.4668118357658386 LKL 0.03279946371912956
epoch 38806 loss -0.4215810000896454 LR -0.4541466534137726 LKL 0.032565660774707794
epoch 38807 loss -0.46897393465042114 LR -0.5016875267028809 LKL 0.03271360322833061
epoch 38808 loss -0.38581565022468567 LR -0.4181221127510071 LKL 0.032306455075740814
epoch 38809 loss -0.3821776509284973 LR -0.4147510230541229 LKL 0.032573360949754715
epoch 38810 loss -0.4686228632926941 LR -0.501604437828064 LKL 0.03298158198595047
epoch 38811 loss -0.4818129241466522 LR -0.514593780040741 LKL 0.03278086706995964
epoch 38812 loss -0.4243957996368408 LR -0.45696768164634705 LKL 0.032571882009506226
epoch 38813 loss -0.4139632284641266 LR -0.44644463062286377 LKL 0.032481

epoch 38900 loss -0.4402367174625397 LR -0.47269511222839355 LKL 0.03245839849114418
62
epoch 38901 loss -0.49527955055236816 LR -0.528083860874176 LKL 0.032804299145936966


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 38902 loss -0.47014927864074707 LR -0.5027576088905334 LKL 0.03260832652449608
epoch 38903 loss -0.48568350076675415 LR -0.5182721018791199 LKL 0.032588597387075424
epoch 38904 loss -0.4034559428691864 LR -0.4360221326351166 LKL 0.03256619721651077
epoch 38905 loss -0.40126675367355347 LR -0.4338894188404083 LKL 0.032622672617435455
epoch 38906 loss -0.4789372384548187 LR -0.5118024945259094 LKL 0.032865259796381
epoch 38907 loss -0.402351051568985 LR -0.4349561333656311 LKL 0.03260508179664612
epoch 38908 loss -0.48268795013427734 LR -0.5155605673789978 LKL 0.03287261351943016
epoch 38909 loss -0.44254669547080994 LR -0.4752027988433838 LKL 0.03265609219670296
epoch 38910 loss -0.520314633846283 LR -0.5534045696258545 LKL 0.033089909702539444
epoch 38911 loss -0.385271281003952 LR -0.4178663194179535 LKL 0.03259504586458206
epoch 38912 loss -0.36587557196617126 LR -0.3982548415660858 LKL 0.03237927332520485
epoch 38913 loss -0.42363986372947693 LR -0.4563314616680145 LKL 0.03269

77
epoch 39001 loss -0.43968185782432556 LR -0.4724368751049042 LKL 0.03275502100586891


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39002 loss -0.354652464389801 LR -0.38720712065696716 LKL 0.032554641366004944
epoch 39003 loss -0.40242114663124084 LR -0.4348568916320801 LKL 0.03243574872612953
epoch 39004 loss -0.4915035665035248 LR -0.5243653059005737 LKL 0.03286173194646835
epoch 39005 loss -0.38838592171669006 LR -0.4208489656448364 LKL 0.03246305510401726
epoch 39006 loss -0.4349195957183838 LR -0.46758484840393066 LKL 0.03266526758670807
epoch 39007 loss -0.42013320326805115 LR -0.4527831971645355 LKL 0.03265000134706497
epoch 39008 loss -0.36002594232559204 LR -0.39254069328308105 LKL 0.032514747232198715
epoch 39009 loss -0.4371040463447571 LR -0.4697667062282562 LKL 0.03266267105937004
epoch 39010 loss -0.494280070066452 LR -0.5269399881362915 LKL 0.032659925520420074
epoch 39011 loss -0.4300719201564789 LR -0.46278637647628784 LKL 0.03271444886922836
epoch 39012 loss -0.46122264862060547 LR -0.49418506026268005 LKL 0.032962411642074585
epoch 39013 loss -0.4854598939418793 LR -0.5183647274971008 LKL 

59
epoch 39101 loss -0.43960216641426086 LR -0.4723975956439972 LKL 0.03279542177915573


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39102 loss -0.36446064710617065 LR -0.3971230089664459 LKL 0.03266235813498497
epoch 39103 loss -0.41045141220092773 LR -0.44300273060798645 LKL 0.03255132585763931
epoch 39104 loss -0.4244385063648224 LR -0.4571705758571625 LKL 0.03273206949234009
epoch 39105 loss -0.4157399535179138 LR -0.4482484459877014 LKL 0.03250850737094879
epoch 39106 loss -0.36657199263572693 LR -0.399069607257843 LKL 0.03249761089682579
epoch 39107 loss -0.42042726278305054 LR -0.4532688558101654 LKL 0.03284158557653427
epoch 39108 loss -0.4556887447834015 LR -0.4885188341140747 LKL 0.03283007815480232
epoch 39109 loss -0.43628302216529846 LR -0.4690457284450531 LKL 0.03276270255446434
epoch 39110 loss -0.4929523766040802 LR -0.525920033454895 LKL 0.03296764940023422
epoch 39111 loss -0.3582347631454468 LR -0.3906313180923462 LKL 0.032396554946899414
epoch 39112 loss -0.4383229613304138 LR -0.470964252948761 LKL 0.03264128789305687
epoch 39113 loss -0.5045123100280762 LR -0.5371864438056946 LKL 0.032674

120
epoch 39201 loss -0.42309141159057617 LR -0.45579999685287476 LKL 0.03270859643816948


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39202 loss -0.3995758891105652 LR -0.4322250187397003 LKL 0.032649122178554535
epoch 39203 loss -0.45214277505874634 LR -0.4849485158920288 LKL 0.03280574455857277
epoch 39204 loss -0.45184123516082764 LR -0.48449018597602844 LKL 0.03264893591403961
epoch 39205 loss -0.4384048581123352 LR -0.4711841940879822 LKL 0.032779332250356674
epoch 39206 loss -0.3935549855232239 LR -0.42629146575927734 LKL 0.03273647278547287
epoch 39207 loss -0.35667070746421814 LR -0.3891066610813141 LKL 0.032435961067676544
epoch 39208 loss -0.4060041308403015 LR -0.43857353925704956 LKL 0.03256940096616745
epoch 39209 loss -0.4127437174320221 LR -0.445517897605896 LKL 0.032774168998003006
epoch 39210 loss -0.4412417709827423 LR -0.47389882802963257 LKL 0.03265704587101936
epoch 39211 loss -0.4223156273365021 LR -0.454688161611557 LKL 0.03237253800034523
epoch 39212 loss -0.45250824093818665 LR -0.48515385389328003 LKL 0.03264562413096428
epoch 39213 loss -0.4343281090259552 LR -0.46718665957450867 LKL 

56
epoch 39301 loss -0.4584736227989197 LR -0.4910954236984253 LKL 0.03262179344892502


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39302 loss -0.4522063136100769 LR -0.48487722873687744 LKL 0.03267091140151024
epoch 39303 loss -0.42315673828125 LR -0.4558594226837158 LKL 0.03270269185304642
epoch 39304 loss -0.4764357805252075 LR -0.5093069672584534 LKL 0.03287117928266525
epoch 39305 loss -0.5236915349960327 LR -0.5565779209136963 LKL 0.03288641571998596
epoch 39306 loss -0.39173221588134766 LR -0.4244375228881836 LKL 0.032705310732126236
epoch 39307 loss -0.42455604672431946 LR -0.45749905705451965 LKL 0.0329429991543293
epoch 39308 loss -0.424441933631897 LR -0.45736008882522583 LKL 0.03291815146803856
epoch 39309 loss -0.39089497923851013 LR -0.4236069321632385 LKL 0.03271196037530899
epoch 39310 loss -0.37901848554611206 LR -0.4118778705596924 LKL 0.03285938501358032
epoch 39311 loss -0.4275128245353699 LR -0.4604048430919647 LKL 0.032892029732465744
epoch 39312 loss -0.46602416038513184 LR -0.49873849749565125 LKL 0.0327143520116806
epoch 39313 loss -0.44573697447776794 LR -0.4784018397331238 LKL 0.032

43
epoch 39401 loss -0.3958982527256012 LR -0.4285215139389038 LKL 0.03262326121330261


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39402 loss -0.38891345262527466 LR -0.4215337932109833 LKL 0.03262032940983772
epoch 39403 loss -0.4023342728614807 LR -0.43496108055114746 LKL 0.03262679651379585
epoch 39404 loss -0.4890769422054291 LR -0.5218787789344788 LKL 0.032801833003759384
epoch 39405 loss -0.42958590388298035 LR -0.46203848719596863 LKL 0.03245258331298828
epoch 39406 loss -0.36820298433303833 LR -0.40084919333457947 LKL 0.032646194100379944
epoch 39407 loss -0.4684526026248932 LR -0.5012437701225281 LKL 0.03279117867350578
epoch 39408 loss -0.4156981110572815 LR -0.4486825466156006 LKL 0.03298443183302879
epoch 39409 loss -0.4064888060092926 LR -0.43931204080581665 LKL 0.03282322734594345
epoch 39410 loss -0.4238758385181427 LR -0.45669007301330566 LKL 0.03281422704458237
epoch 39411 loss -0.47230738401412964 LR -0.5048353672027588 LKL 0.03252797573804855
epoch 39412 loss -0.43665415048599243 LR -0.46954721212387085 LKL 0.03289305046200752
epoch 39413 loss -0.46316537261009216 LR -0.4959763288497925 LK

44
epoch 39501 loss -0.4735983610153198 LR -0.5062986612319946 LKL 0.032700296491384506


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39502 loss -0.440216988325119 LR -0.4729316532611847 LKL 0.032714664936065674
epoch 39503 loss -0.4381217360496521 LR -0.4707579016685486 LKL 0.032636165618896484
epoch 39504 loss -0.4760807752609253 LR -0.5089948177337646 LKL 0.032914046198129654
epoch 39505 loss -0.42599982023239136 LR -0.45876580476760864 LKL 0.03276597335934639
epoch 39506 loss -0.4966718256473541 LR -0.5295939445495605 LKL 0.03292211890220642
epoch 39507 loss -0.4555812478065491 LR -0.48860085010528564 LKL 0.033019594848155975
epoch 39508 loss -0.5021893382072449 LR -0.5352146625518799 LKL 0.03302531689405441
epoch 39509 loss -0.4252393841743469 LR -0.45804911851882935 LKL 0.03280973061919212
epoch 39510 loss -0.4642109274864197 LR -0.49692976474761963 LKL 0.032718852162361145
epoch 39511 loss -0.45730242133140564 LR -0.48997098207473755 LKL 0.03266856446862221
epoch 39512 loss -0.4550453722476959 LR -0.48776301741600037 LKL 0.03271764516830444
epoch 39513 loss -0.3341742753982544 LR -0.3667447865009308 LKL 

49
epoch 39601 loss -0.4294215440750122 LR -0.4620583653450012 LKL 0.03263682872056961


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39602 loss -0.4637313485145569 LR -0.4965224266052246 LKL 0.03279106691479683
epoch 39603 loss -0.4950442612171173 LR -0.5279025435447693 LKL 0.03285827487707138
epoch 39604 loss -0.4390854239463806 LR -0.4718824028968811 LKL 0.03279697522521019
epoch 39605 loss -0.4505613446235657 LR -0.48349031805992126 LKL 0.03292897343635559
epoch 39606 loss -0.5022254586219788 LR -0.5353211164474487 LKL 0.03309566155076027
epoch 39607 loss -0.49529221653938293 LR -0.5280731916427612 LKL 0.03278098627924919
epoch 39608 loss -0.4164614677429199 LR -0.44906777143478394 LKL 0.03260630741715431
epoch 39609 loss -0.44890186190605164 LR -0.4816528558731079 LKL 0.03275100514292717
epoch 39610 loss -0.36921244859695435 LR -0.40203070640563965 LKL 0.0328182652592659
epoch 39611 loss -0.45129650831222534 LR -0.4839652478694916 LKL 0.032668743282556534
epoch 39612 loss -0.4385833442211151 LR -0.4715780019760132 LKL 0.03299466520547867
epoch 39613 loss -0.4691392183303833 LR -0.5019950270652771 LKL 0.032

56
epoch 39701 loss -0.40907999873161316 LR -0.44177621603012085 LKL 0.032696209847927094
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 39702 loss -0.4357396364212036 LR -0.46841105818748474 LKL 0.03267143666744232
epoch 39703 loss -0.4285679757595062 LR -0.4613743722438812 LKL 0.032806385308504105
epoch 39704 loss -0.46339133381843567 LR -0.496050089597702 LKL 0.03265875205397606
epoch 39705 loss -0.4746426045894623 LR -0.5075895190238953 LKL 0.03294692188501358
epoch 39706 loss -0.40366673469543457 LR -0.4361286759376526 LKL 0.032461948692798615
epoch 39707 loss -0.4027738869190216 LR -0.43560895323753357 LKL 0.03283507004380226
epoch 39708 loss -0.4267372488975525 LR -0.45955270528793335 LKL 0.032815463840961456
epoch 39709 loss -0.4352598786354065 LR -0.46791666746139526 LKL 0.03265680372714996
epoch 39710 loss -0.5244126319885254 LR -0.5573716163635254 LKL 0.03295901045203209
epoch 39711 loss -0.41115885972976685 LR -0.4437919557094574 LKL 0.032633088529109955
epoch 39712 loss -0.4006361663341522 LR -0.43314242362976074 LKL 0.032506249845027924
epoch 39713 loss -0.43346092104911804 LR -0.46617406606674194 LKL 0.0

epoch 39800 loss -0.4435514807701111 LR -0.4766619801521301 LKL 0.03311050310730934
41


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39801 loss -0.4864756464958191 LR -0.5193580389022827 LKL 0.03288240730762482
epoch 39802 loss -0.5089142322540283 LR -0.5418314933776855 LKL 0.032917290925979614
epoch 39803 loss -0.46720296144485474 LR -0.4998849630355835 LKL 0.032682016491889954
epoch 39804 loss -0.4435381293296814 LR -0.47648710012435913 LKL 0.03294896334409714
epoch 39805 loss -0.506278932094574 LR -0.5390733480453491 LKL 0.03279440104961395
epoch 39806 loss -0.44816380739212036 LR -0.48090824484825134 LKL 0.03274444863200188
epoch 39807 loss -0.4449999928474426 LR -0.4778216481208801 LKL 0.0328216478228569
epoch 39808 loss -0.42829951643943787 LR -0.4610227942466736 LKL 0.03272327780723572
epoch 39809 loss -0.4889310300350189 LR -0.5219103693962097 LKL 0.0329793356359005
epoch 39810 loss -0.4684722423553467 LR -0.5012124180793762 LKL 0.03274017572402954
epoch 39811 loss -0.4329086244106293 LR -0.4657425880432129 LKL 0.03283396735787392
epoch 39812 loss -0.4565773606300354 LR -0.48938465118408203 LKL 0.03280

epoch 39900 loss -0.45265141129493713 LR -0.4857180714607239 LKL 0.03306666389107704
97
epoch 39901 loss -0.43079960346221924 LR -0.4636198878288269 LKL 0.032820284366607666


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 39902 loss -0.43118178844451904 LR -0.46418455243110657 LKL 0.03300277516245842
epoch 39903 loss -0.45388323068618774 LR -0.4867279827594757 LKL 0.03284473717212677
epoch 39904 loss -0.34441208839416504 LR -0.376772940158844 LKL 0.03236086666584015
epoch 39905 loss -0.33203017711639404 LR -0.36469966173171997 LKL 0.03266949579119682
epoch 39906 loss -0.4577208459377289 LR -0.49065837264060974 LKL 0.032937534153461456
epoch 39907 loss -0.491525799036026 LR -0.5246043801307678 LKL 0.03307858854532242
epoch 39908 loss -0.4441925883293152 LR -0.47711730003356934 LKL 0.03292470425367355
epoch 39909 loss -0.38412803411483765 LR -0.41663801670074463 LKL 0.032509975135326385
epoch 39910 loss -0.4292248487472534 LR -0.4619559943675995 LKL 0.03273114189505577
epoch 39911 loss -0.4368720054626465 LR -0.4696827530860901 LKL 0.0328107625246048
epoch 39912 loss -0.4262012541294098 LR -0.4589540362358093 LKL 0.032752782106399536
epoch 39913 loss -0.45001161098480225 LR -0.4828891158103943 LKL 0

77
epoch 40001 loss -0.4718777537345886 LR -0.5046547055244446 LKL 0.03277693688869476


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40002 loss -0.41411906480789185 LR -0.4467832148075104 LKL 0.03266415372490883
epoch 40003 loss -0.4933243989944458 LR -0.5263720750808716 LKL 0.03304768353700638
epoch 40004 loss -0.40852606296539307 LR -0.44124889373779297 LKL 0.0327228382229805
epoch 40005 loss -0.4405052065849304 LR -0.4733324944972992 LKL 0.03282730281352997
epoch 40006 loss -0.4165319800376892 LR -0.4494098424911499 LKL 0.032877858728170395
epoch 40007 loss -0.4673987329006195 LR -0.5003222823143005 LKL 0.03292354568839073
epoch 40008 loss -0.47586342692375183 LR -0.5087347626686096 LKL 0.03287133201956749
epoch 40009 loss -0.4244682192802429 LR -0.4571816027164459 LKL 0.0327133871614933
epoch 40010 loss -0.4604499340057373 LR -0.49332547187805176 LKL 0.032875534147024155
epoch 40011 loss -0.42340126633644104 LR -0.45638740062713623 LKL 0.03298613429069519
epoch 40012 loss -0.40482524037361145 LR -0.4374299943447113 LKL 0.03260476142168045
epoch 40013 loss -0.41084134578704834 LR -0.4437665045261383 LKL 0.0

78
epoch 40101 loss -0.4762132465839386 LR -0.5092882513999939 LKL 0.033075001090765
epoch

  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


 40102 loss -0.45644235610961914 LR -0.48906707763671875 LKL 0.032624728977680206
epoch 40103 loss -0.4725133776664734 LR -0.5053938627243042 LKL 0.032880477607250214
epoch 40104 loss -0.4154101014137268 LR -0.4480493664741516 LKL 0.032639279961586
epoch 40105 loss -0.3972683250904083 LR -0.43017464876174927 LKL 0.03290631249547005
epoch 40106 loss -0.4501132369041443 LR -0.48301416635513306 LKL 0.032900940626859665
epoch 40107 loss -0.34885984659194946 LR -0.38146114349365234 LKL 0.03260130062699318
epoch 40108 loss -0.476344496011734 LR -0.5092759728431702 LKL 0.03293146938085556
epoch 40109 loss -0.4520408809185028 LR -0.48513635993003845 LKL 0.03309548273682594
epoch 40110 loss -0.4838552176952362 LR -0.5167854428291321 LKL 0.032930225133895874
epoch 40111 loss -0.4034891724586487 LR -0.43640318512916565 LKL 0.03291401267051697
epoch 40112 loss -0.44352734088897705 LR -0.4764192998409271 LKL 0.032891955226659775
epoch 40113 loss -0.4139425754547119 LR -0.4468189775943756 LKL 0.0328

117
epoch 40201 loss -0.3424510657787323 LR -0.374927818775177 LKL 0.0324767641723156


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40202 loss -0.49528181552886963 LR -0.5283231139183044 LKL 0.03304129093885422
epoch 40203 loss -0.43450257182121277 LR -0.4670608639717102 LKL 0.03255829960107803
epoch 40204 loss -0.44461584091186523 LR -0.4776514172554016 LKL 0.03303556144237518
epoch 40205 loss -0.42348068952560425 LR -0.4565381109714508 LKL 0.03305741026997566
epoch 40206 loss -0.40247902274131775 LR -0.435228168964386 LKL 0.03274914622306824
epoch 40207 loss -0.40317633748054504 LR -0.43601295351982117 LKL 0.032836608588695526
epoch 40208 loss -0.4688849449157715 LR -0.501916766166687 LKL 0.03303183615207672
epoch 40209 loss -0.43553534150123596 LR -0.4683572053909302 LKL 0.03282187134027481
epoch 40210 loss -0.4654570519924164 LR -0.4983115792274475 LKL 0.03285452350974083
epoch 40211 loss -0.3714081943035126 LR -0.40415945649147034 LKL 0.03275126963853836
epoch 40212 loss -0.43646061420440674 LR -0.46918806433677673 LKL 0.0327274352312088
epoch 40213 loss -0.3299427032470703 LR -0.3625374436378479 LKL 0.0

48
epoch 40301 loss -0.4243937134742737 LR -0.457346111536026 LKL 0.03295241296291351


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40302 loss -0.4302561581134796 LR -0.46314936876296997 LKL 0.03289322182536125
epoch 40303 loss -0.41571590304374695 LR -0.4485587477684021 LKL 0.032842837274074554
epoch 40304 loss -0.44902992248535156 LR -0.4819106161594391 LKL 0.03288070484995842
epoch 40305 loss -0.46234169602394104 LR -0.4952951967716217 LKL 0.032953500747680664
epoch 40306 loss -0.42482882738113403 LR -0.45784085988998413 LKL 0.033012039959430695
epoch 40307 loss -0.41178205609321594 LR -0.44438067078590393 LKL 0.03259861096739769
epoch 40308 loss -0.41596806049346924 LR -0.44879305362701416 LKL 0.03282500430941582
epoch 40309 loss -0.3823133111000061 LR -0.415162593126297 LKL 0.032849278301000595
epoch 40310 loss -0.3812311589717865 LR -0.414078027009964 LKL 0.03284686058759689
epoch 40311 loss -0.4172011911869049 LR -0.449998140335083 LKL 0.0327969528734684
epoch 40312 loss -0.44991159439086914 LR -0.48272904753685 LKL 0.03281744197010994
epoch 40313 loss -0.39933541417121887 LR -0.4320701062679291 LKL 0.

epoch 40400 loss -0.5134371519088745 LR -0.5465246438980103 LKL 0.033087484538555145
44


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40401 loss -0.3968815803527832 LR -0.42973917722702026 LKL 0.03285759687423706
epoch 40402 loss -0.41130226850509644 LR -0.44389480352401733 LKL 0.032592546194791794
epoch 40403 loss -0.41149643063545227 LR -0.4441637694835663 LKL 0.03266735002398491
epoch 40404 loss -0.3912447690963745 LR -0.4240955412387848 LKL 0.03285077586770058
epoch 40405 loss -0.440685510635376 LR -0.4736616909503937 LKL 0.0329761765897274
epoch 40406 loss -0.4625626504421234 LR -0.4955739676952362 LKL 0.03301131725311279
epoch 40407 loss -0.4420146346092224 LR -0.47486957907676697 LKL 0.032854944467544556
epoch 40408 loss -0.40584057569503784 LR -0.438733845949173 LKL 0.032893259078264236
epoch 40409 loss -0.41567355394363403 LR -0.4487072825431824 LKL 0.033033717423677444
epoch 40410 loss -0.4586564302444458 LR -0.4914650321006775 LKL 0.032808609306812286
epoch 40411 loss -0.41537222266197205 LR -0.4482724666595459 LKL 0.032900236546993256
epoch 40412 loss -0.4941585659980774 LR -0.5273811221122742 LKL 0

epoch 40500 loss -0.43775251507759094 LR -0.4705636501312256 LKL 0.03281114250421524
53


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40501 loss -0.459526926279068 LR -0.49271509051322937 LKL 0.03318816050887108
epoch 40502 loss -0.5087997913360596 LR -0.541851818561554 LKL 0.03305203467607498
epoch 40503 loss -0.40416377782821655 LR -0.43696895241737366 LKL 0.032805185765028
epoch 40504 loss -0.46747568249702454 LR -0.5004944801330566 LKL 0.0330188013613224
epoch 40505 loss -0.4415859878063202 LR -0.4746864140033722 LKL 0.033100422471761703
epoch 40506 loss -0.47262874245643616 LR -0.505617082118988 LKL 0.03298834711313248
epoch 40507 loss -0.41032853722572327 LR -0.4433600902557373 LKL 0.03303154185414314
epoch 40508 loss -0.5070481896400452 LR -0.5401993989944458 LKL 0.03315121307969093
epoch 40509 loss -0.4373517632484436 LR -0.4700654149055481 LKL 0.0327136404812336
epoch 40510 loss -0.45820721983909607 LR -0.4912247955799103 LKL 0.033017586916685104
epoch 40511 loss -0.43829426169395447 LR -0.4710909426212311 LKL 0.032796673476696014
epoch 40512 loss -0.4573250412940979 LR -0.49032464623451233 LKL 0.03299

epoch 40600 loss -0.43316560983657837 LR -0.46608734130859375 LKL 0.032921724021434784
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40601 loss -0.47107335925102234 LR -0.5039985179901123 LKL 0.03292515128850937
epoch 40602 loss -0.46559664607048035 LR -0.4987144470214844 LKL 0.033117808401584625
epoch 40603 loss -0.43053436279296875 LR -0.4635477662086487 LKL 0.033013392239809036
epoch 40604 loss -0.4804776608943939 LR -0.5135433077812195 LKL 0.033065635710954666
epoch 40605 loss -0.5190702080726624 LR -0.552148699760437 LKL 0.03307848796248436
epoch 40606 loss -0.46524912118911743 LR -0.4980927109718323 LKL 0.03284357860684395
epoch 40607 loss -0.4195261001586914 LR -0.45214831829071045 LKL 0.032622214406728745
epoch 40608 loss -0.4029300808906555 LR -0.4357961118221283 LKL 0.03286601975560188
epoch 40609 loss -0.40304630994796753 LR -0.43578121066093445 LKL 0.032734889537096024
epoch 40610 loss -0.49769577383995056 LR -0.5305966138839722 LKL 0.03290082886815071
epoch 40611 loss -0.43424558639526367 LR -0.46703824400901794 LKL 0.03279266133904457
epoch 40612 loss -0.5005472898483276 LR -0.5335718393325806 LK

epoch 40700 loss -0.43857479095458984 LR -0.4714953303337097 LKL 0.03292052820324898
41


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40701 loss -0.4478788375854492 LR -0.480707049369812 LKL 0.03282821923494339
epoch 40702 loss -0.3420858681201935 LR -0.3748207688331604 LKL 0.032734911888837814
epoch 40703 loss -0.5111095905303955 LR -0.5440787076950073 LKL 0.032969094812870026
epoch 40704 loss -0.4119819104671478 LR -0.4446519911289215 LKL 0.03267008811235428
epoch 40705 loss -0.3932681679725647 LR -0.4262184500694275 LKL 0.03295028582215309
epoch 40706 loss -0.40669625997543335 LR -0.43953168392181396 LKL 0.03283542022109032
epoch 40707 loss -0.4711192548274994 LR -0.5038519501686096 LKL 0.03273268789052963
epoch 40708 loss -0.4846627712249756 LR -0.5176595449447632 LKL 0.03299678489565849
epoch 40709 loss -0.4983515441417694 LR -0.5312546491622925 LKL 0.03290311619639397
epoch 40710 loss -0.3866068124771118 LR -0.4194030463695526 LKL 0.03279624879360199
epoch 40711 loss -0.4848839044570923 LR -0.5176830887794495 LKL 0.03279918059706688
epoch 40712 loss -0.5001938343048096 LR -0.5332227349281311 LKL 0.0330288

epoch 40799 loss -0.4575471878051758 LR -0.49040547013282776 LKL 0.03285829350352287
epoch 40800 loss -0.44848939776420593 LR -0.4814128279685974 LKL 0.032923441380262375
74
epoch 40801 loss -0.42286163568496704 LR -0.45565247535705566 LKL 0.03279082849621773


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40802 loss -0.4475806951522827 LR -0.4806499481201172 LKL 0.03306923806667328
epoch 40803 loss -0.4474427103996277 LR -0.48028773069381714 LKL 0.03284502029418945
epoch 40804 loss -0.4289073348045349 LR -0.4618856906890869 LKL 0.03297834098339081
epoch 40805 loss -0.4166843295097351 LR -0.4494066834449768 LKL 0.032722365111112595
epoch 40806 loss -0.4821665585041046 LR -0.5152180790901184 LKL 0.0330515094101429
epoch 40807 loss -0.4411441683769226 LR -0.4742288589477539 LKL 0.0330846942961216
epoch 40808 loss -0.47217127680778503 LR -0.5051851272583008 LKL 0.03301384299993515
epoch 40809 loss -0.3997997045516968 LR -0.43260976672172546 LKL 0.03281007707118988
epoch 40810 loss -0.39568251371383667 LR -0.42862433195114136 LKL 0.03294181451201439
epoch 40811 loss -0.4462885558605194 LR -0.4791540205478668 LKL 0.03286546841263771
epoch 40812 loss -0.45226725935935974 LR -0.4852699637413025 LKL 0.03300270065665245
epoch 40813 loss -0.4758289158344269 LR -0.5090805292129517 LKL 0.03325

58
epoch 40901 loss -0.4896431565284729 LR -0.5227965712547302 LKL 0.03315342962741852


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 40902 loss -0.45265403389930725 LR -0.4856141209602356 LKL 0.03296007961034775
epoch 40903 loss -0.41680389642715454 LR -0.4499149024486542 LKL 0.033111006021499634
epoch 40904 loss -0.4975970983505249 LR -0.5304445624351501 LKL 0.03284744918346405
epoch 40905 loss -0.46410873532295227 LR -0.49711740016937256 LKL 0.033008672297000885
epoch 40906 loss -0.5090594291687012 LR -0.5420527458190918 LKL 0.03299331292510033
epoch 40907 loss -0.3880540132522583 LR -0.4208361506462097 LKL 0.03278213366866112
epoch 40908 loss -0.4360441565513611 LR -0.4691612720489502 LKL 0.033117104321718216
epoch 40909 loss -0.3749484121799469 LR -0.4077877402305603 LKL 0.032839335501194
epoch 40910 loss -0.4473404288291931 LR -0.4801471531391144 LKL 0.03280672803521156
epoch 40911 loss -0.3681037127971649 LR -0.4008829891681671 LKL 0.032779283821582794
epoch 40912 loss -0.4293249845504761 LR -0.46214765310287476 LKL 0.032822661101818085
epoch 40913 loss -0.4438512623310089 LR -0.47680649161338806 LKL 0.0

epoch 41000 loss -0.47862064838409424 LR -0.511738657951355 LKL 0.03311800956726074
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41001 loss -0.4974023699760437 LR -0.5305340886116028 LKL 0.03313172981142998
epoch 41002 loss -0.4168059527873993 LR -0.4496966302394867 LKL 0.0328906886279583
epoch 41003 loss -0.47795912623405457 LR -0.5111209750175476 LKL 0.033161841332912445
epoch 41004 loss -0.4484807252883911 LR -0.48141664266586304 LKL 0.03293590992689133
epoch 41005 loss -0.3992738127708435 LR -0.4321568012237549 LKL 0.03288298100233078
epoch 41006 loss -0.4505414366722107 LR -0.48357954621315 LKL 0.033038120716810226
epoch 41007 loss -0.4549393057823181 LR -0.487913578748703 LKL 0.032974276691675186
epoch 41008 loss -0.49975666403770447 LR -0.5327090620994568 LKL 0.03295240178704262
epoch 41009 loss -0.40742355585098267 LR -0.440527081489563 LKL 0.03310352563858032
epoch 41010 loss -0.4111012816429138 LR -0.444122314453125 LKL 0.033021025359630585
epoch 41011 loss -0.3795149624347687 LR -0.4121277928352356 LKL 0.03261282294988632
epoch 41012 loss -0.44318071007728577 LR -0.4760643541812897 LKL 0.0328836

epoch 41099 loss -0.41836488246917725 LR -0.4513653814792633 LKL 0.033000484108924866
epoch 41100 loss -0.43888890743255615 LR -0.4719695448875427 LKL 0.033080633729696274
80
epoch 41101 loss -0.46247562766075134 LR -0.49543139338493347 LKL 0.03295576572418213


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41102 loss -0.4678539037704468 LR -0.501056432723999 LKL 0.03320251777768135
epoch 41103 loss -0.38694998621940613 LR -0.41992664337158203 LKL 0.032976653426885605
epoch 41104 loss -0.43129292130470276 LR -0.46413683891296387 LKL 0.03284391760826111
epoch 41105 loss -0.44686925411224365 LR -0.4800490140914917 LKL 0.03317975625395775
epoch 41106 loss -0.4562658965587616 LR -0.48928672075271606 LKL 0.033020831644535065
epoch 41107 loss -0.4034843146800995 LR -0.4364868402481079 LKL 0.03300251439213753
epoch 41108 loss -0.4610357880592346 LR -0.4938221871852875 LKL 0.03278641402721405
epoch 41109 loss -0.43123021721839905 LR -0.46395546197891235 LKL 0.032725244760513306
epoch 41110 loss -0.38801512122154236 LR -0.42114752531051636 LKL 0.033132415264844894
epoch 41111 loss -0.41021162271499634 LR -0.4430098533630371 LKL 0.03279823809862137
epoch 41112 loss -0.436027467250824 LR -0.46905604004859924 LKL 0.03302857279777527
epoch 41113 loss -0.4078255295753479 LR -0.4407660961151123 LK

epoch 41200 loss -0.4301969110965729 LR -0.46292489767074585 LKL 0.032727986574172974
99
epoch 41201 loss -0.39286425709724426 LR -0.42572882771492004 LKL 0.032864559441804886


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41202 loss -0.43115749955177307 LR -0.4640987813472748 LKL 0.03294127434492111
epoch 41203 loss -0.4715389907360077 LR -0.5044877529144287 LKL 0.03294876962900162
epoch 41204 loss -0.4719645082950592 LR -0.5049082040786743 LKL 0.03294370695948601
epoch 41205 loss -0.48870009183883667 LR -0.5218639969825745 LKL 0.033163897693157196
epoch 41206 loss -0.4533480405807495 LR -0.48612672090530396 LKL 0.03277866542339325
epoch 41207 loss -0.44905227422714233 LR -0.48202094435691833 LKL 0.032968658953905106
epoch 41208 loss -0.4732119143009186 LR -0.5063768029212952 LKL 0.03316488489508629
epoch 41209 loss -0.43004393577575684 LR -0.46304136514663696 LKL 0.032997433096170425
epoch 41210 loss -0.4874994158744812 LR -0.5206976532936096 LKL 0.03319824859499931
epoch 41211 loss -0.4319780468940735 LR -0.4649660289287567 LKL 0.03298799693584442
epoch 41212 loss -0.4596620202064514 LR -0.49260851740837097 LKL 0.032946497201919556
epoch 41213 loss -0.4839099049568176 LR -0.5169528126716614 LKL 

58
epoch 41301 loss -0.43673303723335266 LR -0.4696088433265686 LKL 0.03287581726908684


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41302 loss -0.4820522964000702 LR -0.515237033367157 LKL 0.033184729516506195
epoch 41303 loss -0.45829641819000244 LR -0.49122586846351624 LKL 0.0329294353723526
epoch 41304 loss -0.45055997371673584 LR -0.483401894569397 LKL 0.03284190967679024
epoch 41305 loss -0.4270973801612854 LR -0.46002331376075745 LKL 0.03292594850063324
epoch 41306 loss -0.44863876700401306 LR -0.4814973473548889 LKL 0.03285856917500496
epoch 41307 loss -0.4080721139907837 LR -0.44099897146224976 LKL 0.03292685002088547
epoch 41308 loss -0.5017651915550232 LR -0.534910261631012 LKL 0.03314509242773056
epoch 41309 loss -0.442241907119751 LR -0.47504299879074097 LKL 0.032801106572151184
epoch 41310 loss -0.44790953397750854 LR -0.48099285364151 LKL 0.03308332711458206
epoch 41311 loss -0.4388597905635834 LR -0.4719577431678772 LKL 0.03309796005487442
epoch 41312 loss -0.504608690738678 LR -0.5377844572067261 LKL 0.03317577764391899
epoch 41313 loss -0.46042534708976746 LR -0.49353229999542236 LKL 0.033106

105
epoch 41401 loss -0.40502306818962097 LR -0.43782109022140503 LKL 0.032798025757074356


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41402 loss -0.4342367649078369 LR -0.4670374393463135 LKL 0.03280066326260567
epoch 41403 loss -0.4368009865283966 LR -0.46997004747390747 LKL 0.03316905349493027
epoch 41404 loss -0.46383512020111084 LR -0.4969828426837921 LKL 0.03314773365855217
epoch 41405 loss -0.43852493166923523 LR -0.47149115800857544 LKL 0.03296623378992081
epoch 41406 loss -0.41469570994377136 LR -0.4478255808353424 LKL 0.033129867166280746
epoch 41407 loss -0.42734187841415405 LR -0.4603821337223053 LKL 0.03304027020931244
epoch 41408 loss -0.4775291979312897 LR -0.5105974078178406 LKL 0.033068202435970306
epoch 41409 loss -0.42074254155158997 LR -0.45366451144218445 LKL 0.032921966165304184
epoch 41410 loss -0.4611893892288208 LR -0.49415314197540283 LKL 0.03296375274658203
epoch 41411 loss -0.43811625242233276 LR -0.4710868000984192 LKL 0.03297053277492523
epoch 41412 loss -0.46291178464889526 LR -0.49611273407936096 LKL 0.0332009382545948
epoch 41413 loss -0.4057160019874573 LR -0.4387231469154358 LK

epoch 41500 loss -0.4649192988872528 LR -0.49800747632980347 LKL 0.03308817744255066
50


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41501 loss -0.48989972472190857 LR -0.5228678584098816 LKL 0.03296814113855362
epoch 41502 loss -0.4449392557144165 LR -0.4778396785259247 LKL 0.03290041163563728
epoch 41503 loss -0.4060390293598175 LR -0.43873775005340576 LKL 0.03269873186945915
epoch 41504 loss -0.5007295608520508 LR -0.5338000059127808 LKL 0.03307047113776207
epoch 41505 loss -0.4334114193916321 LR -0.4663291573524475 LKL 0.032917726784944534
epoch 41506 loss -0.4585219621658325 LR -0.49163818359375 LKL 0.03311621770262718
epoch 41507 loss -0.4501824676990509 LR -0.48331648111343384 LKL 0.03313400223851204
epoch 41508 loss -0.409391313791275 LR -0.44237828254699707 LKL 0.032986968755722046
epoch 41509 loss -0.41486090421676636 LR -0.4479828476905823 LKL 0.03312193229794502
epoch 41510 loss -0.4629276692867279 LR -0.49607205390930176 LKL 0.033144377171993256
epoch 41511 loss -0.4759224057197571 LR -0.5089406967163086 LKL 0.03301829844713211
epoch 41512 loss -0.4431246817111969 LR -0.4761699438095093 LKL 0.0330

epoch 41600 loss -0.4311057925224304 LR -0.46397748589515686 LKL 0.03287167847156525
122
epoch 41601 loss -0.5223813056945801 LR -0.5556209683418274 LKL 0.03323964774608612


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41602 loss -0.43563657999038696 LR -0.4684961438179016 LKL 0.03285956382751465
epoch 41603 loss -0.4782535135746002 LR -0.5114036202430725 LKL 0.03315009921789169
epoch 41604 loss -0.38743430376052856 LR -0.4201204776763916 LKL 0.03268616273999214
epoch 41605 loss -0.3809811770915985 LR -0.41383326053619385 LKL 0.032852090895175934
epoch 41606 loss -0.4414830207824707 LR -0.4744334816932678 LKL 0.03295046463608742
epoch 41607 loss -0.4159044325351715 LR -0.4488529562950134 LKL 0.03294852375984192
epoch 41608 loss -0.4882885217666626 LR -0.5214430093765259 LKL 0.03315448760986328
epoch 41609 loss -0.4720073342323303 LR -0.5047422647476196 LKL 0.03273492306470871
epoch 41610 loss -0.4289970099925995 LR -0.4618716239929199 LKL 0.03287462145090103
epoch 41611 loss -0.3838024139404297 LR -0.4166319966316223 LKL 0.03282956779003143
epoch 41612 loss -0.4207226634025574 LR -0.45364877581596375 LKL 0.032926108688116074
epoch 41613 loss -0.4596134424209595 LR -0.49265220761299133 LKL 0.033

epoch 41700 loss -0.4960724115371704 LR -0.5291116237640381 LKL 0.03303920477628708
61


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41701 loss -0.4949961006641388 LR -0.5279444456100464 LKL 0.032948337495326996
epoch 41702 loss -0.40370211005210876 LR -0.4366540312767029 LKL 0.03295191377401352
epoch 41703 loss -0.3441351056098938 LR -0.37710413336753845 LKL 0.03296901285648346
epoch 41704 loss -0.4122469127178192 LR -0.4451993703842163 LKL 0.032952457666397095
epoch 41705 loss -0.5419570207595825 LR -0.5751150250434875 LKL 0.03315800055861473
epoch 41706 loss -0.3895554840564728 LR -0.42227277159690857 LKL 0.032717276364564896
epoch 41707 loss -0.4433519244194031 LR -0.47635871171951294 LKL 0.033006783574819565
epoch 41708 loss -0.4399825632572174 LR -0.4729880392551422 LKL 0.0330054834485054
epoch 41709 loss -0.44508057832717896 LR -0.478261798620224 LKL 0.03318121284246445
epoch 41710 loss -0.4592466354370117 LR -0.49239304661750793 LKL 0.03314640745520592
epoch 41711 loss -0.43811285495758057 LR -0.4710477590560913 LKL 0.03293491154909134
epoch 41712 loss -0.47160089015960693 LR -0.5045605897903442 LKL 0.

epoch 41799 loss -0.4746324419975281 LR -0.5077367424964905 LKL 0.03310428559780121
epoch 41800 loss -0.5059950351715088 LR -0.5391947627067566 LKL 0.033199746161699295
57
epoch 41801 loss -0.4930362105369568 LR -0.5262695550918579 LKL 0.03323335945606232


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41802 loss -0.4731042981147766 LR -0.5061949491500854 LKL 0.03309066221117973
epoch 41803 loss -0.48305660486221313 LR -0.5161434412002563 LKL 0.033086828887462616
epoch 41804 loss -0.4601535201072693 LR -0.4933474361896515 LKL 0.03319390490651131
epoch 41805 loss -0.43399688601493835 LR -0.46685439348220825 LKL 0.032857514917850494
epoch 41806 loss -0.3840789794921875 LR -0.4169541597366333 LKL 0.0328751839697361
epoch 41807 loss -0.44123169779777527 LR -0.47431522607803345 LKL 0.03308351710438728
epoch 41808 loss -0.4529627859592438 LR -0.4861429035663605 LKL 0.033180121332407
epoch 41809 loss -0.4315955340862274 LR -0.4647097885608673 LKL 0.03311425819993019
epoch 41810 loss -0.3806971311569214 LR -0.41360312700271606 LKL 0.03290598839521408
epoch 41811 loss -0.40625032782554626 LR -0.4390738904476166 LKL 0.03282356262207031
epoch 41812 loss -0.4557856619358063 LR -0.4889141321182251 LKL 0.03312845900654793
epoch 41813 loss -0.48048821091651917 LR -0.5136702060699463 LKL 0.033

87
epoch 41901 loss -0.4928848147392273 LR -0.5261833071708679 LKL 0.033298492431640625


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 41902 loss -0.46554070711135864 LR -0.49850839376449585 LKL 0.03296767920255661
epoch 41903 loss -0.3865985870361328 LR -0.41945886611938477 LKL 0.032860275357961655
epoch 41904 loss -0.39497724175453186 LR -0.4279237985610962 LKL 0.03294655680656433
epoch 41905 loss -0.5126381516456604 LR -0.5458272695541382 LKL 0.03318913280963898
epoch 41906 loss -0.40072742104530334 LR -0.43357813358306885 LKL 0.032850705087184906
epoch 41907 loss -0.39272502064704895 LR -0.4257543683052063 LKL 0.03302934020757675
epoch 41908 loss -0.4417847990989685 LR -0.4747924506664276 LKL 0.03300766274333
epoch 41909 loss -0.3694964051246643 LR -0.4026191234588623 LKL 0.033122703433036804
epoch 41910 loss -0.4372239112854004 LR -0.4703822433948517 LKL 0.0331583172082901
epoch 41911 loss -0.4936460256576538 LR -0.5268242359161377 LKL 0.03317819908261299
epoch 41912 loss -0.5024347305297852 LR -0.5355587601661682 LKL 0.03312401473522186
epoch 41913 loss -0.4578262269496918 LR -0.4909377098083496 LKL 0.0331

74
epoch 42001 loss -0.4620617926120758 LR -0.4953058660030365 LKL 0.03324408084154129


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42002 loss -0.4658968448638916 LR -0.49883630871772766 LKL 0.032939448952674866
epoch 42003 loss -0.41663044691085815 LR -0.4496370255947113 LKL 0.033006563782691956
epoch 42004 loss -0.40994617342948914 LR -0.4428318738937378 LKL 0.03288571164011955
epoch 42005 loss -0.44139015674591064 LR -0.47431471943855286 LKL 0.03292457014322281
epoch 42006 loss -0.4815365672111511 LR -0.5145406126976013 LKL 0.03300405666232109
epoch 42007 loss -0.47611695528030396 LR -0.5093580484390259 LKL 0.03324110433459282
epoch 42008 loss -0.468366414308548 LR -0.5013433694839478 LKL 0.03297695145010948
epoch 42009 loss -0.4865916669368744 LR -0.5197122097015381 LKL 0.033120546489953995
epoch 42010 loss -0.35770171880722046 LR -0.3905121386051178 LKL 0.032810430973768234
epoch 42011 loss -0.4601369798183441 LR -0.493111789226532 LKL 0.032974809408187866
epoch 42012 loss -0.5252349376678467 LR -0.5586304664611816 LKL 0.03339549899101257
epoch 42013 loss -0.4502020478248596 LR -0.48343372344970703 LKL 0

epoch 42100 loss -0.4209415912628174 LR -0.45415031909942627 LKL 0.03320873901247978
62


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42101 loss -0.40441662073135376 LR -0.4374975562095642 LKL 0.03308093547821045
epoch 42102 loss -0.4687108099460602 LR -0.5018705129623413 LKL 0.03315971419215202
epoch 42103 loss -0.43962013721466064 LR -0.4727180600166321 LKL 0.033097922801971436
epoch 42104 loss -0.4403128921985626 LR -0.4732493758201599 LKL 0.03293648734688759
epoch 42105 loss -0.3924373388290405 LR -0.4256502389907837 LKL 0.033212900161743164
epoch 42106 loss -0.47768765687942505 LR -0.5108446478843689 LKL 0.033156994730234146
epoch 42107 loss -0.3978753387928009 LR -0.4309976100921631 LKL 0.03312227502465248
epoch 42108 loss -0.4105529189109802 LR -0.4436751902103424 LKL 0.033122267574071884
epoch 42109 loss -0.46678319573402405 LR -0.4998238682746887 LKL 0.03304068371653557
epoch 42110 loss -0.4014971852302551 LR -0.43435990810394287 LKL 0.03286273404955864
epoch 42111 loss -0.40303534269332886 LR -0.43610048294067383 LKL 0.03306514769792557
epoch 42112 loss -0.5075212717056274 LR -0.5408101081848145 LKL 0

epoch 42199 loss -0.39246514439582825 LR -0.42530640959739685 LKL 0.032841261476278305
epoch 42200 loss -0.493917316198349 LR -0.5267856121063232 LKL 0.03286830335855484
69
epoch 42201 loss -0.47635912895202637 LR -0.5093128681182861 LKL 0.03295375034213066


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42202 loss -0.514141321182251 LR -0.5475063323974609 LKL 0.03336498513817787
epoch 42203 loss -0.44007039070129395 LR -0.4733109474182129 LKL 0.033240560442209244
epoch 42204 loss -0.37247419357299805 LR -0.4053698778152466 LKL 0.03289569169282913
epoch 42205 loss -0.37715238332748413 LR -0.41010650992393494 LKL 0.03295411542057991
epoch 42206 loss -0.382548451423645 LR -0.41528239846229553 LKL 0.03273396193981171
epoch 42207 loss -0.46726518869400024 LR -0.5004764199256897 LKL 0.03321123868227005
epoch 42208 loss -0.37249550223350525 LR -0.4054301679134369 LKL 0.03293466195464134
epoch 42209 loss -0.48959657549858093 LR -0.5228149890899658 LKL 0.033218421041965485
epoch 42210 loss -0.478787899017334 LR -0.5121040344238281 LKL 0.03331613913178444
epoch 42211 loss -0.4251122772693634 LR -0.45810747146606445 LKL 0.03299519419670105
epoch 42212 loss -0.4266982674598694 LR -0.459857314825058 LKL 0.033159058541059494
epoch 42213 loss -0.4777318835258484 LR -0.5107462406158447 LKL 0.03

85
epoch 42301 loss -0.4383983016014099 LR -0.4714794158935547 LKL 0.03308112174272537


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42302 loss -0.506559431552887 LR -0.5396998524665833 LKL 0.03314042463898659
epoch 42303 loss -0.4990774989128113 LR -0.5324988961219788 LKL 0.033421412110328674
epoch 42304 loss -0.4569458067417145 LR -0.4900268018245697 LKL 0.03308098390698433
epoch 42305 loss -0.40502792596817017 LR -0.4380025267601013 LKL 0.032974597066640854
epoch 42306 loss -0.45265594124794006 LR -0.4855833947658539 LKL 0.03292744234204292
epoch 42307 loss -0.45822885632514954 LR -0.4913376569747925 LKL 0.03310881182551384
epoch 42308 loss -0.4851994514465332 LR -0.518386721611023 LKL 0.033187273889780045
epoch 42309 loss -0.5155026912689209 LR -0.5489147901535034 LKL 0.03341208025813103
epoch 42310 loss -0.4311249256134033 LR -0.4640364646911621 LKL 0.032911527901887894
epoch 42311 loss -0.36744099855422974 LR -0.4001981019973755 LKL 0.03275711461901665
epoch 42312 loss -0.43983304500579834 LR -0.4729447066783905 LKL 0.03311166539788246
epoch 42313 loss -0.47818535566329956 LR -0.5115455389022827 LKL 0.03

39
epoch 42401 loss -0.47388553619384766 LR -0.5070714354515076 LKL 0.03318590298295021


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42402 loss -0.49966564774513245 LR -0.5328999757766724 LKL 0.03323432803153992
epoch 42403 loss -0.39931175112724304 LR -0.43222981691360474 LKL 0.0329180546104908
epoch 42404 loss -0.42505475878715515 LR -0.45808514952659607 LKL 0.03303040191531181
epoch 42405 loss -0.37506216764450073 LR -0.4080601930618286 LKL 0.03299802169203758
epoch 42406 loss -0.39921802282333374 LR -0.432081401348114 LKL 0.03286337852478027
epoch 42407 loss -0.3758280575275421 LR -0.40874502062797546 LKL 0.03291695564985275
epoch 42408 loss -0.4245198667049408 LR -0.45775488018989563 LKL 0.03323502093553543
epoch 42409 loss -0.4735501706600189 LR -0.5064579844474792 LKL 0.03290782496333122
epoch 42410 loss -0.5301536321640015 LR -0.5635279417037964 LKL 0.03337429463863373
epoch 42411 loss -0.43170687556266785 LR -0.4646899104118347 LKL 0.032983046025037766
epoch 42412 loss -0.4742315709590912 LR -0.5073432326316833 LKL 0.03311165049672127
epoch 42413 loss -0.48218730092048645 LR -0.5153264999389648 LKL 0.

78
epoch 42501 loss -0.468461275100708 LR -0.5015959739685059 LKL 0.033134687691926956


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42502 loss -0.5217763185501099 LR -0.5550137162208557 LKL 0.03323740512132645
epoch 42503 loss -0.471693217754364 LR -0.5048716068267822 LKL 0.033178385347127914
epoch 42504 loss -0.4541900157928467 LR -0.48724058270454407 LKL 0.03305056691169739
epoch 42505 loss -0.48324400186538696 LR -0.5162538886070251 LKL 0.03300989419221878
epoch 42506 loss -0.3938957452774048 LR -0.4268708825111389 LKL 0.03297513350844383
epoch 42507 loss -0.3988509178161621 LR -0.43192195892333984 LKL 0.03307103365659714
epoch 42508 loss -0.47708645462989807 LR -0.5104461312294006 LKL 0.03335966914892197
epoch 42509 loss -0.45310816168785095 LR -0.48639580607414246 LKL 0.0332876481115818
epoch 42510 loss -0.36029940843582153 LR -0.3931158185005188 LKL 0.03281639888882637
epoch 42511 loss -0.380354642868042 LR -0.41319698095321655 LKL 0.03284233435988426
epoch 42512 loss -0.4518376886844635 LR -0.4851246476173401 LKL 0.033286962658166885
epoch 42513 loss -0.4706486463546753 LR -0.5037842988967896 LKL 0.033

73
epoch 42601 loss -0.3008544147014618 LR -0.3337215781211853 LKL 0.03286716714501381


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42602 loss -0.4626990556716919 LR -0.4958192706108093 LKL 0.03312020003795624
epoch 42603 loss -0.4774457514286041 LR -0.5107406377792358 LKL 0.03329489380121231
epoch 42604 loss -0.37609589099884033 LR -0.40914636850357056 LKL 0.03305049240589142
epoch 42605 loss -0.4148453176021576 LR -0.44798755645751953 LKL 0.03314223140478134
epoch 42606 loss -0.4355263411998749 LR -0.4686849117279053 LKL 0.03315858170390129
epoch 42607 loss -0.4367401897907257 LR -0.46978551149368286 LKL 0.033045317977666855
epoch 42608 loss -0.44064590334892273 LR -0.47373658418655396 LKL 0.03309067711234093
epoch 42609 loss -0.44459980726242065 LR -0.47761476039886475 LKL 0.03301494941115379
epoch 42610 loss -0.4008086025714874 LR -0.4341636598110199 LKL 0.033355068415403366
epoch 42611 loss -0.4669750928878784 LR -0.5000125765800476 LKL 0.033037472516298294
epoch 42612 loss -0.44322770833969116 LR -0.4763975143432617 LKL 0.033169813454151154
epoch 42613 loss -0.38152259588241577 LR -0.4145837426185608 LK

51
epoch 42701 loss -0.39283403754234314 LR -0.4259667992591858 LKL 0.03313275799155235


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42702 loss -0.44271084666252136 LR -0.4757598042488098 LKL 0.03304895758628845
epoch 42703 loss -0.4156090319156647 LR -0.4486842453479767 LKL 0.033075205981731415
epoch 42704 loss -0.48145532608032227 LR -0.5146172046661377 LKL 0.033161893486976624
epoch 42705 loss -0.4449540972709656 LR -0.4781677722930908 LKL 0.03321366384625435
epoch 42706 loss -0.4821421205997467 LR -0.5154181718826294 LKL 0.03327604755759239
epoch 42707 loss -0.43990010023117065 LR -0.4730754792690277 LKL 0.03317536413669586
epoch 42708 loss -0.46708372235298157 LR -0.5002536773681641 LKL 0.03316996246576309
epoch 42709 loss -0.419026643037796 LR -0.4520489275455475 LKL 0.033022284507751465
epoch 42710 loss -0.4305965006351471 LR -0.4636395573616028 LKL 0.033043064177036285
epoch 42711 loss -0.48420706391334534 LR -0.5173521637916565 LKL 0.03314511105418205
epoch 42712 loss -0.4152947664260864 LR -0.4484905004501343 LKL 0.03319574519991875
epoch 42713 loss -0.4928974509239197 LR -0.526129424571991 LKL 0.033

42
epoch 42801 loss -0.4406086504459381 LR -0.4736413061618805 LKL 0.033032648265361786


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42802 loss -0.5155275464057922 LR -0.5490320920944214 LKL 0.03350454941391945
epoch 42803 loss -0.4417511820793152 LR -0.47475412487983704 LKL 0.03300294280052185
epoch 42804 loss -0.43831634521484375 LR -0.47121548652648926 LKL 0.03289914131164551
epoch 42805 loss -0.46870705485343933 LR -0.5017992258071899 LKL 0.03309217467904091
epoch 42806 loss -0.42325809597969055 LR -0.45628222823143005 LKL 0.0330241434276104
epoch 42807 loss -0.5178101062774658 LR -0.551218569278717 LKL 0.03340848535299301
epoch 42808 loss -0.4295784533023834 LR -0.4627757668495178 LKL 0.033197324723005295
epoch 42809 loss -0.48882579803466797 LR -0.5220858454704285 LKL 0.0332600399851799
epoch 42810 loss -0.43341439962387085 LR -0.4666401147842407 LKL 0.03322572261095047
epoch 42811 loss -0.4320092499256134 LR -0.46529147028923035 LKL 0.03328220918774605
epoch 42812 loss -0.5045111775398254 LR -0.5377270579338074 LKL 0.03321588411927223
epoch 42813 loss -0.4796871542930603 LR -0.5132251977920532 LKL 0.033

epoch 42900 loss -0.3709217607975006 LR -0.4039353132247925 LKL 0.03301355242729187
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 42901 loss -0.4734264314174652 LR -0.5065559148788452 LKL 0.033129479736089706
epoch 42902 loss -0.4202909469604492 LR -0.4531562030315399 LKL 0.032865267246961594
epoch 42903 loss -0.46827489137649536 LR -0.5015891790390015 LKL 0.0333142951130867
epoch 42904 loss -0.4264843761920929 LR -0.4595834016799927 LKL 0.03309902176260948
epoch 42905 loss -0.47820159792900085 LR -0.5110728144645691 LKL 0.03287120908498764
epoch 42906 loss -0.4156299829483032 LR -0.4488322138786316 LKL 0.03320222347974777
epoch 42907 loss -0.42969322204589844 LR -0.4629749655723572 LKL 0.03328174352645874
epoch 42908 loss -0.42249876260757446 LR -0.45581114292144775 LKL 0.033312372863292694
epoch 42909 loss -0.4158265292644501 LR -0.44897860288619995 LKL 0.03315206617116928
epoch 42910 loss -0.4235401749610901 LR -0.45642727613449097 LKL 0.03288710117340088
epoch 42911 loss -0.43947550654411316 LR -0.47255367040634155 LKL 0.03307817503809929
epoch 42912 loss -0.48724621534347534 LR -0.5204904675483704 LKL 

epoch 43000 loss -0.5017589330673218 LR -0.5349915027618408 LKL 0.033232565969228745
64


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43001 loss -0.45201992988586426 LR -0.4852060079574585 LKL 0.033186089247465134
epoch 43002 loss -0.43958309292793274 LR -0.47281235456466675 LKL 0.03322925046086311
epoch 43003 loss -0.42629849910736084 LR -0.45951899886131287 LKL 0.03322048857808113
epoch 43004 loss -0.4790567457675934 LR -0.5123006105422974 LKL 0.03324385732412338
epoch 43005 loss -0.41892969608306885 LR -0.45188817381858826 LKL 0.03295847401022911
epoch 43006 loss -0.493266761302948 LR -0.5264536142349243 LKL 0.033186864107847214
epoch 43007 loss -0.45771509408950806 LR -0.4910433292388916 LKL 0.03332824632525444
epoch 43008 loss -0.4380451738834381 LR -0.4710889160633087 LKL 0.03304373472929001
epoch 43009 loss -0.40258607268333435 LR -0.43542689085006714 LKL 0.03284081816673279
epoch 43010 loss -0.4449722170829773 LR -0.47821134328842163 LKL 0.03323912248015404
epoch 43011 loss -0.5322649478912354 LR -0.5657269358634949 LKL 0.033461980521678925
epoch 43012 loss -0.4710838496685028 LR -0.5044618248939514 LKL

epoch 43099 loss -0.45336979627609253 LR -0.48662084341049194 LKL 0.03325106203556061
epoch 43100 loss -0.42914602160453796 LR -0.4621956944465637 LKL 0.03304968401789665
73
epoch 43101 loss -0.49826329946517944 LR -0.5315946340560913 LKL 0.033331334590911865


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43102 loss -0.50938880443573 LR -0.542718768119812 LKL 0.03332993760704994
epoch 43103 loss -0.4789098799228668 LR -0.5121414661407471 LKL 0.03323158249258995
epoch 43104 loss -0.4144880771636963 LR -0.4477779269218445 LKL 0.03328985348343849
epoch 43105 loss -0.46888014674186707 LR -0.5022050142288208 LKL 0.033324871212244034
epoch 43106 loss -0.38297367095947266 LR -0.41595521569252014 LKL 0.03298152983188629
epoch 43107 loss -0.49023884534835815 LR -0.5235004425048828 LKL 0.033261608332395554
epoch 43108 loss -0.39782559871673584 LR -0.4310332238674164 LKL 0.03320763632655144
epoch 43109 loss -0.46692022681236267 LR -0.5001379251480103 LKL 0.03321769833564758
epoch 43110 loss -0.5191815495491028 LR -0.5525704026222229 LKL 0.03338887542486191
epoch 43111 loss -0.4012490212917328 LR -0.4345470070838928 LKL 0.03329797461628914
epoch 43112 loss -0.38291797041893005 LR -0.4160704016685486 LKL 0.03315242752432823
epoch 43113 loss -0.43412870168685913 LR -0.46746826171875 LKL 0.03333

epoch 43199 loss -0.47553524374961853 LR -0.5088164210319519 LKL 0.033281177282333374
epoch 43200 loss -0.46605345606803894 LR -0.4994916319847107 LKL 0.03343817591667175
62
epoch 43201 loss -0.4566528797149658 LR -0.4899904131889343 LKL 0.033337533473968506


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43202 loss -0.5097017884254456 LR -0.542954683303833 LKL 0.03325287997722626
epoch 43203 loss -0.4500214457511902 LR -0.48309487104415894 LKL 0.033073410391807556
epoch 43204 loss -0.39472880959510803 LR -0.42782801389694214 LKL 0.03309919685125351
epoch 43205 loss -0.38165318965911865 LR -0.4149223864078522 LKL 0.033269185572862625
epoch 43206 loss -0.5723026990890503 LR -0.6057415008544922 LKL 0.03343877196311951
epoch 43207 loss -0.46990904211997986 LR -0.5032012462615967 LKL 0.03329221159219742
epoch 43208 loss -0.46952641010284424 LR -0.502572238445282 LKL 0.033045824617147446
epoch 43209 loss -0.5060485005378723 LR -0.5394352674484253 LKL 0.03338678926229477
epoch 43210 loss -0.40788689255714417 LR -0.441139817237854 LKL 0.033252913504838943
epoch 43211 loss -0.4980115294456482 LR -0.531497597694397 LKL 0.033486057072877884
epoch 43212 loss -0.42184266448020935 LR -0.4550588130950928 LKL 0.03321615606546402
epoch 43213 loss -0.45525863766670227 LR -0.48843443393707275 LKL 0

epoch 43300 loss -0.5356886386871338 LR -0.5689072012901306 LKL 0.033218562602996826
80


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43301 loss -0.48749372363090515 LR -0.5209699273109436 LKL 0.03347620740532875
epoch 43302 loss -0.4325381815433502 LR -0.4655744135379791 LKL 0.03303622081875801
epoch 43303 loss -0.4848659038543701 LR -0.5181922316551208 LKL 0.03332633152604103
epoch 43304 loss -0.4383094310760498 LR -0.471576988697052 LKL 0.0332675576210022
epoch 43305 loss -0.4208989441394806 LR -0.4541274607181549 LKL 0.03322850540280342
epoch 43306 loss -0.46119946241378784 LR -0.4943999946117401 LKL 0.03320053592324257
epoch 43307 loss -0.40210258960723877 LR -0.4351929724216461 LKL 0.033090390264987946
epoch 43308 loss -0.4922192692756653 LR -0.5256506204605103 LKL 0.033431362360715866
epoch 43309 loss -0.45885786414146423 LR -0.4921092092990875 LKL 0.033251356333494186
epoch 43310 loss -0.4245830476284027 LR -0.4578397274017334 LKL 0.03325667604804039
epoch 43311 loss -0.4606282114982605 LR -0.4940645098686218 LKL 0.03343629837036133
epoch 43312 loss -0.43265780806541443 LR -0.4657563865184784 LKL 0.0330

epoch 43400 loss -0.47691330313682556 LR -0.5103543996810913 LKL 0.03344108536839485
43


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43401 loss -0.4085621237754822 LR -0.4419289231300354 LKL 0.033366795629262924
epoch 43402 loss -0.4814438819885254 LR -0.5146906971931458 LKL 0.03324681147933006
epoch 43403 loss -0.409645140171051 LR -0.44280651211738586 LKL 0.033161379396915436
epoch 43404 loss -0.4456660747528076 LR -0.4789593815803528 LKL 0.033293310552835464
epoch 43405 loss -0.39763307571411133 LR -0.43098360300064087 LKL 0.03335051238536835
epoch 43406 loss -0.42596903443336487 LR -0.45921066403388977 LKL 0.0332416333258152
epoch 43407 loss -0.4509432315826416 LR -0.48425278067588806 LKL 0.033309537917375565
epoch 43408 loss -0.4039163589477539 LR -0.4373352527618408 LKL 0.033418893814086914
epoch 43409 loss -0.3857308328151703 LR -0.41883647441864014 LKL 0.03310563415288925
epoch 43410 loss -0.47849616408348083 LR -0.5118032097816467 LKL 0.03330705687403679
epoch 43411 loss -0.41378387808799744 LR -0.4470478296279907 LKL 0.03326394036412239
epoch 43412 loss -0.48380377888679504 LR -0.5172345042228699 LKL

epoch 43500 loss -0.32658374309539795 LR -0.3597750663757324 LKL 0.03319133445620537
44


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43501 loss -0.4024493992328644 LR -0.43559709191322327 LKL 0.033147700130939484
epoch 43502 loss -0.42742788791656494 LR -0.46060094237327576 LKL 0.03317306935787201
epoch 43503 loss -0.39375773072242737 LR -0.42683732509613037 LKL 0.033079586923122406
epoch 43504 loss -0.4402926564216614 LR -0.4739212989807129 LKL 0.033628638833761215
epoch 43505 loss -0.42285996675491333 LR -0.45602676272392273 LKL 0.0331667885184288
epoch 43506 loss -0.4428706467151642 LR -0.47615620493888855 LKL 0.03328555449843407
epoch 43507 loss -0.47020310163497925 LR -0.5035709738731384 LKL 0.03336787968873978
epoch 43508 loss -0.41419321298599243 LR -0.4474605619907379 LKL 0.03326736390590668
epoch 43509 loss -0.4049989879131317 LR -0.4382178783416748 LKL 0.033218879252672195
epoch 43510 loss -0.44082823395729065 LR -0.473935604095459 LKL 0.03310738131403923
epoch 43511 loss -0.488105446100235 LR -0.5215247869491577 LKL 0.03341934457421303
epoch 43512 loss -0.5088967084884644 LR -0.5424124598503113 LKL 

epoch 43600 loss -0.44913485646247864 LR -0.4824781119823456 LKL 0.03334325924515724
104
epoch 43601 loss -0.422876238822937 LR -0.4562760591506958 LKL 0.03339981660246849


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43602 loss -0.46557745337486267 LR -0.4987427890300751 LKL 0.033165328204631805
epoch 43603 loss -0.4406147301197052 LR -0.47368335723876953 LKL 0.03306862339377403
epoch 43604 loss -0.42023423314094543 LR -0.4534881114959717 LKL 0.03325388953089714
epoch 43605 loss -0.4474567472934723 LR -0.4807748794555664 LKL 0.03331812843680382
epoch 43606 loss -0.45005881786346436 LR -0.48319220542907715 LKL 0.03313340246677399
epoch 43607 loss -0.4214657247066498 LR -0.4548436403274536 LKL 0.033377911895513535
epoch 43608 loss -0.4538852870464325 LR -0.48712703585624695 LKL 0.033241741359233856
epoch 43609 loss -0.4498191773891449 LR -0.4831818640232086 LKL 0.033362697809934616
epoch 43610 loss -0.44911909103393555 LR -0.4824973940849304 LKL 0.03337831795215607
epoch 43611 loss -0.48833540081977844 LR -0.5216827392578125 LKL 0.033347342163324356
epoch 43612 loss -0.45664194226264954 LR -0.49006542563438416 LKL 0.033423494547605515
epoch 43613 loss -0.4250406324863434 LR -0.45839622616767883

epoch 43700 loss -0.41893523931503296 LR -0.4521264433860779 LKL 0.03319120034575462
49


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43701 loss -0.3717260956764221 LR -0.40472865104675293 LKL 0.033002566546201706
epoch 43702 loss -0.45986732840538025 LR -0.49318188428878784 LKL 0.03331456705927849
epoch 43703 loss -0.42035725712776184 LR -0.45353326201438904 LKL 0.033176008611917496
epoch 43704 loss -0.5051949620246887 LR -0.5385816097259521 LKL 0.03338664397597313
epoch 43705 loss -0.4928538203239441 LR -0.5263750553131104 LKL 0.03352123871445656
epoch 43706 loss -0.454159140586853 LR -0.48716405034065247 LKL 0.03300490975379944
epoch 43707 loss -0.451392263174057 LR -0.48459765315055847 LKL 0.03320539370179176
epoch 43708 loss -0.48276710510253906 LR -0.5159984230995178 LKL 0.033231303095817566
epoch 43709 loss -0.4515502452850342 LR -0.48472654819488525 LKL 0.03317629545927048
epoch 43710 loss -0.4759729504585266 LR -0.5090782642364502 LKL 0.03310529887676239
epoch 43711 loss -0.4434913396835327 LR -0.4767574071884155 LKL 0.033266082406044006
epoch 43712 loss -0.5524992942810059 LR -0.5860098600387573 LKL 0

epoch 43800 loss -0.44594255089759827 LR -0.47908878326416016 LKL 0.03314623236656189
82


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43801 loss -0.3764152228832245 LR -0.4095070958137512 LKL 0.033091865479946136
epoch 43802 loss -0.42486372590065 LR -0.4579780697822571 LKL 0.033114343881607056
epoch 43803 loss -0.46043822169303894 LR -0.4936791956424713 LKL 0.03324098512530327
epoch 43804 loss -0.4268019497394562 LR -0.45987993478775024 LKL 0.03307797759771347
epoch 43805 loss -0.4849507808685303 LR -0.5181477069854736 LKL 0.033196933567523956
epoch 43806 loss -0.4819094240665436 LR -0.5152864456176758 LKL 0.0333770215511322
epoch 43807 loss -0.4096958041191101 LR -0.4429960548877716 LKL 0.033300239592790604
epoch 43808 loss -0.4277728497982025 LR -0.4609926640987396 LKL 0.03321981057524681
epoch 43809 loss -0.41199401021003723 LR -0.44526955485343933 LKL 0.0332755409181118
epoch 43810 loss -0.46561235189437866 LR -0.49895933270454407 LKL 0.033346984535455704
epoch 43811 loss -0.41230157017707825 LR -0.4455340504646301 LKL 0.03323248773813248
epoch 43812 loss -0.42602792382240295 LR -0.4591113328933716 LKL 0.0

epoch 43900 loss -0.45026829838752747 LR -0.4836762845516205 LKL 0.03340797871351242
51


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 43901 loss -0.49774813652038574 LR -0.5309255719184875 LKL 0.03317742422223091
epoch 43902 loss -0.49456244707107544 LR -0.5280203819274902 LKL 0.03345794975757599
epoch 43903 loss -0.47043377161026 LR -0.5037546157836914 LKL 0.0333208367228508
epoch 43904 loss -0.41643568873405457 LR -0.4496704339981079 LKL 0.03323474898934364
epoch 43905 loss -0.45796042680740356 LR -0.4911857545375824 LKL 0.033225324004888535
epoch 43906 loss -0.4476652145385742 LR -0.4809491038322449 LKL 0.033283889293670654
epoch 43907 loss -0.4491429328918457 LR -0.482300341129303 LKL 0.03315740451216698
epoch 43908 loss -0.4458727240562439 LR -0.47940343618392944 LKL 0.03353070467710495
epoch 43909 loss -0.4176202118396759 LR -0.45091041922569275 LKL 0.03329019993543625
epoch 43910 loss -0.5101175308227539 LR -0.5436814427375793 LKL 0.033563900738954544
epoch 43911 loss -0.4900645911693573 LR -0.5234338641166687 LKL 0.0333692692220211
epoch 43912 loss -0.41603633761405945 LR -0.44929227232933044 LKL 0.0332

epoch 43998 loss -0.44053930044174194 LR -0.47389182448387146 LKL 0.033352527767419815
epoch 43999 loss -0.42823854088783264 LR -0.4617322087287903 LKL 0.03349367156624794
epoch 44000 loss -0.47853943705558777 LR -0.5119597911834717 LKL 0.03342035040259361
62
epoch 44001 loss -0.4863571226596832 LR -0.5197855830192566 LKL 0.03342844918370247


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44002 loss -0.41767239570617676 LR -0.45096150040626526 LKL 0.0332891084253788
epoch 44003 loss -0.41849732398986816 LR -0.45166435837745667 LKL 0.0331670343875885
epoch 44004 loss -0.4836137592792511 LR -0.5169651508331299 LKL 0.03335138410329819
epoch 44005 loss -0.4460335373878479 LR -0.4794715642929077 LKL 0.03343804180622101
epoch 44006 loss -0.49751120805740356 LR -0.5307038426399231 LKL 0.03319263830780983
epoch 44007 loss -0.475886732339859 LR -0.5092459321022034 LKL 0.033359210938215256
epoch 44008 loss -0.4182985723018646 LR -0.4514796733856201 LKL 0.033181093633174896
epoch 44009 loss -0.4354260265827179 LR -0.4684622883796692 LKL 0.0330362543463707
epoch 44010 loss -0.4153231382369995 LR -0.4485563635826111 LKL 0.033233240246772766
epoch 44011 loss -0.4649049639701843 LR -0.49818897247314453 LKL 0.03328399732708931
epoch 44012 loss -0.49841442704200745 LR -0.5316840410232544 LKL 0.03326961398124695
epoch 44013 loss -0.4495776295661926 LR -0.4828316867351532 LKL 0.0332

46
epoch 44101 loss -0.45838847756385803 LR -0.4916790723800659 LKL 0.033290598541498184


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44102 loss -0.4308461546897888 LR -0.4639953076839447 LKL 0.03314914181828499
epoch 44103 loss -0.45211076736450195 LR -0.485268235206604 LKL 0.033157479017972946
epoch 44104 loss -0.4279206097126007 LR -0.4612155258655548 LKL 0.033294927328825
epoch 44105 loss -0.3648722171783447 LR -0.39801737666130066 LKL 0.03314514830708504
epoch 44106 loss -0.49488624930381775 LR -0.5284103751182556 LKL 0.03352411463856697
epoch 44107 loss -0.38582009077072144 LR -0.41883584856987 LKL 0.033015746623277664
epoch 44108 loss -0.46530571579933167 LR -0.49858927726745605 LKL 0.033283550292253494
epoch 44109 loss -0.5262218713760376 LR -0.5597725510597229 LKL 0.033550672233104706
epoch 44110 loss -0.4753562808036804 LR -0.5085654258728027 LKL 0.03320915624499321
epoch 44111 loss -0.432591050863266 LR -0.46591776609420776 LKL 0.033326707780361176
epoch 44112 loss -0.4707593619823456 LR -0.5041108727455139 LKL 0.03335151821374893
epoch 44113 loss -0.4512113332748413 LR -0.4846039414405823 LKL 0.0333

42
epoch 44201 loss -0.4308074414730072 LR -0.46444612741470337 LKL 0.03363868221640587


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44202 loss -0.3997463583946228 LR -0.4329802095890045 LKL 0.03323386237025261
epoch 44203 loss -0.4029068350791931 LR -0.4361727833747864 LKL 0.033265937119722366
epoch 44204 loss -0.48011714220046997 LR -0.5134392380714417 LKL 0.03332209959626198
epoch 44205 loss -0.4154830873012543 LR -0.4487733244895935 LKL 0.03329024463891983
epoch 44206 loss -0.4763624966144562 LR -0.5097271800041199 LKL 0.033364687114953995
epoch 44207 loss -0.5306234359741211 LR -0.5640118718147278 LKL 0.033388420939445496
epoch 44208 loss -0.531893253326416 LR -0.565398633480072 LKL 0.033505357801914215
epoch 44209 loss -0.4525732696056366 LR -0.4857841432094574 LKL 0.033210866153240204
epoch 44210 loss -0.45759421586990356 LR -0.4909319281578064 LKL 0.033337704837322235
epoch 44211 loss -0.4782760739326477 LR -0.5117499232292175 LKL 0.033473849296569824
epoch 44212 loss -0.4536930024623871 LR -0.4871208965778351 LKL 0.0334278829395771
epoch 44213 loss -0.4024789035320282 LR -0.4357141852378845 LKL 0.0332

91
epoch 44301 loss -0.4542914628982544 LR -0.48766762018203735 LKL 0.03337615728378296


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44302 loss -0.4770298898220062 LR -0.510308563709259 LKL 0.0332786850631237
epoch 44303 loss -0.436224102973938 LR -0.4696847200393677 LKL 0.03346061706542969
epoch 44304 loss -0.45867300033569336 LR -0.49223095178604126 LKL 0.033557966351509094
epoch 44305 loss -0.4109230041503906 LR -0.44412940740585327 LKL 0.03320639580488205
epoch 44306 loss -0.4824124574661255 LR -0.5158510804176331 LKL 0.03343862295150757
epoch 44307 loss -0.48609545826911926 LR -0.5195319056510925 LKL 0.03343645855784416
epoch 44308 loss -0.4201171398162842 LR -0.4530811011791229 LKL 0.03296395018696785
epoch 44309 loss -0.40613412857055664 LR -0.4393611550331116 LKL 0.033227041363716125
epoch 44310 loss -0.5027872920036316 LR -0.5362552404403687 LKL 0.03346792981028557
epoch 44311 loss -0.4862912595272064 LR -0.5196244120597839 LKL 0.033333152532577515
epoch 44312 loss -0.5106257796287537 LR -0.5442301034927368 LKL 0.03360431268811226
epoch 44313 loss -0.49918320775032043 LR -0.532426655292511 LKL 0.03324

54
epoch 44401 loss -0.4389142692089081 LR -0.4722270965576172 LKL 0.033312827348709106


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44402 loss -0.4795832335948944 LR -0.5131630897521973 LKL 0.033579856157302856
epoch 44403 loss -0.48079022765159607 LR -0.5142217874526978 LKL 0.03343157097697258
epoch 44404 loss -0.45196300745010376 LR -0.48517686128616333 LKL 0.03321384638547897
epoch 44405 loss -0.40464213490486145 LR -0.4377979636192322 LKL 0.03315582498908043
epoch 44406 loss -0.4310014843940735 LR -0.46414411067962646 LKL 0.03314261510968208
epoch 44407 loss -0.4032912254333496 LR -0.4366500973701477 LKL 0.0333588644862175
epoch 44408 loss -0.4208519756793976 LR -0.4540639817714691 LKL 0.03321200609207153
epoch 44409 loss -0.4538140296936035 LR -0.48708847165107727 LKL 0.03327444940805435
epoch 44410 loss -0.4838447570800781 LR -0.5172594785690308 LKL 0.03341473639011383
epoch 44411 loss -0.4303668439388275 LR -0.46362200379371643 LKL 0.03325517103075981
epoch 44412 loss -0.4942585825920105 LR -0.5278522968292236 LKL 0.033593710511922836
epoch 44413 loss -0.41269803047180176 LR -0.4459366798400879 LKL 0.0

epoch 44500 loss -0.4570336937904358 LR -0.4903758764266968 LKL 0.033342182636260986
61


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44501 loss -0.39865681529045105 LR -0.43194878101348877 LKL 0.03329196944832802
epoch 44502 loss -0.47608423233032227 LR -0.5095428228378296 LKL 0.03345860168337822
epoch 44503 loss -0.48205119371414185 LR -0.5155944228172302 LKL 0.03354322165250778
epoch 44504 loss -0.46359938383102417 LR -0.4967533051967621 LKL 0.033153921365737915
epoch 44505 loss -0.4587532877922058 LR -0.4918776750564575 LKL 0.03312438726425171
epoch 44506 loss -0.38425618410110474 LR -0.41731151938438416 LKL 0.03305535018444061
epoch 44507 loss -0.474092036485672 LR -0.5073353052139282 LKL 0.033243272453546524
epoch 44508 loss -0.44149842858314514 LR -0.474961519241333 LKL 0.03346309810876846
epoch 44509 loss -0.45859554409980774 LR -0.49181067943573 LKL 0.033215127885341644
epoch 44510 loss -0.48466765880584717 LR -0.517844021320343 LKL 0.03317636996507645
epoch 44511 loss -0.5178194046020508 LR -0.551438570022583 LKL 0.033619169145822525
epoch 44512 loss -0.4626915454864502 LR -0.4962162971496582 LKL 0.03

epoch 44600 loss -0.4605916142463684 LR -0.4940934181213379 LKL 0.03350178897380829
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44601 loss -0.4379946291446686 LR -0.4712274968624115 LKL 0.033232878893613815
epoch 44602 loss -0.4413480758666992 LR -0.4746857285499573 LKL 0.03333766385912895
epoch 44603 loss -0.4807743430137634 LR -0.5140751004219055 LKL 0.03330076113343239
epoch 44604 loss -0.4399532675743103 LR -0.47310030460357666 LKL 0.03314705193042755
epoch 44605 loss -0.42220139503479004 LR -0.4555971026420593 LKL 0.03339569270610809
epoch 44606 loss -0.5007179975509644 LR -0.5341543555259705 LKL 0.033436357975006104
epoch 44607 loss -0.49622422456741333 LR -0.5296955108642578 LKL 0.03347128629684448
epoch 44608 loss -0.3894862234592438 LR -0.4228741526603699 LKL 0.0333879180252552
epoch 44609 loss -0.43132713437080383 LR -0.4647166132926941 LKL 0.03338947519659996
epoch 44610 loss -0.4264856278896332 LR -0.4598982334136963 LKL 0.03341260179877281
epoch 44611 loss -0.5008203387260437 LR -0.5342811346054077 LKL 0.033460792154073715
epoch 44612 loss -0.45117977261543274 LR -0.48435643315315247 LKL 0.03

epoch 44700 loss -0.3352544903755188 LR -0.3680969476699829 LKL 0.03284246101975441
65


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44701 loss -0.4514683485031128 LR -0.48499923944473267 LKL 0.033530887216329575
epoch 44702 loss -0.40818268060684204 LR -0.4415404200553894 LKL 0.03335772454738617
epoch 44703 loss -0.5055519342422485 LR -0.5390159487724304 LKL 0.03346399962902069
epoch 44704 loss -0.4979476034641266 LR -0.5313977599143982 LKL 0.0334501676261425
epoch 44705 loss -0.44621002674102783 LR -0.47967296838760376 LKL 0.033462926745414734
epoch 44706 loss -0.4969518780708313 LR -0.5303993225097656 LKL 0.03344745561480522
epoch 44707 loss -0.4926317036151886 LR -0.5261269211769104 LKL 0.0334952175617218
epoch 44708 loss -0.4853493869304657 LR -0.5188166499137878 LKL 0.03346725553274155
epoch 44709 loss -0.39392805099487305 LR -0.4271030128002167 LKL 0.03317496180534363
epoch 44710 loss -0.44925805926322937 LR -0.48276036977767944 LKL 0.03350231423974037
epoch 44711 loss -0.437839150428772 LR -0.4710610508918762 LKL 0.03322191536426544
epoch 44712 loss -0.46805208921432495 LR -0.5013340711593628 LKL 0.033

epoch 44800 loss -0.5044002532958984 LR -0.5378305315971375 LKL 0.033430248498916626
51


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44801 loss -0.45924293994903564 LR -0.4923582077026367 LKL 0.033115267753601074
epoch 44802 loss -0.5109080076217651 LR -0.5442776083946228 LKL 0.033369626849889755
epoch 44803 loss -0.4100084900856018 LR -0.44326871633529663 LKL 0.03326024115085602
epoch 44804 loss -0.4462108612060547 LR -0.47957515716552734 LKL 0.03336430713534355
epoch 44805 loss -0.45799344778060913 LR -0.4915791153907776 LKL 0.03358565270900726
epoch 44806 loss -0.4683188199996948 LR -0.5017274618148804 LKL 0.033408645540475845
epoch 44807 loss -0.46777665615081787 LR -0.5012133121490479 LKL 0.03343665227293968
epoch 44808 loss -0.37995100021362305 LR -0.4130471348762512 LKL 0.03309614211320877
epoch 44809 loss -0.43969011306762695 LR -0.4727866053581238 LKL 0.033096492290496826
epoch 44810 loss -0.393768310546875 LR -0.4270479679107666 LKL 0.0332796573638916
epoch 44811 loss -0.40546971559524536 LR -0.43873047828674316 LKL 0.0332607701420784
epoch 44812 loss -0.45186445116996765 LR -0.4854118227958679 LKL 0

epoch 44899 loss -0.46676576137542725 LR -0.5001001954078674 LKL 0.03333444893360138
epoch 44900 loss -0.41373422741889954 LR -0.44690901041030884 LKL 0.0331747904419899
68
epoch 44901 loss -0.41191861033439636 LR -0.44533970952033997 LKL 0.033421099185943604


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 44902 loss -0.3812958300113678 LR -0.41472578048706055 LKL 0.033429957926273346
epoch 44903 loss -0.4932309091091156 LR -0.526812732219696 LKL 0.03358182683587074
epoch 44904 loss -0.39927902817726135 LR -0.432458758354187 LKL 0.03317972645163536
epoch 44905 loss -0.3612842857837677 LR -0.39466020464897156 LKL 0.033375926315784454
epoch 44906 loss -0.45713213086128235 LR -0.4905085861682892 LKL 0.03337646275758743
epoch 44907 loss -0.48176613450050354 LR -0.5152562260627747 LKL 0.03349008783698082
epoch 44908 loss -0.47197598218917847 LR -0.5055248141288757 LKL 0.033548831939697266
epoch 44909 loss -0.33010244369506836 LR -0.36333656311035156 LKL 0.0332341194152832
epoch 44910 loss -0.37730836868286133 LR -0.4104239046573639 LKL 0.03311553969979286
epoch 44911 loss -0.4180464744567871 LR -0.45113691687583923 LKL 0.03309044614434242
epoch 44912 loss -0.43042463064193726 LR -0.4637795090675354 LKL 0.033354878425598145
epoch 44913 loss -0.455644428730011 LR -0.48916810750961304 LKL 

45
epoch 45001 loss -0.4567064046859741 LR -0.4900568723678589 LKL 0.03335046395659447


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45002 loss -0.5152797102928162 LR -0.5489462018013 LKL 0.03366650640964508
epoch 45003 loss -0.40779536962509155 LR -0.44114482402801514 LKL 0.03334946930408478
epoch 45004 loss -0.5450884103775024 LR -0.5785598754882812 LKL 0.03347143903374672
epoch 45005 loss -0.47271618247032166 LR -0.5061198472976685 LKL 0.033403657376766205
epoch 45006 loss -0.43679338693618774 LR -0.46989884972572327 LKL 0.033105455338954926
epoch 45007 loss -0.4892476201057434 LR -0.522712230682373 LKL 0.03346461430191994
epoch 45008 loss -0.405680775642395 LR -0.43887683749198914 LKL 0.03319606930017471
epoch 45009 loss -0.43676427006721497 LR -0.47008198499679565 LKL 0.03331771120429039
epoch 45010 loss -0.4412843883037567 LR -0.474648654460907 LKL 0.03336426988244057
epoch 45011 loss -0.47190162539482117 LR -0.5051636099815369 LKL 0.033261992037296295
epoch 45012 loss -0.43857184052467346 LR -0.4717522859573364 LKL 0.03318045660853386
epoch 45013 loss -0.4469948410987854 LR -0.48025888204574585 LKL 0.03

epoch 45100 loss -0.49030202627182007 LR -0.523969292640686 LKL 0.03366727754473686
94
epoch 45101 loss -0.38962677121162415 LR -0.42292726039886475 LKL 0.033300478011369705


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45102 loss -0.4243491590023041 LR -0.45752403140068054 LKL 0.033174868673086166
epoch 45103 loss -0.49409380555152893 LR -0.5277137160301208 LKL 0.03361990675330162
epoch 45104 loss -0.4393523335456848 LR -0.4727703332901001 LKL 0.033417992293834686
epoch 45105 loss -0.4738447964191437 LR -0.5073888301849365 LKL 0.03354402631521225
epoch 45106 loss -0.43555089831352234 LR -0.46898895502090454 LKL 0.033438049256801605
epoch 45107 loss -0.42206525802612305 LR -0.4554125964641571 LKL 0.03334732726216316
epoch 45108 loss -0.404755175113678 LR -0.4380488395690918 LKL 0.033293649554252625
epoch 45109 loss -0.4493257403373718 LR -0.48285913467407227 LKL 0.03353339806199074
epoch 45110 loss -0.357700914144516 LR -0.3908025622367859 LKL 0.0331016480922699
epoch 45111 loss -0.44623738527297974 LR -0.4797934889793396 LKL 0.03355610743165016
epoch 45112 loss -0.450899213552475 LR -0.48442304134368896 LKL 0.033523816615343094
epoch 45113 loss -0.4631539583206177 LR -0.4965638816356659 LKL 0.0

55
epoch 45201 loss -0.4292714297771454 LR -0.4624742269515991 LKL 0.033202797174453735


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45202 loss -0.4283060133457184 LR -0.4617103040218353 LKL 0.03340429440140724
epoch 45203 loss -0.48360544443130493 LR -0.5171689987182617 LKL 0.033563561737537384
epoch 45204 loss -0.46738922595977783 LR -0.500742495059967 LKL 0.03335326537489891
epoch 45205 loss -0.5112980604171753 LR -0.5447728633880615 LKL 0.03347479924559593
epoch 45206 loss -0.44569727778434753 LR -0.4789029061794281 LKL 0.03320563584566116
epoch 45207 loss -0.487781286239624 LR -0.5212184190750122 LKL 0.033437132835388184
epoch 45208 loss -0.45318803191185 LR -0.4866628646850586 LKL 0.033474843949079514
epoch 45209 loss -0.4288928806781769 LR -0.4622615575790405 LKL 0.03336867317557335
epoch 45210 loss -0.4639824628829956 LR -0.4972275495529175 LKL 0.03324507921934128
epoch 45211 loss -0.4201696515083313 LR -0.45341724157333374 LKL 0.033247582614421844
epoch 45212 loss -0.41237759590148926 LR -0.44589763879776 LKL 0.03352002799510956
epoch 45213 loss -0.38807666301727295 LR -0.42114728689193726 LKL 0.03307

119
epoch 45301 loss -0.4267505407333374 LR -0.4597845673561096 LKL 0.033034030348062515


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45302 loss -0.5105438232421875 LR -0.5441964864730835 LKL 0.033652663230895996
epoch 45303 loss -0.44008147716522217 LR -0.4737301170825958 LKL 0.03364865481853485
epoch 45304 loss -0.44441041350364685 LR -0.477713942527771 LKL 0.03330354019999504
epoch 45305 loss -0.3528408706188202 LR -0.385921835899353 LKL 0.033080972731113434
epoch 45306 loss -0.42923012375831604 LR -0.46256935596466064 LKL 0.033339232206344604
epoch 45307 loss -0.4260883927345276 LR -0.4594819247722626 LKL 0.033393532037734985
epoch 45308 loss -0.41608625650405884 LR -0.4494595229625702 LKL 0.03337325155735016
epoch 45309 loss -0.47648748755455017 LR -0.5098411440849304 LKL 0.03335366025567055
epoch 45310 loss -0.4270324110984802 LR -0.4605984389781952 LKL 0.03356602415442467
epoch 45311 loss -0.5145220160484314 LR -0.5479397773742676 LKL 0.033417753875255585
epoch 45312 loss -0.4482925534248352 LR -0.4816211462020874 LKL 0.03332860767841339
epoch 45313 loss -0.48805761337280273 LR -0.5214880704879761 LKL 0.

48
epoch 45401 loss -0.4549517035484314 LR -0.48844975233078003 LKL 0.033498041331768036


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45402 loss -0.40530920028686523 LR -0.43868735432624817 LKL 0.03337815776467323
epoch 45403 loss -0.44812098145484924 LR -0.48166346549987793 LKL 0.03354247659444809
epoch 45404 loss -0.48394832015037537 LR -0.517492949962616 LKL 0.033544618636369705
epoch 45405 loss -0.407087504863739 LR -0.44029173254966736 LKL 0.03320423886179924
epoch 45406 loss -0.4522656798362732 LR -0.4856586158275604 LKL 0.03339293226599693
epoch 45407 loss -0.4342818260192871 LR -0.46775758266448975 LKL 0.03347574919462204
epoch 45408 loss -0.389570951461792 LR -0.4229235053062439 LKL 0.03335254639387131
epoch 45409 loss -0.43611472845077515 LR -0.46961984038352966 LKL 0.033505119383335114
epoch 45410 loss -0.42300161719322205 LR -0.45640552043914795 LKL 0.03340389207005501
epoch 45411 loss -0.42065879702568054 LR -0.4538998007774353 LKL 0.03324100747704506
epoch 45412 loss -0.44131314754486084 LR -0.4743935167789459 LKL 0.03308036923408508
epoch 45413 loss -0.48230305314064026 LR -0.5157658457756042 LKL

epoch 45500 loss -0.45431727170944214 LR -0.48772647976875305 LKL 0.03340922296047211
49


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45501 loss -0.41016703844070435 LR -0.4433002173900604 LKL 0.03313317149877548
epoch 45502 loss -0.4518570899963379 LR -0.48536229133605957 LKL 0.033505216240882874
epoch 45503 loss -0.43230611085891724 LR -0.46551787853240967 LKL 0.03321175277233124
epoch 45504 loss -0.40880653262138367 LR -0.4423094391822815 LKL 0.033502910286188126
epoch 45505 loss -0.4791185259819031 LR -0.5125143527984619 LKL 0.03339581936597824
epoch 45506 loss -0.4254779517650604 LR -0.45879679918289185 LKL 0.033318836241960526
epoch 45507 loss -0.44411343336105347 LR -0.4772760272026062 LKL 0.03316258266568184
epoch 45508 loss -0.470729798078537 LR -0.5043637156486511 LKL 0.033633921295404434
epoch 45509 loss -0.4402058720588684 LR -0.4735521078109741 LKL 0.03334623575210571
epoch 45510 loss -0.5275303721427917 LR -0.5612908601760864 LKL 0.03376051038503647
epoch 45511 loss -0.4098607003688812 LR -0.44324690103530884 LKL 0.033386196941137314
epoch 45512 loss -0.5127952098846436 LR -0.5462387204170227 LKL 

epoch 45600 loss -0.5003626346588135 LR -0.5338876247406006 LKL 0.03352496773004532
55


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45601 loss -0.47832608222961426 LR -0.5119104385375977 LKL 0.0335843600332737
epoch 45602 loss -0.41155460476875305 LR -0.44504982233047485 LKL 0.0334952138364315
epoch 45603 loss -0.4174100458621979 LR -0.4507201611995697 LKL 0.03331012278795242
epoch 45604 loss -0.4868837296962738 LR -0.5205035209655762 LKL 0.033619798719882965
epoch 45605 loss -0.4898897409439087 LR -0.523242175579071 LKL 0.03335242345929146
epoch 45606 loss -0.41842055320739746 LR -0.45201388001441956 LKL 0.033593323081731796
epoch 45607 loss -0.4097960591316223 LR -0.44308552145957947 LKL 0.033289454877376556
epoch 45608 loss -0.43533238768577576 LR -0.46867579221725464 LKL 0.03334341198205948
epoch 45609 loss -0.4510301947593689 LR -0.4847221076488495 LKL 0.03369190916419029
epoch 45610 loss -0.396078884601593 LR -0.4295221269130707 LKL 0.033443253487348557
epoch 45611 loss -0.41198796033859253 LR -0.44529205560684204 LKL 0.03330409899353981
epoch 45612 loss -0.547770619392395 LR -0.5816075801849365 LKL 0.0

epoch 45700 loss -0.4623311460018158 LR -0.4958655536174774 LKL 0.03353441506624222
65


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45701 loss -0.43477943539619446 LR -0.4680871069431305 LKL 0.033307675272226334
epoch 45702 loss -0.47621431946754456 LR -0.5095832347869873 LKL 0.03336890786886215
epoch 45703 loss -0.4635757803916931 LR -0.4972139000892639 LKL 0.0336381159722805
epoch 45704 loss -0.4348851442337036 LR -0.4684462547302246 LKL 0.03356112539768219
epoch 45705 loss -0.45609214901924133 LR -0.4893631935119629 LKL 0.03327103704214096
epoch 45706 loss -0.4083825945854187 LR -0.44171279668807983 LKL 0.03333021700382233
epoch 45707 loss -0.5012831687927246 LR -0.534815788269043 LKL 0.03353261947631836
epoch 45708 loss -0.39215824007987976 LR -0.42543739080429077 LKL 0.03327915817499161
epoch 45709 loss -0.49948033690452576 LR -0.5330137014389038 LKL 0.03353337571024895
epoch 45710 loss -0.5012981295585632 LR -0.5348675847053528 LKL 0.03356945514678955
epoch 45711 loss -0.4510694742202759 LR -0.4843589961528778 LKL 0.033289529383182526
epoch 45712 loss -0.49892938137054443 LR -0.5324167609214783 LKL 0.03

epoch 45800 loss -0.4672490954399109 LR -0.5006037950515747 LKL 0.03335469588637352
81
epoch 45801 loss -0.4772563576698303 LR -0.5105273127555847 LKL 0.0332709439098835


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45802 loss -0.4202410876750946 LR -0.4537159502506256 LKL 0.033474866300821304
epoch 45803 loss -0.44727787375450134 LR -0.4807649850845337 LKL 0.03348710015416145
epoch 45804 loss -0.4089498817920685 LR -0.442297101020813 LKL 0.03334721177816391
epoch 45805 loss -0.4605293869972229 LR -0.49395114183425903 LKL 0.03342174366116524
epoch 45806 loss -0.4354069232940674 LR -0.4688204526901245 LKL 0.03341352194547653
epoch 45807 loss -0.5069189667701721 LR -0.5405898690223694 LKL 0.033670905977487564
epoch 45808 loss -0.43874916434288025 LR -0.4723023772239685 LKL 0.03355320170521736
epoch 45809 loss -0.446065753698349 LR -0.4795120656490326 LKL 0.033446308225393295
epoch 45810 loss -0.45301535725593567 LR -0.48658907413482666 LKL 0.033573709428310394
epoch 45811 loss -0.5069628357887268 LR -0.5405500531196594 LKL 0.03358723595738411
epoch 45812 loss -0.4820745885372162 LR -0.5155519247055054 LKL 0.033477332442998886
epoch 45813 loss -0.5580946207046509 LR -0.5918015241622925 LKL 0.03

epoch 45900 loss -0.4843255877494812 LR -0.5176779627799988 LKL 0.03335237130522728
64


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 45901 loss -0.5152648687362671 LR -0.5487711429595947 LKL 0.03350627049803734
epoch 45902 loss -0.4300691485404968 LR -0.4631587266921997 LKL 0.03308958187699318
epoch 45903 loss -0.46717631816864014 LR -0.5007504820823669 LKL 0.03357416018843651
epoch 45904 loss -0.4262990355491638 LR -0.4595562219619751 LKL 0.033257171511650085
epoch 45905 loss -0.49182188510894775 LR -0.5254776477813721 LKL 0.03365577757358551
epoch 45906 loss -0.3589988648891449 LR -0.3921407163143158 LKL 0.0331418439745903
epoch 45907 loss -0.47449949383735657 LR -0.5079164505004883 LKL 0.033416956663131714
epoch 45908 loss -0.46797388792037964 LR -0.5016754865646362 LKL 0.03370160609483719
epoch 45909 loss -0.42861300706863403 LR -0.4621542692184448 LKL 0.033541273325681686
epoch 45910 loss -0.44166457653045654 LR -0.47515273094177246 LKL 0.03348814696073532
epoch 45911 loss -0.38224703073501587 LR -0.41534221172332764 LKL 0.03309516981244087
epoch 45912 loss -0.44143328070640564 LR -0.4748157858848572 LKL 

epoch 46000 loss -0.420890212059021 LR -0.4541710615158081 LKL 0.033280860632658005
80


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46001 loss -0.47433382272720337 LR -0.507798433303833 LKL 0.03346461057662964
epoch 46002 loss -0.4041553735733032 LR -0.4374368190765381 LKL 0.03328144550323486
epoch 46003 loss -0.49209779500961304 LR -0.5258086919784546 LKL 0.03371091187000275
epoch 46004 loss -0.40455251932144165 LR -0.438016802072525 LKL 0.033464282751083374
epoch 46005 loss -0.5308826565742493 LR -0.5644915103912354 LKL 0.03360884636640549
epoch 46006 loss -0.46884098649024963 LR -0.5022813677787781 LKL 0.033440377563238144
epoch 46007 loss -0.47396180033683777 LR -0.5072838068008423 LKL 0.03332200273871422
epoch 46008 loss -0.5315262079238892 LR -0.565326452255249 LKL 0.03380022570490837
epoch 46009 loss -0.44756442308425903 LR -0.4812099039554596 LKL 0.03364549204707146
epoch 46010 loss -0.49293482303619385 LR -0.5264322757720947 LKL 0.033497463911771774
epoch 46011 loss -0.46671637892723083 LR -0.5001310110092163 LKL 0.03341462090611458
epoch 46012 loss -0.4702247381210327 LR -0.5038853883743286 LKL 0.03

epoch 46099 loss -0.43666061758995056 LR -0.4700741767883301 LKL 0.03341354802250862
epoch 46100 loss -0.48512083292007446 LR -0.5187238454818726 LKL 0.0336030013859272
84
epoch 46101 loss -0.515157163143158 LR -0.5489124059677124 LKL 0.03375525027513504


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46102 loss -0.43149587512016296 LR -0.4647783041000366 LKL 0.03328241780400276
epoch 46103 loss -0.4221791923046112 LR -0.45565688610076904 LKL 0.03347770497202873
epoch 46104 loss -0.43005967140197754 LR -0.4636530578136444 LKL 0.03359338641166687
epoch 46105 loss -0.49534815549850464 LR -0.5290094017982483 LKL 0.03366123512387276
epoch 46106 loss -0.4977646470069885 LR -0.5314279198646545 LKL 0.03366328403353691
epoch 46107 loss -0.45674946904182434 LR -0.49025964736938477 LKL 0.033510178327560425
epoch 46108 loss -0.4450012147426605 LR -0.4784789979457855 LKL 0.033477772027254105
epoch 46109 loss -0.42057347297668457 LR -0.45393866300582886 LKL 0.033365197479724884
epoch 46110 loss -0.4590164124965668 LR -0.492423415184021 LKL 0.033407002687454224
epoch 46111 loss -0.52583247423172 LR -0.5593674778938293 LKL 0.03353502228856087
epoch 46112 loss -0.5270047187805176 LR -0.5609411001205444 LKL 0.03393639251589775
epoch 46113 loss -0.48014721274375916 LR -0.5136844515800476 LKL 0.

epoch 46200 loss -0.5184261798858643 LR -0.5520036816596985 LKL 0.03357750549912453
57


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46201 loss -0.39298638701438904 LR -0.4263600707054138 LKL 0.03337367624044418
epoch 46202 loss -0.4378111958503723 LR -0.4710742235183716 LKL 0.03326302021741867
epoch 46203 loss -0.4200066328048706 LR -0.45336705446243286 LKL 0.03336040675640106
epoch 46204 loss -0.4568396806716919 LR -0.49039971828460693 LKL 0.033560026437044144
epoch 46205 loss -0.43549180030822754 LR -0.46893852949142456 LKL 0.03344673663377762
epoch 46206 loss -0.4359169900417328 LR -0.4693564772605896 LKL 0.03343949466943741
epoch 46207 loss -0.39802733063697815 LR -0.43142834305763245 LKL 0.0334010049700737
epoch 46208 loss -0.44495144486427307 LR -0.47853413224220276 LKL 0.03358267992734909
epoch 46209 loss -0.47046151757240295 LR -0.5042110681533813 LKL 0.03374956175684929
epoch 46210 loss -0.4700008034706116 LR -0.503368616104126 LKL 0.033367808908224106
epoch 46211 loss -0.44220927357673645 LR -0.4757537245750427 LKL 0.03354443982243538
epoch 46212 loss -0.5246670842170715 LR -0.5584102869033813 LKL 0

epoch 46298 loss -0.5164058804512024 LR -0.5501405000686646 LKL 0.03373461961746216
epoch 46299 loss -0.38526809215545654 LR -0.41864198446273804 LKL 0.033373892307281494
epoch 46300 loss -0.4531603157520294 LR -0.48680150508880615 LKL 0.03364119306206703
55
epoch 46301 loss -0.4844277501106262 LR -0.5178815722465515 LKL 0.0334538109600544


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46302 loss -0.49146077036857605 LR -0.5249667167663574 LKL 0.03350595384836197
epoch 46303 loss -0.42552006244659424 LR -0.4589623510837555 LKL 0.03344228118658066
epoch 46304 loss -0.43227922916412354 LR -0.46560966968536377 LKL 0.033330440521240234
epoch 46305 loss -0.45809561014175415 LR -0.4914095997810364 LKL 0.033313997089862823
epoch 46306 loss -0.4031292796134949 LR -0.43632566928863525 LKL 0.033196378499269485
epoch 46307 loss -0.4362364709377289 LR -0.4695591330528259 LKL 0.03332267329096794
epoch 46308 loss -0.41118040680885315 LR -0.4445194602012634 LKL 0.03333904966711998
epoch 46309 loss -0.38715940713882446 LR -0.42055827379226685 LKL 0.033398859202861786
epoch 46310 loss -0.520565927028656 LR -0.5542595386505127 LKL 0.03369363397359848
epoch 46311 loss -0.39612817764282227 LR -0.4295390248298645 LKL 0.03341085463762283
epoch 46312 loss -0.4687057435512543 LR -0.5022373795509338 LKL 0.03353162854909897
epoch 46313 loss -0.4035182595252991 LR -0.4368791878223419 LKL

epoch 46400 loss -0.4254019856452942 LR -0.45891594886779785 LKL 0.033513955771923065
56


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46401 loss -0.48359864950180054 LR -0.5172634124755859 LKL 0.03366474807262421
epoch 46402 loss -0.47401607036590576 LR -0.5075284242630005 LKL 0.033512361347675323
epoch 46403 loss -0.42178115248680115 LR -0.4552074670791626 LKL 0.03342631831765175
epoch 46404 loss -0.4532787501811981 LR -0.486902117729187 LKL 0.03362337127327919
epoch 46405 loss -0.5033484697341919 LR -0.5368903279304504 LKL 0.03354187309741974
epoch 46406 loss -0.5214196443557739 LR -0.5551696419715881 LKL 0.03374999761581421
epoch 46407 loss -0.470100075006485 LR -0.50369793176651 LKL 0.03359784930944443
epoch 46408 loss -0.46770399808883667 LR -0.5012924075126648 LKL 0.03358840569853783
epoch 46409 loss -0.44568488001823425 LR -0.47926974296569824 LKL 0.03358485549688339
epoch 46410 loss -0.46863827109336853 LR -0.5021161437034607 LKL 0.033477868884801865
epoch 46411 loss -0.5151932239532471 LR -0.5489873290061951 LKL 0.033794090151786804
epoch 46412 loss -0.3954368531703949 LR -0.4286918044090271 LKL 0.0332

epoch 46499 loss -0.47608625888824463 LR -0.5099075436592102 LKL 0.03382129222154617
epoch 46500 loss -0.5002236366271973 LR -0.5337774753570557 LKL 0.03355380892753601
61
epoch 46501 loss -0.4481503963470459 LR -0.48179101943969727 LKL 0.03364061191678047


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46502 loss -0.49058520793914795 LR -0.5242862701416016 LKL 0.03370106592774391
epoch 46503 loss -0.44892776012420654 LR -0.4824577271938324 LKL 0.03352997079491615
epoch 46504 loss -0.4052988886833191 LR -0.4388352334499359 LKL 0.03353635221719742
epoch 46505 loss -0.4445623457431793 LR -0.4778442680835724 LKL 0.033281922340393066
epoch 46506 loss -0.5031733512878418 LR -0.5369127988815308 LKL 0.033739469945430756
epoch 46507 loss -0.46112415194511414 LR -0.4948359727859497 LKL 0.03371182084083557
epoch 46508 loss -0.41920068860054016 LR -0.45281782746315 LKL 0.03361714258790016
epoch 46509 loss -0.3822855055332184 LR -0.4156312346458435 LKL 0.03334571793675423
epoch 46510 loss -0.4808224141597748 LR -0.5144196152687073 LKL 0.0335971936583519
epoch 46511 loss -0.46168214082717896 LR -0.49539580941200256 LKL 0.0337136834859848
epoch 46512 loss -0.4451824426651001 LR -0.4788120985031128 LKL 0.0336296409368515
epoch 46513 loss -0.396648108959198 LR -0.4299575090408325 LKL 0.03330940

66
epoch 46601 loss -0.4396882951259613 LR -0.4730428457260132 LKL 0.03335454687476158


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46602 loss -0.4936489760875702 LR -0.5272352695465088 LKL 0.033586286008358
epoch 46603 loss -0.42363306879997253 LR -0.45706045627593994 LKL 0.033427394926548004
epoch 46604 loss -0.4274527132511139 LR -0.4608187973499298 LKL 0.03336607292294502
epoch 46605 loss -0.4599047005176544 LR -0.49345701932907104 LKL 0.033552318811416626
epoch 46606 loss -0.4072789251804352 LR -0.4406694769859314 LKL 0.03339056298136711
epoch 46607 loss -0.49787256121635437 LR -0.531417965888977 LKL 0.03354541212320328
epoch 46608 loss -0.4191276431083679 LR -0.45254722237586975 LKL 0.03341956436634064
epoch 46609 loss -0.4065963327884674 LR -0.43992388248443604 LKL 0.03332754224538803
epoch 46610 loss -0.47624415159225464 LR -0.5095416307449341 LKL 0.03329746425151825
epoch 46611 loss -0.4647815525531769 LR -0.4984405040740967 LKL 0.0336589477956295
epoch 46612 loss -0.48922738432884216 LR -0.5230302214622498 LKL 0.033802829682826996
epoch 46613 loss -0.4308524429798126 LR -0.46397972106933594 LKL 0.03

epoch 46700 loss -0.45695167779922485 LR -0.49064528942108154 LKL 0.033693600445985794
49


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46701 loss -0.47259271144866943 LR -0.5062288045883179 LKL 0.033636096864938736
epoch 46702 loss -0.47801750898361206 LR -0.5116677284240723 LKL 0.03365020453929901
epoch 46703 loss -0.43503814935684204 LR -0.468458354473114 LKL 0.033420201390981674
epoch 46704 loss -0.44698160886764526 LR -0.4805619716644287 LKL 0.03358037769794464
epoch 46705 loss -0.43830162286758423 LR -0.4718132019042969 LKL 0.033511579036712646
epoch 46706 loss -0.5009925961494446 LR -0.5347006320953369 LKL 0.03370806202292442
epoch 46707 loss -0.4732303023338318 LR -0.5067830085754395 LKL 0.033552706241607666
epoch 46708 loss -0.42424386739730835 LR -0.45794355869293213 LKL 0.03369969129562378
epoch 46709 loss -0.42862412333488464 LR -0.4618767201900482 LKL 0.033252596855163574
epoch 46710 loss -0.46342167258262634 LR -0.497112512588501 LKL 0.03369084745645523
epoch 46711 loss -0.48549947142601013 LR -0.5191579461097717 LKL 0.033658482134342194
epoch 46712 loss -0.5111332535743713 LR -0.544924795627594 LKL

epoch 46799 loss -0.43480148911476135 LR -0.4680793285369873 LKL 0.03327785059809685
epoch 46800 loss -0.36207544803619385 LR -0.39546671509742737 LKL 0.03339125216007233
53
epoch 46801 loss -0.4869152903556824 LR -0.5204175710678101 LKL 0.03350226953625679


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46802 loss -0.45312944054603577 LR -0.4866738021373749 LKL 0.03354437276721001
epoch 46803 loss -0.39121222496032715 LR -0.4247017502784729 LKL 0.03348953276872635
epoch 46804 loss -0.4530733823776245 LR -0.4868490993976593 LKL 0.033775728195905685
epoch 46805 loss -0.4436505436897278 LR -0.477236807346344 LKL 0.03358626738190651
epoch 46806 loss -0.4240453243255615 LR -0.45759183168411255 LKL 0.03354651853442192
epoch 46807 loss -0.490721732378006 LR -0.5240709781646729 LKL 0.03334924951195717
epoch 46808 loss -0.4609372019767761 LR -0.49440625309944153 LKL 0.03346903994679451
epoch 46809 loss -0.4770706295967102 LR -0.5104570984840393 LKL 0.033386483788490295
epoch 46810 loss -0.4615268111228943 LR -0.49504292011260986 LKL 0.03351609408855438
epoch 46811 loss -0.4383259117603302 LR -0.47171080112457275 LKL 0.03338489681482315
epoch 46812 loss -0.46463313698768616 LR -0.49818629026412964 LKL 0.033553145825862885
epoch 46813 loss -0.41624730825424194 LR -0.44975703954696655 LKL 0

67
epoch 46901 loss -0.3895566761493683 LR -0.4231654703617096 LKL 0.03360879048705101


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 46902 loss -0.4495387673377991 LR -0.4830188751220703 LKL 0.033480092883110046
epoch 46903 loss -0.3738296627998352 LR -0.40739762783050537 LKL 0.03356795012950897
epoch 46904 loss -0.43659818172454834 LR -0.4704163074493408 LKL 0.03381813317537308
epoch 46905 loss -0.593252420425415 LR -0.6272287368774414 LKL 0.03397631272673607
epoch 46906 loss -0.5076851844787598 LR -0.5414127707481384 LKL 0.03372757509350777
epoch 46907 loss -0.4525476396083832 LR -0.48614242672920227 LKL 0.03359479457139969
epoch 46908 loss -0.4375162124633789 LR -0.4711689352989197 LKL 0.03365272283554077
epoch 46909 loss -0.38841763138771057 LR -0.4217585027217865 LKL 0.03334088250994682
epoch 46910 loss -0.430233359336853 LR -0.46373915672302246 LKL 0.033505793660879135
epoch 46911 loss -0.31911006569862366 LR -0.35247987508773804 LKL 0.033369820564985275
epoch 46912 loss -0.46098777651786804 LR -0.4945356547832489 LKL 0.03354788199067116
epoch 46913 loss -0.48658305406570435 LR -0.5200425982475281 LKL 0.

52
epoch 47001 loss -0.4578191936016083 LR -0.49138033390045166 LKL 0.03356115147471428


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47002 loss -0.44286447763442993 LR -0.4762846827507019 LKL 0.033420197665691376
epoch 47003 loss -0.5021651983261108 LR -0.5357891321182251 LKL 0.03362390398979187
epoch 47004 loss -0.465146541595459 LR -0.4987431466579437 LKL 0.033596619963645935
epoch 47005 loss -0.4065304398536682 LR -0.43985044956207275 LKL 0.03332000970840454
epoch 47006 loss -0.46304336190223694 LR -0.49662166833877563 LKL 0.0335782952606678
epoch 47007 loss -0.4346885681152344 LR -0.468411922454834 LKL 0.0337233692407608
epoch 47008 loss -0.416756272315979 LR -0.450379341840744 LKL 0.03362307325005531
epoch 47009 loss -0.4285650849342346 LR -0.46212494373321533 LKL 0.03355986997485161
epoch 47010 loss -0.3806053400039673 LR -0.4138513505458832 LKL 0.03324601799249649
epoch 47011 loss -0.43020182847976685 LR -0.46371880173683167 LKL 0.033516958355903625
epoch 47012 loss -0.4498424232006073 LR -0.4833296537399292 LKL 0.033487219363451004
epoch 47013 loss -0.5117497444152832 LR -0.5455929040908813 LKL 0.03384

85
epoch 47101 loss -0.4657216966152191 LR -0.4992508888244629 LKL 0.033529188483953476


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47102 loss -0.4819747805595398 LR -0.5156167149543762 LKL 0.03364192321896553
epoch 47103 loss -0.4059678912162781 LR -0.4392648935317993 LKL 0.03329699486494064
epoch 47104 loss -0.40963825583457947 LR -0.4429492950439453 LKL 0.03331104293465614
epoch 47105 loss -0.5063328742980957 LR -0.5398163199424744 LKL 0.033483438193798065
epoch 47106 loss -0.44506487250328064 LR -0.47886618971824646 LKL 0.033801328390836716
epoch 47107 loss -0.4739961326122284 LR -0.5073424577713013 LKL 0.03334633260965347
epoch 47108 loss -0.39868631958961487 LR -0.43227601051330566 LKL 0.0335896834731102
epoch 47109 loss -0.4652264416217804 LR -0.4989938735961914 LKL 0.03376743569970131
epoch 47110 loss -0.42130181193351746 LR -0.4548724591732025 LKL 0.03357064723968506
epoch 47111 loss -0.4320423901081085 LR -0.4655134379863739 LKL 0.033471040427684784
epoch 47112 loss -0.4442020058631897 LR -0.477664589881897 LKL 0.03346259891986847
epoch 47113 loss -0.46023210883140564 LR -0.49389851093292236 LKL 0.0

epoch 47200 loss -0.5155176520347595 LR -0.5490556359291077 LKL 0.03353800252079964
70


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47201 loss -0.48082584142684937 LR -0.5143634080886841 LKL 0.03353755176067352
epoch 47202 loss -0.39982569217681885 LR -0.43302521109580994 LKL 0.03319951891899109
epoch 47203 loss -0.3686561584472656 LR -0.4021366238594055 LKL 0.03348047286272049
epoch 47204 loss -0.48105621337890625 LR -0.5148776173591614 LKL 0.033821411430835724
epoch 47205 loss -0.41868558526039124 LR -0.45221051573753357 LKL 0.03352491930127144
epoch 47206 loss -0.481101930141449 LR -0.514652669429779 LKL 0.03355072811245918
epoch 47207 loss -0.4282580614089966 LR -0.46176430583000183 LKL 0.03350624814629555
epoch 47208 loss -0.3998342752456665 LR -0.4333435595035553 LKL 0.03350929543375969
epoch 47209 loss -0.47485077381134033 LR -0.508180558681488 LKL 0.0333297923207283
epoch 47210 loss -0.4773619472980499 LR -0.5108321309089661 LKL 0.03347017988562584
epoch 47211 loss -0.470706582069397 LR -0.5041793584823608 LKL 0.033472783863544464
epoch 47212 loss -0.44516968727111816 LR -0.47889891266822815 LKL 0.033

epoch 47300 loss -0.48128220438957214 LR -0.5149938464164734 LKL 0.03371163085103035
109
epoch 47301 loss -0.45547229051589966 LR -0.48920273780822754 LKL 0.033730436116456985


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47302 loss -0.43280288577079773 LR -0.4660326838493347 LKL 0.033229801803827286
epoch 47303 loss -0.4104068875312805 LR -0.4436580240726471 LKL 0.03325114771723747
epoch 47304 loss -0.3457745313644409 LR -0.3790937662124634 LKL 0.03331923484802246
epoch 47305 loss -0.4686894714832306 LR -0.5022740364074707 LKL 0.03358457610011101
epoch 47306 loss -0.38703712821006775 LR -0.4205956757068634 LKL 0.033558543771505356
epoch 47307 loss -0.47155332565307617 LR -0.5050044655799866 LKL 0.0334511436522007
epoch 47308 loss -0.41225820779800415 LR -0.44574737548828125 LKL 0.033489178866147995
epoch 47309 loss -0.5046273469924927 LR -0.5383729934692383 LKL 0.033745672553777695
epoch 47310 loss -0.5116007924079895 LR -0.5453897714614868 LKL 0.03378896787762642
epoch 47311 loss -0.4992296099662781 LR -0.5329891443252563 LKL 0.033759523183107376
epoch 47312 loss -0.40142637491226196 LR -0.4348374307155609 LKL 0.03341106325387955
epoch 47313 loss -0.4463924467563629 LR -0.4800686240196228 LKL 0.

51
epoch 47401 loss -0.4634215533733368 LR -0.49710312485694885 LKL 0.03368156775832176


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47402 loss -0.43419092893600464 LR -0.4677082300186157 LKL 0.03351730853319168
epoch 47403 loss -0.5014128088951111 LR -0.5351709127426147 LKL 0.03375811129808426
epoch 47404 loss -0.4238239824771881 LR -0.4574945867061615 LKL 0.03367059677839279
epoch 47405 loss -0.4578606188297272 LR -0.49140027165412903 LKL 0.033539656549692154
epoch 47406 loss -0.4657319188117981 LR -0.49926817417144775 LKL 0.03353627026081085
epoch 47407 loss -0.4841338098049164 LR -0.5180374383926392 LKL 0.033903639763593674
epoch 47408 loss -0.3550431430339813 LR -0.38813287019729614 LKL 0.03308972343802452
epoch 47409 loss -0.45472797751426697 LR -0.48856663703918457 LKL 0.0338386707007885
epoch 47410 loss -0.41287460923194885 LR -0.446356862783432 LKL 0.033482249826192856
epoch 47411 loss -0.4526635706424713 LR -0.48615196347236633 LKL 0.033488400280475616
epoch 47412 loss -0.46200239658355713 LR -0.49569278955459595 LKL 0.03369040787220001
epoch 47413 loss -0.5056077241897583 LR -0.5393372774124146 LKL 

82
epoch 47501 loss -0.46146222949028015 LR -0.4950213134288788 LKL 0.03355908393859863


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47502 loss -0.41404032707214355 LR -0.4476076364517212 LKL 0.03356730937957764
epoch 47503 loss -0.4618532061576843 LR -0.4957715570926666 LKL 0.0339183583855629
epoch 47504 loss -0.4291166663169861 LR -0.46297523379325867 LKL 0.03385858237743378
epoch 47505 loss -0.46235141158103943 LR -0.4960865378379822 LKL 0.03373512998223305
epoch 47506 loss -0.47103986144065857 LR -0.5046082139015198 LKL 0.03356834873557091
epoch 47507 loss -0.5195959806442261 LR -0.5534418225288391 LKL 0.03384583815932274
epoch 47508 loss -0.4743017256259918 LR -0.5078321099281311 LKL 0.033530380576848984
epoch 47509 loss -0.45291244983673096 LR -0.4866252839565277 LKL 0.03371282294392586
epoch 47510 loss -0.4125448167324066 LR -0.44588902592658997 LKL 0.033344220370054245
epoch 47511 loss -0.4701795279979706 LR -0.5040174722671509 LKL 0.03383794054389
epoch 47512 loss -0.3860974907875061 LR -0.4195173382759094 LKL 0.03341985121369362
epoch 47513 loss -0.40948283672332764 LR -0.44302695989608765 LKL 0.0335

epoch 47599 loss -0.46288394927978516 LR -0.4966646432876587 LKL 0.03378068283200264
epoch 47600 loss -0.43078529834747314 LR -0.4643169641494751 LKL 0.03353166952729225
50
epoch 47601 loss -0.3541468679904938 LR -0.3875750005245209 LKL 0.0334281399846077


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47602 loss -0.5570794939994812 LR -0.5910021066665649 LKL 0.033922627568244934
epoch 47603 loss -0.43713200092315674 LR -0.4706309735774994 LKL 0.033498961478471756
epoch 47604 loss -0.41924044489860535 LR -0.4529905319213867 LKL 0.03375009819865227
epoch 47605 loss -0.5260212421417236 LR -0.5599350929260254 LKL 0.033913854509592056
epoch 47606 loss -0.42095252871513367 LR -0.4545481204986572 LKL 0.03359559178352356
epoch 47607 loss -0.5028437376022339 LR -0.5365893244743347 LKL 0.03374555706977844
epoch 47608 loss -0.3561371862888336 LR -0.38963133096694946 LKL 0.03349415212869644
epoch 47609 loss -0.46096113324165344 LR -0.49462246894836426 LKL 0.03366133198142052
epoch 47610 loss -0.43645527958869934 LR -0.4698837399482727 LKL 0.03342846781015396
epoch 47611 loss -0.4592077434062958 LR -0.4928257465362549 LKL 0.033618003129959106
epoch 47612 loss -0.4448471665382385 LR -0.4784766733646393 LKL 0.03362952172756195
epoch 47613 loss -0.4176301658153534 LR -0.450960248708725 LKL 0.

55
epoch 47701 loss -0.4479876756668091 LR -0.48145368695259094 LKL 0.033465996384620667


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47702 loss -0.37140730023384094 LR -0.40467432141304016 LKL 0.03326701372861862
epoch 47703 loss -0.43175435066223145 LR -0.46529075503349304 LKL 0.033536411821842194
epoch 47704 loss -0.4610196352005005 LR -0.49456986784935 LKL 0.03355022147297859
epoch 47705 loss -0.4009019732475281 LR -0.43453603982925415 LKL 0.03363407775759697
epoch 47706 loss -0.4922831952571869 LR -0.5258437991142273 LKL 0.03356059268116951
epoch 47707 loss -0.44572913646698 LR -0.47934165596961975 LKL 0.03361252322793007
epoch 47708 loss -0.4675913453102112 LR -0.5013204216957092 LKL 0.03372907266020775
epoch 47709 loss -0.45458459854125977 LR -0.48796677589416504 LKL 0.03338218852877617
epoch 47710 loss -0.2927168011665344 LR -0.32607758045196533 LKL 0.033360790461301804
epoch 47711 loss -0.4409269690513611 LR -0.4745144844055176 LKL 0.03358752653002739
epoch 47712 loss -0.44481706619262695 LR -0.4783182740211487 LKL 0.03350120782852173
epoch 47713 loss -0.4871823191642761 LR -0.5207670331001282 LKL 0.03

36
epoch 47801 loss -0.4553093910217285 LR -0.4888576865196228 LKL 0.03354829549789429


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47802 loss -0.4778721332550049 LR -0.5116568803787231 LKL 0.033784762024879456
epoch 47803 loss -0.4569469690322876 LR -0.4904666244983673 LKL 0.03351966291666031
epoch 47804 loss -0.4576728045940399 LR -0.49125444889068604 LKL 0.03358164057135582
epoch 47805 loss -0.5506201386451721 LR -0.5845240354537964 LKL 0.03390388563275337
epoch 47806 loss -0.3701382875442505 LR -0.4036743640899658 LKL 0.03353606164455414
epoch 47807 loss -0.47659236192703247 LR -0.5101330280303955 LKL 0.033540673553943634
epoch 47808 loss -0.40220993757247925 LR -0.43579843640327454 LKL 0.03358849138021469
epoch 47809 loss -0.425045907497406 LR -0.45858117938041687 LKL 0.03353525698184967
epoch 47810 loss -0.43549931049346924 LR -0.46921148896217346 LKL 0.03371216729283333
epoch 47811 loss -0.40860599279403687 LR -0.44194623827934265 LKL 0.033340245485305786
epoch 47812 loss -0.5083436369895935 LR -0.5421504378318787 LKL 0.03380680829286575
epoch 47813 loss -0.46255454421043396 LR -0.49637454748153687 LKL

83
epoch 47901 loss -0.4661695659160614 LR -0.4997652769088745 LKL 0.033595722168684006


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 47902 loss -0.46656930446624756 LR -0.5000573992729187 LKL 0.033488109707832336
epoch 47903 loss -0.486407071352005 LR -0.5200064778327942 LKL 0.03359939530491829
epoch 47904 loss -0.45014244318008423 LR -0.4837903678417206 LKL 0.03364793211221695
epoch 47905 loss -0.44499868154525757 LR -0.47860682010650635 LKL 0.03360813856124878
epoch 47906 loss -0.4568328559398651 LR -0.4904116988182068 LKL 0.03357883542776108
epoch 47907 loss -0.43984749913215637 LR -0.4735375642776489 LKL 0.033690061420202255
epoch 47908 loss -0.4803606867790222 LR -0.5139964818954468 LKL 0.03363579884171486
epoch 47909 loss -0.4378751516342163 LR -0.4714491665363312 LKL 0.033574000000953674
epoch 47910 loss -0.49768170714378357 LR -0.5314969420433044 LKL 0.033815234899520874
epoch 47911 loss -0.49806109070777893 LR -0.5317996144294739 LKL 0.033738527446985245
epoch 47912 loss -0.41674819588661194 LR -0.4501943588256836 LKL 0.03344617411494255
epoch 47913 loss -0.5068809986114502 LR -0.5405956506729126 LKL 

39
epoch 48001 loss -0.46615374088287354 LR -0.5000847578048706 LKL 0.03393101692199707


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48002 loss -0.5202570557594299 LR -0.55394446849823 LKL 0.03368739038705826
epoch 48003 loss -0.4974852204322815 LR -0.5313873291015625 LKL 0.0339021198451519
epoch 48004 loss -0.4570907950401306 LR -0.49078819155693054 LKL 0.03369738161563873
epoch 48005 loss -0.3307355046272278 LR -0.3642619848251343 LKL 0.03352648764848709
epoch 48006 loss -0.44996437430381775 LR -0.48347902297973633 LKL 0.033514637500047684
epoch 48007 loss -0.3813777565956116 LR -0.4146101772785187 LKL 0.033232420682907104
epoch 48008 loss -0.4786864221096039 LR -0.5123075246810913 LKL 0.03362111374735832
epoch 48009 loss -0.40322357416152954 LR -0.4366123080253601 LKL 0.03338872268795967
epoch 48010 loss -0.47996342182159424 LR -0.5137666463851929 LKL 0.033803217113018036
epoch 48011 loss -0.45515093207359314 LR -0.48881709575653076 LKL 0.033666159957647324
epoch 48012 loss -0.44605952501296997 LR -0.4795134961605072 LKL 0.03345397859811783
epoch 48013 loss -0.4728759825229645 LR -0.506730318069458 LKL 0.03

76
epoch 48101 loss -0.39490148425102234 LR -0.42852282524108887 LKL 0.03362134099006653


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48102 loss -0.4645785093307495 LR -0.4982166886329651 LKL 0.03363818675279617
epoch 48103 loss -0.49475914239883423 LR -0.5283465385437012 LKL 0.033587388694286346
epoch 48104 loss -0.4579750895500183 LR -0.4916183650493622 LKL 0.03364328294992447
epoch 48105 loss -0.4193684160709381 LR -0.45305776596069336 LKL 0.033689357340335846
epoch 48106 loss -0.49980953335762024 LR -0.5335975885391235 LKL 0.033788055181503296
epoch 48107 loss -0.42108744382858276 LR -0.45468494296073914 LKL 0.03359748423099518
epoch 48108 loss -0.46989279985427856 LR -0.5036128163337708 LKL 0.033720020204782486
epoch 48109 loss -0.5526158213615417 LR -0.5863627195358276 LKL 0.0337468720972538
epoch 48110 loss -0.43766069412231445 LR -0.47125178575515747 LKL 0.033591076731681824
epoch 48111 loss -0.4085421562194824 LR -0.4419400990009308 LKL 0.03339793160557747
epoch 48112 loss -0.4909581243991852 LR -0.5248733162879944 LKL 0.0339151993393898
epoch 48113 loss -0.47822558879852295 LR -0.5120285749435425 LKL 

60
epoch 48201 loss -0.4105078876018524 LR -0.444099485874176 LKL 0.033591605722904205


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48202 loss -0.43287163972854614 LR -0.4665991961956024 LKL 0.03372756391763687
epoch 48203 loss -0.3969598412513733 LR -0.43031591176986694 LKL 0.03335605934262276
epoch 48204 loss -0.43332362174987793 LR -0.4671107530593872 LKL 0.03378712385892868
epoch 48205 loss -0.4259105324745178 LR -0.4593936502933502 LKL 0.0334831103682518
epoch 48206 loss -0.4805350601673126 LR -0.5143294930458069 LKL 0.03379443287849426
epoch 48207 loss -0.46314793825149536 LR -0.4969598352909088 LKL 0.03381189703941345
epoch 48208 loss -0.42188066244125366 LR -0.45539504289627075 LKL 0.033514395356178284
epoch 48209 loss -0.47396185994148254 LR -0.5073906183242798 LKL 0.03342876955866814
epoch 48210 loss -0.34867793321609497 LR -0.3818439841270447 LKL 0.0331660658121109
epoch 48211 loss -0.45684191584587097 LR -0.49041810631752014 LKL 0.03357619419693947
epoch 48212 loss -0.39977365732192993 LR -0.43329399824142456 LKL 0.033520352095365524
epoch 48213 loss -0.4147341549396515 LR -0.4482949674129486 LKL 

epoch 48300 loss -0.4023137390613556 LR -0.4359983205795288 LKL 0.03368459269404411
48


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48301 loss -0.4614335298538208 LR -0.49497121572494507 LKL 0.033537693321704865
epoch 48302 loss -0.4864348769187927 LR -0.5201183557510376 LKL 0.03368346765637398
epoch 48303 loss -0.4902153015136719 LR -0.5238510966300964 LKL 0.033635787665843964
epoch 48304 loss -0.4175822138786316 LR -0.45107921957969666 LKL 0.033497001975774765
epoch 48305 loss -0.40866318345069885 LR -0.44224750995635986 LKL 0.033584319055080414
epoch 48306 loss -0.4437659680843353 LR -0.47740593552589417 LKL 0.03363995626568794
epoch 48307 loss -0.4104624092578888 LR -0.4439792335033417 LKL 0.03351682424545288
epoch 48308 loss -0.4615459144115448 LR -0.49528512358665466 LKL 0.03373920917510986
epoch 48309 loss -0.41784948110580444 LR -0.451570987701416 LKL 0.03372151777148247
epoch 48310 loss -0.3914971947669983 LR -0.42517971992492676 LKL 0.03368251398205757
epoch 48311 loss -0.4659416675567627 LR -0.49933719635009766 LKL 0.033395517617464066
epoch 48312 loss -0.36408731341362 LR -0.39744192361831665 LKL 

epoch 48400 loss -0.5179437398910522 LR -0.5517361164093018 LKL 0.03379235789179802
86


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48401 loss -0.46817004680633545 LR -0.5019413232803345 LKL 0.03377126157283783
epoch 48402 loss -0.44994157552719116 LR -0.48364901542663574 LKL 0.03370743617415428
epoch 48403 loss -0.4414268434047699 LR -0.47511255741119385 LKL 0.033685702830553055
epoch 48404 loss -0.4749462604522705 LR -0.5088273882865906 LKL 0.033881135284900665
epoch 48405 loss -0.4513789117336273 LR -0.48513859510421753 LKL 0.03375968709588051
epoch 48406 loss -0.4334365725517273 LR -0.4670180380344391 LKL 0.033581458032131195
epoch 48407 loss -0.48834121227264404 LR -0.5221092104911804 LKL 0.03376798331737518
epoch 48408 loss -0.4608548581600189 LR -0.49445557594299316 LKL 0.033600714057683945
epoch 48409 loss -0.4666561484336853 LR -0.5002923607826233 LKL 0.03363621607422829
epoch 48410 loss -0.4504908323287964 LR -0.48417428135871887 LKL 0.03368344530463219
epoch 48411 loss -0.4362596869468689 LR -0.4699482023715973 LKL 0.0336885042488575
epoch 48412 loss -0.4702112674713135 LR -0.5038986206054688 LKL 0

epoch 48500 loss -0.5161954164505005 LR -0.5499840378761292 LKL 0.03378860279917717
44


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48501 loss -0.45077764987945557 LR -0.4845092296600342 LKL 0.03373159468173981
epoch 48502 loss -0.4556939899921417 LR -0.4893479347229004 LKL 0.033653948456048965
epoch 48503 loss -0.5170133709907532 LR -0.5505457520484924 LKL 0.033532366156578064
epoch 48504 loss -0.45135730504989624 LR -0.4848642349243164 LKL 0.03350691497325897
epoch 48505 loss -0.3777659237384796 LR -0.4113275110721588 LKL 0.0335615836083889
epoch 48506 loss -0.4691598415374756 LR -0.5029696822166443 LKL 0.033809829503297806
epoch 48507 loss -0.4616507887840271 LR -0.4953823685646057 LKL 0.03373157978057861
epoch 48508 loss -0.4336456060409546 LR -0.46720919013023376 LKL 0.03356359526515007
epoch 48509 loss -0.4025999903678894 LR -0.43619465827941895 LKL 0.033594682812690735
epoch 48510 loss -0.4113817512989044 LR -0.4448575973510742 LKL 0.0334758423268795
epoch 48511 loss -0.48997530341148376 LR -0.5237074494361877 LKL 0.033732157200574875
epoch 48512 loss -0.4757519066333771 LR -0.5092588067054749 LKL 0.03

epoch 48600 loss -0.3962711989879608 LR -0.4297744035720825 LKL 0.033503200858831406
67


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48601 loss -0.49025657773017883 LR -0.5239225625991821 LKL 0.0336659774184227
epoch 48602 loss -0.48062360286712646 LR -0.5143080949783325 LKL 0.03368450701236725
epoch 48603 loss -0.5149365067481995 LR -0.5485192537307739 LKL 0.033582769334316254
epoch 48604 loss -0.4385654032230377 LR -0.47208505868911743 LKL 0.03351965174078941
epoch 48605 loss -0.519454300403595 LR -0.5532835721969604 LKL 0.03382929041981697
epoch 48606 loss -0.4538414180278778 LR -0.4875873327255249 LKL 0.0337459035217762
epoch 48607 loss -0.4460057318210602 LR -0.47971290349960327 LKL 0.033707182854413986
epoch 48608 loss -0.4710887670516968 LR -0.5049835443496704 LKL 0.033894769847393036
epoch 48609 loss -0.47837838530540466 LR -0.512008786201477 LKL 0.033630408346652985
epoch 48610 loss -0.40214037895202637 LR -0.43590471148490906 LKL 0.03376433625817299
epoch 48611 loss -0.43949928879737854 LR -0.4732542037963867 LKL 0.03375491499900818
epoch 48612 loss -0.42760908603668213 LR -0.4609794318675995 LKL 0.0

epoch 48699 loss -0.42690595984458923 LR -0.46056637167930603 LKL 0.0336604118347168
epoch 48700 loss -0.4620247185230255 LR -0.4958304166793823 LKL 0.03380570933222771
78
epoch 48701 loss -0.4715612828731537 LR -0.5050805807113647 LKL 0.03351930156350136


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48702 loss -0.5032440423965454 LR -0.5371609926223755 LKL 0.03391696885228157
epoch 48703 loss -0.41664960980415344 LR -0.4502386748790741 LKL 0.033589065074920654
epoch 48704 loss -0.44136810302734375 LR -0.47510480880737305 LKL 0.033736702054739
epoch 48705 loss -0.509351372718811 LR -0.5433418154716492 LKL 0.033990420401096344
epoch 48706 loss -0.4164433479309082 LR -0.45004287362098694 LKL 0.03359951451420784
epoch 48707 loss -0.41150176525115967 LR -0.4451338052749634 LKL 0.03363204747438431
epoch 48708 loss -0.5004138350486755 LR -0.5341450572013855 LKL 0.03373120352625847
epoch 48709 loss -0.40373215079307556 LR -0.43733465671539307 LKL 0.033602502197027206
epoch 48710 loss -0.4185488820075989 LR -0.45217424631118774 LKL 0.03362537547945976
epoch 48711 loss -0.38529813289642334 LR -0.4187106192111969 LKL 0.03341247886419296
epoch 48712 loss -0.43685609102249146 LR -0.4702224135398865 LKL 0.033366307616233826
epoch 48713 loss -0.4474605321884155 LR -0.48101893067359924 LKL 

77
epoch 48801 loss -0.4688955843448639 LR -0.5024529695510864 LKL 0.03355739638209343


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48802 loss -0.4791080951690674 LR -0.5129122734069824 LKL 0.03380417078733444
epoch 48803 loss -0.46420377492904663 LR -0.4978724420070648 LKL 0.03366865590214729
epoch 48804 loss -0.4934675693511963 LR -0.5270354747772217 LKL 0.03356790170073509
epoch 48805 loss -0.5061401724815369 LR -0.5399153232574463 LKL 0.03377513214945793
epoch 48806 loss -0.41582033038139343 LR -0.4494055211544037 LKL 0.03358519822359085
epoch 48807 loss -0.4162696599960327 LR -0.44989439845085144 LKL 0.03362472355365753
epoch 48808 loss -0.4378575086593628 LR -0.4715301990509033 LKL 0.03367270529270172
epoch 48809 loss -0.4905899465084076 LR -0.5243955850601196 LKL 0.03380562737584114
epoch 48810 loss -0.36440521478652954 LR -0.39787399768829346 LKL 0.033468782901763916
epoch 48811 loss -0.47701945900917053 LR -0.5110141038894653 LKL 0.0339946448802948
epoch 48812 loss -0.5493470430374146 LR -0.5831603407859802 LKL 0.03381328284740448
epoch 48813 loss -0.5258806943893433 LR -0.5598793029785156 LKL 0.0339

36
epoch 48901 loss -0.4437106251716614 LR -0.4774016737937927 LKL 0.03369106352329254


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 48902 loss -0.4436027705669403 LR -0.47716024518013 LKL 0.0335574634373188
epoch 48903 loss -0.4194431006908417 LR -0.45324546098709106 LKL 0.033802371472120285
epoch 48904 loss -0.4781249761581421 LR -0.5117226243019104 LKL 0.033597640693187714
epoch 48905 loss -0.4503948390483856 LR -0.48411840200424194 LKL 0.033723555505275726
epoch 48906 loss -0.48244139552116394 LR -0.5161128044128418 LKL 0.03367139771580696
epoch 48907 loss -0.4910742938518524 LR -0.5248342752456665 LKL 0.03375997394323349
epoch 48908 loss -0.5284519791603088 LR -0.5621028542518616 LKL 0.03365086391568184
epoch 48909 loss -0.47911036014556885 LR -0.5127951502799988 LKL 0.03368477523326874
epoch 48910 loss -0.5077412128448486 LR -0.5414448380470276 LKL 0.033703599125146866
epoch 48911 loss -0.5121064186096191 LR -0.5460453629493713 LKL 0.033938951790332794
epoch 48912 loss -0.4115479290485382 LR -0.4449384808540344 LKL 0.03339055925607681
epoch 48913 loss -0.40881451964378357 LR -0.4423540532588959 LKL 0.033

epoch 49000 loss -0.44157153367996216 LR -0.4752804636955261 LKL 0.03370892256498337
48


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49001 loss -0.4245144724845886 LR -0.45849812030792236 LKL 0.033983636647462845
epoch 49002 loss -0.4257180392742157 LR -0.4592316746711731 LKL 0.033513639122247696
epoch 49003 loss -0.450258731842041 LR -0.4838827848434448 LKL 0.03362404927611351
epoch 49004 loss -0.4756440222263336 LR -0.5095050930976868 LKL 0.033861082047224045
epoch 49005 loss -0.5055956244468689 LR -0.5394449830055237 LKL 0.03384936973452568
epoch 49006 loss -0.5139367580413818 LR -0.5477975606918335 LKL 0.03386080265045166
epoch 49007 loss -0.4428751766681671 LR -0.47644758224487305 LKL 0.03357241675257683
epoch 49008 loss -0.44687771797180176 LR -0.480680912733078 LKL 0.03380318358540535
epoch 49009 loss -0.45310845971107483 LR -0.48657795786857605 LKL 0.033469486981630325
epoch 49010 loss -0.427626371383667 LR -0.4612691402435303 LKL 0.033642757683992386
epoch 49011 loss -0.40128326416015625 LR -0.4349204897880554 LKL 0.03363723307847977
epoch 49012 loss -0.3838966190814972 LR -0.4174008369445801 LKL 0.03

epoch 49100 loss -0.5149446129798889 LR -0.5488210320472717 LKL 0.033876437693834305
51


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49101 loss -0.38444170355796814 LR -0.4180174469947815 LKL 0.03357575461268425
epoch 49102 loss -0.43401825428009033 LR -0.46761149168014526 LKL 0.033593229949474335
epoch 49103 loss -0.4966469705104828 LR -0.530368447303772 LKL 0.033721473067998886
epoch 49104 loss -0.47659236192703247 LR -0.5103369951248169 LKL 0.03374462202191353
epoch 49105 loss -0.4598771035671234 LR -0.4936197102069855 LKL 0.033742595463991165
epoch 49106 loss -0.4163806438446045 LR -0.4501410126686096 LKL 0.03376035764813423
epoch 49107 loss -0.46109193563461304 LR -0.4950260519981384 LKL 0.03393411263823509
epoch 49108 loss -0.39724254608154297 LR -0.43077152967453003 LKL 0.033528976142406464
epoch 49109 loss -0.5128516554832458 LR -0.546737790107727 LKL 0.03388611599802971
epoch 49110 loss -0.3532746732234955 LR -0.38680803775787354 LKL 0.03353337571024895
epoch 49111 loss -0.5261058807373047 LR -0.5599549412727356 LKL 0.033849041908979416
epoch 49112 loss -0.4245387315750122 LR -0.4580978751182556 LKL 0

epoch 49200 loss -0.4487781226634979 LR -0.4826160669326782 LKL 0.03383794054389
76


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49201 loss -0.48887595534324646 LR -0.5227299332618713 LKL 0.033853985369205475
epoch 49202 loss -0.519856870174408 LR -0.5536775588989258 LKL 0.03382071107625961
epoch 49203 loss -0.4419673681259155 LR -0.47571590542793274 LKL 0.03374854475259781
epoch 49204 loss -0.43685805797576904 LR -0.4705808162689209 LKL 0.03372275456786156
epoch 49205 loss -0.4473962187767029 LR -0.4810398817062378 LKL 0.033643677830696106
epoch 49206 loss -0.35682499408721924 LR -0.3904115557670593 LKL 0.03358657285571098
epoch 49207 loss -0.45997607707977295 LR -0.4938427209854126 LKL 0.03386664763092995
epoch 49208 loss -0.44101792573928833 LR -0.4746953248977661 LKL 0.03367739915847778
epoch 49209 loss -0.4523690640926361 LR -0.48605653643608093 LKL 0.03368747979402542
epoch 49210 loss -0.5028982162475586 LR -0.5366936326026917 LKL 0.033795442432165146
epoch 49211 loss -0.4671240746974945 LR -0.5010847449302673 LKL 0.033960677683353424
epoch 49212 loss -0.5195772647857666 LR -0.553471565246582 LKL 0.0

epoch 49299 loss -0.44836729764938354 LR -0.4820663332939148 LKL 0.03369903564453125
epoch 49300 loss -0.46927550435066223 LR -0.5030969381332397 LKL 0.033821433782577515
73
epoch 49301 loss -0.46560433506965637 LR -0.49944084882736206 LKL 0.03383651748299599


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49302 loss -0.44496795535087585 LR -0.4785116910934448 LKL 0.033543724566698074
epoch 49303 loss -0.42370903491973877 LR -0.4575217366218567 LKL 0.03381270542740822
epoch 49304 loss -0.5164783000946045 LR -0.5502864122390747 LKL 0.03380810469388962
epoch 49305 loss -0.4510417580604553 LR -0.4848116636276245 LKL 0.033769913017749786
epoch 49306 loss -0.48916223645210266 LR -0.5228285789489746 LKL 0.033666349947452545
epoch 49307 loss -0.4387349784374237 LR -0.4724130630493164 LKL 0.0336780846118927
epoch 49308 loss -0.46351319551467896 LR -0.4974253177642822 LKL 0.03391213342547417
epoch 49309 loss -0.43775954842567444 LR -0.4718032479286194 LKL 0.03404369205236435
epoch 49310 loss -0.4066963493824005 LR -0.4400660991668701 LKL 0.03336974233388901
epoch 49311 loss -0.4358806312084198 LR -0.46949970722198486 LKL 0.033619076013565063
epoch 49312 loss -0.4503958523273468 LR -0.48405376076698303 LKL 0.03365791216492653
epoch 49313 loss -0.4544030427932739 LR -0.4883595108985901 LKL 0.

85
epoch 49401 loss -0.4555380940437317 LR -0.48922470211982727 LKL 0.03368660435080528


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49402 loss -0.48025089502334595 LR -0.5140901803970337 LKL 0.03383929654955864
epoch 49403 loss -0.42646074295043945 LR -0.46009913086891174 LKL 0.033638399094343185
epoch 49404 loss -0.5132320523262024 LR -0.54708331823349 LKL 0.033851251006126404
epoch 49405 loss -0.36728009581565857 LR -0.40094342827796936 LKL 0.03366333618760109
epoch 49406 loss -0.4953213930130005 LR -0.5290948152542114 LKL 0.033773407340049744
epoch 49407 loss -0.3752448856830597 LR -0.40876322984695435 LKL 0.03351834416389465
epoch 49408 loss -0.488879531621933 LR -0.5225837230682373 LKL 0.03370420262217522
epoch 49409 loss -0.43655508756637573 LR -0.4703487753868103 LKL 0.033793698996305466
epoch 49410 loss -0.4534705579280853 LR -0.4871808886528015 LKL 0.033710334450006485
epoch 49411 loss -0.43156611919403076 LR -0.465410441160202 LKL 0.03384433314204216
epoch 49412 loss -0.5227121114730835 LR -0.5567267537117004 LKL 0.03401462361216545
epoch 49413 loss -0.360404372215271 LR -0.3940890431404114 LKL 0.03

101
epoch 49501 loss -0.4691973328590393 LR -0.503000795841217 LKL 0.033803459256887436


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49502 loss -0.4480384886264801 LR -0.48165690898895264 LKL 0.03361842781305313
epoch 49503 loss -0.4691554009914398 LR -0.5031471252441406 LKL 0.0339917354285717
epoch 49504 loss -0.5035924911499023 LR -0.5372445583343506 LKL 0.03365204110741615
epoch 49505 loss -0.4251587390899658 LR -0.45886316895484924 LKL 0.03370443359017372
epoch 49506 loss -0.48424631357192993 LR -0.5180162787437439 LKL 0.03376997262239456
epoch 49507 loss -0.48888352513313293 LR -0.5226572155952454 LKL 0.03377367928624153
epoch 49508 loss -0.4939475655555725 LR -0.5278891324996948 LKL 0.03394155204296112
epoch 49509 loss -0.5052497982978821 LR -0.5391495823860168 LKL 0.033899787813425064
epoch 49510 loss -0.5020061731338501 LR -0.5358684659004211 LKL 0.033862289041280746
epoch 49511 loss -0.41577833890914917 LR -0.4493767321109772 LKL 0.03359838202595711
epoch 49512 loss -0.41796326637268066 LR -0.45180022716522217 LKL 0.03383695334196091
epoch 49513 loss -0.48885583877563477 LR -0.5226923227310181 LKL 0.0

67
epoch 49601 loss -0.48645517230033875 LR -0.5200396180152893 LKL 0.03358445316553116


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49602 loss -0.4911225736141205 LR -0.5249143838882446 LKL 0.033791810274124146
epoch 49603 loss -0.4636053442955017 LR -0.49729201197624207 LKL 0.033686667680740356
epoch 49604 loss -0.47716858983039856 LR -0.5111523866653442 LKL 0.03398380056023598
epoch 49605 loss -0.47067025303840637 LR -0.5043355822563171 LKL 0.03366531804203987
epoch 49606 loss -0.4691002368927002 LR -0.5032253265380859 LKL 0.03412509337067604
epoch 49607 loss -0.4926977753639221 LR -0.5265715718269348 LKL 0.03387381136417389
epoch 49608 loss -0.481601357460022 LR -0.5154454112052917 LKL 0.033844057470560074
epoch 49609 loss -0.5527726411819458 LR -0.5866143703460693 LKL 0.03384172543883324
epoch 49610 loss -0.4537500739097595 LR -0.48752233386039734 LKL 0.033772267401218414
epoch 49611 loss -0.41855838894844055 LR -0.4520883560180664 LKL 0.03352997079491615
epoch 49612 loss -0.44738873839378357 LR -0.4809839725494385 LKL 0.03359522297978401
epoch 49613 loss -0.45032376050949097 LR -0.4839688241481781 LKL 0.

68
epoch 49701 loss -0.4489627182483673 LR -0.4826376736164093 LKL 0.033674947917461395


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49702 loss -0.44204074144363403 LR -0.47577810287475586 LKL 0.03373735398054123
epoch 49703 loss -0.4580223262310028 LR -0.4913703203201294 LKL 0.03334799408912659
epoch 49704 loss -0.4999355673789978 LR -0.5338190793991089 LKL 0.033883508294820786
epoch 49705 loss -0.4652099311351776 LR -0.4988137185573578 LKL 0.033603787422180176
epoch 49706 loss -0.4865669310092926 LR -0.5203536748886108 LKL 0.033786751329898834
epoch 49707 loss -0.5389607548713684 LR -0.5730310082435608 LKL 0.03407027944922447
epoch 49708 loss -0.46852344274520874 LR -0.5021971464157104 LKL 0.03367369994521141
epoch 49709 loss -0.5089664459228516 LR -0.5431618094444275 LKL 0.03419538959860802
epoch 49710 loss -0.47193384170532227 LR -0.5057188868522644 LKL 0.03378506004810333
epoch 49711 loss -0.47119051218032837 LR -0.5049211382865906 LKL 0.03373061865568161
epoch 49712 loss -0.5016278028488159 LR -0.5354453325271606 LKL 0.033817511051893234
epoch 49713 loss -0.3716093897819519 LR -0.40512967109680176 LKL 0.

epoch 49800 loss -0.4183913767337799 LR -0.4521747827529907 LKL 0.03378341346979141
65


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49801 loss -0.44663095474243164 LR -0.48034894466400146 LKL 0.033717986196279526
epoch 49802 loss -0.4635687470436096 LR -0.4973106384277344 LKL 0.03374190255999565
epoch 49803 loss -0.3972683548927307 LR -0.43089210987091064 LKL 0.03362376242876053
epoch 49804 loss -0.4469848573207855 LR -0.4805446267127991 LKL 0.03355976566672325
epoch 49805 loss -0.4542991518974304 LR -0.48827075958251953 LKL 0.03397160768508911
epoch 49806 loss -0.45387566089630127 LR -0.48762696981430054 LKL 0.03375132009387016
epoch 49807 loss -0.47314321994781494 LR -0.5068097710609436 LKL 0.033666566014289856
epoch 49808 loss -0.4128991961479187 LR -0.44657760858535767 LKL 0.03367839753627777
epoch 49809 loss -0.44467857480049133 LR -0.478574275970459 LKL 0.03389570862054825
epoch 49810 loss -0.5038924813270569 LR -0.5377979278564453 LKL 0.033905431628227234
epoch 49811 loss -0.5015909075737 LR -0.5353348255157471 LKL 0.03374394029378891
epoch 49812 loss -0.44617900252342224 LR -0.4800277352333069 LKL 0.0

epoch 49900 loss -0.41923820972442627 LR -0.4528416097164154 LKL 0.03360338881611824
61


  pi = F.softmax(pi.transpose(0,1).squeeze()).view(len_out,-1,hp.M)
  q = F.softmax(params_pen).view(len_out,-1,3)
  nn.utils.clip_grad_norm(self.encoder.parameters(), hp.grad_clip)
  nn.utils.clip_grad_norm(self.decoder.parameters(), hp.grad_clip)


epoch 49901 loss -0.5244176387786865 LR -0.5584223866462708 LKL 0.034004777669906616
epoch 49902 loss -0.4436076581478119 LR -0.4774344563484192 LKL 0.0338268056511879
epoch 49903 loss -0.4476613402366638 LR -0.4813843071460724 LKL 0.033722952008247375
epoch 49904 loss -0.42076075077056885 LR -0.454571396112442 LKL 0.03381063789129257
epoch 49905 loss -0.4884796738624573 LR -0.5221737623214722 LKL 0.033694103360176086
epoch 49906 loss -0.4526996612548828 LR -0.4863950312137604 LKL 0.03369535505771637
epoch 49907 loss -0.43051058053970337 LR -0.46429088711738586 LKL 0.03378032147884369
epoch 49908 loss -0.39591073989868164 LR -0.42952919006347656 LKL 0.03361843526363373
epoch 49909 loss -0.4558570683002472 LR -0.48976564407348633 LKL 0.03390856459736824
epoch 49910 loss -0.43187782168388367 LR -0.4654976427555084 LKL 0.03361980989575386
epoch 49911 loss -0.4777476489543915 LR -0.5115693211555481 LKL 0.03382168337702751
epoch 49912 loss -0.48154935240745544 LR -0.5155132412910461 LKL 0.0

epoch 49999 loss -0.4561496078968048 LR -0.4897915720939636 LKL 0.033641964197158813
epoch 50000 loss -0.4569929242134094 LR -0.49086010456085205 LKL 0.03386716917157173
46
