In [12]:
import math
from typing import Tuple
import pandas as pd
import numpy as np
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from itertools import product
import copy
import time

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
def standardization(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    return (data - mu) / sigma
def Cor_Loss_func(X,Y):
    the_coef=torch.corrcoef(torch.cat([X.reshape([1,-1]),Y.reshape([1,-1])]))[0,1]
    return -the_coef
def Cor_Loss(X,Y):
    cor=[]
    for i in range(X.shape[0]):
        cor.append(Cor_Loss_func(X[i,:],Y[i,:]).detach().numpy())
    return torch.tensor(np.mean(cor),requires_grad=True)

In [15]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Linear(ntoken,d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)


    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]
        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src=src.reshape(-1,433,1)
        src = self.encoder(src.cuda()) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [16]:
data_csv=pd.read_csv("industry_ret_data_1500.csv")
dataset = data_csv.values
dataset=np.delete(dataset,0,axis=1)
dataset = dataset.astype('float32')
dataset=standardization(dataset)
vari=[]
# for i in range(433):
#     svar=np.var(np.array(dataset)[1000:1200,i])
#     print(i,svar)
#     vari.append(svar)
# print(np.mean(vari))
train_data = torch.from_numpy(np.array(dataset[:1000]))
val_data = torch.from_numpy(np.array(dataset[1000:1200]))
test_data = torch.from_numpy(np.array(dataset[1200:]))
epochs = 5
ntokens=1
bptt =24
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [17]:
def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        #print(data,torch.reshape(targets.cpu(), data.size()))
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        loss_function = torch.nn.MSELoss()  # 正确
        output = model(data, src_mask)
        output = torch.mean(output, 2)

        loss = loss_function(torch.reshape(output.cuda(), targets.size()), targets.cuda())
        cor=Cor_Loss(output.cpu(), torch.reshape(targets.cpu(), output.size()))

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f} | 'f'cor {cor:.5f}')
            total_loss = 0
            start_time = time.time()
def evaluate(model: nn.Module, eval_data: Tensor) :
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    total_var=0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(ntokens, -1)[0]

            loss_function = torch.nn.MSELoss()  # 正确
            output1 = torch.mean(output, 2)
            v_cor=Cor_Loss(output.cpu(), torch.reshape(targets.cpu(),output.size()))
            total_loss += batch_size * loss_function(output_flat.cuda(), targets.cuda()).item()
            total_var+=batch_size*np.var(np.array(targets))
            #print(targets,output_flat)

   # print("var:",total_var/(len(eval_data)-1))
    return total_loss / (len(eval_data) - 1),v_cor

In [18]:
parameters=dict(
    emsize=[8,32,64,128,256],
    d_hid=[8,32,64,128,256],
    nlayers=[2,4,6,8],
    nhead=[2,4,8],
    dropout=[0.2,0.5],
    lr=[0.1,0.01,0.005,0.001]
)
param_values=[v for v in parameters.values()]
param=list(product(*param_values))
print(param)

[(8, 8, 2, 2, 0.2, 0.1), (8, 8, 2, 2, 0.2, 0.01), (8, 8, 2, 2, 0.2, 0.005), (8, 8, 2, 2, 0.2, 0.001), (8, 8, 2, 2, 0.5, 0.1), (8, 8, 2, 2, 0.5, 0.01), (8, 8, 2, 2, 0.5, 0.005), (8, 8, 2, 2, 0.5, 0.001), (8, 8, 2, 4, 0.2, 0.1), (8, 8, 2, 4, 0.2, 0.01), (8, 8, 2, 4, 0.2, 0.005), (8, 8, 2, 4, 0.2, 0.001), (8, 8, 2, 4, 0.5, 0.1), (8, 8, 2, 4, 0.5, 0.01), (8, 8, 2, 4, 0.5, 0.005), (8, 8, 2, 4, 0.5, 0.001), (8, 8, 2, 8, 0.2, 0.1), (8, 8, 2, 8, 0.2, 0.01), (8, 8, 2, 8, 0.2, 0.005), (8, 8, 2, 8, 0.2, 0.001), (8, 8, 2, 8, 0.5, 0.1), (8, 8, 2, 8, 0.5, 0.01), (8, 8, 2, 8, 0.5, 0.005), (8, 8, 2, 8, 0.5, 0.001), (8, 8, 4, 2, 0.2, 0.1), (8, 8, 4, 2, 0.2, 0.01), (8, 8, 4, 2, 0.2, 0.005), (8, 8, 4, 2, 0.2, 0.001), (8, 8, 4, 2, 0.5, 0.1), (8, 8, 4, 2, 0.5, 0.01), (8, 8, 4, 2, 0.5, 0.005), (8, 8, 4, 2, 0.5, 0.001), (8, 8, 4, 4, 0.2, 0.1), (8, 8, 4, 4, 0.2, 0.01), (8, 8, 4, 4, 0.2, 0.005), (8, 8, 4, 4, 0.2, 0.001), (8, 8, 4, 4, 0.5, 0.1), (8, 8, 4, 4, 0.5, 0.01), (8, 8, 4, 4, 0.5, 0.005), (8, 8, 4, 4, 0.

In [19]:
test_mse=[]
for emsize, d_hid,nlayers, nhead, dropout,lr in param:
    print(emsize, d_hid,nlayers, nhead, dropout,lr)
    model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)
    criterion = torch.nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    best_val_loss = float('inf')
    best_model = None
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss,v_cor = evaluate(model, val_data)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
       # print('-' * 89)
       # print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
         #     f'valid loss {val_loss:.5f} | valid ppl {val_ppl:8.2f}| 'f'V_cor {v_cor:.5f}')
        #print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model)

        scheduler.step()


    test_loss,t_cor= evaluate(best_model, test_data)
    test_mse.append([emsize, d_hid,nlayers, nhead, dropout,lr,test_loss])
    test_ppl = math.exp(test_loss)
    print('=' * 89)
    print(f'| End of training | test loss {test_loss:.5f} | '
          f'test ppl {test_ppl:.5f}|'f'test cor {t_cor:.5f}')
    print('=' * 89)

8 8 2 2 0.2 0.1
| End of training | test loss 1.15884 | test ppl 3.18622|test cor 0.09621
8 8 2 2 0.2 0.01
| End of training | test loss 1.17225 | test ppl 3.22925|test cor 0.15771
8 8 2 2 0.2 0.005
| End of training | test loss 1.21274 | test ppl 3.36267|test cor -0.04338
8 8 2 2 0.2 0.001
| End of training | test loss 1.29595 | test ppl 3.65447|test cor -0.03868
8 8 2 2 0.5 0.1
| End of training | test loss 1.15868 | test ppl 3.18572|test cor 0.01304
8 8 2 2 0.5 0.01
| End of training | test loss 1.15953 | test ppl 3.18842|test cor 0.04286
8 8 2 2 0.5 0.005
| End of training | test loss 1.17703 | test ppl 3.24473|test cor -0.06827
8 8 2 2 0.5 0.001
| End of training | test loss 1.45594 | test ppl 4.28853|test cor -0.02964
8 8 2 4 0.2 0.1
| End of training | test loss 1.15651 | test ppl 3.17881|test cor 0.07304
8 8 2 4 0.2 0.01
| End of training | test loss 1.16498 | test ppl 3.20586|test cor -0.01077
8 8 2 4 0.2 0.005
| End of training | test loss 1.17073 | test ppl 3.22435|test cor 

| End of training | test loss 1.55656 | test ppl 4.74248|test cor -0.09546
8 8 4 4 0.2 0.1
| End of training | test loss 1.15603 | test ppl 3.17729|test cor 0.08083
8 8 4 4 0.2 0.01
| End of training | test loss 1.15564 | test ppl 3.17604|test cor 0.09604
8 8 4 4 0.2 0.005
| End of training | test loss 1.17473 | test ppl 3.23726|test cor -0.05403
8 8 4 4 0.2 0.001
| End of training | test loss 1.24998 | test ppl 3.49026|test cor 0.08382
8 8 4 4 0.5 0.1
| End of training | test loss 1.15976 | test ppl 3.18918|test cor -0.06313
8 8 4 4 0.5 0.01
| End of training | test loss 1.15753 | test ppl 3.18207|test cor 0.13344
8 8 4 4 0.5 0.005
| End of training | test loss 1.23204 | test ppl 3.42822|test cor -0.00958
8 8 4 4 0.5 0.001
| End of training | test loss 1.68803 | test ppl 5.40879|test cor 0.18123
8 8 4 8 0.2 0.1
| End of training | test loss 1.16872 | test ppl 3.21787|test cor -0.00723
8 8 4 8 0.2 0.01
| End of training | test loss 1.16356 | test ppl 3.20132|test cor 0.14096
8 8 4 8 0.

| End of training | test loss 1.16729 | test ppl 3.21328|test cor -0.02822
8 8 6 4 0.5 0.001
| End of training | test loss 1.17481 | test ppl 3.23752|test cor -0.21998
8 8 6 8 0.2 0.1
| End of training | test loss 1.15874 | test ppl 3.18590|test cor 0.04906
8 8 6 8 0.2 0.01
| End of training | test loss 1.16533 | test ppl 3.20699|test cor -0.04941
8 8 6 8 0.2 0.005
| End of training | test loss 1.16154 | test ppl 3.19484|test cor 0.03064
8 8 6 8 0.2 0.001
| End of training | test loss 1.24390 | test ppl 3.46911|test cor 0.06965
8 8 6 8 0.5 0.1
| End of training | test loss 1.16068 | test ppl 3.19212|test cor -0.08271
8 8 6 8 0.5 0.01
| End of training | test loss 1.16998 | test ppl 3.22192|test cor -0.01113
8 8 6 8 0.5 0.005
| End of training | test loss 1.17649 | test ppl 3.24299|test cor 0.04289
8 8 6 8 0.5 0.001
| End of training | test loss 1.19572 | test ppl 3.30595|test cor 0.10328
8 8 8 2 0.2 0.1
| End of training | test loss 1.16027 | test ppl 3.19079|test cor -0.16504
8 8 8 2 

| End of training | test loss 1.16232 | test ppl 3.19735|test cor -0.12906
8 8 8 8 0.5 0.005
| End of training | test loss 1.16831 | test ppl 3.21655|test cor -0.06509
8 8 8 8 0.5 0.001
| End of training | test loss 1.18867 | test ppl 3.28270|test cor -0.04290
8 32 2 2 0.2 0.1
| End of training | test loss 1.15600 | test ppl 3.17721|test cor 0.02757
8 32 2 2 0.2 0.01
| End of training | test loss 1.15980 | test ppl 3.18930|test cor -0.05136
8 32 2 2 0.2 0.005
| End of training | test loss 1.18879 | test ppl 3.28310|test cor -0.11545
8 32 2 2 0.2 0.001
| End of training | test loss 1.42982 | test ppl 4.17793|test cor -0.03265
8 32 2 2 0.5 0.1
| End of training | test loss 1.16058 | test ppl 3.19179|test cor -0.03515
8 32 2 2 0.5 0.01
| End of training | test loss 1.15835 | test ppl 3.18469|test cor 0.09649
8 32 2 2 0.5 0.005
| End of training | test loss 1.16053 | test ppl 3.19163|test cor 0.20939
8 32 2 2 0.5 0.001
| End of training | test loss 1.35955 | test ppl 3.89446|test cor -0.00

| End of training | test loss 1.16087 | test ppl 3.19271|test cor 0.04547
8 32 4 2 0.5 0.01
| End of training | test loss 1.16359 | test ppl 3.20142|test cor 0.02578
8 32 4 2 0.5 0.005
| End of training | test loss 1.16388 | test ppl 3.20235|test cor 0.02458
8 32 4 2 0.5 0.001
| End of training | test loss 1.20957 | test ppl 3.35203|test cor -0.01699
8 32 4 4 0.2 0.1
| End of training | test loss 1.15598 | test ppl 3.17713|test cor 0.06907
8 32 4 4 0.2 0.01
| End of training | test loss 1.16138 | test ppl 3.19434|test cor 0.00011
8 32 4 4 0.2 0.005
| End of training | test loss 1.16188 | test ppl 3.19593|test cor -0.15623
8 32 4 4 0.2 0.001
| End of training | test loss 1.24501 | test ppl 3.47296|test cor -0.14117
8 32 4 4 0.5 0.1
| End of training | test loss 1.15885 | test ppl 3.18628|test cor -0.02636
8 32 4 4 0.5 0.01
| End of training | test loss 1.15872 | test ppl 3.18586|test cor -0.01573
8 32 4 4 0.5 0.005
| End of training | test loss 1.20461 | test ppl 3.33547|test cor 0.2057

| End of training | test loss 1.17189 | test ppl 3.22807|test cor 0.04876
8 32 6 4 0.5 0.1
| End of training | test loss 1.15964 | test ppl 3.18877|test cor -0.01929
8 32 6 4 0.5 0.01
| End of training | test loss 1.16307 | test ppl 3.19973|test cor -0.15139
8 32 6 4 0.5 0.005
| End of training | test loss 1.17491 | test ppl 3.23786|test cor -0.04088
8 32 6 4 0.5 0.001
| End of training | test loss 1.33526 | test ppl 3.80099|test cor -0.09541
8 32 6 8 0.2 0.1
| End of training | test loss 1.16019 | test ppl 3.19055|test cor -0.11681
8 32 6 8 0.2 0.01
| End of training | test loss 1.16086 | test ppl 3.19267|test cor -0.00111
8 32 6 8 0.2 0.005
| End of training | test loss 1.16362 | test ppl 3.20150|test cor 0.13114
8 32 6 8 0.2 0.001
| End of training | test loss 1.22767 | test ppl 3.41327|test cor -0.13676
8 32 6 8 0.5 0.1
| End of training | test loss 1.16085 | test ppl 3.19265|test cor -0.00279
8 32 6 8 0.5 0.01
| End of training | test loss 1.16297 | test ppl 3.19943|test cor -0.17

| End of training | test loss 1.17484 | test ppl 3.23762|test cor 0.00257
8 32 8 8 0.2 0.001
| End of training | test loss 1.28373 | test ppl 3.61008|test cor -0.16589
8 32 8 8 0.5 0.1
| End of training | test loss 1.15893 | test ppl 3.18653|test cor 0.01189
8 32 8 8 0.5 0.01
| End of training | test loss 1.15938 | test ppl 3.18794|test cor -0.00865
8 32 8 8 0.5 0.005
| End of training | test loss 1.16603 | test ppl 3.20922|test cor 0.11398
8 32 8 8 0.5 0.001
| End of training | test loss 1.20777 | test ppl 3.34603|test cor 0.17961
8 64 2 2 0.2 0.1
| End of training | test loss 1.15983 | test ppl 3.18940|test cor 0.05200
8 64 2 2 0.2 0.01
| End of training | test loss 1.17008 | test ppl 3.22226|test cor -0.13425
8 64 2 2 0.2 0.005
| End of training | test loss 1.16846 | test ppl 3.21705|test cor -0.05725
8 64 2 2 0.2 0.001
| End of training | test loss 1.31241 | test ppl 3.71511|test cor -0.04565
8 64 2 2 0.5 0.1
| End of training | test loss 1.16189 | test ppl 3.19597|test cor 0.00901

| End of training | test loss 1.16418 | test ppl 3.20330|test cor 0.11249
8 64 4 2 0.2 0.005
| End of training | test loss 1.15945 | test ppl 3.18818|test cor -0.12962
8 64 4 2 0.2 0.001
| End of training | test loss 1.18342 | test ppl 3.26553|test cor 0.14815
8 64 4 2 0.5 0.1
| End of training | test loss 1.16308 | test ppl 3.19978|test cor -0.07528
8 64 4 2 0.5 0.01
| End of training | test loss 1.16628 | test ppl 3.21002|test cor -0.11650
8 64 4 2 0.5 0.005
| End of training | test loss 1.16723 | test ppl 3.21309|test cor 0.08740
8 64 4 2 0.5 0.001
| End of training | test loss 1.25076 | test ppl 3.49300|test cor -0.17526
8 64 4 4 0.2 0.1
| End of training | test loss 1.16553 | test ppl 3.20763|test cor 0.20114
8 64 4 4 0.2 0.01
| End of training | test loss 1.15966 | test ppl 3.18885|test cor -0.04701
8 64 4 4 0.2 0.005
| End of training | test loss 1.16585 | test ppl 3.20866|test cor 0.02323
8 64 4 4 0.2 0.001
| End of training | test loss 1.16943 | test ppl 3.22016|test cor -0.00

| End of training | test loss 1.16202 | test ppl 3.19640|test cor 0.08308
8 64 6 4 0.2 0.01
| End of training | test loss 1.16403 | test ppl 3.20280|test cor 0.12079
8 64 6 4 0.2 0.005
| End of training | test loss 1.18220 | test ppl 3.26153|test cor 0.20594
8 64 6 4 0.2 0.001
| End of training | test loss 1.69627 | test ppl 5.45357|test cor 0.24477
8 64 6 4 0.5 0.1
| End of training | test loss 1.16192 | test ppl 3.19607|test cor 0.07003
8 64 6 4 0.5 0.01
| End of training | test loss 1.17099 | test ppl 3.22518|test cor -0.15143
8 64 6 4 0.5 0.005
| End of training | test loss 1.16207 | test ppl 3.19653|test cor -0.13360
8 64 6 4 0.5 0.001
| End of training | test loss 1.64811 | test ppl 5.19715|test cor -0.04042
8 64 6 8 0.2 0.1
| End of training | test loss 1.16112 | test ppl 3.19350|test cor 0.21001
8 64 6 8 0.2 0.01
| End of training | test loss 1.15470 | test ppl 3.17307|test cor -0.04654
8 64 6 8 0.2 0.005
| End of training | test loss 1.17600 | test ppl 3.24140|test cor 0.14094

| End of training | test loss 1.29230 | test ppl 3.64115|test cor -0.10393
8 64 8 8 0.2 0.1
| End of training | test loss 1.16040 | test ppl 3.19121|test cor -0.00021
8 64 8 8 0.2 0.01
| End of training | test loss 1.16072 | test ppl 3.19224|test cor -0.12774
8 64 8 8 0.2 0.005
| End of training | test loss 1.15937 | test ppl 3.18793|test cor 0.25383
8 64 8 8 0.2 0.001
| End of training | test loss 1.29417 | test ppl 3.64797|test cor 0.12605
8 64 8 8 0.5 0.1
| End of training | test loss 1.16215 | test ppl 3.19678|test cor -0.09396
8 64 8 8 0.5 0.01
| End of training | test loss 1.16599 | test ppl 3.20911|test cor 0.00047
8 64 8 8 0.5 0.005
| End of training | test loss 1.17928 | test ppl 3.25203|test cor 0.06283
8 64 8 8 0.5 0.001
| End of training | test loss 1.74802 | test ppl 5.74320|test cor -0.13001
8 128 2 2 0.2 0.1
| End of training | test loss 1.15641 | test ppl 3.17851|test cor 0.09844
8 128 2 2 0.2 0.01
| End of training | test loss 1.16222 | test ppl 3.19704|test cor -0.069

| End of training | test loss 1.16756 | test ppl 3.21414|test cor 0.08575
8 128 2 8 0.5 0.005
| End of training | test loss 1.18272 | test ppl 3.26324|test cor 0.01903
8 128 2 8 0.5 0.001
| End of training | test loss 1.21811 | test ppl 3.38078|test cor 0.07558
8 128 4 2 0.2 0.1
| End of training | test loss 1.16210 | test ppl 3.19665|test cor -0.06372
8 128 4 2 0.2 0.01
| End of training | test loss 1.16468 | test ppl 3.20490|test cor 0.02003
8 128 4 2 0.2 0.005
| End of training | test loss 1.16308 | test ppl 3.19979|test cor -0.10665
8 128 4 2 0.2 0.001
| End of training | test loss 1.18065 | test ppl 3.25649|test cor -0.26876
8 128 4 2 0.5 0.1
| End of training | test loss 1.15827 | test ppl 3.18443|test cor -0.01169
8 128 4 2 0.5 0.01
| End of training | test loss 1.16662 | test ppl 3.21111|test cor 0.06999
8 128 4 2 0.5 0.005
| End of training | test loss 1.16358 | test ppl 3.20138|test cor -0.11133
8 128 4 2 0.5 0.001
| End of training | test loss 1.48506 | test ppl 4.41525|test

| End of training | test loss 1.22504 | test ppl 3.40431|test cor 0.07278
8 128 6 2 0.5 0.1
| End of training | test loss 1.15990 | test ppl 3.18962|test cor -0.10958
8 128 6 2 0.5 0.01
| End of training | test loss 1.16219 | test ppl 3.19692|test cor 0.02822
8 128 6 2 0.5 0.005
| End of training | test loss 1.24118 | test ppl 3.45969|test cor 0.05251
8 128 6 2 0.5 0.001
| End of training | test loss 1.17711 | test ppl 3.24497|test cor -0.08811
8 128 6 4 0.2 0.1
| End of training | test loss 1.15937 | test ppl 3.18792|test cor 0.16583
8 128 6 4 0.2 0.01
| End of training | test loss 1.16304 | test ppl 3.19964|test cor 0.00442
8 128 6 4 0.2 0.005
| End of training | test loss 1.16272 | test ppl 3.19863|test cor 0.13824
8 128 6 4 0.2 0.001
| End of training | test loss 1.18279 | test ppl 3.26347|test cor -0.24954
8 128 6 4 0.5 0.1
| End of training | test loss 1.15921 | test ppl 3.18742|test cor 0.15177
8 128 6 4 0.5 0.01
| End of training | test loss 1.15982 | test ppl 3.18935|test cor 

| End of training | test loss 1.17406 | test ppl 3.23509|test cor -0.03724
8 128 8 4 0.2 0.005
| End of training | test loss 1.16404 | test ppl 3.20284|test cor -0.08970
8 128 8 4 0.2 0.001
| End of training | test loss 1.29521 | test ppl 3.65175|test cor 0.05960
8 128 8 4 0.5 0.1
| End of training | test loss 1.16107 | test ppl 3.19336|test cor 0.05677
8 128 8 4 0.5 0.01
| End of training | test loss 1.16224 | test ppl 3.19707|test cor -0.12804
8 128 8 4 0.5 0.005
| End of training | test loss 1.16991 | test ppl 3.22171|test cor 0.13175
8 128 8 4 0.5 0.001
| End of training | test loss 1.36540 | test ppl 3.91728|test cor 0.10185
8 128 8 8 0.2 0.1
| End of training | test loss 1.16124 | test ppl 3.19390|test cor -0.05973
8 128 8 8 0.2 0.01
| End of training | test loss 1.16162 | test ppl 3.19511|test cor 0.19616
8 128 8 8 0.2 0.005
| End of training | test loss 1.16123 | test ppl 3.19387|test cor -0.08624
8 128 8 8 0.2 0.001
| End of training | test loss 1.33299 | test ppl 3.79236|test

| End of training | test loss 1.70899 | test ppl 5.52336|test cor 0.02573
8 256 2 8 0.2 0.1
| End of training | test loss 1.16399 | test ppl 3.20269|test cor 0.04905
8 256 2 8 0.2 0.01
| End of training | test loss 1.15697 | test ppl 3.18029|test cor 0.00716
8 256 2 8 0.2 0.005
| End of training | test loss 1.16214 | test ppl 3.19677|test cor 0.26176
8 256 2 8 0.2 0.001
| End of training | test loss 1.19221 | test ppl 3.29435|test cor -0.17094
8 256 2 8 0.5 0.1
| End of training | test loss 1.16040 | test ppl 3.19121|test cor -0.01735
8 256 2 8 0.5 0.01
| End of training | test loss 1.16885 | test ppl 3.21830|test cor -0.08162
8 256 2 8 0.5 0.005
| End of training | test loss 1.15983 | test ppl 3.18939|test cor 0.12559
8 256 2 8 0.5 0.001
| End of training | test loss 1.34464 | test ppl 3.83681|test cor -0.05808
8 256 4 2 0.2 0.1
| End of training | test loss 1.16017 | test ppl 3.19048|test cor -0.05885
8 256 4 2 0.2 0.01
| End of training | test loss 1.15804 | test ppl 3.18367|test co

| End of training | test loss 1.15957 | test ppl 3.18856|test cor 0.04880
8 256 4 8 0.5 0.005
| End of training | test loss 1.16117 | test ppl 3.19368|test cor -0.02013
8 256 4 8 0.5 0.001
| End of training | test loss 1.24554 | test ppl 3.47480|test cor 0.02295
8 256 6 2 0.2 0.1
| End of training | test loss 1.16184 | test ppl 3.19581|test cor -0.11046
8 256 6 2 0.2 0.01
| End of training | test loss 1.16057 | test ppl 3.19175|test cor -0.21292
8 256 6 2 0.2 0.005
| End of training | test loss 1.16145 | test ppl 3.19457|test cor -0.03442
8 256 6 2 0.2 0.001
| End of training | test loss 1.16858 | test ppl 3.21743|test cor -0.04557
8 256 6 2 0.5 0.1
| End of training | test loss 1.16128 | test ppl 3.19402|test cor 0.02570
8 256 6 2 0.5 0.01
| End of training | test loss 1.16354 | test ppl 3.20125|test cor -0.14965
8 256 6 2 0.5 0.005
| End of training | test loss 1.16732 | test ppl 3.21335|test cor 0.15923
8 256 6 2 0.5 0.001
| End of training | test loss 1.18431 | test ppl 3.26844|tes

| End of training | test loss 1.45977 | test ppl 4.30499|test cor 0.05360
8 256 8 2 0.5 0.1
| End of training | test loss 1.16156 | test ppl 3.19491|test cor -0.03842
8 256 8 2 0.5 0.01
| End of training | test loss 1.16099 | test ppl 3.19308|test cor -0.09944
8 256 8 2 0.5 0.005
| End of training | test loss 1.15912 | test ppl 3.18713|test cor 0.03403
8 256 8 2 0.5 0.001
| End of training | test loss 1.18338 | test ppl 3.26541|test cor -0.05917
8 256 8 4 0.2 0.1
| End of training | test loss 1.16123 | test ppl 3.19385|test cor -0.18474
8 256 8 4 0.2 0.01
| End of training | test loss 1.16652 | test ppl 3.21081|test cor 0.08066
8 256 8 4 0.2 0.005
| End of training | test loss 1.15962 | test ppl 3.18872|test cor 0.01794
8 256 8 4 0.2 0.001
| End of training | test loss 1.16714 | test ppl 3.21278|test cor 0.12635
8 256 8 4 0.5 0.1
| End of training | test loss 1.16271 | test ppl 3.19860|test cor -0.04145
8 256 8 4 0.5 0.01
| End of training | test loss 1.16388 | test ppl 3.20234|test co

| End of training | test loss 1.16154 | test ppl 3.19485|test cor 0.12475
32 8 2 4 0.2 0.005
| End of training | test loss 1.15627 | test ppl 3.17807|test cor 0.14628
32 8 2 4 0.2 0.001
| End of training | test loss 1.21984 | test ppl 3.38665|test cor 0.00373
32 8 2 4 0.5 0.1
| End of training | test loss 1.16344 | test ppl 3.20091|test cor -0.01074
32 8 2 4 0.5 0.01
| End of training | test loss 1.16606 | test ppl 3.20933|test cor 0.04425
32 8 2 4 0.5 0.005
| End of training | test loss 1.16169 | test ppl 3.19533|test cor 0.09176
32 8 2 4 0.5 0.001
| End of training | test loss 1.22407 | test ppl 3.40099|test cor -0.08129
32 8 2 8 0.2 0.1
| End of training | test loss 1.16278 | test ppl 3.19881|test cor 0.05980
32 8 2 8 0.2 0.01
| End of training | test loss 1.15720 | test ppl 3.18100|test cor 0.09364
32 8 2 8 0.2 0.005
| End of training | test loss 1.16332 | test ppl 3.20055|test cor 0.10303
32 8 2 8 0.2 0.001
| End of training | test loss 1.18475 | test ppl 3.26986|test cor 0.07283


| End of training | test loss 1.16263 | test ppl 3.19833|test cor 0.10826
32 8 4 8 0.2 0.01
| End of training | test loss 1.16270 | test ppl 3.19855|test cor -0.01011
32 8 4 8 0.2 0.005
| End of training | test loss 1.16532 | test ppl 3.20695|test cor 0.16137
32 8 4 8 0.2 0.001
| End of training | test loss 1.30929 | test ppl 3.70356|test cor 0.06000
32 8 4 8 0.5 0.1
| End of training | test loss 1.17076 | test ppl 3.22443|test cor 0.12768
32 8 4 8 0.5 0.01
| End of training | test loss 1.15903 | test ppl 3.18685|test cor -0.04379
32 8 4 8 0.5 0.005
| End of training | test loss 1.16662 | test ppl 3.21111|test cor 0.18771
32 8 4 8 0.5 0.001
| End of training | test loss 1.26174 | test ppl 3.53155|test cor -0.06798
32 8 6 2 0.2 0.1
| End of training | test loss 1.15964 | test ppl 3.18879|test cor 0.18563
32 8 6 2 0.2 0.01
| End of training | test loss 1.16534 | test ppl 3.20701|test cor 0.25177
32 8 6 2 0.2 0.005
| End of training | test loss 1.15871 | test ppl 3.18582|test cor 0.08679


| End of training | test loss 1.18104 | test ppl 3.25775|test cor 0.01001
32 8 8 2 0.2 0.1
| End of training | test loss 1.16649 | test ppl 3.21070|test cor 0.14400
32 8 8 2 0.2 0.01
| End of training | test loss 1.16928 | test ppl 3.21967|test cor 0.26101
32 8 8 2 0.2 0.005
| End of training | test loss 1.15887 | test ppl 3.18634|test cor -0.22902
32 8 8 2 0.2 0.001
| End of training | test loss 1.18678 | test ppl 3.27652|test cor -0.18514
32 8 8 2 0.5 0.1
| End of training | test loss 1.16245 | test ppl 3.19775|test cor 0.14511
32 8 8 2 0.5 0.01
| End of training | test loss 1.16207 | test ppl 3.19655|test cor 0.02481
32 8 8 2 0.5 0.005
| End of training | test loss 1.16150 | test ppl 3.19472|test cor -0.10686
32 8 8 2 0.5 0.001
| End of training | test loss 1.33972 | test ppl 3.81798|test cor -0.20998
32 8 8 4 0.2 0.1
| End of training | test loss 1.16915 | test ppl 3.21927|test cor 0.15951
32 8 8 4 0.2 0.01
| End of training | test loss 1.16484 | test ppl 3.20540|test cor -0.08773


| End of training | test loss 1.16931 | test ppl 3.21976|test cor 0.07962
32 32 2 2 0.5 0.001
| End of training | test loss 1.34553 | test ppl 3.84022|test cor -0.04892
32 32 2 4 0.2 0.1
| End of training | test loss 1.17058 | test ppl 3.22387|test cor 0.00525
32 32 2 4 0.2 0.01
| End of training | test loss 1.16088 | test ppl 3.19274|test cor 0.10696
32 32 2 4 0.2 0.005
| End of training | test loss 1.16408 | test ppl 3.20299|test cor -0.04028
32 32 2 4 0.2 0.001
| End of training | test loss 1.19901 | test ppl 3.31684|test cor 0.10410
32 32 2 4 0.5 0.1
| End of training | test loss 1.15989 | test ppl 3.18958|test cor -0.00907
32 32 2 4 0.5 0.01
| End of training | test loss 1.16711 | test ppl 3.21268|test cor 0.01311
32 32 2 4 0.5 0.005
| End of training | test loss 1.16202 | test ppl 3.19638|test cor 0.07333
32 32 2 4 0.5 0.001
| End of training | test loss 1.20240 | test ppl 3.32809|test cor -0.02118
32 32 2 8 0.2 0.1
| End of training | test loss 1.16096 | test ppl 3.19299|test co

| End of training | test loss 1.15996 | test ppl 3.18980|test cor 0.03970
32 32 4 4 0.5 0.01
| End of training | test loss 1.16546 | test ppl 3.20741|test cor 0.00743
32 32 4 4 0.5 0.005
| End of training | test loss 1.16336 | test ppl 3.20067|test cor -0.03090
32 32 4 4 0.5 0.001
| End of training | test loss 1.40829 | test ppl 4.08894|test cor 0.16104
32 32 4 8 0.2 0.1
| End of training | test loss 1.15656 | test ppl 3.17897|test cor 0.08062
32 32 4 8 0.2 0.01
| End of training | test loss 1.16080 | test ppl 3.19249|test cor 0.14236
32 32 4 8 0.2 0.005
| End of training | test loss 1.15974 | test ppl 3.18909|test cor 0.15058
32 32 4 8 0.2 0.001
| End of training | test loss 1.55415 | test ppl 4.73108|test cor -0.04641
32 32 4 8 0.5 0.1
| End of training | test loss 1.16320 | test ppl 3.20016|test cor 0.13376
32 32 4 8 0.5 0.01
| End of training | test loss 1.16420 | test ppl 3.20335|test cor -0.15837
32 32 4 8 0.5 0.005
| End of training | test loss 1.16960 | test ppl 3.22069|test co

| End of training | test loss 1.17469 | test ppl 3.23714|test cor 0.08522
32 32 6 8 0.2 0.001
| End of training | test loss 1.17012 | test ppl 3.22239|test cor 0.10454
32 32 6 8 0.5 0.1
| End of training | test loss 1.15935 | test ppl 3.18784|test cor 0.13659
32 32 6 8 0.5 0.01
| End of training | test loss 1.16551 | test ppl 3.20755|test cor 0.00131
32 32 6 8 0.5 0.005
| End of training | test loss 1.17409 | test ppl 3.23520|test cor 0.05840
32 32 6 8 0.5 0.001
| End of training | test loss 1.18911 | test ppl 3.28416|test cor 0.04399
32 32 8 2 0.2 0.1
| End of training | test loss 1.16819 | test ppl 3.21617|test cor 0.22137
32 32 8 2 0.2 0.01
| End of training | test loss 1.16027 | test ppl 3.19080|test cor 0.09840
32 32 8 2 0.2 0.005
| End of training | test loss 1.17073 | test ppl 3.22436|test cor -0.11225
32 32 8 2 0.2 0.001
| End of training | test loss 1.16470 | test ppl 3.20495|test cor -0.09562
32 32 8 2 0.5 0.1
| End of training | test loss 1.16271 | test ppl 3.19859|test cor 

| End of training | test loss 1.18259 | test ppl 3.26283|test cor 0.07725
32 64 2 2 0.2 0.01
| End of training | test loss 1.16381 | test ppl 3.20211|test cor -0.06266
32 64 2 2 0.2 0.005
| End of training | test loss 1.16292 | test ppl 3.19926|test cor -0.10187
32 64 2 2 0.2 0.001
| End of training | test loss 1.33407 | test ppl 3.79646|test cor -0.01921
32 64 2 2 0.5 0.1
| End of training | test loss 1.16029 | test ppl 3.19086|test cor -0.01844
32 64 2 2 0.5 0.01
| End of training | test loss 1.15628 | test ppl 3.17808|test cor 0.03474
32 64 2 2 0.5 0.005
| End of training | test loss 1.18881 | test ppl 3.28316|test cor 0.01050
32 64 2 2 0.5 0.001
| End of training | test loss 1.17882 | test ppl 3.25055|test cor 0.10660
32 64 2 4 0.2 0.1
| End of training | test loss 1.15644 | test ppl 3.17859|test cor 0.04798
32 64 2 4 0.2 0.01
| End of training | test loss 1.16144 | test ppl 3.19454|test cor 0.02451
32 64 2 4 0.2 0.005
| End of training | test loss 1.16490 | test ppl 3.20560|test c

| End of training | test loss 1.16972 | test ppl 3.22109|test cor -0.17601
32 64 4 2 0.5 0.001
| End of training | test loss 1.50457 | test ppl 4.50223|test cor -0.05158
32 64 4 4 0.2 0.1
| End of training | test loss 1.17502 | test ppl 3.23821|test cor 0.17238
32 64 4 4 0.2 0.01
| End of training | test loss 1.15850 | test ppl 3.18514|test cor 0.15894
32 64 4 4 0.2 0.005
| End of training | test loss 1.16322 | test ppl 3.20023|test cor 0.15369
32 64 4 4 0.2 0.001
| End of training | test loss 1.24351 | test ppl 3.46777|test cor -0.00178
32 64 4 4 0.5 0.1
| End of training | test loss 1.15691 | test ppl 3.18009|test cor 0.08442
32 64 4 4 0.5 0.01
| End of training | test loss 1.16316 | test ppl 3.20003|test cor 0.07690
32 64 4 4 0.5 0.005
| End of training | test loss 1.16700 | test ppl 3.21235|test cor -0.14279
32 64 4 4 0.5 0.001
| End of training | test loss 1.25554 | test ppl 3.50974|test cor 0.00162
32 64 4 8 0.2 0.1
| End of training | test loss 1.16189 | test ppl 3.19596|test co

| End of training | test loss 1.16026 | test ppl 3.19075|test cor -0.03535
32 64 6 4 0.5 0.01
| End of training | test loss 1.16521 | test ppl 3.20661|test cor -0.10411
32 64 6 4 0.5 0.005
| End of training | test loss 1.16618 | test ppl 3.20972|test cor 0.18790
32 64 6 4 0.5 0.001
| End of training | test loss 1.23053 | test ppl 3.42306|test cor 0.18700
32 64 6 8 0.2 0.1
| End of training | test loss 1.16011 | test ppl 3.19029|test cor 0.19078
32 64 6 8 0.2 0.01
| End of training | test loss 1.15839 | test ppl 3.18480|test cor -0.08840
32 64 6 8 0.2 0.005
| End of training | test loss 1.16824 | test ppl 3.21634|test cor 0.02288
32 64 6 8 0.2 0.001
| End of training | test loss 1.20346 | test ppl 3.33163|test cor 0.09520
32 64 6 8 0.5 0.1
| End of training | test loss 1.17407 | test ppl 3.23515|test cor 0.08515
32 64 6 8 0.5 0.01
| End of training | test loss 1.16053 | test ppl 3.19161|test cor 0.16816
32 64 6 8 0.5 0.005
| End of training | test loss 1.17144 | test ppl 3.22663|test co

| End of training | test loss 1.16815 | test ppl 3.21605|test cor -0.05426
32 64 8 8 0.2 0.001
| End of training | test loss 1.18029 | test ppl 3.25530|test cor 0.05815
32 64 8 8 0.5 0.1
| End of training | test loss 1.16513 | test ppl 3.20634|test cor 0.04105
32 64 8 8 0.5 0.01
| End of training | test loss 1.16241 | test ppl 3.19763|test cor 0.14707
32 64 8 8 0.5 0.005
| End of training | test loss 1.16926 | test ppl 3.21961|test cor 0.15505
32 64 8 8 0.5 0.001
| End of training | test loss 1.18203 | test ppl 3.26098|test cor 0.24855
32 128 2 2 0.2 0.1
| End of training | test loss 1.16033 | test ppl 3.19098|test cor 0.08102
32 128 2 2 0.2 0.01
| End of training | test loss 1.15821 | test ppl 3.18424|test cor 0.19777
32 128 2 2 0.2 0.005
| End of training | test loss 1.16193 | test ppl 3.19609|test cor -0.06587
32 128 2 2 0.2 0.001
| End of training | test loss 1.33300 | test ppl 3.79240|test cor -0.05120
32 128 2 2 0.5 0.1
| End of training | test loss 1.16721 | test ppl 3.21300|tes

| End of training | test loss 1.16110 | test ppl 3.19345|test cor 0.15953
32 128 4 2 0.2 0.01
| End of training | test loss 1.16191 | test ppl 3.19604|test cor 0.11657
32 128 4 2 0.2 0.005
| End of training | test loss 1.16269 | test ppl 3.19852|test cor -0.00374
32 128 4 2 0.2 0.001
| End of training | test loss 1.24274 | test ppl 3.46510|test cor -0.07015
32 128 4 2 0.5 0.1
| End of training | test loss 1.15973 | test ppl 3.18906|test cor -0.02310
32 128 4 2 0.5 0.01
| End of training | test loss 1.16762 | test ppl 3.21434|test cor -0.05407
32 128 4 2 0.5 0.005
| End of training | test loss 1.16650 | test ppl 3.21074|test cor -0.02004
32 128 4 2 0.5 0.001
| End of training | test loss 1.17207 | test ppl 3.22868|test cor -0.13291
32 128 4 4 0.2 0.1
| End of training | test loss 1.15832 | test ppl 3.18458|test cor 0.19301
32 128 4 4 0.2 0.01
| End of training | test loss 1.15717 | test ppl 3.18092|test cor 0.06587
32 128 4 4 0.2 0.005
| End of training | test loss 1.17205 | test ppl 3.

| End of training | test loss 1.18040 | test ppl 3.25569|test cor 0.08774
32 128 6 2 0.5 0.001
| End of training | test loss 1.17237 | test ppl 3.22964|test cor 0.00863
32 128 6 4 0.2 0.1
| End of training | test loss 1.17030 | test ppl 3.22296|test cor 0.15988
32 128 6 4 0.2 0.01
| End of training | test loss 1.16833 | test ppl 3.21662|test cor -0.03734
32 128 6 4 0.2 0.005
| End of training | test loss 1.16340 | test ppl 3.20080|test cor -0.05591
32 128 6 4 0.2 0.001
| End of training | test loss 1.16623 | test ppl 3.20988|test cor -0.02600
32 128 6 4 0.5 0.1
| End of training | test loss 1.16351 | test ppl 3.20116|test cor 0.09709
32 128 6 4 0.5 0.01
| End of training | test loss 1.16172 | test ppl 3.19543|test cor 0.04059
32 128 6 4 0.5 0.005
| End of training | test loss 1.19882 | test ppl 3.31622|test cor -0.07965
32 128 6 4 0.5 0.001
| End of training | test loss 1.18371 | test ppl 3.26647|test cor 0.01440
32 128 6 8 0.2 0.1
| End of training | test loss 1.16944 | test ppl 3.220

| End of training | test loss 1.16230 | test ppl 3.19728|test cor 0.21297
32 128 8 4 0.5 0.01
| End of training | test loss 1.16385 | test ppl 3.20223|test cor -0.08767
32 128 8 4 0.5 0.005
| End of training | test loss 1.17623 | test ppl 3.24212|test cor -0.19571
32 128 8 4 0.5 0.001
| End of training | test loss 1.16315 | test ppl 3.20001|test cor -0.12529
32 128 8 8 0.2 0.1
| End of training | test loss 1.17199 | test ppl 3.22840|test cor 0.02856
32 128 8 8 0.2 0.01
| End of training | test loss 1.16367 | test ppl 3.20166|test cor 0.06569
32 128 8 8 0.2 0.005
| End of training | test loss 1.15924 | test ppl 3.18750|test cor 0.19382
32 128 8 8 0.2 0.001
| End of training | test loss 1.20656 | test ppl 3.34195|test cor 0.14037
32 128 8 8 0.5 0.1
| End of training | test loss 1.16257 | test ppl 3.19815|test cor 0.16069
32 128 8 8 0.5 0.01
| End of training | test loss 1.16918 | test ppl 3.21936|test cor 0.07296
32 128 8 8 0.5 0.005
| End of training | test loss 1.16499 | test ppl 3.205

| End of training | test loss 1.16019 | test ppl 3.19055|test cor 0.17423
32 256 2 8 0.2 0.001
| End of training | test loss 1.19037 | test ppl 3.28831|test cor 0.00668
32 256 2 8 0.5 0.1
| End of training | test loss 1.16341 | test ppl 3.20081|test cor 0.01829
32 256 2 8 0.5 0.01
| End of training | test loss 1.17677 | test ppl 3.24389|test cor 0.04069
32 256 2 8 0.5 0.005
| End of training | test loss 1.16821 | test ppl 3.21623|test cor 0.06609
32 256 2 8 0.5 0.001
| End of training | test loss 1.26507 | test ppl 3.54336|test cor -0.08410
32 256 4 2 0.2 0.1
| End of training | test loss 1.17323 | test ppl 3.23240|test cor 0.17633
32 256 4 2 0.2 0.01
| End of training | test loss 1.16491 | test ppl 3.20563|test cor 0.17406
32 256 4 2 0.2 0.005
| End of training | test loss 1.15421 | test ppl 3.17153|test cor 0.06598
32 256 4 2 0.2 0.001
| End of training | test loss 1.26397 | test ppl 3.53943|test cor -0.13655
32 256 4 2 0.5 0.1
| End of training | test loss 1.16017 | test ppl 3.19049

| End of training | test loss 1.17170 | test ppl 3.22748|test cor -0.02781
32 256 6 2 0.2 0.01
| End of training | test loss 1.16075 | test ppl 3.19231|test cor -0.03515
32 256 6 2 0.2 0.005
| End of training | test loss 1.16810 | test ppl 3.21589|test cor 0.10104
32 256 6 2 0.2 0.001
| End of training | test loss 1.20156 | test ppl 3.32530|test cor -0.14400
32 256 6 2 0.5 0.1
| End of training | test loss 1.16941 | test ppl 3.22010|test cor 0.19901
32 256 6 2 0.5 0.01
| End of training | test loss 1.16271 | test ppl 3.19859|test cor 0.01910
32 256 6 2 0.5 0.005
| End of training | test loss 1.16231 | test ppl 3.19730|test cor -0.05707
32 256 6 2 0.5 0.001
| End of training | test loss 1.27507 | test ppl 3.57895|test cor 0.10867
32 256 6 4 0.2 0.1
| End of training | test loss 1.16689 | test ppl 3.21200|test cor 0.15419
32 256 6 4 0.2 0.01
| End of training | test loss 1.16063 | test ppl 3.19194|test cor 0.17499
32 256 6 4 0.2 0.005
| End of training | test loss 1.16732 | test ppl 3.21

| End of training | test loss 1.17070 | test ppl 3.22426|test cor -0.17026
32 256 8 2 0.5 0.001
| End of training | test loss 1.20969 | test ppl 3.35246|test cor -0.10232
32 256 8 4 0.2 0.1
| End of training | test loss 1.16422 | test ppl 3.20343|test cor 0.16240
32 256 8 4 0.2 0.01
| End of training | test loss 1.16269 | test ppl 3.19852|test cor 0.18539
32 256 8 4 0.2 0.005
| End of training | test loss 1.17095 | test ppl 3.22506|test cor 0.18976
32 256 8 4 0.2 0.001
| End of training | test loss 1.19293 | test ppl 3.29672|test cor 0.21173
32 256 8 4 0.5 0.1
| End of training | test loss 1.16153 | test ppl 3.19483|test cor 0.15836
32 256 8 4 0.5 0.01
| End of training | test loss 1.16393 | test ppl 3.20249|test cor -0.08210
32 256 8 4 0.5 0.005
| End of training | test loss 1.17055 | test ppl 3.22375|test cor 0.06010
32 256 8 4 0.5 0.001
| End of training | test loss 1.34335 | test ppl 3.83187|test cor 0.06749
32 256 8 8 0.2 0.1
| End of training | test loss 1.18294 | test ppl 3.2639

| End of training | test loss 1.17118 | test ppl 3.22581|test cor 0.02838
64 8 2 4 0.5 0.01
| End of training | test loss 1.16796 | test ppl 3.21541|test cor 0.15407
64 8 2 4 0.5 0.005
| End of training | test loss 1.16793 | test ppl 3.21532|test cor 0.04005
64 8 2 4 0.5 0.001
| End of training | test loss 1.33872 | test ppl 3.81415|test cor 0.01225
64 8 2 8 0.2 0.1
| End of training | test loss 1.16699 | test ppl 3.21230|test cor 0.08525
64 8 2 8 0.2 0.01
| End of training | test loss 1.16430 | test ppl 3.20369|test cor 0.08763
64 8 2 8 0.2 0.005
| End of training | test loss 1.15810 | test ppl 3.18388|test cor 0.06308
64 8 2 8 0.2 0.001
| End of training | test loss 1.28644 | test ppl 3.61986|test cor 0.05134
64 8 2 8 0.5 0.1
| End of training | test loss 1.17591 | test ppl 3.24110|test cor 0.00013
64 8 2 8 0.5 0.01
| End of training | test loss 1.16256 | test ppl 3.19810|test cor -0.04127
64 8 2 8 0.5 0.005
| End of training | test loss 1.19164 | test ppl 3.29247|test cor 0.02258
64

| End of training | test loss 1.17872 | test ppl 3.25020|test cor 0.10851
64 8 4 8 0.5 0.1
| End of training | test loss 1.17090 | test ppl 3.22488|test cor 0.15395
64 8 4 8 0.5 0.01
| End of training | test loss 1.16524 | test ppl 3.20670|test cor 0.11297
64 8 4 8 0.5 0.005
| End of training | test loss 1.17512 | test ppl 3.23852|test cor 0.15655
64 8 4 8 0.5 0.001
| End of training | test loss 1.19396 | test ppl 3.30012|test cor 0.10394
64 8 6 2 0.2 0.1
| End of training | test loss 1.17767 | test ppl 3.24679|test cor 0.20630
64 8 6 2 0.2 0.01
| End of training | test loss 1.16258 | test ppl 3.19818|test cor -0.04961
64 8 6 2 0.2 0.005
| End of training | test loss 1.16602 | test ppl 3.20920|test cor -0.22121
64 8 6 2 0.2 0.001
| End of training | test loss 1.16421 | test ppl 3.20338|test cor -0.23413
64 8 6 2 0.5 0.1
| End of training | test loss 1.16425 | test ppl 3.20352|test cor 0.14249
64 8 6 2 0.5 0.01
| End of training | test loss 1.16733 | test ppl 3.21340|test cor -0.07340
6

| End of training | test loss 1.16453 | test ppl 3.20440|test cor 0.07119
64 8 8 2 0.2 0.001
| End of training | test loss 1.17290 | test ppl 3.23134|test cor 0.19142
64 8 8 2 0.5 0.1
| End of training | test loss 1.16621 | test ppl 3.20982|test cor 0.05968
64 8 8 2 0.5 0.01
| End of training | test loss 1.16426 | test ppl 3.20354|test cor -0.15205
64 8 8 2 0.5 0.005
| End of training | test loss 1.16327 | test ppl 3.20037|test cor -0.17355
64 8 8 2 0.5 0.001
| End of training | test loss 1.19570 | test ppl 3.30588|test cor -0.15675
64 8 8 4 0.2 0.1
| End of training | test loss 1.17878 | test ppl 3.25040|test cor 0.23601
64 8 8 4 0.2 0.01
| End of training | test loss 1.16762 | test ppl 3.21434|test cor 0.07112
64 8 8 4 0.2 0.005
| End of training | test loss 1.16480 | test ppl 3.20529|test cor -0.02177
64 8 8 4 0.2 0.001
| End of training | test loss 1.17190 | test ppl 3.22811|test cor 0.09051
64 8 8 4 0.5 0.1
| End of training | test loss 1.17563 | test ppl 3.24018|test cor 0.15683


| End of training | test loss 1.15995 | test ppl 3.18978|test cor 0.11162
64 32 2 4 0.2 0.005
| End of training | test loss 1.17376 | test ppl 3.23412|test cor -0.25654
64 32 2 4 0.2 0.001
| End of training | test loss 1.16615 | test ppl 3.20961|test cor 0.15430
64 32 2 4 0.5 0.1
| End of training | test loss 1.18123 | test ppl 3.25837|test cor -0.00086
64 32 2 4 0.5 0.01
| End of training | test loss 1.16129 | test ppl 3.19406|test cor 0.01972
64 32 2 4 0.5 0.005
| End of training | test loss 1.18711 | test ppl 3.27758|test cor -0.00562
64 32 2 4 0.5 0.001
| End of training | test loss 1.25906 | test ppl 3.52212|test cor -0.03387
64 32 2 8 0.2 0.1
| End of training | test loss 1.16094 | test ppl 3.19292|test cor 0.10727
64 32 2 8 0.2 0.01
| End of training | test loss 1.16893 | test ppl 3.21856|test cor 0.04321
64 32 2 8 0.2 0.005
| End of training | test loss 1.17383 | test ppl 3.23436|test cor 0.08443
64 32 2 8 0.2 0.001
| End of training | test loss 1.19300 | test ppl 3.29696|test 

| End of training | test loss 1.24320 | test ppl 3.46669|test cor -0.11002
64 32 4 8 0.2 0.1
| End of training | test loss 1.18639 | test ppl 3.27524|test cor 0.16392
64 32 4 8 0.2 0.01
| End of training | test loss 1.16039 | test ppl 3.19116|test cor -0.01537
64 32 4 8 0.2 0.005
| End of training | test loss 1.16393 | test ppl 3.20249|test cor -0.09113
64 32 4 8 0.2 0.001
| End of training | test loss 1.19106 | test ppl 3.29055|test cor -0.04105
64 32 4 8 0.5 0.1
| End of training | test loss 1.16027 | test ppl 3.19080|test cor 0.04590
64 32 4 8 0.5 0.01
| End of training | test loss 1.16313 | test ppl 3.19994|test cor 0.18050
64 32 4 8 0.5 0.005
| End of training | test loss 1.17334 | test ppl 3.23277|test cor -0.07103
64 32 4 8 0.5 0.001
| End of training | test loss 1.17206 | test ppl 3.22863|test cor 0.14444
64 32 6 2 0.2 0.1
| End of training | test loss 1.18347 | test ppl 3.26568|test cor -0.05910
64 32 6 2 0.2 0.01
| End of training | test loss 1.16487 | test ppl 3.20550|test c

| End of training | test loss 1.16443 | test ppl 3.20411|test cor -0.08306
64 32 6 8 0.5 0.005
| End of training | test loss 1.18048 | test ppl 3.25595|test cor 0.12062
64 32 6 8 0.5 0.001
| End of training | test loss 1.17197 | test ppl 3.22833|test cor 0.20406
64 32 8 2 0.2 0.1
| End of training | test loss 1.17636 | test ppl 3.24254|test cor 0.18989
64 32 8 2 0.2 0.01
| End of training | test loss 1.16289 | test ppl 3.19916|test cor -0.15708
64 32 8 2 0.2 0.005
| End of training | test loss 1.16168 | test ppl 3.19530|test cor -0.13732
64 32 8 2 0.2 0.001
| End of training | test loss 1.17936 | test ppl 3.25230|test cor -0.05317
64 32 8 2 0.5 0.1
| End of training | test loss 1.17335 | test ppl 3.23281|test cor 0.19007
64 32 8 2 0.5 0.01
| End of training | test loss 1.16311 | test ppl 3.19988|test cor 0.11259
64 32 8 2 0.5 0.005
| End of training | test loss 1.16105 | test ppl 3.19329|test cor -0.04064
64 32 8 2 0.5 0.001
| End of training | test loss 1.17094 | test ppl 3.22502|test

| End of training | test loss 1.16993 | test ppl 3.22178|test cor -0.01587
64 64 2 2 0.5 0.1
| End of training | test loss 1.17560 | test ppl 3.24009|test cor 0.03110
64 64 2 2 0.5 0.01
| End of training | test loss 1.15921 | test ppl 3.18742|test cor 0.10344
64 64 2 2 0.5 0.005
| End of training | test loss 1.16455 | test ppl 3.20447|test cor 0.03623
64 64 2 2 0.5 0.001
| End of training | test loss 1.23014 | test ppl 3.42172|test cor 0.11379
64 64 2 4 0.2 0.1
| End of training | test loss 1.16659 | test ppl 3.21104|test cor 0.05769
64 64 2 4 0.2 0.01
| End of training | test loss 1.15697 | test ppl 3.18029|test cor 0.12128
64 64 2 4 0.2 0.005
| End of training | test loss 1.16563 | test ppl 3.20796|test cor -0.07083
64 64 2 4 0.2 0.001
| End of training | test loss 1.22590 | test ppl 3.40723|test cor -0.15894
64 64 2 4 0.5 0.1
| End of training | test loss 1.16693 | test ppl 3.21210|test cor 0.00889
64 64 2 4 0.5 0.01
| End of training | test loss 1.16278 | test ppl 3.19882|test cor 

| End of training | test loss 1.15968 | test ppl 3.18891|test cor 0.04473
64 64 4 4 0.2 0.005
| End of training | test loss 1.17230 | test ppl 3.22940|test cor -0.01350
64 64 4 4 0.2 0.001
| End of training | test loss 1.17100 | test ppl 3.22523|test cor 0.06823
64 64 4 4 0.5 0.1
| End of training | test loss 1.16104 | test ppl 3.19325|test cor 0.02001
64 64 4 4 0.5 0.01
| End of training | test loss 1.16071 | test ppl 3.19221|test cor 0.05195
64 64 4 4 0.5 0.005
| End of training | test loss 1.17424 | test ppl 3.23569|test cor 0.04312
64 64 4 4 0.5 0.001
| End of training | test loss 1.23547 | test ppl 3.43999|test cor 0.10574
64 64 4 8 0.2 0.1
| End of training | test loss 1.16257 | test ppl 3.19813|test cor 0.13077
64 64 4 8 0.2 0.01
| End of training | test loss 1.16127 | test ppl 3.19397|test cor 0.19433
64 64 4 8 0.2 0.005
| End of training | test loss 1.15945 | test ppl 3.18820|test cor 0.05451
64 64 4 8 0.2 0.001
| End of training | test loss 1.17120 | test ppl 3.22585|test cor

| End of training | test loss 1.20614 | test ppl 3.34055|test cor 0.09098
64 64 6 8 0.2 0.1
| End of training | test loss 1.18016 | test ppl 3.25488|test cor 0.10005
64 64 6 8 0.2 0.01
| End of training | test loss 1.16103 | test ppl 3.19323|test cor 0.17946
64 64 6 8 0.2 0.005
| End of training | test loss 1.16997 | test ppl 3.22189|test cor -0.12245
64 64 6 8 0.2 0.001
| End of training | test loss 1.18079 | test ppl 3.25693|test cor -0.17144
64 64 6 8 0.5 0.1
| End of training | test loss 1.16053 | test ppl 3.19161|test cor 0.19273
64 64 6 8 0.5 0.01
| End of training | test loss 1.16570 | test ppl 3.20817|test cor -0.04962
64 64 6 8 0.5 0.005
| End of training | test loss 1.19080 | test ppl 3.28972|test cor -0.06753
64 64 6 8 0.5 0.001
| End of training | test loss 1.26706 | test ppl 3.55038|test cor -0.07271
64 64 8 2 0.2 0.1
| End of training | test loss 1.19552 | test ppl 3.30527|test cor 0.14452
64 64 8 2 0.2 0.01
| End of training | test loss 1.16438 | test ppl 3.20394|test co

| End of training | test loss 1.16312 | test ppl 3.19989|test cor 0.08670
64 64 8 8 0.5 0.005
| End of training | test loss 1.15985 | test ppl 3.18946|test cor 0.03710
64 64 8 8 0.5 0.001
| End of training | test loss 1.19812 | test ppl 3.31389|test cor -0.10348
64 128 2 2 0.2 0.1
| End of training | test loss 1.17201 | test ppl 3.22846|test cor 0.10023
64 128 2 2 0.2 0.01
| End of training | test loss 1.16091 | test ppl 3.19283|test cor 0.12323
64 128 2 2 0.2 0.005
| End of training | test loss 1.16508 | test ppl 3.20618|test cor 0.11367
64 128 2 2 0.2 0.001
| End of training | test loss 1.24867 | test ppl 3.48571|test cor 0.12507
64 128 2 2 0.5 0.1
| End of training | test loss 1.17867 | test ppl 3.25005|test cor -0.01483
64 128 2 2 0.5 0.01
| End of training | test loss 1.16347 | test ppl 3.20101|test cor -0.00809
64 128 2 2 0.5 0.005
| End of training | test loss 1.17170 | test ppl 3.22747|test cor 0.22271
64 128 2 2 0.5 0.001
| End of training | test loss 1.19676 | test ppl 3.3093

| End of training | test loss 1.16560 | test ppl 3.20784|test cor 0.23824
64 128 4 2 0.5 0.1
| End of training | test loss 1.16163 | test ppl 3.19513|test cor 0.10534
64 128 4 2 0.5 0.01
| End of training | test loss 1.15973 | test ppl 3.18909|test cor 0.12418
64 128 4 2 0.5 0.005
| End of training | test loss 1.16217 | test ppl 3.19687|test cor -0.13949
64 128 4 2 0.5 0.001
| End of training | test loss 1.20717 | test ppl 3.34399|test cor 0.01842
64 128 4 4 0.2 0.1
| End of training | test loss 1.18507 | test ppl 3.27092|test cor 0.12669
64 128 4 4 0.2 0.01
| End of training | test loss 1.16354 | test ppl 3.20125|test cor -0.12089
64 128 4 4 0.2 0.005
| End of training | test loss 1.15556 | test ppl 3.17580|test cor 0.03467
64 128 4 4 0.2 0.001
| End of training | test loss 1.54269 | test ppl 4.67717|test cor -0.10848
64 128 4 4 0.5 0.1
| End of training | test loss 1.16812 | test ppl 3.21594|test cor 0.08147
64 128 4 4 0.5 0.01
| End of training | test loss 1.16541 | test ppl 3.20723

| End of training | test loss 1.16138 | test ppl 3.19433|test cor 0.12166
64 128 6 4 0.2 0.005
| End of training | test loss 1.16575 | test ppl 3.20831|test cor 0.15308
64 128 6 4 0.2 0.001
| End of training | test loss 1.18504 | test ppl 3.27083|test cor 0.06386
64 128 6 4 0.5 0.1
| End of training | test loss 1.17183 | test ppl 3.22789|test cor 0.14931
64 128 6 4 0.5 0.01
| End of training | test loss 1.16568 | test ppl 3.20811|test cor 0.09364
64 128 6 4 0.5 0.005
| End of training | test loss 1.16712 | test ppl 3.21273|test cor -0.15759
64 128 6 4 0.5 0.001
| End of training | test loss 1.20871 | test ppl 3.34917|test cor 0.18362
64 128 6 8 0.2 0.1
| End of training | test loss 1.16550 | test ppl 3.20752|test cor 0.02377
64 128 6 8 0.2 0.01
| End of training | test loss 1.16267 | test ppl 3.19846|test cor 0.11553
64 128 6 8 0.2 0.005
| End of training | test loss 1.17258 | test ppl 3.23033|test cor -0.16276
64 128 6 8 0.2 0.001
| End of training | test loss 1.16679 | test ppl 3.211

| End of training | test loss 1.21081 | test ppl 3.35619|test cor 0.20772
64 128 8 8 0.2 0.1
| End of training | test loss 1.19154 | test ppl 3.29214|test cor 0.13856
64 128 8 8 0.2 0.01
| End of training | test loss 1.16477 | test ppl 3.20518|test cor -0.20250
64 128 8 8 0.2 0.005
| End of training | test loss 1.16030 | test ppl 3.19090|test cor -0.03899
64 128 8 8 0.2 0.001
| End of training | test loss 1.17442 | test ppl 3.23626|test cor -0.02702
64 128 8 8 0.5 0.1
| End of training | test loss 1.16386 | test ppl 3.20229|test cor 0.11458
64 128 8 8 0.5 0.01
| End of training | test loss 1.15900 | test ppl 3.18675|test cor 0.07715
64 128 8 8 0.5 0.005
| End of training | test loss 1.16311 | test ppl 3.19987|test cor 0.18748
64 128 8 8 0.5 0.001
| End of training | test loss 1.19276 | test ppl 3.29618|test cor 0.20288
64 256 2 2 0.2 0.1
| End of training | test loss 1.16695 | test ppl 3.21220|test cor 0.05233
64 256 2 2 0.2 0.01
| End of training | test loss 1.15908 | test ppl 3.18702

| End of training | test loss 1.15623 | test ppl 3.17792|test cor -0.01902
64 256 2 8 0.5 0.005
| End of training | test loss 1.17499 | test ppl 3.23809|test cor 0.05982
64 256 2 8 0.5 0.001
| End of training | test loss 1.20874 | test ppl 3.34927|test cor 0.09580
64 256 4 2 0.2 0.1
| End of training | test loss 1.18889 | test ppl 3.28342|test cor 0.13628
64 256 4 2 0.2 0.01
| End of training | test loss 1.16017 | test ppl 3.19048|test cor 0.23152
64 256 4 2 0.2 0.005
| End of training | test loss 1.16508 | test ppl 3.20619|test cor 0.12675
64 256 4 2 0.2 0.001
| End of training | test loss 1.24903 | test ppl 3.48698|test cor 0.08745
64 256 4 2 0.5 0.1
| End of training | test loss 1.16660 | test ppl 3.21107|test cor 0.15506
64 256 4 2 0.5 0.01
| End of training | test loss 1.16807 | test ppl 3.21578|test cor 0.22736
64 256 4 2 0.5 0.005
| End of training | test loss 1.17225 | test ppl 3.22924|test cor -0.07270
64 256 4 2 0.5 0.001
| End of training | test loss 1.18053 | test ppl 3.256

| End of training | test loss 1.19544 | test ppl 3.30500|test cor -0.02738
64 256 6 2 0.5 0.1
| End of training | test loss 1.16809 | test ppl 3.21585|test cor 0.22300
64 256 6 2 0.5 0.01
| End of training | test loss 1.16496 | test ppl 3.20578|test cor 0.15659
64 256 6 2 0.5 0.005
| End of training | test loss 1.16722 | test ppl 3.21306|test cor -0.15959
64 256 6 2 0.5 0.001
| End of training | test loss 1.17311 | test ppl 3.23203|test cor 0.22458
64 256 6 4 0.2 0.1
| End of training | test loss 1.16558 | test ppl 3.20779|test cor 0.10670
64 256 6 4 0.2 0.01
| End of training | test loss 1.16833 | test ppl 3.21663|test cor 0.22687
64 256 6 4 0.2 0.005
| End of training | test loss 1.15592 | test ppl 3.17695|test cor -0.10714
64 256 6 4 0.2 0.001
| End of training | test loss 1.19450 | test ppl 3.30190|test cor 0.20729
64 256 6 4 0.5 0.1
| End of training | test loss 1.17366 | test ppl 3.23382|test cor 0.10166
64 256 6 4 0.5 0.01
| End of training | test loss 1.15767 | test ppl 3.18251

| End of training | test loss 1.16098 | test ppl 3.19306|test cor 0.20971
64 256 8 4 0.2 0.005
| End of training | test loss 1.16483 | test ppl 3.20537|test cor -0.14315
64 256 8 4 0.2 0.001
| End of training | test loss 1.18783 | test ppl 3.27995|test cor -0.00100
64 256 8 4 0.5 0.1
| End of training | test loss 1.17489 | test ppl 3.23779|test cor 0.15486
64 256 8 4 0.5 0.01
| End of training | test loss 1.16278 | test ppl 3.19881|test cor -0.13246
64 256 8 4 0.5 0.005
| End of training | test loss 1.16372 | test ppl 3.20182|test cor -0.22406
64 256 8 4 0.5 0.001
| End of training | test loss 1.30438 | test ppl 3.68541|test cor -0.12948
64 256 8 8 0.2 0.1
| End of training | test loss 1.19064 | test ppl 3.28918|test cor 0.03542
64 256 8 8 0.2 0.01
| End of training | test loss 1.16259 | test ppl 3.19822|test cor -0.14658
64 256 8 8 0.2 0.005
| End of training | test loss 1.16610 | test ppl 3.20947|test cor -0.06283
64 256 8 8 0.2 0.001
| End of training | test loss 1.18609 | test ppl 

| End of training | test loss 1.29053 | test ppl 3.63470|test cor 0.03687
128 8 2 8 0.2 0.1
| End of training | test loss 1.21026 | test ppl 3.35435|test cor 0.14795
128 8 2 8 0.2 0.01
| End of training | test loss 1.15708 | test ppl 3.18063|test cor 0.11236
128 8 2 8 0.2 0.005
| End of training | test loss 1.16825 | test ppl 3.21635|test cor 0.06946
128 8 2 8 0.2 0.001
| End of training | test loss 1.22410 | test ppl 3.40112|test cor 0.08401
128 8 2 8 0.5 0.1
| End of training | test loss 1.17151 | test ppl 3.22686|test cor 0.04953
128 8 2 8 0.5 0.01
| End of training | test loss 1.16367 | test ppl 3.20167|test cor 0.15212
128 8 2 8 0.5 0.005
| End of training | test loss 1.16084 | test ppl 3.19261|test cor 0.05442
128 8 2 8 0.5 0.001
| End of training | test loss 1.29664 | test ppl 3.65700|test cor 0.01634
128 8 4 2 0.2 0.1
| End of training | test loss 1.21299 | test ppl 3.36352|test cor 0.12681
128 8 4 2 0.2 0.01
| End of training | test loss 1.16097 | test ppl 3.19304|test cor -0.

| End of training | test loss 1.16955 | test ppl 3.22055|test cor 0.20131
128 8 4 8 0.5 0.005
| End of training | test loss 1.16536 | test ppl 3.20709|test cor 0.05384
128 8 4 8 0.5 0.001
| End of training | test loss 1.18137 | test ppl 3.25883|test cor 0.17571
128 8 6 2 0.2 0.1
| End of training | test loss 1.16545 | test ppl 3.20737|test cor 0.10759
128 8 6 2 0.2 0.01
| End of training | test loss 1.16444 | test ppl 3.20413|test cor -0.02087
128 8 6 2 0.2 0.005
| End of training | test loss 1.16332 | test ppl 3.20053|test cor -0.11685
128 8 6 2 0.2 0.001
| End of training | test loss 1.18008 | test ppl 3.25463|test cor -0.11168
128 8 6 2 0.5 0.1
| End of training | test loss 1.26170 | test ppl 3.53143|test cor 0.18158
128 8 6 2 0.5 0.01
| End of training | test loss 1.16506 | test ppl 3.20611|test cor 0.13130
128 8 6 2 0.5 0.005
| End of training | test loss 1.15558 | test ppl 3.17585|test cor 0.07731
128 8 6 2 0.5 0.001
| End of training | test loss 1.19128 | test ppl 3.29130|test c

| End of training | test loss 1.16889 | test ppl 3.21840|test cor 0.05848
128 8 8 2 0.5 0.1
| End of training | test loss 1.20402 | test ppl 3.33348|test cor 0.16643
128 8 8 2 0.5 0.01
| End of training | test loss 1.16240 | test ppl 3.19761|test cor 0.20460
128 8 8 2 0.5 0.005
| End of training | test loss 1.16938 | test ppl 3.22001|test cor 0.19306
128 8 8 2 0.5 0.001
| End of training | test loss 1.18207 | test ppl 3.26111|test cor -0.09171
128 8 8 4 0.2 0.1
| End of training | test loss 1.16553 | test ppl 3.20763|test cor 0.15508
128 8 8 4 0.2 0.01
| End of training | test loss 1.16653 | test ppl 3.21085|test cor -0.11931
128 8 8 4 0.2 0.005
| End of training | test loss 1.17002 | test ppl 3.22205|test cor -0.11702
128 8 8 4 0.2 0.001
| End of training | test loss 1.19827 | test ppl 3.31436|test cor -0.20095
128 8 8 4 0.5 0.1
| End of training | test loss 1.20471 | test ppl 3.33579|test cor 0.04669
128 8 8 4 0.5 0.01
| End of training | test loss 1.17523 | test ppl 3.23889|test cor

| End of training | test loss 1.15842 | test ppl 3.18489|test cor 0.00327
128 32 2 4 0.2 0.005
| End of training | test loss 1.16175 | test ppl 3.19553|test cor 0.03127
128 32 2 4 0.2 0.001
| End of training | test loss 1.25621 | test ppl 3.51209|test cor 0.04671
128 32 2 4 0.5 0.1
| End of training | test loss 1.21564 | test ppl 3.37245|test cor 0.00675
128 32 2 4 0.5 0.01
| End of training | test loss 1.15897 | test ppl 3.18666|test cor 0.05745
128 32 2 4 0.5 0.005
| End of training | test loss 1.16264 | test ppl 3.19837|test cor -0.04430
128 32 2 4 0.5 0.001
| End of training | test loss 1.17631 | test ppl 3.24237|test cor 0.01449
128 32 2 8 0.2 0.1
| End of training | test loss 1.17623 | test ppl 3.24214|test cor 0.05097
128 32 2 8 0.2 0.01
| End of training | test loss 1.16259 | test ppl 3.19821|test cor 0.14669
128 32 2 8 0.2 0.005
| End of training | test loss 1.16484 | test ppl 3.20540|test cor 0.09573
128 32 2 8 0.2 0.001
| End of training | test loss 1.17143 | test ppl 3.2266

| End of training | test loss 1.25315 | test ppl 3.50134|test cor -0.02461
128 32 4 8 0.2 0.1
| End of training | test loss 1.19980 | test ppl 3.31945|test cor 0.14496
128 32 4 8 0.2 0.01
| End of training | test loss 1.15658 | test ppl 3.17904|test cor 0.18977
128 32 4 8 0.2 0.005
| End of training | test loss 1.16119 | test ppl 3.19372|test cor -0.09737
128 32 4 8 0.2 0.001
| End of training | test loss 1.21469 | test ppl 3.36925|test cor 0.06100
128 32 4 8 0.5 0.1
| End of training | test loss 1.18089 | test ppl 3.25728|test cor 0.15137
128 32 4 8 0.5 0.01
| End of training | test loss 1.16285 | test ppl 3.19903|test cor 0.02256
128 32 4 8 0.5 0.005
| End of training | test loss 1.15799 | test ppl 3.18353|test cor 0.16491
128 32 4 8 0.5 0.001
| End of training | test loss 1.21226 | test ppl 3.36107|test cor 0.17822
128 32 6 2 0.2 0.1
| End of training | test loss 1.21449 | test ppl 3.36857|test cor 0.19182
128 32 6 2 0.2 0.01
| End of training | test loss 1.16324 | test ppl 3.20028|

| End of training | test loss 1.15872 | test ppl 3.18586|test cor 0.11606
128 32 6 8 0.5 0.005
| End of training | test loss 1.18977 | test ppl 3.28634|test cor 0.08217
128 32 6 8 0.5 0.001
| End of training | test loss 1.17225 | test ppl 3.22926|test cor 0.09352
128 32 8 2 0.2 0.1
| End of training | test loss 1.17034 | test ppl 3.22308|test cor 0.15432
128 32 8 2 0.2 0.01
| End of training | test loss 1.16562 | test ppl 3.20792|test cor 0.05721
128 32 8 2 0.2 0.005
| End of training | test loss 1.17635 | test ppl 3.24253|test cor -0.22365
128 32 8 2 0.2 0.001
| End of training | test loss 1.17523 | test ppl 3.23890|test cor -0.28112
128 32 8 2 0.5 0.1
| End of training | test loss 1.17012 | test ppl 3.22238|test cor 0.18680
128 32 8 2 0.5 0.01
| End of training | test loss 1.16525 | test ppl 3.20673|test cor 0.13295
128 32 8 2 0.5 0.005
| End of training | test loss 1.19094 | test ppl 3.29019|test cor 0.19064
128 32 8 2 0.5 0.001
| End of training | test loss 1.16560 | test ppl 3.207

| End of training | test loss 1.18080 | test ppl 3.25699|test cor 0.02269
128 64 2 2 0.5 0.1
| End of training | test loss 1.18949 | test ppl 3.28542|test cor -0.00870
128 64 2 2 0.5 0.01
| End of training | test loss 1.16091 | test ppl 3.19285|test cor 0.05521
128 64 2 2 0.5 0.005
| End of training | test loss 1.16433 | test ppl 3.20378|test cor 0.06793
128 64 2 2 0.5 0.001
| End of training | test loss 1.24152 | test ppl 3.46088|test cor -0.02381
128 64 2 4 0.2 0.1
| End of training | test loss 1.18368 | test ppl 3.26637|test cor 0.07075
128 64 2 4 0.2 0.01
| End of training | test loss 1.16833 | test ppl 3.21661|test cor 0.13622
128 64 2 4 0.2 0.005
| End of training | test loss 1.15779 | test ppl 3.18289|test cor -0.01466
128 64 2 4 0.2 0.001
| End of training | test loss 1.16658 | test ppl 3.21098|test cor -0.02675
128 64 2 4 0.5 0.1
| End of training | test loss 1.20656 | test ppl 3.34198|test cor 0.02610
128 64 2 4 0.5 0.01
| End of training | test loss 1.16301 | test ppl 3.1995

| End of training | test loss 1.17254 | test ppl 3.23020|test cor -0.04354
128 64 4 4 0.2 0.005
| End of training | test loss 1.17345 | test ppl 3.23313|test cor 0.08905
128 64 4 4 0.2 0.001
| End of training | test loss 1.16913 | test ppl 3.21918|test cor -0.10837
128 64 4 4 0.5 0.1
| End of training | test loss 1.21472 | test ppl 3.36936|test cor 0.09260
128 64 4 4 0.5 0.01
| End of training | test loss 1.17245 | test ppl 3.22991|test cor 0.20227
128 64 4 4 0.5 0.005
| End of training | test loss 1.16920 | test ppl 3.21942|test cor 0.18018
128 64 4 4 0.5 0.001
| End of training | test loss 1.28518 | test ppl 3.61533|test cor -0.03896
128 64 4 8 0.2 0.1
| End of training | test loss 1.19404 | test ppl 3.30039|test cor 0.15831
128 64 4 8 0.2 0.01
| End of training | test loss 1.16331 | test ppl 3.20051|test cor 0.08111
128 64 4 8 0.2 0.005
| End of training | test loss 1.17043 | test ppl 3.22338|test cor -0.00008
128 64 4 8 0.2 0.001
| End of training | test loss 1.19623 | test ppl 3.3

| End of training | test loss 1.28462 | test ppl 3.61328|test cor 0.16507
128 64 6 8 0.2 0.1
| End of training | test loss 1.19202 | test ppl 3.29373|test cor 0.10831
128 64 6 8 0.2 0.01
| End of training | test loss 1.15698 | test ppl 3.18032|test cor 0.18918
128 64 6 8 0.2 0.005
| End of training | test loss 1.16493 | test ppl 3.20570|test cor 0.15082
128 64 6 8 0.2 0.001
| End of training | test loss 1.17246 | test ppl 3.22992|test cor -0.20072
128 64 6 8 0.5 0.1
| End of training | test loss 1.18653 | test ppl 3.27570|test cor 0.20351
128 64 6 8 0.5 0.01
| End of training | test loss 1.16437 | test ppl 3.20392|test cor -0.05696
128 64 6 8 0.5 0.005
| End of training | test loss 1.16388 | test ppl 3.20232|test cor 0.11636
128 64 6 8 0.5 0.001
| End of training | test loss 1.19627 | test ppl 3.30774|test cor -0.00455
128 64 8 2 0.2 0.1
| End of training | test loss 1.16497 | test ppl 3.20584|test cor 0.25837
128 64 8 2 0.2 0.01
| End of training | test loss 1.17019 | test ppl 3.22260

| End of training | test loss 1.16762 | test ppl 3.21434|test cor 0.08747
128 64 8 8 0.5 0.005
| End of training | test loss 1.16803 | test ppl 3.21564|test cor -0.16419
128 64 8 8 0.5 0.001
| End of training | test loss 1.26144 | test ppl 3.53051|test cor -0.02373
128 128 2 2 0.2 0.1
| End of training | test loss 1.17782 | test ppl 3.24730|test cor 0.05596
128 128 2 2 0.2 0.01
| End of training | test loss 1.16053 | test ppl 3.19162|test cor 0.06324
128 128 2 2 0.2 0.005
| End of training | test loss 1.15815 | test ppl 3.18403|test cor 0.05832
128 128 2 2 0.2 0.001
| End of training | test loss 1.17748 | test ppl 3.24618|test cor 0.15878
128 128 2 2 0.5 0.1
| End of training | test loss 1.18725 | test ppl 3.27806|test cor 0.02573
128 128 2 2 0.5 0.01
| End of training | test loss 1.16065 | test ppl 3.19199|test cor -0.00791
128 128 2 2 0.5 0.005
| End of training | test loss 1.16077 | test ppl 3.19240|test cor 0.00508
128 128 2 2 0.5 0.001
| End of training | test loss 1.20744 | test 

| End of training | test loss 1.20494 | test ppl 3.33655|test cor -0.11924
128 128 4 2 0.5 0.1
| End of training | test loss 1.20680 | test ppl 3.34278|test cor 0.07478
128 128 4 2 0.5 0.01
| End of training | test loss 1.16707 | test ppl 3.21257|test cor 0.18746
128 128 4 2 0.5 0.005
| End of training | test loss 1.18154 | test ppl 3.25938|test cor -0.08845
128 128 4 2 0.5 0.001
| End of training | test loss 1.26679 | test ppl 3.54943|test cor 0.12755
128 128 4 4 0.2 0.1
| End of training | test loss 1.20579 | test ppl 3.33941|test cor 0.15432
128 128 4 4 0.2 0.01
| End of training | test loss 1.15906 | test ppl 3.18694|test cor -0.09066
128 128 4 4 0.2 0.005
| End of training | test loss 1.16273 | test ppl 3.19865|test cor -0.15215
128 128 4 4 0.2 0.001
| End of training | test loss 1.17922 | test ppl 3.25183|test cor -0.03477
128 128 4 4 0.5 0.1
| End of training | test loss 1.17383 | test ppl 3.23437|test cor 0.14434
128 128 4 4 0.5 0.01
| End of training | test loss 1.16261 | test

| End of training | test loss 1.16649 | test ppl 3.21070|test cor 0.07006
128 128 6 4 0.2 0.005
| End of training | test loss 1.16738 | test ppl 3.21358|test cor 0.15573
128 128 6 4 0.2 0.001
| End of training | test loss 1.17680 | test ppl 3.24399|test cor -0.14288
128 128 6 4 0.5 0.1
| End of training | test loss 1.18173 | test ppl 3.25999|test cor 0.11950
128 128 6 4 0.5 0.01
| End of training | test loss 1.16932 | test ppl 3.21980|test cor 0.16327
128 128 6 4 0.5 0.005
| End of training | test loss 1.17219 | test ppl 3.22907|test cor -0.09690
128 128 6 4 0.5 0.001
| End of training | test loss 1.16696 | test ppl 3.21220|test cor 0.17218
128 128 6 8 0.2 0.1
| End of training | test loss 1.18639 | test ppl 3.27525|test cor 0.10143
128 128 6 8 0.2 0.01
| End of training | test loss 1.17066 | test ppl 3.22411|test cor 0.15995
128 128 6 8 0.2 0.005
| End of training | test loss 1.16334 | test ppl 3.20060|test cor 0.06454
128 128 6 8 0.2 0.001
| End of training | test loss 1.23489 | test

| End of training | test loss 1.20535 | test ppl 3.33793|test cor -0.14850
128 128 8 8 0.2 0.1
| End of training | test loss 1.23161 | test ppl 3.42676|test cor 0.21391
128 128 8 8 0.2 0.01
| End of training | test loss 1.16664 | test ppl 3.21118|test cor 0.19052
128 128 8 8 0.2 0.005
| End of training | test loss 1.15728 | test ppl 3.18127|test cor -0.20249
128 128 8 8 0.2 0.001
| End of training | test loss 1.21455 | test ppl 3.36876|test cor 0.11735
128 128 8 8 0.5 0.1
| End of training | test loss 1.19059 | test ppl 3.28903|test cor 0.18917
128 128 8 8 0.5 0.01
| End of training | test loss 1.16012 | test ppl 3.19031|test cor 0.11569
128 128 8 8 0.5 0.005
| End of training | test loss 1.16873 | test ppl 3.21791|test cor 0.07011
128 128 8 8 0.5 0.001
| End of training | test loss 1.18404 | test ppl 3.26754|test cor -0.01359
128 256 2 2 0.2 0.1
| End of training | test loss 1.18526 | test ppl 3.27155|test cor 0.04972
128 256 2 2 0.2 0.01
| End of training | test loss 1.16046 | test p

| End of training | test loss 1.16261 | test ppl 3.19826|test cor -0.04771
128 256 2 8 0.5 0.005
| End of training | test loss 1.18225 | test ppl 3.26169|test cor 0.13795
128 256 2 8 0.5 0.001
| End of training | test loss 1.17305 | test ppl 3.23183|test cor 0.05019
128 256 4 2 0.2 0.1
| End of training | test loss 1.18890 | test ppl 3.28347|test cor 0.12821
128 256 4 2 0.2 0.01
| End of training | test loss 1.16139 | test ppl 3.19436|test cor 0.07610
128 256 4 2 0.2 0.005
| End of training | test loss 1.17029 | test ppl 3.22293|test cor 0.05654
128 256 4 2 0.2 0.001
| End of training | test loss 1.17367 | test ppl 3.23384|test cor -0.12203
128 256 4 2 0.5 0.1
| End of training | test loss 1.17843 | test ppl 3.24927|test cor 0.08476
128 256 4 2 0.5 0.01
| End of training | test loss 1.16137 | test ppl 3.19431|test cor 0.10737
128 256 4 2 0.5 0.005
| End of training | test loss 1.17708 | test ppl 3.24489|test cor -0.01579
128 256 4 2 0.5 0.001
| End of training | test loss 1.23323 | tes

| End of training | test loss 1.18342 | test ppl 3.26553|test cor 0.05041
128 256 6 2 0.5 0.1
| End of training | test loss 1.20717 | test ppl 3.34401|test cor 0.14919
128 256 6 2 0.5 0.01
| End of training | test loss 1.16542 | test ppl 3.20727|test cor 0.14383
128 256 6 2 0.5 0.005
| End of training | test loss 1.17853 | test ppl 3.24958|test cor 0.21313
128 256 6 2 0.5 0.001
| End of training | test loss 1.17967 | test ppl 3.25329|test cor -0.12213
128 256 6 4 0.2 0.1
| End of training | test loss 1.19664 | test ppl 3.30899|test cor 0.14995
128 256 6 4 0.2 0.01
| End of training | test loss 1.16554 | test ppl 3.20766|test cor 0.05847
128 256 6 4 0.2 0.005
| End of training | test loss 1.17826 | test ppl 3.24873|test cor 0.18431
128 256 6 4 0.2 0.001
| End of training | test loss 1.18290 | test ppl 3.26384|test cor 0.20028
128 256 6 4 0.5 0.1
| End of training | test loss 1.16147 | test ppl 3.19462|test cor 0.16653
128 256 6 4 0.5 0.01
| End of training | test loss 1.16251 | test ppl

| End of training | test loss 1.16362 | test ppl 3.20151|test cor 0.08360
128 256 8 4 0.2 0.005
| End of training | test loss 1.16069 | test ppl 3.19212|test cor 0.04846
128 256 8 4 0.2 0.001
| End of training | test loss 1.16187 | test ppl 3.19589|test cor -0.11481
128 256 8 4 0.5 0.1
| End of training | test loss 1.18855 | test ppl 3.28233|test cor 0.18264
128 256 8 4 0.5 0.01
| End of training | test loss 1.16779 | test ppl 3.21488|test cor 0.15308
128 256 8 4 0.5 0.005
| End of training | test loss 1.17308 | test ppl 3.23194|test cor 0.17748
128 256 8 4 0.5 0.001
| End of training | test loss 1.17914 | test ppl 3.25159|test cor -0.24857
128 256 8 8 0.2 0.1
| End of training | test loss 1.18402 | test ppl 3.26749|test cor 0.14200
128 256 8 8 0.2 0.01
| End of training | test loss 1.17101 | test ppl 3.22524|test cor -0.06628
128 256 8 8 0.2 0.005
| End of training | test loss 1.15685 | test ppl 3.17989|test cor -0.03194
128 256 8 8 0.2 0.001
| End of training | test loss 1.16926 | te

| End of training | test loss 1.20060 | test ppl 3.32211|test cor 0.11053
256 8 2 8 0.2 0.1
| End of training | test loss 1.23334 | test ppl 3.43266|test cor 0.05468
256 8 2 8 0.2 0.01
| End of training | test loss 1.16320 | test ppl 3.20016|test cor 0.10427
256 8 2 8 0.2 0.005
| End of training | test loss 1.17449 | test ppl 3.23651|test cor 0.06561
256 8 2 8 0.2 0.001
| End of training | test loss 1.17797 | test ppl 3.24777|test cor 0.04415
256 8 2 8 0.5 0.1
| End of training | test loss 1.20967 | test ppl 3.35238|test cor 0.15607
256 8 2 8 0.5 0.01
| End of training | test loss 1.17012 | test ppl 3.22239|test cor 0.05171
256 8 2 8 0.5 0.005
| End of training | test loss 1.17983 | test ppl 3.25381|test cor 0.07555
256 8 2 8 0.5 0.001
| End of training | test loss 1.16962 | test ppl 3.22078|test cor 0.01943
256 8 4 2 0.2 0.1
| End of training | test loss 1.36806 | test ppl 3.92773|test cor 0.10702
256 8 4 2 0.2 0.01
| End of training | test loss 1.17014 | test ppl 3.22243|test cor 0.1

| End of training | test loss 1.15776 | test ppl 3.18280|test cor -0.04972
256 8 4 8 0.5 0.005
| End of training | test loss 1.15914 | test ppl 3.18718|test cor -0.09681
256 8 4 8 0.5 0.001
| End of training | test loss 1.28436 | test ppl 3.61237|test cor 0.11172
256 8 6 2 0.2 0.1
| End of training | test loss 1.18361 | test ppl 3.26614|test cor 0.20131
256 8 6 2 0.2 0.01
| End of training | test loss 1.16087 | test ppl 3.19272|test cor -0.05571
256 8 6 2 0.2 0.005
| End of training | test loss 1.16484 | test ppl 3.20542|test cor -0.01377
256 8 6 2 0.2 0.001
| End of training | test loss 1.19011 | test ppl 3.28744|test cor -0.18309
256 8 6 2 0.5 0.1
| End of training | test loss 1.15762 | test ppl 3.18234|test cor 0.20146
256 8 6 2 0.5 0.01
| End of training | test loss 1.17048 | test ppl 3.22354|test cor 0.15866
256 8 6 2 0.5 0.005
| End of training | test loss 1.16479 | test ppl 3.20526|test cor 0.04400
256 8 6 2 0.5 0.001
| End of training | test loss 1.18576 | test ppl 3.27316|test

| End of training | test loss 1.17475 | test ppl 3.23734|test cor 0.07221
256 8 8 2 0.5 0.1
| End of training | test loss 1.20480 | test ppl 3.33608|test cor 0.18147
256 8 8 2 0.5 0.01
| End of training | test loss 1.16157 | test ppl 3.19494|test cor 0.19066
256 8 8 2 0.5 0.005
| End of training | test loss 1.16051 | test ppl 3.19155|test cor 0.07945
256 8 8 2 0.5 0.001
| End of training | test loss 1.23026 | test ppl 3.42212|test cor 0.18819
256 8 8 4 0.2 0.1
| End of training | test loss 1.18896 | test ppl 3.28366|test cor 0.21760
256 8 8 4 0.2 0.01
| End of training | test loss 1.16664 | test ppl 3.21119|test cor -0.03733
256 8 8 4 0.2 0.005
| End of training | test loss 1.17101 | test ppl 3.22526|test cor 0.19196
256 8 8 4 0.2 0.001
| End of training | test loss 1.17675 | test ppl 3.24382|test cor -0.07038
256 8 8 4 0.5 0.1
| End of training | test loss 1.20924 | test ppl 3.35093|test cor 0.15118
256 8 8 4 0.5 0.01
| End of training | test loss 1.16588 | test ppl 3.20874|test cor 0

| End of training | test loss 1.15780 | test ppl 3.18293|test cor 0.05031
256 32 2 4 0.2 0.005
| End of training | test loss 1.15995 | test ppl 3.18978|test cor 0.05395
256 32 2 4 0.2 0.001
| End of training | test loss 1.16993 | test ppl 3.22178|test cor -0.07890
256 32 2 4 0.5 0.1
| End of training | test loss 1.25186 | test ppl 3.49683|test cor 0.01511
256 32 2 4 0.5 0.01
| End of training | test loss 1.16038 | test ppl 3.19115|test cor 0.08089
256 32 2 4 0.5 0.005
| End of training | test loss 1.16547 | test ppl 3.20744|test cor 0.11452
256 32 2 4 0.5 0.001
| End of training | test loss 1.17654 | test ppl 3.24312|test cor -0.05591
256 32 2 8 0.2 0.1
| End of training | test loss 1.21412 | test ppl 3.36733|test cor 0.03341
256 32 2 8 0.2 0.01
| End of training | test loss 1.16181 | test ppl 3.19570|test cor 0.11096
256 32 2 8 0.2 0.005
| End of training | test loss 1.15894 | test ppl 3.18655|test cor 0.09495
256 32 2 8 0.2 0.001
| End of training | test loss 1.17252 | test ppl 3.230

| End of training | test loss 1.25837 | test ppl 3.51967|test cor 0.16845
256 32 4 8 0.2 0.1
| End of training | test loss 1.26723 | test ppl 3.55102|test cor 0.15630
256 32 4 8 0.2 0.01
| End of training | test loss 1.16303 | test ppl 3.19960|test cor 0.06941
256 32 4 8 0.2 0.005
| End of training | test loss 1.16805 | test ppl 3.21573|test cor -0.01302
256 32 4 8 0.2 0.001
| End of training | test loss 1.16724 | test ppl 3.21312|test cor 0.13084
256 32 4 8 0.5 0.1
| End of training | test loss 1.26427 | test ppl 3.54051|test cor 0.19252
256 32 4 8 0.5 0.01
| End of training | test loss 1.16258 | test ppl 3.19817|test cor 0.02801
256 32 4 8 0.5 0.005
| End of training | test loss 1.18436 | test ppl 3.26861|test cor 0.16604
256 32 4 8 0.5 0.001
| End of training | test loss 1.21512 | test ppl 3.37071|test cor -0.18279
256 32 6 2 0.2 0.1
| End of training | test loss 1.26228 | test ppl 3.53346|test cor 0.07710
256 32 6 2 0.2 0.01
| End of training | test loss 1.17413 | test ppl 3.23532|

| End of training | test loss 1.17309 | test ppl 3.23195|test cor 0.13843
256 32 6 8 0.5 0.005
| End of training | test loss 1.17655 | test ppl 3.24315|test cor -0.07958
256 32 6 8 0.5 0.001
| End of training | test loss 1.24454 | test ppl 3.47134|test cor 0.07627
256 32 8 2 0.2 0.1
| End of training | test loss 1.18208 | test ppl 3.26114|test cor 0.25894
256 32 8 2 0.2 0.01
| End of training | test loss 1.16301 | test ppl 3.19954|test cor 0.23981
256 32 8 2 0.2 0.005
| End of training | test loss 1.16392 | test ppl 3.20248|test cor 0.16444
256 32 8 2 0.2 0.001
| End of training | test loss 1.17506 | test ppl 3.23833|test cor 0.21681
256 32 8 2 0.5 0.1
| End of training | test loss 1.21602 | test ppl 3.37372|test cor 0.12243
256 32 8 2 0.5 0.01
| End of training | test loss 1.16428 | test ppl 3.20361|test cor 0.13507
256 32 8 2 0.5 0.005
| End of training | test loss 1.18402 | test ppl 3.26748|test cor -0.05844
256 32 8 2 0.5 0.001
| End of training | test loss 1.17443 | test ppl 3.236

| End of training | test loss 1.16726 | test ppl 3.21317|test cor -0.01276
256 64 2 2 0.5 0.1
| End of training | test loss 1.23930 | test ppl 3.45320|test cor 0.04282
256 64 2 2 0.5 0.01
| End of training | test loss 1.16088 | test ppl 3.19275|test cor 0.04539
256 64 2 2 0.5 0.005
| End of training | test loss 1.15911 | test ppl 3.18710|test cor 0.04983
256 64 2 2 0.5 0.001
| End of training | test loss 1.16096 | test ppl 3.19300|test cor 0.13708
256 64 2 4 0.2 0.1
| End of training | test loss 1.23267 | test ppl 3.43037|test cor 0.04618
256 64 2 4 0.2 0.01
| End of training | test loss 1.16340 | test ppl 3.20080|test cor 0.13074
256 64 2 4 0.2 0.005
| End of training | test loss 1.16735 | test ppl 3.21347|test cor 0.08570
256 64 2 4 0.2 0.001
| End of training | test loss 1.20479 | test ppl 3.33605|test cor -0.00636
256 64 2 4 0.5 0.1
| End of training | test loss 1.21824 | test ppl 3.38123|test cor -0.02645
256 64 2 4 0.5 0.01
| End of training | test loss 1.16068 | test ppl 3.19209

| End of training | test loss 1.16378 | test ppl 3.20201|test cor 0.10455
256 64 4 4 0.2 0.005
| End of training | test loss 1.16421 | test ppl 3.20338|test cor 0.09933
256 64 4 4 0.2 0.001
| End of training | test loss 1.17422 | test ppl 3.23563|test cor 0.07184
256 64 4 4 0.5 0.1
| End of training | test loss 1.24704 | test ppl 3.48001|test cor 0.20861
256 64 4 4 0.5 0.01
| End of training | test loss 1.16015 | test ppl 3.19042|test cor 0.11892
256 64 4 4 0.5 0.005
| End of training | test loss 1.16784 | test ppl 3.21503|test cor 0.17872
256 64 4 4 0.5 0.001
| End of training | test loss 1.22936 | test ppl 3.41904|test cor 0.11140
256 64 4 8 0.2 0.1
| End of training | test loss 1.31621 | test ppl 3.72928|test cor 0.12339
256 64 4 8 0.2 0.01
| End of training | test loss 1.16068 | test ppl 3.19211|test cor -0.10197
256 64 4 8 0.2 0.005
| End of training | test loss 1.16369 | test ppl 3.20173|test cor 0.13109
256 64 4 8 0.2 0.001
| End of training | test loss 1.18068 | test ppl 3.2565

| End of training | test loss 1.27833 | test ppl 3.59065|test cor 0.05984
256 64 6 8 0.2 0.1
| End of training | test loss 1.19111 | test ppl 3.29074|test cor 0.04606
256 64 6 8 0.2 0.01
| End of training | test loss 1.16539 | test ppl 3.20718|test cor 0.14145
256 64 6 8 0.2 0.005
| End of training | test loss 1.17306 | test ppl 3.23186|test cor -0.08697
256 64 6 8 0.2 0.001
| End of training | test loss 1.18990 | test ppl 3.28675|test cor 0.18281
256 64 6 8 0.5 0.1
| End of training | test loss 1.17908 | test ppl 3.25139|test cor 0.16236
256 64 6 8 0.5 0.01
| End of training | test loss 1.16708 | test ppl 3.21260|test cor 0.08730
256 64 6 8 0.5 0.005
| End of training | test loss 1.18556 | test ppl 3.27251|test cor 0.01838
256 64 6 8 0.5 0.001
| End of training | test loss 1.20256 | test ppl 3.32864|test cor -0.11181
256 64 8 2 0.2 0.1
| End of training | test loss 1.26451 | test ppl 3.54136|test cor 0.21544
256 64 8 2 0.2 0.01
| End of training | test loss 1.16599 | test ppl 3.20909|

| End of training | test loss 1.16449 | test ppl 3.20428|test cor 0.21661
256 64 8 8 0.5 0.005
| End of training | test loss 1.17410 | test ppl 3.23523|test cor -0.19782
256 64 8 8 0.5 0.001
| End of training | test loss 1.17613 | test ppl 3.24179|test cor 0.08456
256 128 2 2 0.2 0.1
| End of training | test loss 1.28754 | test ppl 3.62385|test cor 0.04791
256 128 2 2 0.2 0.01
| End of training | test loss 1.15908 | test ppl 3.18699|test cor -0.12489
256 128 2 2 0.2 0.005
| End of training | test loss 1.15841 | test ppl 3.18485|test cor -0.09090
256 128 2 2 0.2 0.001
| End of training | test loss 1.16356 | test ppl 3.20130|test cor -0.07205
256 128 2 2 0.5 0.1
| End of training | test loss 1.21962 | test ppl 3.38591|test cor 0.02010
256 128 2 2 0.5 0.01
| End of training | test loss 1.16133 | test ppl 3.19418|test cor 0.03377
256 128 2 2 0.5 0.005
| End of training | test loss 1.18151 | test ppl 3.25931|test cor -0.17651
256 128 2 2 0.5 0.001
| End of training | test loss 1.16090 | tes

| End of training | test loss 1.17080 | test ppl 3.22457|test cor 0.17332
256 128 4 2 0.5 0.1
| End of training | test loss 1.22139 | test ppl 3.39190|test cor 0.13621
256 128 4 2 0.5 0.01
| End of training | test loss 1.16921 | test ppl 3.21944|test cor 0.17942
256 128 4 2 0.5 0.005
| End of training | test loss 1.16467 | test ppl 3.20486|test cor 0.18819
256 128 4 2 0.5 0.001
| End of training | test loss 1.16961 | test ppl 3.22075|test cor -0.08258
256 128 4 4 0.2 0.1
| End of training | test loss 1.19450 | test ppl 3.30191|test cor 0.09866
256 128 4 4 0.2 0.01
| End of training | test loss 1.16520 | test ppl 3.20658|test cor -0.08962
256 128 4 4 0.2 0.005
| End of training | test loss 1.15949 | test ppl 3.18829|test cor -0.06518
256 128 4 4 0.2 0.001
| End of training | test loss 1.22306 | test ppl 3.39757|test cor 0.04009
256 128 4 4 0.5 0.1
| End of training | test loss 1.26664 | test ppl 3.54890|test cor 0.18681
256 128 4 4 0.5 0.01
| End of training | test loss 1.16980 | test p

| End of training | test loss 1.17162 | test ppl 3.22722|test cor 0.10977
256 128 6 4 0.2 0.005
| End of training | test loss 1.16650 | test ppl 3.21074|test cor 0.17979
256 128 6 4 0.2 0.001
| End of training | test loss 1.16818 | test ppl 3.21613|test cor 0.15045
256 128 6 4 0.5 0.1
| End of training | test loss 1.16686 | test ppl 3.21189|test cor 0.12392
256 128 6 4 0.5 0.01
| End of training | test loss 1.16478 | test ppl 3.20522|test cor 0.19398
256 128 6 4 0.5 0.005
| End of training | test loss 1.18059 | test ppl 3.25629|test cor 0.01952
256 128 6 4 0.5 0.001
| End of training | test loss 1.27654 | test ppl 3.58421|test cor 0.14867
256 128 6 8 0.2 0.1
| End of training | test loss 1.18483 | test ppl 3.27014|test cor 0.12968
256 128 6 8 0.2 0.01
| End of training | test loss 1.16321 | test ppl 3.20019|test cor 0.15288
256 128 6 8 0.2 0.005
| End of training | test loss 1.17656 | test ppl 3.24320|test cor 0.14831
256 128 6 8 0.2 0.001
| End of training | test loss 1.21350 | test p

| End of training | test loss 1.20907 | test ppl 3.35038|test cor -0.21757
256 128 8 8 0.2 0.1
| End of training | test loss 1.19467 | test ppl 3.30245|test cor 0.23917
256 128 8 8 0.2 0.01
| End of training | test loss 1.16550 | test ppl 3.20752|test cor 0.12852
256 128 8 8 0.2 0.005
| End of training | test loss 1.17353 | test ppl 3.23338|test cor -0.15784
256 128 8 8 0.2 0.001
| End of training | test loss 1.19914 | test ppl 3.31728|test cor 0.14034
256 128 8 8 0.5 0.1
| End of training | test loss 1.19557 | test ppl 3.30545|test cor 0.03622
256 128 8 8 0.5 0.01
| End of training | test loss 1.17794 | test ppl 3.24769|test cor 0.21891
256 128 8 8 0.5 0.005
| End of training | test loss 1.16320 | test ppl 3.20015|test cor 0.03484
256 128 8 8 0.5 0.001
| End of training | test loss 1.19634 | test ppl 3.30800|test cor -0.18154
256 256 2 2 0.2 0.1
| End of training | test loss 1.21799 | test ppl 3.38040|test cor 0.04966
256 256 2 2 0.2 0.01
| End of training | test loss 1.15814 | test p

| End of training | test loss 1.15823 | test ppl 3.18428|test cor 0.09576
256 256 2 8 0.5 0.005
| End of training | test loss 1.16466 | test ppl 3.20482|test cor 0.01677
256 256 2 8 0.5 0.001
| End of training | test loss 1.23103 | test ppl 3.42474|test cor 0.17782
256 256 4 2 0.2 0.1
| End of training | test loss 1.16771 | test ppl 3.21462|test cor 0.19044
256 256 4 2 0.2 0.01
| End of training | test loss 1.16793 | test ppl 3.21532|test cor 0.10018
256 256 4 2 0.2 0.005
| End of training | test loss 1.16200 | test ppl 3.19633|test cor 0.09731
256 256 4 2 0.2 0.001
| End of training | test loss 1.17691 | test ppl 3.24435|test cor 0.08745
256 256 4 2 0.5 0.1
| End of training | test loss 1.26456 | test ppl 3.54155|test cor 0.14482
256 256 4 2 0.5 0.01
| End of training | test loss 1.15928 | test ppl 3.18762|test cor 0.01236
256 256 4 2 0.5 0.005
| End of training | test loss 1.16183 | test ppl 3.19577|test cor -0.07693
256 256 4 2 0.5 0.001
| End of training | test loss 1.24807 | test 

| End of training | test loss 1.17728 | test ppl 3.24554|test cor -0.08474
256 256 6 2 0.5 0.1
| End of training | test loss 1.17758 | test ppl 3.24652|test cor 0.16027
256 256 6 2 0.5 0.01
| End of training | test loss 1.16800 | test ppl 3.21555|test cor -0.06253
256 256 6 2 0.5 0.005
| End of training | test loss 1.16897 | test ppl 3.21867|test cor 0.04285
256 256 6 2 0.5 0.001
| End of training | test loss 1.25175 | test ppl 3.49646|test cor -0.12423
256 256 6 4 0.2 0.1
| End of training | test loss 1.17689 | test ppl 3.24426|test cor 0.16589
256 256 6 4 0.2 0.01
| End of training | test loss 1.16129 | test ppl 3.19406|test cor 0.22591
256 256 6 4 0.2 0.005
| End of training | test loss 1.16752 | test ppl 3.21401|test cor 0.16029
256 256 6 4 0.2 0.001
| End of training | test loss 1.22042 | test ppl 3.38860|test cor 0.03404
256 256 6 4 0.5 0.1
| End of training | test loss 1.29851 | test ppl 3.66382|test cor 0.20779
256 256 6 4 0.5 0.01
| End of training | test loss 1.16636 | test p

| End of training | test loss 1.17775 | test ppl 3.24706|test cor -0.01188
256 256 8 4 0.2 0.005
| End of training | test loss 1.17302 | test ppl 3.23172|test cor 0.15213
256 256 8 4 0.2 0.001
| End of training | test loss 1.16862 | test ppl 3.21755|test cor -0.17611
256 256 8 4 0.5 0.1
| End of training | test loss 1.17124 | test ppl 3.22598|test cor 0.12071
256 256 8 4 0.5 0.01
| End of training | test loss 1.16350 | test ppl 3.20113|test cor 0.09215
256 256 8 4 0.5 0.005
| End of training | test loss 1.19392 | test ppl 3.30000|test cor 0.15594
256 256 8 4 0.5 0.001
| End of training | test loss 1.20934 | test ppl 3.35129|test cor 0.13047
256 256 8 8 0.2 0.1
| End of training | test loss 1.17873 | test ppl 3.25024|test cor 0.21407
256 256 8 8 0.2 0.01
| End of training | test loss 1.15656 | test ppl 3.17898|test cor 0.12842
256 256 8 8 0.2 0.005
| End of training | test loss 1.16066 | test ppl 3.19205|test cor 0.04053
256 256 8 8 0.2 0.001
| End of training | test loss 1.18184 | test

In [22]:
print(test_mse[np.argmax(np.array(test_mse)[:,-1])])

[8, 8, 4, 2, 0.2, 0.001, 1.9330300315518245]


In [23]:
print(test_mse[np.argmin(np.array(test_mse)[:,-1])])

[8, 32, 2, 8, 0.2, 0.01, 1.1523562502638203]
