# Music 103 diffusion version

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import numpy as np
import copy
import pandas as pd
from tqdm import tqdm
from os.path import exists
from os import remove, chdir
import pickle

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output
    
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model

    def forward(self, x):
        max_len = x.size(1)
        pe = torch.zeros(max_len, self.d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * -(math.log(10000.0) / self.d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0).to(x.device)
        return x + pe


class DecoderPositionalEncoding(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.d_model = d_model

    def forward(self, x, tgt):
        # max_len = x.size(1)
        tgt_one_hot = tgt[:, :, 12:]
        tgt_class = torch.argmax(tgt_one_hot, dim=-1)
        pe = torch.zeros_like(x)
        position = torch.cumsum(tgt_class, dim=1).unsqueeze(-1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * -(math.log(10000.0) / self.d_model)).to(position.device)
        
        pe[:, :, 0::2] = torch.sin(position * div_term)
        pe[:, :, 1::2] = torch.cos(position * div_term)
        return x + pe

    
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class EmbedHead(nn.Module):
    def __init__(
        self,
        input_dim,
        inner_dim_1,
        inner_dim_2,
        out_dim
    ):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, inner_dim_1)
        self.linear2 = nn.Linear(inner_dim_1, inner_dim_2)
        self.linear3 = nn.Linear(inner_dim_2, out_dim)
        self.activation_fn = nn.functional.gelu

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation_fn(x)
        x = self.linear2(x)
        x = self.activation_fn(x)
        x = self.linear3(x)
        return x
    

class EmbedFC(nn.Module):
    def __init__(self, input_dim, emb_dim):
        super(EmbedFC, self).__init__()
        self.input_dim = input_dim
        layers = [
            nn.Linear(input_dim, emb_dim),
            nn.GELU(),
            nn.Linear(emb_dim, emb_dim),
        ]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(-1, self.input_dim)
        return self.model(x)


class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = EmbedHead(src_vocab_size, d_model, d_model, d_model)
        self.decoder_embedding = EmbedHead(tgt_vocab_size, d_model, d_model, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.time_embeddings = nn.ModuleList([EmbedFC(1, d_model) for _ in range(num_layers)])
        self.decoder_positional_encoding = PositionalEncoding(d_model)

        self.encoder_layers_src = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.encoder_layers_noise = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.encoder_layers_tgt = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.output_layer = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, x, time):
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        x_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(x)))

        enc_src = src_embedded
        for enc_layer in self.encoder_layers_src:
            enc_src = enc_layer(enc_src, None)

        enc_noise = x_embedded
        for enc_layer in self.encoder_layers_noise:
            enc_noise = enc_layer(enc_noise, None)

        enc_tgt = enc_noise
        for i, enc_layer in enumerate(self.encoder_layers_tgt):
            time_embedding = self.time_embeddings[i](time).unsqueeze(1)
            enc_tgt = enc_tgt + enc_src + time_embedding
            enc_tgt = enc_layer(enc_tgt, None)
        
        output = self.output_layer(enc_tgt)
        
        return output
    


In [3]:
class VQVAE(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, dropout, codebook_size, d_codebook):
        super().__init__()
        self.encoder_embedding = EmbedHead(vocab_size, d_model, d_model, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.encoder_output = nn.Linear(d_model, d_codebook)
        self.codebook = nn.Embedding(codebook_size, d_codebook)
        self.codebook.weight.data.uniform_(-1/d_codebook, 1/d_codebook)
        self.decoder_embedding = EmbedHead(d_codebook, d_model, d_model, d_model)
        self.decoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_output = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def encode(self, x):
        embedding = self.dropout(self.positional_encoding(self.encoder_embedding(x)))
        for i, enc_layer in enumerate(self.encoder_layers):
            embedding = enc_layer(embedding, None)
        return self.encoder_output(embedding)
    
    def vq(self, z):
        # z: [batch_size, seq_length, d_codebook]
        distance = (z.unsqueeze(2) - self.codebook.weight.unsqueeze(0).unsqueeze(0)).pow(2).mean(dim=-1)
        _, indices = torch.min(distance, dim=-1)
        # print(indices)
        return self.codebook(indices)

    def decode(self, z):
        embedding = self.dropout(self.positional_encoding(self.decoder_embedding(z)))
        for i, dec_layer in enumerate(self.decoder_layers):
            embedding = dec_layer(embedding, None)
        return torch.sigmoid(self.decoder_output(embedding))
    
    def forward(self, x):
        # x: [batch_size, seq_length, vocab_size]
        z = self.encode(x)
        z_vq = self.vq(z)
        z_straight_through = (z_vq - z).detach() + z
        x_recon = self.decode(z_straight_through)
        recon_loss = nn.functional.binary_cross_entropy(x_recon, x)
        embed_loss = nn.functional.mse_loss(z_vq, z.detach())
        commit_loss = nn.functional.mse_loss(z, z_vq.detach())
        return x_recon, recon_loss, embed_loss, commit_loss


# **1. DDPM**


# a. Building Blocks

# b. DDPM Schedules

In [4]:
def ddpm_schedules(beta1, beta2, T):
    assert beta1 < beta2 < 1.0, "beta1 and beta2 must be in (0, 1)"

    ##################
    ### Problem 1 (a): Implement ddpm_schedules()
    beta_t = torch.linspace(beta1, beta2, T).float()

    alpha_t = 1 - beta_t
    oneover_sqrta = 1 / torch.sqrt(alpha_t)
    sqrt_beta_t = torch.sqrt(beta_t)
    alphabar_t = torch.cumprod(alpha_t, dim=0)
    sqrtab = torch.sqrt(alphabar_t)
    sqrtmab = torch.sqrt(1 - alphabar_t)
    mab_over_sqrtmab_inv = (1 - alpha_t) / torch.sqrt(1 - alphabar_t)
    ##################
    ##################

    return {
        "alpha_t": alpha_t,  # \alpha_t
        "oneover_sqrta": oneover_sqrta,  # 1/\sqrt{\alpha_t}
        "sqrt_beta_t": sqrt_beta_t,  # \sqrt{\beta_t}
        "alphabar_t": alphabar_t,  # \bar{\alpha_t}
        "sqrtab": sqrtab,  # \sqrt{\bar{\alpha_t}}
        "sqrtmab": sqrtmab,  # \sqrt{1-\bar{\alpha_t}}
        "mab_over_sqrtmab": mab_over_sqrtmab_inv,  # (1-\alpha_t)/\sqrt{1-\bar{\alpha_t}}
    }

# c. DDPM Main Module



Here the noise $\sigma_t^2=\beta_t$

In [5]:
class DDPM(nn.Module):
    def __init__(self, nn_model, betas, n_T, device, n_inference=None, drop_prob=0.1):
        super(DDPM, self).__init__()
        self.nn_model = nn_model.to(device)

        for k, v in ddpm_schedules(betas[0], betas[1], n_T).items():
            self.register_buffer(k, v)
        
        self.n_T = n_T
        self.n_inference = n_inference if n_inference else n_T 
        
        for k, v in ddpm_schedules(betas[0], betas[1], self.n_inference).items():
            self.register_buffer(k+'_KAIMING', v)

        self.device = device
        self.drop_prob = drop_prob
        self.loss_mse = nn.MSELoss()

    def forward(self, src, tgt):
        ##################
        ### Problem 1 (b): Implement forward()
        t = torch.randint(0, self.n_T, (tgt.size(0),), device=self.device)
        sqrtab_t, sqrtmab_t = self.sqrtab[t].view(-1, 1, 1), \
            self.sqrtmab[t].view(-1, 1, 1)

        noise = torch.randn_like(tgt).to(self.device)  # Define noise tensor
        x_t = sqrtab_t * tgt + sqrtmab_t * noise

        # mask out with probability
        context_mask = torch.bernoulli(torch.zeros(src.shape[0])+1 - self.drop_prob).unsqueeze(-1).unsqueeze(-1).to(self.device)

        pred_noise = self.nn_model(src * context_mask, x_t, t / (self.n_T - 1))
        loss = self.loss_mse(pred_noise, noise) 
        ##################
        ##################

        return loss

    @torch.no_grad()
    def sample(self, src, guide_w=0.0):
        n_sample = src.shape[0]
        x_i = torch.randn(*src.shape).to(self.device)
        c_i = src.to(self.device).clone()
        c_i = c_i.repeat(2, 1, 1)
        context_mask = torch.zeros_like(c_i).to(self.device)
        context_mask[:n_sample] = 1.0  # second half context-free

        for i in range(int(self.n_inference), 0, -1):

            ##################
            ### Problem 1 (c): Implement sample()
            t = torch.full((n_sample,), (i - 1) / (self.n_inference - 1)).to(self.device).float()
            t_i = t.view(-1, 1, 1)

            # double batch
            x_i = x_i.repeat(2, 1, 1)
            t_i = t_i.repeat(2, 1, 1)

            z = torch.randn(*src.shape).to(self.device) if i > 1 else 0 # if last step, no noise
            # classifier-free guidance
            pred_full = self.nn_model(c_i * context_mask, x_i, t_i)
            pred_1, pred_2 = pred_full[:n_sample], pred_full[n_sample:]
            pred_noise = (1 + guide_w) * pred_1 - guide_w * pred_2
            x_i = x_i[:n_sample]
            x_i = self.oneover_sqrta_KAIMING[i - 1] * (x_i - pred_noise * self.mab_over_sqrtmab_KAIMING[i - 1])\
                + self.sqrt_beta_t_KAIMING[i - 1] * z
        return x_i

# c. Training Function

In [6]:
from tqdm import tqdm

def train_main_loop(ddpm, vqvae, optim, trainset, validset, lr, n_epoch, device, guide_w, patience):
    wait = 0
    min_valid_loss = float('inf')
    for ep in tqdm(range(n_epoch)):
        ddpm.train()

        # linear lrate decay
        optim.param_groups[0]['lr'] = lr*(1-ep/n_epoch)
        loss_ema = None
        # train
        for idx, src, tgt in trainset:
            optim.zero_grad()
            tgt = tgt.to(device)
            src = src.to(device)
            tgt_enc = vqvae.encode(tgt)
            loss = ddpm(src, tgt_enc)
            loss.backward()
            if loss_ema is None:
                loss_ema = loss.item()
            else:
                loss_ema = 0.95 * loss_ema + 0.05 * loss.item()
            optim.step()
            
        # validation
        ddpm.eval()
        total_loss = 0
        with torch.no_grad():
            for idx, src, tgt in validset:
                tgt = tgt.to(device)
                src = src.to(device)
                tgt_enc = vqvae.encode(tgt)
                loss = ddpm(src, tgt_enc)
                total_loss += loss.item()
        avg_valid_loss = total_loss / len(validset)

        # early stopping
        if avg_valid_loss < min_valid_loss:
            min_valid_loss = avg_valid_loss
            torch.save(ddpm.nn_model.state_dict(), f"model_best_diffusion.pt")
            print(f'epoch {ep}, train_loss: {loss_ema:.4f}, valid loss: {avg_valid_loss:.4f}')
            wait = 0
        else:
            print(f'epoch {ep}, train_loss: {loss_ema:.4f}, valid loss: {avg_valid_loss:.4f}, min_valid_loss: {min_valid_loss:.4f}, wait: {wait} / {patience}')
            wait += 1
        if wait >= patience:
            break

def eval_main_loop(ddpm, vqvae, checkpoint, testset, device, guide_w, rate=0.5):
    ddpm.nn_model.load_state_dict(torch.load(checkpoint))
    ddpm.eval()
    x_gens = []
    count = 0
    with torch.no_grad():
        for idx, src, tgt in tqdm(testset, total=len(testset)):
            if count > 3:
                break
            x_gen = ddpm.sample(src, guide_w)
            x_gen = vqvae.decode(vqvae.vq(x_gen))
            x_gen = (x_gen >= rate).long()
            x_gens.append((idx, x_gen))
            count += 1

    torch.save(x_gens, "song_test_music103.pt")

# e. Training


In [12]:
# hardcoding these here
n_epoch = 1000
n_T = 1000
n_feat = 128
lr = 1e-4
ws_test = [0.0, 0.5, 2.0]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

src_vocab_size = 12
tgt_vocab_size = 12
d_model = 512
num_heads = 8
num_layers = 4
d_ff = 4096//8
max_seq_length = 2400
dropout = 0.1
batchsize = 16
mode = "train"


if exists("trainset_w.pkl") and exists("validset_w.pkl") and exists("testset_w.pkl"):
    print("splitted dataset found!")
    with open("trainset_w.pkl", "rb") as f:
        trainset = pickle.load(f)
    with open("validset_w.pkl", "rb") as f:
        validset = pickle.load(f)
    with open("testset_w.pkl", "rb") as f:
        testset = pickle.load(f)
else:
    print("?")

def collate_fn(batch):
    # Unpack batch into individual components
    idx, src_data, tgt_data, w = zip(*batch)
    #print(len(rates[0]), len(tgt_data[0]), len(src_data[0]))
    
    # Convert `src_data`, `tgt_data`, and `rates` to tensors if they are not already
    src_data = [torch.tensor(s, dtype=torch.float32) if not isinstance(s, torch.Tensor) else s for s in src_data]
    tgt_data = [torch.tensor(t, dtype=torch.float32) if not isinstance(t, torch.Tensor) else t for t in tgt_data]

    src_data = [torch.cat([s], dim=-1) for s in src_data]
    tgt_data = [torch.cat([t], dim=-1) for t in tgt_data]

    # Pad src_data
    src_data = nn.utils.rnn.pad_sequence(src_data, batch_first=True, padding_value=0.).to(DEVICE)

    # Pad tgt_data
    tgt_data = nn.utils.rnn.pad_sequence(tgt_data, batch_first=True, padding_value=0).to(DEVICE)

    # Extract the last dimension and one-hot encode it
    return idx, src_data, tgt_data


trainset = data.DataLoader(trainset, batch_size=batchsize, collate_fn=collate_fn)
validset = data.DataLoader(validset, batch_size=1, collate_fn=collate_fn)
testset = data.DataLoader(testset, batch_size=1, collate_fn=collate_fn)


splitted dataset found!


In [13]:
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(DEVICE)
ddpm = DDPM(nn_model=transformer, betas=(1e-4, 0.02), n_T=n_T, \
    device=DEVICE, n_inference=1000, drop_prob=0.1)
ddpm.to(DEVICE)
optim = torch.optim.Adam(ddpm.parameters(), lr=lr)
vqvae = VQVAE(tgt_vocab_size, 256, num_heads, 1, d_ff, dropout, 512, 12).to(DEVICE)
vqvae.load_state_dict(torch.load("model_best_vqvae.pt"))
train_main_loop(ddpm, vqvae, optim, trainset, validset, lr, n_epoch, DEVICE, 0, 50)


  0%|          | 1/1000 [00:11<3:14:20, 11.67s/it]

epoch 0, train_loss: 1.0509, valid loss: 0.9826


  0%|          | 2/1000 [00:23<3:12:42, 11.59s/it]

epoch 1, train_loss: 0.5621, valid loss: 0.2487


  0%|          | 3/1000 [00:34<3:12:29, 11.58s/it]

epoch 2, train_loss: 0.2118, valid loss: 0.1876


  0%|          | 4/1000 [00:46<3:12:44, 11.61s/it]

epoch 3, train_loss: 0.1770, valid loss: 0.1464


  0%|          | 5/1000 [00:58<3:12:49, 11.63s/it]

epoch 4, train_loss: 0.1304, valid loss: 0.1249


  1%|          | 6/1000 [01:09<3:12:56, 11.65s/it]

epoch 5, train_loss: 0.1068, valid loss: 0.1355, min_valid_loss: 0.1249, wait: 0 / 50


  1%|          | 7/1000 [01:21<3:12:21, 11.62s/it]

epoch 6, train_loss: 0.1233, valid loss: 0.0970


  1%|          | 8/1000 [01:32<3:12:04, 11.62s/it]

epoch 7, train_loss: 0.0961, valid loss: 0.0838


  1%|          | 9/1000 [01:44<3:11:59, 11.62s/it]

epoch 8, train_loss: 0.1038, valid loss: 0.0813


  1%|          | 10/1000 [01:56<3:11:17, 11.59s/it]

epoch 9, train_loss: 0.0946, valid loss: 0.1075, min_valid_loss: 0.0813, wait: 0 / 50


  1%|          | 11/1000 [02:07<3:12:19, 11.67s/it]

epoch 10, train_loss: 0.0791, valid loss: 0.1193, min_valid_loss: 0.0813, wait: 1 / 50


  1%|          | 12/1000 [02:19<3:11:41, 11.64s/it]

epoch 11, train_loss: 0.0884, valid loss: 0.0855, min_valid_loss: 0.0813, wait: 2 / 50


  1%|▏         | 13/1000 [02:31<3:11:40, 11.65s/it]

epoch 12, train_loss: 0.0826, valid loss: 0.0648


  1%|▏         | 14/1000 [02:42<3:10:58, 11.62s/it]

epoch 13, train_loss: 0.0800, valid loss: 0.0804, min_valid_loss: 0.0648, wait: 0 / 50


  2%|▏         | 15/1000 [02:54<3:11:29, 11.66s/it]

epoch 14, train_loss: 0.0735, valid loss: 0.0663, min_valid_loss: 0.0648, wait: 1 / 50


  2%|▏         | 16/1000 [03:06<3:10:45, 11.63s/it]

epoch 15, train_loss: 0.0717, valid loss: 0.0890, min_valid_loss: 0.0648, wait: 2 / 50


  2%|▏         | 17/1000 [03:17<3:10:22, 11.62s/it]

epoch 16, train_loss: 0.0739, valid loss: 0.0710, min_valid_loss: 0.0648, wait: 3 / 50


  2%|▏         | 18/1000 [03:29<3:09:58, 11.61s/it]

epoch 17, train_loss: 0.0628, valid loss: 0.0671, min_valid_loss: 0.0648, wait: 4 / 50


  2%|▏         | 19/1000 [03:40<3:09:40, 11.60s/it]

epoch 18, train_loss: 0.0679, valid loss: 0.0894, min_valid_loss: 0.0648, wait: 5 / 50


  2%|▏         | 20/1000 [03:52<3:09:26, 11.60s/it]

epoch 19, train_loss: 0.0618, valid loss: 0.0666, min_valid_loss: 0.0648, wait: 6 / 50


  2%|▏         | 21/1000 [04:04<3:09:17, 11.60s/it]

epoch 20, train_loss: 0.0814, valid loss: 0.0881, min_valid_loss: 0.0648, wait: 7 / 50


  2%|▏         | 22/1000 [04:15<3:09:02, 11.60s/it]

epoch 21, train_loss: 0.0659, valid loss: 0.0735, min_valid_loss: 0.0648, wait: 8 / 50


  2%|▏         | 23/1000 [04:27<3:08:50, 11.60s/it]

epoch 22, train_loss: 0.0568, valid loss: 0.0773, min_valid_loss: 0.0648, wait: 9 / 50


  2%|▏         | 24/1000 [04:38<3:09:18, 11.64s/it]

epoch 23, train_loss: 0.0628, valid loss: 0.0580


  2%|▎         | 25/1000 [04:50<3:09:31, 11.66s/it]

epoch 24, train_loss: 0.0627, valid loss: 0.0527


  3%|▎         | 26/1000 [05:02<3:09:00, 11.64s/it]

epoch 25, train_loss: 0.0665, valid loss: 0.0644, min_valid_loss: 0.0527, wait: 0 / 50


  3%|▎         | 27/1000 [05:13<3:08:44, 11.64s/it]

epoch 26, train_loss: 0.0587, valid loss: 0.0784, min_valid_loss: 0.0527, wait: 1 / 50


  3%|▎         | 28/1000 [05:25<3:08:23, 11.63s/it]

epoch 27, train_loss: 0.0670, valid loss: 0.0730, min_valid_loss: 0.0527, wait: 2 / 50


  3%|▎         | 29/1000 [05:37<3:08:18, 11.64s/it]

epoch 28, train_loss: 0.0587, valid loss: 0.0634, min_valid_loss: 0.0527, wait: 3 / 50


  3%|▎         | 30/1000 [05:48<3:08:03, 11.63s/it]

epoch 29, train_loss: 0.0574, valid loss: 0.0627, min_valid_loss: 0.0527, wait: 4 / 50


  3%|▎         | 31/1000 [06:00<3:07:52, 11.63s/it]

epoch 30, train_loss: 0.0627, valid loss: 0.0604, min_valid_loss: 0.0527, wait: 5 / 50


  3%|▎         | 32/1000 [06:12<3:07:34, 11.63s/it]

epoch 31, train_loss: 0.0696, valid loss: 0.0653, min_valid_loss: 0.0527, wait: 6 / 50


  3%|▎         | 33/1000 [06:23<3:07:19, 11.62s/it]

epoch 32, train_loss: 0.0633, valid loss: 0.0692, min_valid_loss: 0.0527, wait: 7 / 50


  3%|▎         | 34/1000 [06:35<3:07:00, 11.61s/it]

epoch 33, train_loss: 0.0533, valid loss: 0.0776, min_valid_loss: 0.0527, wait: 8 / 50


  4%|▎         | 35/1000 [06:46<3:06:41, 11.61s/it]

epoch 34, train_loss: 0.0642, valid loss: 0.0770, min_valid_loss: 0.0527, wait: 9 / 50


  4%|▎         | 36/1000 [06:58<3:06:28, 11.61s/it]

epoch 35, train_loss: 0.0589, valid loss: 0.0618, min_valid_loss: 0.0527, wait: 10 / 50


  4%|▎         | 37/1000 [07:10<3:06:54, 11.65s/it]

epoch 36, train_loss: 0.0642, valid loss: 0.0526


  4%|▍         | 38/1000 [07:21<3:06:30, 11.63s/it]

epoch 37, train_loss: 0.0611, valid loss: 0.0694, min_valid_loss: 0.0526, wait: 0 / 50


  4%|▍         | 39/1000 [07:33<3:06:07, 11.62s/it]

epoch 38, train_loss: 0.0564, valid loss: 0.0601, min_valid_loss: 0.0526, wait: 1 / 50


  4%|▍         | 40/1000 [07:44<3:05:50, 11.62s/it]

epoch 39, train_loss: 0.0601, valid loss: 0.0544, min_valid_loss: 0.0526, wait: 2 / 50


  4%|▍         | 41/1000 [07:56<3:06:10, 11.65s/it]

epoch 40, train_loss: 0.0569, valid loss: 0.0503


  4%|▍         | 42/1000 [08:08<3:05:44, 11.63s/it]

epoch 41, train_loss: 0.0664, valid loss: 0.0589, min_valid_loss: 0.0503, wait: 0 / 50


  4%|▍         | 43/1000 [08:19<3:05:22, 11.62s/it]

epoch 42, train_loss: 0.0531, valid loss: 0.0753, min_valid_loss: 0.0503, wait: 1 / 50


  4%|▍         | 44/1000 [08:31<3:05:03, 11.61s/it]

epoch 43, train_loss: 0.0563, valid loss: 0.0587, min_valid_loss: 0.0503, wait: 2 / 50


  4%|▍         | 45/1000 [08:43<3:04:55, 11.62s/it]

epoch 44, train_loss: 0.0532, valid loss: 0.0591, min_valid_loss: 0.0503, wait: 3 / 50


  5%|▍         | 46/1000 [08:54<3:04:47, 11.62s/it]

epoch 45, train_loss: 0.0531, valid loss: 0.0708, min_valid_loss: 0.0503, wait: 4 / 50


  5%|▍         | 47/1000 [09:06<3:04:29, 11.62s/it]

epoch 46, train_loss: 0.0684, valid loss: 0.0640, min_valid_loss: 0.0503, wait: 5 / 50


  5%|▍         | 48/1000 [09:17<3:04:15, 11.61s/it]

epoch 47, train_loss: 0.0490, valid loss: 0.0659, min_valid_loss: 0.0503, wait: 6 / 50


  5%|▍         | 49/1000 [09:29<3:04:01, 11.61s/it]

epoch 48, train_loss: 0.0526, valid loss: 0.0583, min_valid_loss: 0.0503, wait: 7 / 50


  5%|▌         | 50/1000 [09:41<3:03:51, 11.61s/it]

epoch 49, train_loss: 0.0494, valid loss: 0.0562, min_valid_loss: 0.0503, wait: 8 / 50


  5%|▌         | 51/1000 [09:52<3:03:36, 11.61s/it]

epoch 50, train_loss: 0.0463, valid loss: 0.0618, min_valid_loss: 0.0503, wait: 9 / 50


  5%|▌         | 52/1000 [10:04<3:03:28, 11.61s/it]

epoch 51, train_loss: 0.0488, valid loss: 0.0590, min_valid_loss: 0.0503, wait: 10 / 50


  5%|▌         | 53/1000 [10:16<3:03:15, 11.61s/it]

epoch 52, train_loss: 0.0485, valid loss: 0.0896, min_valid_loss: 0.0503, wait: 11 / 50


  5%|▌         | 54/1000 [10:27<3:03:03, 11.61s/it]

epoch 53, train_loss: 0.0515, valid loss: 0.0618, min_valid_loss: 0.0503, wait: 12 / 50


  6%|▌         | 55/1000 [10:39<3:02:48, 11.61s/it]

epoch 54, train_loss: 0.0541, valid loss: 0.0687, min_valid_loss: 0.0503, wait: 13 / 50


  6%|▌         | 56/1000 [10:50<3:02:36, 11.61s/it]

epoch 55, train_loss: 0.0517, valid loss: 0.0562, min_valid_loss: 0.0503, wait: 14 / 50


  6%|▌         | 57/1000 [11:02<3:02:31, 11.61s/it]

epoch 56, train_loss: 0.0497, valid loss: 0.0620, min_valid_loss: 0.0503, wait: 15 / 50


  6%|▌         | 58/1000 [11:14<3:02:14, 11.61s/it]

epoch 57, train_loss: 0.0557, valid loss: 0.0599, min_valid_loss: 0.0503, wait: 16 / 50


  6%|▌         | 59/1000 [11:25<3:01:59, 11.60s/it]

epoch 58, train_loss: 0.0538, valid loss: 0.0591, min_valid_loss: 0.0503, wait: 17 / 50


  6%|▌         | 60/1000 [11:37<3:02:22, 11.64s/it]

epoch 59, train_loss: 0.0480, valid loss: 0.0469


  6%|▌         | 61/1000 [11:48<3:02:02, 11.63s/it]

epoch 60, train_loss: 0.0550, valid loss: 0.0784, min_valid_loss: 0.0469, wait: 0 / 50


  6%|▌         | 62/1000 [12:00<3:01:49, 11.63s/it]

epoch 61, train_loss: 0.0532, valid loss: 0.0631, min_valid_loss: 0.0469, wait: 1 / 50


  6%|▋         | 63/1000 [12:12<3:02:41, 11.70s/it]

epoch 62, train_loss: 0.0514, valid loss: 0.0628, min_valid_loss: 0.0469, wait: 2 / 50


  6%|▋         | 64/1000 [12:24<3:02:03, 11.67s/it]

epoch 63, train_loss: 0.0473, valid loss: 0.0578, min_valid_loss: 0.0469, wait: 3 / 50


  6%|▋         | 65/1000 [12:35<3:02:53, 11.74s/it]

epoch 64, train_loss: 0.0499, valid loss: 0.0722, min_valid_loss: 0.0469, wait: 4 / 50


  7%|▋         | 66/1000 [12:47<3:02:43, 11.74s/it]

epoch 65, train_loss: 0.0524, valid loss: 0.0455


  7%|▋         | 67/1000 [12:59<3:01:54, 11.70s/it]

epoch 66, train_loss: 0.0585, valid loss: 0.0679, min_valid_loss: 0.0455, wait: 0 / 50


  7%|▋         | 68/1000 [13:10<3:01:20, 11.67s/it]

epoch 67, train_loss: 0.0511, valid loss: 0.0584, min_valid_loss: 0.0455, wait: 1 / 50


  7%|▋         | 69/1000 [13:22<3:00:47, 11.65s/it]

epoch 68, train_loss: 0.0488, valid loss: 0.0618, min_valid_loss: 0.0455, wait: 2 / 50


  7%|▋         | 70/1000 [13:34<3:00:21, 11.64s/it]

epoch 69, train_loss: 0.0466, valid loss: 0.0463, min_valid_loss: 0.0455, wait: 3 / 50


  7%|▋         | 71/1000 [13:45<3:00:03, 11.63s/it]

epoch 70, train_loss: 0.0476, valid loss: 0.0548, min_valid_loss: 0.0455, wait: 4 / 50


  7%|▋         | 72/1000 [13:57<2:59:45, 11.62s/it]

epoch 71, train_loss: 0.0469, valid loss: 0.0599, min_valid_loss: 0.0455, wait: 5 / 50


  7%|▋         | 73/1000 [14:09<3:00:06, 11.66s/it]

epoch 72, train_loss: 0.0564, valid loss: 0.0447


  7%|▋         | 74/1000 [14:20<2:59:36, 11.64s/it]

epoch 73, train_loss: 0.0465, valid loss: 0.0608, min_valid_loss: 0.0447, wait: 0 / 50


  8%|▊         | 75/1000 [14:32<2:59:11, 11.62s/it]

epoch 74, train_loss: 0.0465, valid loss: 0.0539, min_valid_loss: 0.0447, wait: 1 / 50


  8%|▊         | 76/1000 [14:43<2:58:57, 11.62s/it]

epoch 75, train_loss: 0.0478, valid loss: 0.0541, min_valid_loss: 0.0447, wait: 2 / 50


  8%|▊         | 77/1000 [14:55<2:58:35, 11.61s/it]

epoch 76, train_loss: 0.0518, valid loss: 0.0457, min_valid_loss: 0.0447, wait: 3 / 50


  8%|▊         | 78/1000 [15:07<2:58:20, 11.61s/it]

epoch 77, train_loss: 0.0538, valid loss: 0.0614, min_valid_loss: 0.0447, wait: 4 / 50


  8%|▊         | 79/1000 [15:18<2:58:05, 11.60s/it]

epoch 78, train_loss: 0.0455, valid loss: 0.0498, min_valid_loss: 0.0447, wait: 5 / 50


  8%|▊         | 80/1000 [15:30<2:57:54, 11.60s/it]

epoch 79, train_loss: 0.0491, valid loss: 0.0527, min_valid_loss: 0.0447, wait: 6 / 50


  8%|▊         | 81/1000 [15:41<2:57:42, 11.60s/it]

epoch 80, train_loss: 0.0569, valid loss: 0.0576, min_valid_loss: 0.0447, wait: 7 / 50


  8%|▊         | 82/1000 [15:53<2:57:29, 11.60s/it]

epoch 81, train_loss: 0.0461, valid loss: 0.0561, min_valid_loss: 0.0447, wait: 8 / 50


  8%|▊         | 83/1000 [16:05<2:57:13, 11.60s/it]

epoch 82, train_loss: 0.0434, valid loss: 0.0550, min_valid_loss: 0.0447, wait: 9 / 50


  8%|▊         | 84/1000 [16:16<2:57:03, 11.60s/it]

epoch 83, train_loss: 0.0456, valid loss: 0.0449, min_valid_loss: 0.0447, wait: 10 / 50


  8%|▊         | 85/1000 [16:28<2:57:52, 11.66s/it]

epoch 84, train_loss: 0.0440, valid loss: 0.0515, min_valid_loss: 0.0447, wait: 11 / 50


  9%|▊         | 86/1000 [16:40<2:57:24, 11.65s/it]

epoch 85, train_loss: 0.0517, valid loss: 0.0651, min_valid_loss: 0.0447, wait: 12 / 50


  9%|▊         | 87/1000 [16:51<2:57:58, 11.70s/it]

epoch 86, train_loss: 0.0443, valid loss: 0.0654, min_valid_loss: 0.0447, wait: 13 / 50


  9%|▉         | 88/1000 [17:03<2:58:32, 11.75s/it]

epoch 87, train_loss: 0.0491, valid loss: 0.0450, min_valid_loss: 0.0447, wait: 14 / 50


  9%|▉         | 89/1000 [17:15<2:58:57, 11.79s/it]

epoch 88, train_loss: 0.0458, valid loss: 0.0571, min_valid_loss: 0.0447, wait: 15 / 50


  9%|▉         | 90/1000 [17:27<2:57:59, 11.74s/it]

epoch 89, train_loss: 0.0464, valid loss: 0.0656, min_valid_loss: 0.0447, wait: 16 / 50


  9%|▉         | 91/1000 [17:38<2:57:12, 11.70s/it]

epoch 90, train_loss: 0.0524, valid loss: 0.0655, min_valid_loss: 0.0447, wait: 17 / 50


  9%|▉         | 92/1000 [17:50<2:57:14, 11.71s/it]

epoch 91, train_loss: 0.0467, valid loss: 0.0565, min_valid_loss: 0.0447, wait: 18 / 50


  9%|▉         | 93/1000 [18:02<2:56:31, 11.68s/it]

epoch 92, train_loss: 0.0545, valid loss: 0.0543, min_valid_loss: 0.0447, wait: 19 / 50


  9%|▉         | 94/1000 [18:13<2:55:55, 11.65s/it]

epoch 93, train_loss: 0.0476, valid loss: 0.0757, min_valid_loss: 0.0447, wait: 20 / 50


 10%|▉         | 95/1000 [18:25<2:55:26, 11.63s/it]

epoch 94, train_loss: 0.0466, valid loss: 0.0626, min_valid_loss: 0.0447, wait: 21 / 50


 10%|▉         | 96/1000 [18:36<2:55:06, 11.62s/it]

epoch 95, train_loss: 0.0447, valid loss: 0.0538, min_valid_loss: 0.0447, wait: 22 / 50


 10%|▉         | 97/1000 [18:48<2:54:49, 11.62s/it]

epoch 96, train_loss: 0.0543, valid loss: 0.0558, min_valid_loss: 0.0447, wait: 23 / 50


 10%|▉         | 98/1000 [19:00<2:54:31, 11.61s/it]

epoch 97, train_loss: 0.0468, valid loss: 0.0607, min_valid_loss: 0.0447, wait: 24 / 50


 10%|▉         | 99/1000 [19:11<2:54:17, 11.61s/it]

epoch 98, train_loss: 0.0432, valid loss: 0.0629, min_valid_loss: 0.0447, wait: 25 / 50


 10%|█         | 100/1000 [19:23<2:54:02, 11.60s/it]

epoch 99, train_loss: 0.0446, valid loss: 0.0642, min_valid_loss: 0.0447, wait: 26 / 50


 10%|█         | 101/1000 [19:34<2:53:47, 11.60s/it]

epoch 100, train_loss: 0.0493, valid loss: 0.0599, min_valid_loss: 0.0447, wait: 27 / 50


 10%|█         | 102/1000 [19:46<2:53:37, 11.60s/it]

epoch 101, train_loss: 0.0469, valid loss: 0.0629, min_valid_loss: 0.0447, wait: 28 / 50


 10%|█         | 103/1000 [19:58<2:53:25, 11.60s/it]

epoch 102, train_loss: 0.0455, valid loss: 0.0669, min_valid_loss: 0.0447, wait: 29 / 50


 10%|█         | 104/1000 [20:09<2:53:14, 11.60s/it]

epoch 103, train_loss: 0.0441, valid loss: 0.0659, min_valid_loss: 0.0447, wait: 30 / 50


 10%|█         | 105/1000 [20:21<2:53:03, 11.60s/it]

epoch 104, train_loss: 0.0448, valid loss: 0.0661, min_valid_loss: 0.0447, wait: 31 / 50


 11%|█         | 106/1000 [20:33<2:53:25, 11.64s/it]

epoch 105, train_loss: 0.0386, valid loss: 0.0389


 11%|█         | 107/1000 [20:44<2:53:05, 11.63s/it]

epoch 106, train_loss: 0.0479, valid loss: 0.0505, min_valid_loss: 0.0389, wait: 0 / 50


 11%|█         | 108/1000 [20:56<2:52:44, 11.62s/it]

epoch 107, train_loss: 0.0432, valid loss: 0.0622, min_valid_loss: 0.0389, wait: 1 / 50


 11%|█         | 109/1000 [21:07<2:52:27, 11.61s/it]

epoch 108, train_loss: 0.0521, valid loss: 0.0538, min_valid_loss: 0.0389, wait: 2 / 50


 11%|█         | 110/1000 [21:19<2:52:16, 11.61s/it]

epoch 109, train_loss: 0.0455, valid loss: 0.0599, min_valid_loss: 0.0389, wait: 3 / 50


 11%|█         | 111/1000 [21:31<2:52:03, 11.61s/it]

epoch 110, train_loss: 0.0535, valid loss: 0.0597, min_valid_loss: 0.0389, wait: 4 / 50


 11%|█         | 112/1000 [21:42<2:51:59, 11.62s/it]

epoch 111, train_loss: 0.0463, valid loss: 0.0736, min_valid_loss: 0.0389, wait: 5 / 50


 11%|█▏        | 113/1000 [21:54<2:51:44, 11.62s/it]

epoch 112, train_loss: 0.0445, valid loss: 0.0512, min_valid_loss: 0.0389, wait: 6 / 50


 11%|█▏        | 114/1000 [22:05<2:51:30, 11.61s/it]

epoch 113, train_loss: 0.0405, valid loss: 0.0605, min_valid_loss: 0.0389, wait: 7 / 50


 12%|█▏        | 115/1000 [22:17<2:51:17, 11.61s/it]

epoch 114, train_loss: 0.0443, valid loss: 0.0506, min_valid_loss: 0.0389, wait: 8 / 50


 12%|█▏        | 116/1000 [22:29<2:51:08, 11.62s/it]

epoch 115, train_loss: 0.0387, valid loss: 0.0537, min_valid_loss: 0.0389, wait: 9 / 50


 12%|█▏        | 117/1000 [22:40<2:50:55, 11.61s/it]

epoch 116, train_loss: 0.0447, valid loss: 0.0514, min_valid_loss: 0.0389, wait: 10 / 50


 12%|█▏        | 118/1000 [22:52<2:50:43, 11.61s/it]

epoch 117, train_loss: 0.0415, valid loss: 0.0605, min_valid_loss: 0.0389, wait: 11 / 50


 12%|█▏        | 119/1000 [23:04<2:50:29, 11.61s/it]

epoch 118, train_loss: 0.0426, valid loss: 0.0454, min_valid_loss: 0.0389, wait: 12 / 50


 12%|█▏        | 120/1000 [23:15<2:50:13, 11.61s/it]

epoch 119, train_loss: 0.0400, valid loss: 0.0641, min_valid_loss: 0.0389, wait: 13 / 50


 12%|█▏        | 121/1000 [23:27<2:50:01, 11.61s/it]

epoch 120, train_loss: 0.0445, valid loss: 0.0673, min_valid_loss: 0.0389, wait: 14 / 50


 12%|█▏        | 122/1000 [23:38<2:49:52, 11.61s/it]

epoch 121, train_loss: 0.0441, valid loss: 0.0539, min_valid_loss: 0.0389, wait: 15 / 50


 12%|█▏        | 123/1000 [23:50<2:49:42, 11.61s/it]

epoch 122, train_loss: 0.0419, valid loss: 0.0467, min_valid_loss: 0.0389, wait: 16 / 50


 12%|█▏        | 124/1000 [24:02<2:49:32, 11.61s/it]

epoch 123, train_loss: 0.0468, valid loss: 0.0606, min_valid_loss: 0.0389, wait: 17 / 50


 12%|█▎        | 125/1000 [24:13<2:49:23, 11.62s/it]

epoch 124, train_loss: 0.0449, valid loss: 0.0525, min_valid_loss: 0.0389, wait: 18 / 50


 13%|█▎        | 126/1000 [24:25<2:49:11, 11.62s/it]

epoch 125, train_loss: 0.0428, valid loss: 0.0583, min_valid_loss: 0.0389, wait: 19 / 50


 13%|█▎        | 127/1000 [24:36<2:49:02, 11.62s/it]

epoch 126, train_loss: 0.0527, valid loss: 0.0528, min_valid_loss: 0.0389, wait: 20 / 50


 13%|█▎        | 128/1000 [24:48<2:48:48, 11.62s/it]

epoch 127, train_loss: 0.0404, valid loss: 0.0627, min_valid_loss: 0.0389, wait: 21 / 50


 13%|█▎        | 129/1000 [25:00<2:48:34, 11.61s/it]

epoch 128, train_loss: 0.0433, valid loss: 0.0517, min_valid_loss: 0.0389, wait: 22 / 50


 13%|█▎        | 130/1000 [25:11<2:48:22, 11.61s/it]

epoch 129, train_loss: 0.0387, valid loss: 0.0566, min_valid_loss: 0.0389, wait: 23 / 50


 13%|█▎        | 131/1000 [25:23<2:48:13, 11.61s/it]

epoch 130, train_loss: 0.0395, valid loss: 0.0562, min_valid_loss: 0.0389, wait: 24 / 50


 13%|█▎        | 132/1000 [25:34<2:47:58, 11.61s/it]

epoch 131, train_loss: 0.0409, valid loss: 0.0664, min_valid_loss: 0.0389, wait: 25 / 50


 13%|█▎        | 133/1000 [25:46<2:47:49, 11.61s/it]

epoch 132, train_loss: 0.0551, valid loss: 0.0616, min_valid_loss: 0.0389, wait: 26 / 50


 13%|█▎        | 134/1000 [25:58<2:47:39, 11.62s/it]

epoch 133, train_loss: 0.0430, valid loss: 0.0530, min_valid_loss: 0.0389, wait: 27 / 50


 14%|█▎        | 135/1000 [26:09<2:47:26, 11.61s/it]

epoch 134, train_loss: 0.0419, valid loss: 0.0579, min_valid_loss: 0.0389, wait: 28 / 50


 14%|█▎        | 136/1000 [26:21<2:47:15, 11.61s/it]

epoch 135, train_loss: 0.0378, valid loss: 0.0574, min_valid_loss: 0.0389, wait: 29 / 50


 14%|█▎        | 137/1000 [26:33<2:47:03, 11.61s/it]

epoch 136, train_loss: 0.0400, valid loss: 0.0523, min_valid_loss: 0.0389, wait: 30 / 50


 14%|█▍        | 138/1000 [26:44<2:46:46, 11.61s/it]

epoch 137, train_loss: 0.0383, valid loss: 0.0707, min_valid_loss: 0.0389, wait: 31 / 50


 14%|█▍        | 139/1000 [26:56<2:46:32, 11.61s/it]

epoch 138, train_loss: 0.0448, valid loss: 0.0603, min_valid_loss: 0.0389, wait: 32 / 50


 14%|█▍        | 140/1000 [27:07<2:46:18, 11.60s/it]

epoch 139, train_loss: 0.0446, valid loss: 0.0511, min_valid_loss: 0.0389, wait: 33 / 50


 14%|█▍        | 141/1000 [27:19<2:46:06, 11.60s/it]

epoch 140, train_loss: 0.0447, valid loss: 0.0527, min_valid_loss: 0.0389, wait: 34 / 50


 14%|█▍        | 142/1000 [27:31<2:45:51, 11.60s/it]

epoch 141, train_loss: 0.0451, valid loss: 0.0433, min_valid_loss: 0.0389, wait: 35 / 50


 14%|█▍        | 143/1000 [27:42<2:45:42, 11.60s/it]

epoch 142, train_loss: 0.0415, valid loss: 0.0533, min_valid_loss: 0.0389, wait: 36 / 50


 14%|█▍        | 144/1000 [27:54<2:45:31, 11.60s/it]

epoch 143, train_loss: 0.0403, valid loss: 0.0556, min_valid_loss: 0.0389, wait: 37 / 50


 14%|█▍        | 145/1000 [28:05<2:45:22, 11.61s/it]

epoch 144, train_loss: 0.0415, valid loss: 0.0658, min_valid_loss: 0.0389, wait: 38 / 50


 15%|█▍        | 146/1000 [28:17<2:45:12, 11.61s/it]

epoch 145, train_loss: 0.0425, valid loss: 0.0435, min_valid_loss: 0.0389, wait: 39 / 50


 15%|█▍        | 147/1000 [28:29<2:45:01, 11.61s/it]

epoch 146, train_loss: 0.0437, valid loss: 0.0543, min_valid_loss: 0.0389, wait: 40 / 50


 15%|█▍        | 148/1000 [28:40<2:44:50, 11.61s/it]

epoch 147, train_loss: 0.0428, valid loss: 0.0528, min_valid_loss: 0.0389, wait: 41 / 50


 15%|█▍        | 149/1000 [28:52<2:44:40, 11.61s/it]

epoch 148, train_loss: 0.0389, valid loss: 0.0600, min_valid_loss: 0.0389, wait: 42 / 50


 15%|█▌        | 150/1000 [29:03<2:44:25, 11.61s/it]

epoch 149, train_loss: 0.0475, valid loss: 0.0568, min_valid_loss: 0.0389, wait: 43 / 50


 15%|█▌        | 151/1000 [29:15<2:44:12, 11.60s/it]

epoch 150, train_loss: 0.0437, valid loss: 0.0552, min_valid_loss: 0.0389, wait: 44 / 50


 15%|█▌        | 152/1000 [29:27<2:44:02, 11.61s/it]

epoch 151, train_loss: 0.0481, valid loss: 0.0594, min_valid_loss: 0.0389, wait: 45 / 50


 15%|█▌        | 153/1000 [29:38<2:43:48, 11.60s/it]

epoch 152, train_loss: 0.0358, valid loss: 0.0481, min_valid_loss: 0.0389, wait: 46 / 50


 15%|█▌        | 154/1000 [29:50<2:43:39, 11.61s/it]

epoch 153, train_loss: 0.0381, valid loss: 0.0477, min_valid_loss: 0.0389, wait: 47 / 50


 16%|█▌        | 155/1000 [30:01<2:43:31, 11.61s/it]

epoch 154, train_loss: 0.0370, valid loss: 0.0552, min_valid_loss: 0.0389, wait: 48 / 50


 16%|█▌        | 155/1000 [30:13<2:44:48, 11.70s/it]

epoch 155, train_loss: 0.0385, valid loss: 0.0480, min_valid_loss: 0.0389, wait: 49 / 50





In [14]:
eval_main_loop(ddpm, vqvae,"model_best_diffusion.pt", testset, DEVICE, 2, 0.5)

  4%|▍         | 4/100 [00:46<18:34, 11.61s/it]


: 