In [None]:
import json
import os
import requests
import random
import string
import secrets
import time
import re
import collections
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import copy
import numpy as np
from collections import defaultdict


In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random, string, copy
from collections import Counter

class HangmanDataset(Dataset):
    def __init__(self, words, max_word_length=45, reveal_ratio=0.5):
        self.words = [word.lower() for word in words if len(word) <= max_word_length]
        self.max_length = max_word_length
        self.reveal_ratio = reveal_ratio
        self.char_to_idx = {char: i+1 for i, char in enumerate(string.ascii_lowercase)}
        self.char_to_idx['_'] = 0  # blank
        self.char_to_idx['PAD'] = 27

    def __len__(self): return len(self.words) * 80

    def __getitem__(self, idx):
        word = self.words[idx % len(self.words)]
        reveal_count = int(len(word) * self.reveal_ratio)
        revealed = random.sample(range(len(word)), reveal_count) if reveal_count > 0 else []

        word_state = [0] * self.max_length
        for pos in revealed: word_state[pos] = self.char_to_idx[word[pos]]

        target_pos, target_chars, position_context, vowels = [], [], [0]*self.max_length, set('aeiou')
        for i in range(len(word)):
            if i not in revealed:
                ctx = 0
                if i > 0 and word_state[i-1] != 0: ctx += 1
                if i < len(word)-1 and word_state[i+1] != 0: ctx += 2
                if ctx:
                    target_pos.append(i)
                    target_chars.append(self.char_to_idx[word[i]])
                    position_context[i] = ctx

        count_blanks = word_state[:len(word)].count(0)
        blank_vowel_next = [0]*self.max_length
        for i in range(len(word)):
            if word_state[i] == 0:
                l = word[i-1] if i > 0 else 'x'
                r = word[i+1] if i < len(word)-1 else 'x'
                if l in vowels or r in vowels:
                    blank_vowel_next[i] = 1

        max_targets = 10
        while len(target_pos) < max_targets:
            target_pos.append(-1)
            target_chars.append(0)

        return {
            'word_state': torch.tensor(word_state, dtype=torch.long),
            'position_context': torch.tensor(position_context, dtype=torch.long),
            'target_positions': torch.tensor(target_pos[:max_targets], dtype=torch.long),
            'target_chars': torch.tensor(target_chars[:max_targets], dtype=torch.long),
            'word_length': torch.tensor(len(word), dtype=torch.long),
            'blank_count': torch.tensor(count_blanks, dtype=torch.long),
            'next_to_vowel': torch.tensor(blank_vowel_next, dtype=torch.float)
        }


class EnhancedHangmanModel(nn.Module):
    def __init__(self, vocab_size=28, max_len=45, emb_dim=128, hidden_dim=1024, ablate={}):
        super().__init__()
        self.ablate = ablate
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.ctx_emb = nn.Embedding(4, 32)

        self.pattern_cnn = nn.Sequential(
            nn.Conv1d(emb_dim, 64, 3, padding=1), nn.ReLU(), nn.Dropout(0.2),
            nn.Conv1d(64, 64, 3, padding=1), nn.ReLU(), nn.Dropout(0.2)
        )

        self.encoder = nn.LSTM(emb_dim + 32, hidden_dim, bidirectional=True, batch_first=True)

        self.pos_prior_mlp = nn.Sequential(
            nn.Linear(1 + 1 + 64, 32), nn.ReLU(), nn.Dropout(0.2), nn.Linear(32, 26)
        )

        def decoder():
            return nn.Sequential(
                nn.Linear(hidden_dim*2 + 26, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3),

                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),

                nn.Linear(hidden_dim // 2, 26)
            )

        self.left_decoder = decoder()
        self.right_decoder = decoder()
        self.both_decoder = decoder()

    def forward(self, word_state, position_context, word_length, blank_count, next_to_vowel):
        B, L = word_state.size()
        emb = self.char_emb(word_state)
        cnn_feat = self.pattern_cnn(emb.transpose(1, 2)).transpose(1, 2)
        ctx = self.ctx_emb(position_context)
        encoded, _ = self.encoder(torch.cat([emb, ctx], -1))

        pos_scores = []
        for i in range(L):
            is_blank = (word_state[:, i] == 0).float().unsqueeze(1)
            bc = blank_count.unsqueeze(1).float() / L
            pos_input = torch.cat([is_blank, bc, cnn_feat[:, i, :]], -1)
            pos_scores.append(self.pos_prior_mlp(pos_input).unsqueeze(1))
        priors = torch.cat(pos_scores, 1)  # [B, L, 26]

        out = torch.zeros(B, L, 26, device=word_state.device)
        for i in range(L):
            h = encoded[:, i, :]
            ptype = position_context[:, i]
            inp = torch.cat([h, priors[:, i, :]], -1)
            out[ptype==1, i, :] = self.left_decoder(inp[ptype==1])
            out[ptype==2, i, :] = self.right_decoder(inp[ptype==2])
            out[ptype==3, i, :] = self.both_decoder(inp[ptype==3])

        return out

class HangmanSolver:
    def __init__(self, word_list, model_path="best_model.pth"):
        self.model = EnhancedHangmanModel()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Load state dict without DataParallel
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))

        # Now optionally wrap in DataParallel AFTER loading
        if torch.cuda.device_count() > 1:
            self.model = nn.DataParallel(self.model)

        self.model.eval()

        self.dictionary = word_list
        self.char_to_idx = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
        self.char_to_idx['_'] = 0
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items() if v != 0}

    def _fallback_prediction(self, pattern, guessed):
        counter = Counter()
        for word in self.dictionary:
            if len(word) != len(pattern):
                continue
            match = True
            for wc, pc in zip(word, pattern):
                if pc != '_' and pc != wc:
                    match = False
                    break
                if pc == '_' and wc in guessed:
                    match = False
                    break
            if match:
                for i, c in enumerate(word):
                    if pattern[i] == '_' and c not in guessed:
                        counter[c] += 1

        if not counter:
            for c in string.ascii_lowercase:
                if c not in guessed:
                    return c
            return random.choice([c for c in string.ascii_lowercase if c not in guessed])  # final fallback

        for letter, _ in counter.most_common():
            if letter not in guessed:
                return letter

        return random.choice([c for c in string.ascii_lowercase if c not in guessed])

    def predict_letter(self, word_state, guessed_letters=None):
        if guessed_letters is None:
            guessed_letters = set()
        if ' ' in word_state:
            word_state = word_state.replace(' ', '')
        if all(c == '_' for c in word_state):
            # Return most frequent unguessed letter as the first guess
            common_order = "etaoinshrdlucmfwypvbgkjqxz"
            for letter in common_order:
                if letter not in guessed_letters:
                    return letter
        print("ok")
        word_state = word_state.lower()
        max_length = 45
        state_indices = []
        position_context = []

        for i, char in enumerate(word_state):
            if char == '_':
                state_indices.append(0)
                context = 0
                if i > 0 and word_state[i-1] != '_':
                    context += 1
                if i < len(word_state)-1 and word_state[i+1] != '_':
                    context += 2
                position_context.append(context)
            else:
                state_indices.append(self.char_to_idx.get(char, 27))
                position_context.append(0)

        while len(state_indices) < max_length:
            state_indices.append(27)
            position_context.append(0)

        word_tensor = torch.tensor([state_indices], dtype=torch.long).to(self.device)
        context_tensor = torch.tensor([position_context], dtype=torch.long).to(self.device)
        length_tensor = torch.tensor([len(word_state)], dtype=torch.long).to(self.device)
        blank_count_tensor = torch.tensor([word_state.count('_')], dtype=torch.long).to(self.device)
        blank_vowel_next = [0]*max_length

        for i in range(len(word_state)):
            if word_state[i] == '_':
                l = word_state[i-1] if i > 0 else 'x'
                r = word_state[i+1] if i < len(word_state)-1 else 'x'
                if l in 'aeiou' or r in 'aeiou':
                    blank_vowel_next[i] = 1

        blank_vowel_tensor = torch.tensor([blank_vowel_next], dtype=torch.float).to(self.device)

        with torch.no_grad():
            predictions = self.model(word_tensor, context_tensor, length_tensor, blank_count_tensor, blank_vowel_tensor)

        best_predictions = []
        for i in range(len(word_state)):
            if word_state[i] == '_' and position_context[i] > 0:
                probs = torch.softmax(predictions[0, i, :], dim=0)
                for j, prob in enumerate(probs):
                    letter = chr(ord('a') + j)
                    if letter not in guessed_letters:
                        best_predictions.append((letter, prob.item(), i))

        if not best_predictions:
            for i in range(len(word_state)):
                if word_state[i] == '_':
                    probs = torch.softmax(predictions[0, i, :], dim=0)
                    for j, prob in enumerate(probs):
                        letter = chr(ord('a') + j)
                        if letter not in guessed_letters:
                            best_predictions.append((letter, prob.item(), i))

        if best_predictions:
            best_predictions.sort(key=lambda x: x[1], reverse=True)
            return best_predictions[0][0]

        # Fallback to frequency-based guess if model fails
        common_order = "etaoinshrdlucmfwypvbgkjqxz"
        for letter in common_order:
            if letter not in guessed_letters:
                return letter

        return 'e'  # very rare fallback if all else fails

from tqdm import tqdm

def train_model(words, epochs=10, early_stopping_patience=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = EnhancedHangmanModel()
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = ReduceLROnPlateau(opt, patience=2)
    loss_fn = nn.CrossEntropyLoss()

    best = float('inf')
    patience_counter = 0

    for ep in range(epochs):
        # Curriculum: reveal_ratio increases with epoch (starts hard, becomes easier)
        reveal_ratio = max(0.1, 0.55 - (ep) * 0.1)  # Start from 1.0, decrease to 0.1
        ds = HangmanDataset(words, reveal_ratio=reveal_ratio)
        train_len = int(0.9 * len(ds))
        tr, val = random_split(ds, [train_len, len(ds)-train_len])
        dl = DataLoader(tr, shuffle=True, pin_memory=True, batch_size=256, num_workers=2)
        vl = DataLoader(val, pin_memory=True, batch_size=256, num_workers=2)

        print(f"\n--- Epoch {ep+1} | Reveal Ratio: {reveal_ratio:.2f} ---", flush=True)
        model.train()
        total_loss = 0
        batch_count = 0

        for i, batch in enumerate(tqdm(dl, desc="Training", ncols=100)):
            opt.zero_grad()
            out = model(batch['word_state'].to(device), batch['position_context'].to(device),
                        batch['word_length'].to(device), batch['blank_count'].to(device),
                        batch['next_to_vowel'].to(device))
            loss, count = 0, 0
            for b in range(out.size(0)):
                target_pos = batch['target_positions'][b].to(device)
                target_char = batch['target_chars'][b].to(device)
                for p, c in zip(target_pos, target_char):
                    if p >= 0 and c > 0:
                        loss += loss_fn(out[b, p], c-1)
                        count += 1
            if count > 0:
                loss = loss / count
                loss.backward()
                opt.step()
                total_loss += loss.item()
                batch_count += 1
            if i % 20 == 0:
                if isinstance(loss, torch.Tensor):
                    print(f"  Batch {i}/{len(dl)} | Loss: {loss.item():.4f}", flush=True)
                else:
                    print(f"  Batch {i}/{len(dl)} | Loss: N/A (no valid targets)", flush=True)


        train_loss = total_loss / batch_count if batch_count > 0 else 0

        # Validation
        model.eval()
        val_loss = 0
        val_batches = 0
        with torch.no_grad():
            for batch in tqdm(vl, desc="Validation", ncols=100):
                out = model(batch['word_state'].to(device), batch['position_context'].to(device),
                            batch['word_length'].to(device), batch['blank_count'].to(device),
                            batch['next_to_vowel'].to(device))
                loss, count = 0, 0
                for b in range(out.size(0)):
                    target_pos = batch['target_positions'][b].to(device)
                    target_char = batch['target_chars'][b].to(device)
                    for p, c in zip(target_pos, target_char):
                        if p >= 0 and c > 0:
                            loss += loss_fn(out[b, p], c-1)
                            count += 1
                if count > 0:
                    val_loss += loss.item() / count
                    val_batches += 1

        val_loss = val_loss / val_batches if val_batches > 0 else 0
        scheduler.step(val_loss)

        if count > 0 and i % 20 == 0:
            print(f"  Batch {i}/{len(dl)} | Loss: {loss.item():.4f}", flush=True)

        if val_loss < best:
            best = val_loss
            patience_counter = 0
            torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "best_model.pth")
            print("✅ Model improved and saved.", flush=True)
        else:
            patience_counter += 1
            print(f"⚠️ No improvement. Patience: {patience_counter}/{early_stopping_patience}", flush=True)
            if patience_counter >= early_stopping_patience:
                print(f"🛑 Early stopping at epoch {ep+1}", flush=True)
                break

    return model

In [None]:
dictionary = open("/content/words_250000_train.txt").read().splitlines()
# Test code to run in a separate cell after defining the model classes
random.shuffle(dictionary)
word_list = dictionary[:12000]

print("Starting training...")
print(f"Training on {len(word_list)} words")
model = train_model(word_list, epochs=10)

print("\nTraining completed!")
torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "best_model.pth")
print("Model saved as 'best_model.pth'")

solver = HangmanSolver(word_list)
solver.model = model
solver.model.eval()

# Game simulation
def simulate_hangman_game(word, solver, max_wrong=6, verbose=True):
    word = word.lower()
    current = '_' * len(word)
    guessed = set()
    wrong_count = 0
    moves = []

    if verbose:
        print(f"\n{'='*30}\nGAME: {word}\n{'='*30}")

    while '_' in current and wrong_count < max_wrong:
        if verbose:
            print(f"Current: {current}\nWrong guesses: {wrong_count}/{max_wrong}")

        letter = solver.predict_letter(current, guessed)
        guessed.add(letter)
        moves.append(letter)

        if letter in word:
            current = ''.join([c if c == letter or current[i] != '_' else '_' for i, c in enumerate(word)])
            if verbose:
                print(f"✓ Correct! Guessed: {letter}")
        else:
            wrong_count += 1
            if verbose:
                print(f"✗ Wrong! Guessed: {letter}")
        if verbose:
            print()

    if verbose:
        if '_' not in current:
            print(f"🎉 WON! Final word: {current}")
        else:
            print(f"💀 LOST! Word was: {word}")
        print(f"Moves: {' -> '.join(moves)}")

    return '_' not in current, moves, wrong_count

# Run game test
simulate_hangman_game("learning", solver, verbose=True)

Starting training...
Training on 12000 words

--- Epoch 1 | Reveal Ratio: 0.55 ---


Training:   0%|                                                            | 0/3375 [00:00<?, ?it/s]

  Batch 0/3375 | Loss: 3.2584


Training:   1%|▎                                                  | 20/3375 [00:16<49:47,  1.12it/s]

  Batch 20/3375 | Loss: 2.8049


Training:   1%|▌                                                  | 40/3375 [00:33<45:26,  1.22it/s]

  Batch 40/3375 | Loss: 2.6347


Training:   2%|▉                                                  | 60/3375 [00:49<44:10,  1.25it/s]

  Batch 60/3375 | Loss: 2.5202


Training:   2%|█▏                                                 | 80/3375 [01:05<47:29,  1.16it/s]

  Batch 80/3375 | Loss: 2.4116


Training:   3%|█▍                                                | 100/3375 [01:22<43:37,  1.25it/s]

  Batch 100/3375 | Loss: 2.4726


Training:   4%|█▊                                                | 120/3375 [01:38<43:17,  1.25it/s]

  Batch 120/3375 | Loss: 2.3899


Training:   4%|██                                                | 140/3375 [01:55<45:33,  1.18it/s]

  Batch 140/3375 | Loss: 2.3526


Training:   5%|██▎                                               | 160/3375 [02:12<43:29,  1.23it/s]

  Batch 160/3375 | Loss: 2.3076


Training:   5%|██▋                                               | 180/3375 [02:28<46:47,  1.14it/s]

  Batch 180/3375 | Loss: 2.3446


Training:   6%|██▉                                               | 200/3375 [02:45<43:40,  1.21it/s]

  Batch 200/3375 | Loss: 2.3046


Training:   7%|███▎                                              | 220/3375 [03:01<41:51,  1.26it/s]

  Batch 220/3375 | Loss: 2.1985


Training:   7%|███▌                                              | 240/3375 [03:18<45:36,  1.15it/s]

  Batch 240/3375 | Loss: 2.2819


Training:   8%|███▊                                              | 260/3375 [03:35<42:29,  1.22it/s]

  Batch 260/3375 | Loss: 2.2514


Training:   8%|████▏                                             | 280/3375 [03:51<41:36,  1.24it/s]

  Batch 280/3375 | Loss: 2.2436


Training:   9%|████▍                                             | 300/3375 [04:08<43:20,  1.18it/s]

  Batch 300/3375 | Loss: 2.1742


Training:   9%|████▋                                             | 320/3375 [04:24<41:34,  1.22it/s]

  Batch 320/3375 | Loss: 2.1879


Training:  10%|█████                                             | 340/3375 [04:41<45:36,  1.11it/s]

  Batch 340/3375 | Loss: 2.1831


Training:  11%|█████▎                                            | 360/3375 [04:58<41:28,  1.21it/s]

  Batch 360/3375 | Loss: 2.2285


Training:  11%|█████▋                                            | 380/3375 [05:14<40:37,  1.23it/s]

  Batch 380/3375 | Loss: 2.1109


Training:  12%|█████▉                                            | 400/3375 [05:31<42:23,  1.17it/s]

  Batch 400/3375 | Loss: 2.1949


Training:  12%|██████▏                                           | 420/3375 [05:48<40:33,  1.21it/s]

  Batch 420/3375 | Loss: 2.1416


Training:  13%|██████▌                                           | 440/3375 [06:05<41:37,  1.18it/s]

  Batch 440/3375 | Loss: 2.2315


Training:  14%|██████▊                                           | 460/3375 [06:22<41:12,  1.18it/s]

  Batch 460/3375 | Loss: 2.1562


Training:  14%|███████                                           | 480/3375 [06:39<39:42,  1.22it/s]

  Batch 480/3375 | Loss: 2.1541


Training:  15%|███████▍                                          | 500/3375 [06:56<42:32,  1.13it/s]

  Batch 500/3375 | Loss: 2.0870


Training:  15%|███████▋                                          | 520/3375 [07:13<39:19,  1.21it/s]

  Batch 520/3375 | Loss: 2.0838


Training:  16%|████████                                          | 540/3375 [07:29<39:54,  1.18it/s]

  Batch 540/3375 | Loss: 2.0883


Training:  17%|████████▎                                         | 560/3375 [07:46<39:08,  1.20it/s]

  Batch 560/3375 | Loss: 2.0685


Training:  17%|████████▌                                         | 580/3375 [08:03<38:06,  1.22it/s]

  Batch 580/3375 | Loss: 2.0684


Training:  18%|████████▉                                         | 600/3375 [08:20<41:21,  1.12it/s]

  Batch 600/3375 | Loss: 2.0143


Training:  18%|█████████▏                                        | 620/3375 [08:37<37:23,  1.23it/s]

  Batch 620/3375 | Loss: 2.0998


Training:  19%|█████████▍                                        | 640/3375 [08:53<36:19,  1.25it/s]

  Batch 640/3375 | Loss: 2.0903


Training:  20%|█████████▊                                        | 660/3375 [09:10<38:31,  1.17it/s]

  Batch 660/3375 | Loss: 2.0727


Training:  20%|██████████                                        | 680/3375 [09:26<35:57,  1.25it/s]

  Batch 680/3375 | Loss: 2.0833


Training:  21%|██████████▎                                       | 700/3375 [09:43<38:05,  1.17it/s]

  Batch 700/3375 | Loss: 2.0223


Training:  21%|██████████▋                                       | 720/3375 [10:00<36:32,  1.21it/s]

  Batch 720/3375 | Loss: 2.0244


Training:  22%|██████████▉                                       | 740/3375 [10:16<35:02,  1.25it/s]

  Batch 740/3375 | Loss: 1.9694


Training:  23%|███████████▎                                      | 760/3375 [10:33<38:45,  1.12it/s]

  Batch 760/3375 | Loss: 2.0047


Training:  23%|███████████▌                                      | 780/3375 [10:49<35:31,  1.22it/s]

  Batch 780/3375 | Loss: 1.9997


Training:  24%|███████████▊                                      | 800/3375 [11:06<34:00,  1.26it/s]

  Batch 800/3375 | Loss: 2.0283


Training:  24%|████████████▏                                     | 820/3375 [11:22<36:13,  1.18it/s]

  Batch 820/3375 | Loss: 2.0015


Training:  25%|████████████▍                                     | 840/3375 [11:39<34:42,  1.22it/s]

  Batch 840/3375 | Loss: 2.1751


Training:  25%|████████████▋                                     | 860/3375 [11:55<35:02,  1.20it/s]

  Batch 860/3375 | Loss: 2.0252


Training:  26%|█████████████                                     | 880/3375 [12:12<34:05,  1.22it/s]

  Batch 880/3375 | Loss: 2.0439


Training:  27%|█████████████▎                                    | 900/3375 [12:28<33:14,  1.24it/s]

  Batch 900/3375 | Loss: 2.0769


Training:  27%|█████████████▋                                    | 920/3375 [12:45<37:26,  1.09it/s]

  Batch 920/3375 | Loss: 2.0088


Training:  28%|█████████████▉                                    | 940/3375 [13:02<33:18,  1.22it/s]

  Batch 940/3375 | Loss: 1.9620


Training:  28%|██████████████▏                                   | 960/3375 [13:18<32:51,  1.22it/s]

  Batch 960/3375 | Loss: 2.0203


Training:  29%|██████████████▌                                   | 980/3375 [13:36<34:31,  1.16it/s]

  Batch 980/3375 | Loss: 1.9548


Training:  30%|██████████████▌                                  | 1000/3375 [13:52<32:04,  1.23it/s]

  Batch 1000/3375 | Loss: 2.0138


Training:  30%|██████████████▊                                  | 1020/3375 [14:09<32:46,  1.20it/s]

  Batch 1020/3375 | Loss: 1.9961


Training:  31%|███████████████                                  | 1040/3375 [14:25<32:05,  1.21it/s]

  Batch 1040/3375 | Loss: 1.9647


Training:  31%|███████████████▍                                 | 1060/3375 [14:42<31:28,  1.23it/s]

  Batch 1060/3375 | Loss: 2.0343


Training:  32%|███████████████▋                                 | 1080/3375 [14:59<35:05,  1.09it/s]

  Batch 1080/3375 | Loss: 2.0154


Training:  33%|███████████████▉                                 | 1100/3375 [15:16<31:10,  1.22it/s]

  Batch 1100/3375 | Loss: 1.9474


Training:  33%|████████████████▎                                | 1120/3375 [15:32<30:00,  1.25it/s]

  Batch 1120/3375 | Loss: 2.0216


Training:  34%|████████████████▌                                | 1140/3375 [15:49<32:07,  1.16it/s]

  Batch 1140/3375 | Loss: 1.9205


Training:  34%|████████████████▊                                | 1160/3375 [16:05<30:07,  1.23it/s]

  Batch 1160/3375 | Loss: 2.0039


Training:  35%|█████████████████▏                               | 1180/3375 [16:22<31:41,  1.15it/s]

  Batch 1180/3375 | Loss: 2.0097


Training:  36%|█████████████████▍                               | 1200/3375 [16:39<29:46,  1.22it/s]

  Batch 1200/3375 | Loss: 1.9186


Training:  36%|█████████████████▋                               | 1220/3375 [16:56<28:51,  1.24it/s]

  Batch 1220/3375 | Loss: 1.9522


Training:  37%|██████████████████                               | 1240/3375 [17:12<31:18,  1.14it/s]

  Batch 1240/3375 | Loss: 1.9766


Training:  37%|██████████████████▎                              | 1260/3375 [17:29<28:24,  1.24it/s]

  Batch 1260/3375 | Loss: 1.9503


Training:  38%|██████████████████▌                              | 1280/3375 [17:45<28:03,  1.24it/s]

  Batch 1280/3375 | Loss: 1.9742


Training:  39%|██████████████████▊                              | 1300/3375 [18:02<28:51,  1.20it/s]

  Batch 1300/3375 | Loss: 1.9897


Training:  39%|███████████████████▏                             | 1320/3375 [18:19<27:43,  1.23it/s]

  Batch 1320/3375 | Loss: 1.9143


Training:  40%|███████████████████▍                             | 1340/3375 [18:36<30:53,  1.10it/s]

  Batch 1340/3375 | Loss: 2.0648


Training:  40%|███████████████████▋                             | 1360/3375 [18:52<27:32,  1.22it/s]

  Batch 1360/3375 | Loss: 1.9627


Training:  41%|████████████████████                             | 1380/3375 [19:09<26:50,  1.24it/s]

  Batch 1380/3375 | Loss: 1.9056


Training:  41%|████████████████████▎                            | 1400/3375 [19:26<27:58,  1.18it/s]

  Batch 1400/3375 | Loss: 1.9427


Training:  42%|████████████████████▌                            | 1420/3375 [19:42<26:12,  1.24it/s]

  Batch 1420/3375 | Loss: 1.8804


Training:  43%|████████████████████▉                            | 1440/3375 [19:59<27:10,  1.19it/s]

  Batch 1440/3375 | Loss: 1.9594


Training:  43%|█████████████████████▏                           | 1460/3375 [20:15<26:29,  1.20it/s]

  Batch 1460/3375 | Loss: 1.9331


Training:  44%|█████████████████████▍                           | 1480/3375 [20:32<25:19,  1.25it/s]

  Batch 1480/3375 | Loss: 1.9416


Training:  44%|█████████████████████▊                           | 1500/3375 [20:49<28:07,  1.11it/s]

  Batch 1500/3375 | Loss: 1.9008


Training:  45%|██████████████████████                           | 1520/3375 [21:05<25:03,  1.23it/s]

  Batch 1520/3375 | Loss: 1.9403


Training:  46%|██████████████████████▎                          | 1540/3375 [21:22<24:38,  1.24it/s]

  Batch 1540/3375 | Loss: 1.9228


Training:  46%|██████████████████████▋                          | 1560/3375 [21:38<25:39,  1.18it/s]

  Batch 1560/3375 | Loss: 1.9247


Training:  47%|██████████████████████▉                          | 1580/3375 [21:55<24:21,  1.23it/s]

  Batch 1580/3375 | Loss: 1.8606


Training:  47%|███████████████████████▏                         | 1600/3375 [22:12<25:13,  1.17it/s]

  Batch 1600/3375 | Loss: 1.9587


Training:  48%|███████████████████████▌                         | 1620/3375 [22:28<23:52,  1.22it/s]

  Batch 1620/3375 | Loss: 1.9727


Training:  49%|███████████████████████▊                         | 1640/3375 [22:45<23:15,  1.24it/s]

  Batch 1640/3375 | Loss: 1.9601


Training:  49%|████████████████████████                         | 1660/3375 [23:02<24:49,  1.15it/s]

  Batch 1660/3375 | Loss: 1.8990


Training:  50%|████████████████████████▍                        | 1680/3375 [23:18<23:36,  1.20it/s]

  Batch 1680/3375 | Loss: 1.8740


Training:  50%|████████████████████████▋                        | 1700/3375 [23:35<22:20,  1.25it/s]

  Batch 1700/3375 | Loss: 1.8432


Training:  51%|████████████████████████▉                        | 1720/3375 [23:52<23:09,  1.19it/s]

  Batch 1720/3375 | Loss: 1.9346


Training:  52%|█████████████████████████▎                       | 1740/3375 [24:08<22:06,  1.23it/s]

  Batch 1740/3375 | Loss: 1.8626


Training:  52%|█████████████████████████▌                       | 1760/3375 [24:25<24:11,  1.11it/s]

  Batch 1760/3375 | Loss: 1.9908


Training:  53%|█████████████████████████▊                       | 1780/3375 [24:41<21:49,  1.22it/s]

  Batch 1780/3375 | Loss: 1.8468


Training:  53%|██████████████████████████▏                      | 1800/3375 [24:58<20:57,  1.25it/s]

  Batch 1800/3375 | Loss: 1.8835


Training:  54%|██████████████████████████▍                      | 1820/3375 [25:15<22:22,  1.16it/s]

  Batch 1820/3375 | Loss: 1.9552


Training:  55%|██████████████████████████▋                      | 1840/3375 [25:31<20:38,  1.24it/s]

  Batch 1840/3375 | Loss: 1.9549


Training:  55%|███████████████████████████                      | 1860/3375 [25:48<21:10,  1.19it/s]

  Batch 1860/3375 | Loss: 1.8773


Training:  56%|███████████████████████████▎                     | 1880/3375 [26:05<20:38,  1.21it/s]

  Batch 1880/3375 | Loss: 1.9240


Training:  56%|███████████████████████████▌                     | 1900/3375 [26:21<19:55,  1.23it/s]

  Batch 1900/3375 | Loss: 1.9319


Training:  57%|███████████████████████████▉                     | 1920/3375 [26:38<21:50,  1.11it/s]

  Batch 1920/3375 | Loss: 1.9283


Training:  57%|████████████████████████████▏                    | 1940/3375 [26:55<19:35,  1.22it/s]

  Batch 1940/3375 | Loss: 1.9181


Training:  58%|████████████████████████████▍                    | 1960/3375 [27:11<18:56,  1.24it/s]

  Batch 1960/3375 | Loss: 1.9866


Training:  59%|████████████████████████████▋                    | 1980/3375 [27:28<19:35,  1.19it/s]

  Batch 1980/3375 | Loss: 1.9466


Training:  59%|█████████████████████████████                    | 2000/3375 [27:45<18:21,  1.25it/s]

  Batch 2000/3375 | Loss: 1.8734


Training:  60%|█████████████████████████████▎                   | 2020/3375 [28:02<19:59,  1.13it/s]

  Batch 2020/3375 | Loss: 1.8350


Training:  60%|█████████████████████████████▌                   | 2040/3375 [28:18<18:17,  1.22it/s]

  Batch 2040/3375 | Loss: 1.8987


Training:  61%|█████████████████████████████▉                   | 2060/3375 [28:35<17:31,  1.25it/s]

  Batch 2060/3375 | Loss: 1.9153


Training:  62%|██████████████████████████████▏                  | 2080/3375 [28:52<18:57,  1.14it/s]

  Batch 2080/3375 | Loss: 1.8539


Training:  62%|██████████████████████████████▍                  | 2100/3375 [29:08<17:09,  1.24it/s]

  Batch 2100/3375 | Loss: 1.8534


Training:  63%|██████████████████████████████▊                  | 2120/3375 [29:25<16:42,  1.25it/s]

  Batch 2120/3375 | Loss: 1.8402


Training:  63%|███████████████████████████████                  | 2140/3375 [29:41<17:05,  1.20it/s]

  Batch 2140/3375 | Loss: 1.9688


Training:  64%|███████████████████████████████▎                 | 2160/3375 [29:58<16:19,  1.24it/s]

  Batch 2160/3375 | Loss: 1.9226


Training:  65%|███████████████████████████████▋                 | 2180/3375 [30:15<17:48,  1.12it/s]

  Batch 2180/3375 | Loss: 1.9102


Training:  65%|███████████████████████████████▉                 | 2200/3375 [30:31<16:09,  1.21it/s]

  Batch 2200/3375 | Loss: 1.7389


Training:  66%|████████████████████████████████▏                | 2220/3375 [30:48<15:28,  1.24it/s]

  Batch 2220/3375 | Loss: 1.9123


Training:  66%|████████████████████████████████▌                | 2240/3375 [31:05<16:11,  1.17it/s]

  Batch 2240/3375 | Loss: 1.7906


Training:  67%|████████████████████████████████▊                | 2260/3375 [31:21<15:19,  1.21it/s]

  Batch 2260/3375 | Loss: 1.9205


Training:  68%|█████████████████████████████████                | 2280/3375 [31:38<14:50,  1.23it/s]

  Batch 2280/3375 | Loss: 1.9150


Training:  68%|█████████████████████████████████▍               | 2300/3375 [31:55<14:55,  1.20it/s]

  Batch 2300/3375 | Loss: 1.8411


Training:  69%|█████████████████████████████████▋               | 2320/3375 [32:11<14:11,  1.24it/s]

  Batch 2320/3375 | Loss: 1.8698


Training:  69%|█████████████████████████████████▉               | 2340/3375 [32:28<15:35,  1.11it/s]

  Batch 2340/3375 | Loss: 1.7649


Training:  70%|██████████████████████████████████▎              | 2360/3375 [32:44<13:47,  1.23it/s]

  Batch 2360/3375 | Loss: 1.8799


Training:  71%|██████████████████████████████████▌              | 2380/3375 [33:01<13:26,  1.23it/s]

  Batch 2380/3375 | Loss: 1.7401


Training:  71%|██████████████████████████████████▊              | 2400/3375 [33:18<13:44,  1.18it/s]

  Batch 2400/3375 | Loss: 1.8501


Training:  72%|███████████████████████████████████▏             | 2420/3375 [33:34<12:55,  1.23it/s]

  Batch 2420/3375 | Loss: 1.7730


Training:  72%|███████████████████████████████████▍             | 2440/3375 [33:51<13:30,  1.15it/s]

  Batch 2440/3375 | Loss: 1.8353


Training:  73%|███████████████████████████████████▋             | 2460/3375 [34:08<12:31,  1.22it/s]

  Batch 2460/3375 | Loss: 1.8154


Training:  73%|████████████████████████████████████             | 2480/3375 [34:24<12:04,  1.23it/s]

  Batch 2480/3375 | Loss: 1.8236


Training:  74%|████████████████████████████████████▎            | 2500/3375 [34:41<12:44,  1.14it/s]

  Batch 2500/3375 | Loss: 1.8500


Training:  75%|████████████████████████████████████▌            | 2520/3375 [34:58<11:40,  1.22it/s]

  Batch 2520/3375 | Loss: 1.9373


Training:  75%|████████████████████████████████████▉            | 2540/3375 [35:14<11:14,  1.24it/s]

  Batch 2540/3375 | Loss: 1.8475


Training:  76%|█████████████████████████████████████▏           | 2560/3375 [35:31<11:22,  1.19it/s]

  Batch 2560/3375 | Loss: 1.7062


Training:  76%|█████████████████████████████████████▍           | 2580/3375 [35:48<10:58,  1.21it/s]

  Batch 2580/3375 | Loss: 1.8863


Training:  77%|█████████████████████████████████████▋           | 2600/3375 [36:04<11:41,  1.10it/s]

  Batch 2600/3375 | Loss: 1.9032


Training:  78%|██████████████████████████████████████           | 2620/3375 [36:21<10:19,  1.22it/s]

  Batch 2620/3375 | Loss: 1.8494


Training:  78%|██████████████████████████████████████▎          | 2640/3375 [36:37<09:47,  1.25it/s]

  Batch 2640/3375 | Loss: 1.8689


Training:  79%|██████████████████████████████████████▌          | 2660/3375 [36:54<10:15,  1.16it/s]

  Batch 2660/3375 | Loss: 1.8809


Training:  79%|██████████████████████████████████████▉          | 2680/3375 [37:11<09:28,  1.22it/s]

  Batch 2680/3375 | Loss: 1.8428


Training:  80%|███████████████████████████████████████▏         | 2700/3375 [37:28<09:36,  1.17it/s]

  Batch 2700/3375 | Loss: 1.8394


Training:  81%|███████████████████████████████████████▍         | 2720/3375 [37:44<08:59,  1.21it/s]

  Batch 2720/3375 | Loss: 1.8217


Training:  81%|███████████████████████████████████████▊         | 2740/3375 [38:01<08:34,  1.23it/s]

  Batch 2740/3375 | Loss: 1.8452


Training:  82%|████████████████████████████████████████         | 2760/3375 [38:18<09:06,  1.13it/s]

  Batch 2760/3375 | Loss: 1.8720


Training:  82%|████████████████████████████████████████▎        | 2780/3375 [38:35<08:05,  1.23it/s]

  Batch 2780/3375 | Loss: 1.8542


Training:  83%|████████████████████████████████████████▋        | 2800/3375 [38:51<07:45,  1.24it/s]

  Batch 2800/3375 | Loss: 1.8643


Training:  84%|████████████████████████████████████████▉        | 2820/3375 [39:08<07:43,  1.20it/s]

  Batch 2820/3375 | Loss: 1.7826


Training:  84%|█████████████████████████████████████████▏       | 2840/3375 [39:25<07:12,  1.24it/s]

  Batch 2840/3375 | Loss: 1.8642


Training:  85%|█████████████████████████████████████████▌       | 2860/3375 [39:42<07:45,  1.11it/s]

  Batch 2860/3375 | Loss: 1.7860


Training:  85%|█████████████████████████████████████████▊       | 2880/3375 [39:58<06:47,  1.21it/s]

  Batch 2880/3375 | Loss: 1.8939


Training:  86%|██████████████████████████████████████████       | 2900/3375 [40:15<06:24,  1.23it/s]

  Batch 2900/3375 | Loss: 1.8086


Training:  87%|██████████████████████████████████████████▍      | 2920/3375 [40:32<06:23,  1.19it/s]

  Batch 2920/3375 | Loss: 1.8291


Training:  87%|██████████████████████████████████████████▋      | 2940/3375 [40:49<05:58,  1.21it/s]

  Batch 2940/3375 | Loss: 1.7981


Training:  88%|██████████████████████████████████████████▉      | 2960/3375 [41:05<06:03,  1.14it/s]

  Batch 2960/3375 | Loss: 1.7992


Training:  88%|███████████████████████████████████████████▎     | 2980/3375 [41:22<05:25,  1.21it/s]

  Batch 2980/3375 | Loss: 1.7630


Training:  89%|███████████████████████████████████████████▌     | 3000/3375 [41:38<04:59,  1.25it/s]

  Batch 3000/3375 | Loss: 1.8603


Training:  89%|███████████████████████████████████████████▊     | 3020/3375 [41:55<05:03,  1.17it/s]

  Batch 3020/3375 | Loss: 1.7293


Training:  90%|████████████████████████████████████████████▏    | 3040/3375 [42:12<04:32,  1.23it/s]

  Batch 3040/3375 | Loss: 1.8323


Training:  91%|████████████████████████████████████████████▍    | 3060/3375 [42:28<04:15,  1.24it/s]

  Batch 3060/3375 | Loss: 1.7655


Training:  91%|████████████████████████████████████████████▋    | 3080/3375 [42:45<04:05,  1.20it/s]

  Batch 3080/3375 | Loss: 1.7528


Training:  92%|█████████████████████████████████████████████    | 3100/3375 [43:02<03:42,  1.24it/s]

  Batch 3100/3375 | Loss: 1.8156


Training:  92%|█████████████████████████████████████████████▎   | 3120/3375 [43:19<03:54,  1.09it/s]

  Batch 3120/3375 | Loss: 1.8231


Training:  93%|█████████████████████████████████████████████▌   | 3140/3375 [43:35<03:12,  1.22it/s]

  Batch 3140/3375 | Loss: 1.7880


Training:  94%|█████████████████████████████████████████████▉   | 3160/3375 [43:52<02:54,  1.24it/s]

  Batch 3160/3375 | Loss: 1.7613


Training:  94%|██████████████████████████████████████████████▏  | 3180/3375 [44:09<02:46,  1.17it/s]

  Batch 3180/3375 | Loss: 1.7712


Training:  95%|██████████████████████████████████████████████▍  | 3200/3375 [44:25<02:21,  1.23it/s]

  Batch 3200/3375 | Loss: 1.7964


Training:  95%|██████████████████████████████████████████████▋  | 3220/3375 [44:42<02:13,  1.16it/s]

  Batch 3220/3375 | Loss: 1.7882


Training:  96%|███████████████████████████████████████████████  | 3240/3375 [44:59<01:51,  1.21it/s]

  Batch 3240/3375 | Loss: 1.8197


Training:  97%|███████████████████████████████████████████████▎ | 3260/3375 [45:15<01:33,  1.24it/s]

  Batch 3260/3375 | Loss: 1.8146


Training:  97%|███████████████████████████████████████████████▌ | 3280/3375 [45:32<01:23,  1.13it/s]

  Batch 3280/3375 | Loss: 1.8806


Training:  98%|███████████████████████████████████████████████▉ | 3300/3375 [45:49<01:00,  1.23it/s]

  Batch 3300/3375 | Loss: 1.8172


Training:  98%|████████████████████████████████████████████████▏| 3320/3375 [46:05<00:44,  1.24it/s]

  Batch 3320/3375 | Loss: 1.8264


Training:  99%|████████████████████████████████████████████████▍| 3340/3375 [46:22<00:29,  1.19it/s]

  Batch 3340/3375 | Loss: 1.7985


Training: 100%|████████████████████████████████████████████████▊| 3360/3375 [46:39<00:12,  1.24it/s]

  Batch 3360/3375 | Loss: 1.8493


Training: 100%|█████████████████████████████████████████████████| 3375/3375 [46:51<00:00,  1.20it/s]
Validation: 100%|█████████████████████████████████████████████████| 375/375 [01:56<00:00,  3.21it/s]

✅ Model improved and saved.






--- Epoch 2 | Reveal Ratio: 0.45 ---


Training:   0%|                                                            | 0/3375 [00:00<?, ?it/s]

  Batch 0/3375 | Loss: 2.0125


Training:   1%|▎                                                  | 20/3375 [00:17<47:35,  1.17it/s]

  Batch 20/3375 | Loss: 1.9515


Training:   1%|▌                                                  | 40/3375 [00:34<48:39,  1.14it/s]

  Batch 40/3375 | Loss: 1.8668


Training:   2%|▉                                                  | 60/3375 [00:51<45:36,  1.21it/s]

  Batch 60/3375 | Loss: 1.9396


Training:   2%|█▏                                                 | 80/3375 [01:08<47:12,  1.16it/s]

  Batch 80/3375 | Loss: 1.8260


Training:   3%|█▍                                                | 100/3375 [01:25<46:01,  1.19it/s]

  Batch 100/3375 | Loss: 1.8651


Training:   4%|█▊                                                | 120/3375 [01:41<43:57,  1.23it/s]

  Batch 120/3375 | Loss: 1.9489


Training:   4%|██                                                | 140/3375 [01:58<47:03,  1.15it/s]

  Batch 140/3375 | Loss: 1.9866


Training:   5%|██▎                                               | 160/3375 [02:15<43:26,  1.23it/s]

  Batch 160/3375 | Loss: 1.9306


Training:   5%|██▋                                               | 180/3375 [02:32<46:31,  1.14it/s]

  Batch 180/3375 | Loss: 1.9510


Training:   6%|██▉                                               | 200/3375 [02:49<44:07,  1.20it/s]

  Batch 200/3375 | Loss: 1.9355


Training:   7%|███▎                                              | 220/3375 [03:05<42:56,  1.22it/s]

  Batch 220/3375 | Loss: 2.0284


Training:   7%|███▌                                              | 240/3375 [03:22<45:30,  1.15it/s]

  Batch 240/3375 | Loss: 1.8778


Training:   8%|███▊                                              | 260/3375 [03:39<42:38,  1.22it/s]

  Batch 260/3375 | Loss: 1.9151


Training:   8%|████▏                                             | 280/3375 [03:56<42:42,  1.21it/s]

  Batch 280/3375 | Loss: 1.9635


Training:   9%|████▍                                             | 300/3375 [04:13<42:52,  1.20it/s]

  Batch 300/3375 | Loss: 1.9653


Training:   9%|████▋                                             | 320/3375 [04:29<41:05,  1.24it/s]

  Batch 320/3375 | Loss: 1.9756


Training:  10%|█████                                             | 340/3375 [04:46<45:15,  1.12it/s]

  Batch 340/3375 | Loss: 1.8223


Training:  11%|█████▎                                            | 360/3375 [05:03<41:43,  1.20it/s]

  Batch 360/3375 | Loss: 1.9606


Training:  11%|█████▋                                            | 380/3375 [05:20<41:17,  1.21it/s]

  Batch 380/3375 | Loss: 1.9304


Training:  12%|█████▉                                            | 400/3375 [05:37<41:04,  1.21it/s]

  Batch 400/3375 | Loss: 2.0330


Training:  12%|██████▏                                           | 420/3375 [05:53<40:01,  1.23it/s]

  Batch 420/3375 | Loss: 1.8880


Training:  13%|██████▌                                           | 440/3375 [06:10<44:00,  1.11it/s]

  Batch 440/3375 | Loss: 1.8827


Training:  14%|██████▊                                           | 460/3375 [06:27<39:48,  1.22it/s]

  Batch 460/3375 | Loss: 1.9525


Training:  14%|███████                                           | 480/3375 [06:43<38:42,  1.25it/s]

  Batch 480/3375 | Loss: 1.9122


Training:  15%|███████▍                                          | 500/3375 [07:00<40:52,  1.17it/s]

  Batch 500/3375 | Loss: 1.8821


Training:  15%|███████▋                                          | 520/3375 [07:17<38:30,  1.24it/s]

  Batch 520/3375 | Loss: 1.9356


Training:  16%|████████                                          | 540/3375 [07:34<43:38,  1.08it/s]

  Batch 540/3375 | Loss: 1.9253


Training:  17%|████████▎                                         | 560/3375 [07:51<38:28,  1.22it/s]

  Batch 560/3375 | Loss: 1.9550


Training:  17%|████████▌                                         | 580/3375 [08:07<37:47,  1.23it/s]

  Batch 580/3375 | Loss: 2.0144


Training:  18%|████████▉                                         | 600/3375 [08:24<39:45,  1.16it/s]

  Batch 600/3375 | Loss: 2.0023


Training:  18%|█████████▏                                        | 620/3375 [08:41<37:21,  1.23it/s]

  Batch 620/3375 | Loss: 1.9431


Training:  19%|█████████▍                                        | 640/3375 [08:58<41:25,  1.10it/s]

  Batch 640/3375 | Loss: 1.9748


Training:  20%|█████████▊                                        | 660/3375 [09:14<37:05,  1.22it/s]

  Batch 660/3375 | Loss: 1.9008


Training:  20%|██████████                                        | 680/3375 [09:31<36:41,  1.22it/s]

  Batch 680/3375 | Loss: 1.9252


Training:  21%|██████████▎                                       | 700/3375 [09:48<38:01,  1.17it/s]

  Batch 700/3375 | Loss: 1.8770


Training:  21%|██████████▋                                       | 720/3375 [10:05<36:21,  1.22it/s]

  Batch 720/3375 | Loss: 1.9260


Training:  22%|██████████▉                                       | 740/3375 [10:22<38:12,  1.15it/s]

  Batch 740/3375 | Loss: 1.9329


Training:  23%|███████████▎                                      | 760/3375 [10:39<35:51,  1.22it/s]

  Batch 760/3375 | Loss: 2.0157


Training:  23%|███████████▌                                      | 780/3375 [10:55<35:24,  1.22it/s]

  Batch 780/3375 | Loss: 1.8754


Training:  24%|███████████▊                                      | 800/3375 [11:12<37:00,  1.16it/s]

  Batch 800/3375 | Loss: 1.9689


Training:  24%|████████████▏                                     | 820/3375 [11:29<34:36,  1.23it/s]

  Batch 820/3375 | Loss: 1.9256


Training:  25%|████████████▍                                     | 840/3375 [11:46<37:06,  1.14it/s]

  Batch 840/3375 | Loss: 1.9423


Training:  25%|████████████▋                                     | 860/3375 [12:03<34:44,  1.21it/s]

  Batch 860/3375 | Loss: 1.9652


Training:  26%|█████████████                                     | 880/3375 [12:19<33:48,  1.23it/s]

  Batch 880/3375 | Loss: 1.8994


Training:  27%|█████████████▎                                    | 900/3375 [12:36<36:33,  1.13it/s]

  Batch 900/3375 | Loss: 1.9037


Training:  27%|█████████████▋                                    | 920/3375 [12:53<33:30,  1.22it/s]

  Batch 920/3375 | Loss: 1.9439


Training:  28%|█████████████▉                                    | 940/3375 [13:09<33:27,  1.21it/s]

  Batch 940/3375 | Loss: 1.9191


Training:  28%|██████████████▏                                   | 960/3375 [13:26<33:36,  1.20it/s]

  Batch 960/3375 | Loss: 1.8737


Training:  29%|██████████████▌                                   | 980/3375 [13:43<32:31,  1.23it/s]

  Batch 980/3375 | Loss: 1.8133


Training:  30%|██████████████▌                                  | 1000/3375 [14:00<35:25,  1.12it/s]

  Batch 1000/3375 | Loss: 1.9048


Training:  30%|██████████████▊                                  | 1020/3375 [14:17<32:17,  1.22it/s]

  Batch 1020/3375 | Loss: 1.8675


Training:  31%|███████████████                                  | 1040/3375 [14:34<31:57,  1.22it/s]

  Batch 1040/3375 | Loss: 1.9356


Training:  31%|███████████████▍                                 | 1060/3375 [14:51<32:00,  1.21it/s]

  Batch 1060/3375 | Loss: 1.9161


Training:  32%|███████████████▋                                 | 1080/3375 [15:07<31:10,  1.23it/s]

  Batch 1080/3375 | Loss: 1.9094


Training:  33%|███████████████▉                                 | 1100/3375 [15:25<34:01,  1.11it/s]

  Batch 1100/3375 | Loss: 1.9602


Training:  33%|████████████████▎                                | 1120/3375 [15:41<30:56,  1.21it/s]

  Batch 1120/3375 | Loss: 1.9448


Training:  34%|████████████████▌                                | 1140/3375 [15:58<30:50,  1.21it/s]

  Batch 1140/3375 | Loss: 1.8287


Training:  34%|████████████████▊                                | 1160/3375 [16:15<30:54,  1.19it/s]

  Batch 1160/3375 | Loss: 1.9163


Training:  35%|█████████████████▏                               | 1180/3375 [16:32<29:50,  1.23it/s]

  Batch 1180/3375 | Loss: 1.8840


Training:  36%|█████████████████▍                               | 1200/3375 [16:49<31:51,  1.14it/s]

  Batch 1200/3375 | Loss: 1.8463


Training:  36%|█████████████████▋                               | 1220/3375 [17:05<29:26,  1.22it/s]

  Batch 1220/3375 | Loss: 1.9246


Training:  37%|██████████████████                               | 1240/3375 [17:22<28:42,  1.24it/s]

  Batch 1240/3375 | Loss: 1.9074


Training:  37%|██████████████████▎                              | 1260/3375 [17:39<30:04,  1.17it/s]

  Batch 1260/3375 | Loss: 1.8093


Training:  38%|██████████████████▌                              | 1280/3375 [17:56<28:30,  1.22it/s]

  Batch 1280/3375 | Loss: 1.9488


Training:  39%|██████████████████▊                              | 1300/3375 [18:13<31:04,  1.11it/s]

  Batch 1300/3375 | Loss: 1.8689


Training:  39%|███████████████████▏                             | 1320/3375 [18:29<28:14,  1.21it/s]

  Batch 1320/3375 | Loss: 1.8998


Training:  40%|███████████████████▍                             | 1340/3375 [18:46<27:24,  1.24it/s]

  Batch 1340/3375 | Loss: 1.8939


Training:  40%|███████████████████▋                             | 1360/3375 [19:03<28:35,  1.17it/s]

  Batch 1360/3375 | Loss: 1.8412


Training:  41%|████████████████████                             | 1380/3375 [19:20<27:10,  1.22it/s]

  Batch 1380/3375 | Loss: 1.8785


Training:  41%|████████████████████▎                            | 1400/3375 [19:37<29:39,  1.11it/s]

  Batch 1400/3375 | Loss: 1.8791


Training:  42%|████████████████████▌                            | 1420/3375 [19:54<26:48,  1.22it/s]

  Batch 1420/3375 | Loss: 1.9683


Training:  43%|████████████████████▉                            | 1440/3375 [20:10<25:45,  1.25it/s]

  Batch 1440/3375 | Loss: 1.9853


Training:  43%|█████████████████████▏                           | 1460/3375 [20:27<27:25,  1.16it/s]

  Batch 1460/3375 | Loss: 1.8863


Training:  44%|█████████████████████▍                           | 1480/3375 [20:44<25:34,  1.24it/s]

  Batch 1480/3375 | Loss: 1.8453


Training:  44%|█████████████████████▊                           | 1500/3375 [21:01<28:22,  1.10it/s]

  Batch 1500/3375 | Loss: 1.8884


Training:  45%|██████████████████████                           | 1520/3375 [21:17<25:30,  1.21it/s]

  Batch 1520/3375 | Loss: 1.9072


Training:  46%|██████████████████████▎                          | 1540/3375 [21:34<25:40,  1.19it/s]

  Batch 1540/3375 | Loss: 1.8607


Training:  46%|██████████████████████▋                          | 1560/3375 [21:51<26:01,  1.16it/s]

  Batch 1560/3375 | Loss: 1.8545


Training:  47%|██████████████████████▉                          | 1580/3375 [22:08<24:06,  1.24it/s]

  Batch 1580/3375 | Loss: 1.8395


Training:  47%|███████████████████████▏                         | 1600/3375 [22:24<25:35,  1.16it/s]

  Batch 1600/3375 | Loss: 2.0064


Training:  48%|███████████████████████▌                         | 1620/3375 [22:41<24:14,  1.21it/s]

  Batch 1620/3375 | Loss: 1.8408


Training:  49%|███████████████████████▊                         | 1640/3375 [22:58<23:39,  1.22it/s]

  Batch 1640/3375 | Loss: 1.9033


Training:  49%|████████████████████████                         | 1660/3375 [23:15<25:05,  1.14it/s]

  Batch 1660/3375 | Loss: 1.9100


Training:  50%|████████████████████████▍                        | 1680/3375 [23:31<22:56,  1.23it/s]

  Batch 1680/3375 | Loss: 1.8226


Training:  50%|████████████████████████▋                        | 1700/3375 [23:48<22:49,  1.22it/s]

  Batch 1700/3375 | Loss: 1.9818


Training:  51%|████████████████████████▉                        | 1720/3375 [24:05<23:20,  1.18it/s]

  Batch 1720/3375 | Loss: 1.8662


Training:  52%|█████████████████████████▎                       | 1740/3375 [24:21<22:06,  1.23it/s]

  Batch 1740/3375 | Loss: 1.9504


Training:  52%|█████████████████████████▌                       | 1760/3375 [24:39<24:36,  1.09it/s]

  Batch 1760/3375 | Loss: 1.8593


Training:  53%|█████████████████████████▊                       | 1780/3375 [24:55<21:48,  1.22it/s]

  Batch 1780/3375 | Loss: 1.9115


Training:  53%|██████████████████████████▏                      | 1800/3375 [25:12<21:20,  1.23it/s]

  Batch 1800/3375 | Loss: 2.0089


Training:  54%|██████████████████████████▍                      | 1820/3375 [25:29<21:50,  1.19it/s]

  Batch 1820/3375 | Loss: 1.8526


Training:  55%|██████████████████████████▋                      | 1840/3375 [25:45<20:53,  1.22it/s]

  Batch 1840/3375 | Loss: 1.8876


Training:  55%|███████████████████████████                      | 1860/3375 [26:03<23:12,  1.09it/s]

  Batch 1860/3375 | Loss: 1.9290


Training:  56%|███████████████████████████▎                     | 1880/3375 [26:19<20:27,  1.22it/s]

  Batch 1880/3375 | Loss: 1.9308


Training:  56%|███████████████████████████▌                     | 1900/3375 [26:36<19:55,  1.23it/s]

  Batch 1900/3375 | Loss: 1.8698


Training:  57%|███████████████████████████▉                     | 1920/3375 [26:53<20:26,  1.19it/s]

  Batch 1920/3375 | Loss: 1.9556


Training:  57%|████████████████████████████▏                    | 1940/3375 [27:09<19:25,  1.23it/s]

  Batch 1940/3375 | Loss: 1.7175


Training:  58%|████████████████████████████▍                    | 1960/3375 [27:26<20:51,  1.13it/s]

  Batch 1960/3375 | Loss: 1.9520


Training:  59%|████████████████████████████▋                    | 1980/3375 [27:43<19:14,  1.21it/s]

  Batch 1980/3375 | Loss: 1.8260


Training:  59%|█████████████████████████████                    | 2000/3375 [28:00<18:37,  1.23it/s]

  Batch 2000/3375 | Loss: 1.8978


Training:  60%|█████████████████████████████▎                   | 2020/3375 [28:17<19:15,  1.17it/s]

  Batch 2020/3375 | Loss: 1.8973


Training:  60%|█████████████████████████████▌                   | 2040/3375 [28:33<18:08,  1.23it/s]

  Batch 2040/3375 | Loss: 1.8884


Training:  61%|█████████████████████████████▉                   | 2060/3375 [28:50<18:53,  1.16it/s]

  Batch 2060/3375 | Loss: 1.9091


Training:  62%|██████████████████████████████▏                  | 2080/3375 [29:07<17:48,  1.21it/s]

  Batch 2080/3375 | Loss: 1.8788


Training:  62%|██████████████████████████████▍                  | 2100/3375 [29:23<17:09,  1.24it/s]

  Batch 2100/3375 | Loss: 1.8816


Training:  63%|██████████████████████████████▊                  | 2120/3375 [29:40<18:46,  1.11it/s]

  Batch 2120/3375 | Loss: 1.8476


Training:  63%|███████████████████████████████                  | 2140/3375 [29:57<16:50,  1.22it/s]

  Batch 2140/3375 | Loss: 1.8872


Training:  64%|███████████████████████████████▎                 | 2160/3375 [30:14<17:15,  1.17it/s]

  Batch 2160/3375 | Loss: 1.8604


Training:  65%|███████████████████████████████▋                 | 2180/3375 [30:31<16:38,  1.20it/s]

  Batch 2180/3375 | Loss: 1.8364


Training:  65%|███████████████████████████████▉                 | 2200/3375 [30:47<15:59,  1.22it/s]

  Batch 2200/3375 | Loss: 1.8292


Training:  66%|████████████████████████████████▏                | 2220/3375 [31:04<17:12,  1.12it/s]

  Batch 2220/3375 | Loss: 1.8701


Training:  66%|████████████████████████████████▌                | 2240/3375 [31:21<15:27,  1.22it/s]

  Batch 2240/3375 | Loss: 1.8920


Training:  67%|████████████████████████████████▊                | 2260/3375 [31:38<15:15,  1.22it/s]

  Batch 2260/3375 | Loss: 1.9208


Training:  68%|█████████████████████████████████                | 2280/3375 [31:55<15:17,  1.19it/s]

  Batch 2280/3375 | Loss: 1.8658


Training:  68%|█████████████████████████████████▍               | 2300/3375 [32:11<14:34,  1.23it/s]

  Batch 2300/3375 | Loss: 1.8601


Training:  69%|█████████████████████████████████▋               | 2320/3375 [32:28<15:31,  1.13it/s]

  Batch 2320/3375 | Loss: 1.8695


Training:  69%|█████████████████████████████████▉               | 2340/3375 [32:45<14:10,  1.22it/s]

  Batch 2340/3375 | Loss: 1.8789


Training:  70%|██████████████████████████████████▎              | 2360/3375 [33:02<13:44,  1.23it/s]

  Batch 2360/3375 | Loss: 1.9221


Training:  71%|██████████████████████████████████▌              | 2380/3375 [33:19<13:47,  1.20it/s]

  Batch 2380/3375 | Loss: 1.9142


Training:  71%|██████████████████████████████████▊              | 2400/3375 [33:35<13:02,  1.25it/s]

  Batch 2400/3375 | Loss: 1.8813


Training:  72%|███████████████████████████████████▏             | 2420/3375 [33:52<14:15,  1.12it/s]

  Batch 2420/3375 | Loss: 1.8484


Training:  72%|███████████████████████████████████▍             | 2440/3375 [34:09<13:41,  1.14it/s]

  Batch 2440/3375 | Loss: 1.8830


Training:  73%|███████████████████████████████████▋             | 2460/3375 [34:26<12:21,  1.23it/s]

  Batch 2460/3375 | Loss: 1.8374


Training:  73%|████████████████████████████████████             | 2480/3375 [34:43<12:26,  1.20it/s]

  Batch 2480/3375 | Loss: 1.8872


Training:  74%|████████████████████████████████████▎            | 2500/3375 [34:59<11:49,  1.23it/s]

  Batch 2500/3375 | Loss: 1.8437


Training:  75%|████████████████████████████████████▌            | 2520/3375 [35:16<13:16,  1.07it/s]

  Batch 2520/3375 | Loss: 1.8168


Training:  75%|████████████████████████████████████▉            | 2540/3375 [35:33<11:33,  1.20it/s]

  Batch 2540/3375 | Loss: 1.8744


Training:  76%|█████████████████████████████████████▏           | 2560/3375 [35:50<10:56,  1.24it/s]

  Batch 2560/3375 | Loss: 1.9284


Training:  76%|█████████████████████████████████████▍           | 2580/3375 [36:07<11:12,  1.18it/s]

  Batch 2580/3375 | Loss: 1.8508


Training:  77%|█████████████████████████████████████▋           | 2600/3375 [36:23<10:27,  1.23it/s]

  Batch 2600/3375 | Loss: 1.8252


Training:  78%|██████████████████████████████████████           | 2620/3375 [36:40<11:28,  1.10it/s]

  Batch 2620/3375 | Loss: 1.9505


Training:  78%|██████████████████████████████████████▎          | 2640/3375 [36:57<10:06,  1.21it/s]

  Batch 2640/3375 | Loss: 1.9166


Training:  79%|██████████████████████████████████████▌          | 2660/3375 [37:14<09:43,  1.22it/s]

  Batch 2660/3375 | Loss: 1.9721


Training:  79%|██████████████████████████████████████▉          | 2680/3375 [37:31<10:10,  1.14it/s]

  Batch 2680/3375 | Loss: 1.8868


Training:  80%|███████████████████████████████████████▏         | 2700/3375 [37:48<09:40,  1.16it/s]

  Batch 2700/3375 | Loss: 1.8813


Training:  81%|███████████████████████████████████████▍         | 2720/3375 [38:06<10:07,  1.08it/s]

  Batch 2720/3375 | Loss: 1.8956


Training:  81%|███████████████████████████████████████▊         | 2740/3375 [38:24<09:18,  1.14it/s]

  Batch 2740/3375 | Loss: 1.8092


Training:  82%|████████████████████████████████████████         | 2760/3375 [38:42<09:23,  1.09it/s]

  Batch 2760/3375 | Loss: 1.9178


Training:  82%|████████████████████████████████████████▎        | 2780/3375 [38:59<08:16,  1.20it/s]

  Batch 2780/3375 | Loss: 1.8312


Training:  83%|████████████████████████████████████████▋        | 2800/3375 [39:16<07:53,  1.21it/s]

  Batch 2800/3375 | Loss: 1.8910


Training:  84%|████████████████████████████████████████▉        | 2820/3375 [39:33<08:05,  1.14it/s]

  Batch 2820/3375 | Loss: 1.7985


Training:  84%|█████████████████████████████████████████▏       | 2840/3375 [39:50<07:23,  1.21it/s]

  Batch 2840/3375 | Loss: 1.8142


Training:  85%|█████████████████████████████████████████▌       | 2860/3375 [40:07<08:01,  1.07it/s]

  Batch 2860/3375 | Loss: 1.8334


Training:  85%|█████████████████████████████████████████▊       | 2880/3375 [40:24<06:51,  1.20it/s]

  Batch 2880/3375 | Loss: 1.8147


Training:  86%|██████████████████████████████████████████       | 2900/3375 [40:41<06:24,  1.24it/s]

  Batch 2900/3375 | Loss: 1.8525


Training:  87%|██████████████████████████████████████████▍      | 2920/3375 [40:58<06:28,  1.17it/s]

  Batch 2920/3375 | Loss: 1.8701


Training:  87%|██████████████████████████████████████████▋      | 2940/3375 [41:15<05:57,  1.22it/s]

  Batch 2940/3375 | Loss: 1.9273


Training:  88%|██████████████████████████████████████████▉      | 2960/3375 [41:32<06:10,  1.12it/s]

  Batch 2960/3375 | Loss: 1.8577


Training:  88%|███████████████████████████████████████████▎     | 2980/3375 [41:48<05:26,  1.21it/s]

  Batch 2980/3375 | Loss: 1.8822


Training:  89%|███████████████████████████████████████████▌     | 3000/3375 [42:05<04:58,  1.26it/s]

  Batch 3000/3375 | Loss: 1.8393


Training:  89%|███████████████████████████████████████████▊     | 3020/3375 [42:22<05:02,  1.17it/s]

  Batch 3020/3375 | Loss: 1.8313


Training:  90%|████████████████████████████████████████████▏    | 3040/3375 [42:39<04:41,  1.19it/s]

  Batch 3040/3375 | Loss: 1.8027


Training:  91%|████████████████████████████████████████████▍    | 3060/3375 [42:57<04:54,  1.07it/s]

  Batch 3060/3375 | Loss: 1.8698


Training:  91%|████████████████████████████████████████████▋    | 3080/3375 [43:13<04:06,  1.20it/s]

  Batch 3080/3375 | Loss: 1.8900


Training:  92%|█████████████████████████████████████████████    | 3100/3375 [43:30<03:45,  1.22it/s]

  Batch 3100/3375 | Loss: 1.8741


Training:  92%|█████████████████████████████████████████████▎   | 3120/3375 [43:48<03:36,  1.18it/s]

  Batch 3120/3375 | Loss: 1.7749


Training:  93%|█████████████████████████████████████████████▌   | 3140/3375 [44:04<03:10,  1.23it/s]

  Batch 3140/3375 | Loss: 1.8689


Training:  94%|█████████████████████████████████████████████▉   | 3160/3375 [44:21<03:16,  1.09it/s]

  Batch 3160/3375 | Loss: 1.8696


Training:  94%|██████████████████████████████████████████████▏  | 3180/3375 [44:38<02:41,  1.20it/s]

  Batch 3180/3375 | Loss: 1.8592


Training:  95%|██████████████████████████████████████████████▍  | 3200/3375 [44:55<02:22,  1.23it/s]

  Batch 3200/3375 | Loss: 1.8966


Training:  95%|██████████████████████████████████████████████▋  | 3220/3375 [45:12<02:12,  1.17it/s]

  Batch 3220/3375 | Loss: 1.8908


Training:  96%|███████████████████████████████████████████████  | 3240/3375 [45:29<01:50,  1.22it/s]

  Batch 3240/3375 | Loss: 1.8217


Training:  97%|███████████████████████████████████████████████▎ | 3260/3375 [45:46<01:45,  1.09it/s]

  Batch 3260/3375 | Loss: 1.8899


Training:  97%|███████████████████████████████████████████████▌ | 3280/3375 [46:02<01:18,  1.20it/s]

  Batch 3280/3375 | Loss: 1.8210


Training:  98%|███████████████████████████████████████████████▉ | 3300/3375 [46:19<01:01,  1.22it/s]

  Batch 3300/3375 | Loss: 1.8616


Training:  98%|████████████████████████████████████████████████▏| 3320/3375 [46:36<00:47,  1.16it/s]

  Batch 3320/3375 | Loss: 1.8999


Training:  99%|████████████████████████████████████████████████▍| 3340/3375 [46:53<00:28,  1.22it/s]

  Batch 3340/3375 | Loss: 1.8438


Training: 100%|████████████████████████████████████████████████▊| 3360/3375 [47:10<00:13,  1.09it/s]

  Batch 3360/3375 | Loss: 1.8460


Training: 100%|█████████████████████████████████████████████████| 3375/3375 [47:23<00:00,  1.19it/s]
Validation: 100%|█████████████████████████████████████████████████| 375/375 [01:57<00:00,  3.19it/s]

⚠️ No improvement. Patience: 1/5

--- Epoch 3 | Reveal Ratio: 0.35 ---



Training:   0%|                                                            | 0/3375 [00:00<?, ?it/s]

  Batch 0/3375 | Loss: 2.1068


Training:   1%|▎                                                  | 20/3375 [00:17<48:06,  1.16it/s]

  Batch 20/3375 | Loss: 2.0458


Training:   1%|▌                                                  | 40/3375 [00:33<45:11,  1.23it/s]

  Batch 40/3375 | Loss: 2.0649


Training:   2%|▉                                                  | 60/3375 [00:49<44:11,  1.25it/s]

  Batch 60/3375 | Loss: 2.1562


Training:   2%|█▏                                                 | 80/3375 [01:06<45:30,  1.21it/s]

  Batch 80/3375 | Loss: 2.1409


Training:   3%|█▍                                                | 100/3375 [01:23<44:54,  1.22it/s]

  Batch 100/3375 | Loss: 1.9878


Training:   4%|█▊                                                | 120/3375 [01:40<49:19,  1.10it/s]

  Batch 120/3375 | Loss: 2.0466


Training:   4%|██                                                | 140/3375 [01:57<44:55,  1.20it/s]

  Batch 140/3375 | Loss: 2.1283


Training:   5%|██▎                                               | 160/3375 [02:13<43:44,  1.22it/s]

  Batch 160/3375 | Loss: 1.9856


Training:   5%|██▋                                               | 180/3375 [02:30<44:57,  1.18it/s]

  Batch 180/3375 | Loss: 2.0207


Training:   6%|██▉                                               | 200/3375 [02:47<42:36,  1.24it/s]

  Batch 200/3375 | Loss: 2.1106


Training:   7%|███▎                                              | 220/3375 [03:03<44:32,  1.18it/s]

  Batch 220/3375 | Loss: 2.0346


Training:   7%|███▌                                              | 240/3375 [03:20<43:02,  1.21it/s]

  Batch 240/3375 | Loss: 2.0279


Training:   8%|███▊                                              | 260/3375 [03:36<42:02,  1.24it/s]

  Batch 260/3375 | Loss: 2.0670


Training:   8%|████▏                                             | 280/3375 [03:53<44:37,  1.16it/s]

  Batch 280/3375 | Loss: 2.0393


Training:   9%|████▍                                             | 300/3375 [04:10<41:43,  1.23it/s]

  Batch 300/3375 | Loss: 2.0238


Training:   9%|████▋                                             | 320/3375 [04:26<40:20,  1.26it/s]

  Batch 320/3375 | Loss: 2.0874


Training:  10%|█████                                             | 340/3375 [04:43<42:02,  1.20it/s]

  Batch 340/3375 | Loss: 1.9671


Training:  11%|█████▎                                            | 360/3375 [04:59<40:20,  1.25it/s]

  Batch 360/3375 | Loss: 2.0518


Training:  11%|█████▋                                            | 380/3375 [05:15<40:35,  1.23it/s]

  Batch 380/3375 | Loss: 1.9776


Training:  12%|█████▉                                            | 400/3375 [05:32<41:05,  1.21it/s]

  Batch 400/3375 | Loss: 2.0616


Training:  12%|██████▏                                           | 420/3375 [05:48<38:45,  1.27it/s]

  Batch 420/3375 | Loss: 1.9890


Training:  13%|██████▌                                           | 440/3375 [06:05<42:09,  1.16it/s]

  Batch 440/3375 | Loss: 2.0827


Training:  14%|██████▊                                           | 460/3375 [06:21<39:23,  1.23it/s]

  Batch 460/3375 | Loss: 2.0020


Training:  14%|███████                                           | 480/3375 [06:37<38:03,  1.27it/s]

  Batch 480/3375 | Loss: 2.0780


Training:  15%|███████▍                                          | 500/3375 [06:54<42:33,  1.13it/s]

  Batch 500/3375 | Loss: 2.0289


Training:  15%|███████▋                                          | 520/3375 [07:10<38:33,  1.23it/s]

  Batch 520/3375 | Loss: 2.0969


Training:  16%|████████                                          | 540/3375 [07:26<37:25,  1.26it/s]

  Batch 540/3375 | Loss: 2.0855


Training:  17%|████████▎                                         | 560/3375 [07:43<40:48,  1.15it/s]

  Batch 560/3375 | Loss: 2.1474


Training:  17%|████████▌                                         | 580/3375 [07:59<37:08,  1.25it/s]

  Batch 580/3375 | Loss: 2.0516


Training:  18%|████████▉                                         | 600/3375 [08:15<36:14,  1.28it/s]

  Batch 600/3375 | Loss: 1.9902


Training:  18%|█████████▏                                        | 620/3375 [08:32<38:43,  1.19it/s]

  Batch 620/3375 | Loss: 2.0738


Training:  19%|█████████▍                                        | 640/3375 [08:49<36:25,  1.25it/s]

  Batch 640/3375 | Loss: 2.0294


Training:  20%|█████████▊                                        | 660/3375 [09:05<36:13,  1.25it/s]

  Batch 660/3375 | Loss: 2.0246


Training:  20%|██████████                                        | 680/3375 [09:21<37:09,  1.21it/s]

  Batch 680/3375 | Loss: 2.0476


Training:  21%|██████████▎                                       | 700/3375 [09:38<35:37,  1.25it/s]

  Batch 700/3375 | Loss: 1.9975


Training:  21%|██████████▋                                       | 720/3375 [09:54<36:55,  1.20it/s]

  Batch 720/3375 | Loss: 1.9763


Training:  22%|██████████▉                                       | 740/3375 [10:11<36:05,  1.22it/s]

  Batch 740/3375 | Loss: 1.9831


Training:  23%|███████████▎                                      | 760/3375 [10:27<34:43,  1.25it/s]

  Batch 760/3375 | Loss: 2.0502


Training:  23%|███████████▌                                      | 780/3375 [10:43<38:45,  1.12it/s]

  Batch 780/3375 | Loss: 2.0633


Training:  24%|███████████▊                                      | 800/3375 [11:00<34:26,  1.25it/s]

  Batch 800/3375 | Loss: 2.0451


Training:  24%|████████████▏                                     | 820/3375 [11:16<33:40,  1.26it/s]

  Batch 820/3375 | Loss: 2.0269


Training:  25%|████████████▍                                     | 840/3375 [11:33<36:48,  1.15it/s]

  Batch 840/3375 | Loss: 2.0318


Training:  25%|████████████▋                                     | 860/3375 [11:49<33:37,  1.25it/s]

  Batch 860/3375 | Loss: 1.9909


Training:  26%|█████████████                                     | 880/3375 [12:05<33:16,  1.25it/s]

  Batch 880/3375 | Loss: 2.0375


Training:  27%|█████████████▎                                    | 900/3375 [12:22<34:25,  1.20it/s]

  Batch 900/3375 | Loss: 2.0651


Training:  27%|█████████████▋                                    | 920/3375 [12:38<33:04,  1.24it/s]

  Batch 920/3375 | Loss: 2.0776


Training:  28%|█████████████▉                                    | 940/3375 [12:54<32:08,  1.26it/s]

  Batch 940/3375 | Loss: 2.0325


Training:  28%|██████████████▏                                   | 960/3375 [13:11<33:50,  1.19it/s]

  Batch 960/3375 | Loss: 1.9160


Training:  29%|██████████████▌                                   | 980/3375 [13:27<32:07,  1.24it/s]

  Batch 980/3375 | Loss: 2.0813


Training:  30%|██████████████▌                                  | 1000/3375 [13:44<33:42,  1.17it/s]

  Batch 1000/3375 | Loss: 2.0743


Training:  30%|██████████████▊                                  | 1020/3375 [14:00<32:23,  1.21it/s]

  Batch 1020/3375 | Loss: 2.0270


Training:  31%|███████████████                                  | 1040/3375 [14:16<31:00,  1.26it/s]

  Batch 1040/3375 | Loss: 2.0216


Training:  31%|███████████████▍                                 | 1060/3375 [14:33<34:41,  1.11it/s]

  Batch 1060/3375 | Loss: 2.0023


Training:  32%|███████████████▋                                 | 1080/3375 [14:49<30:43,  1.24it/s]

  Batch 1080/3375 | Loss: 1.9877


Training:  33%|███████████████▉                                 | 1100/3375 [15:05<30:15,  1.25it/s]

  Batch 1100/3375 | Loss: 2.0791


Training:  33%|████████████████▎                                | 1120/3375 [15:22<33:07,  1.13it/s]

  Batch 1120/3375 | Loss: 1.9917


Training:  34%|████████████████▌                                | 1140/3375 [15:38<29:36,  1.26it/s]

  Batch 1140/3375 | Loss: 1.9547


Training:  34%|████████████████▊                                | 1160/3375 [15:54<29:11,  1.26it/s]

  Batch 1160/3375 | Loss: 2.0527


Training:  35%|█████████████████▏                               | 1180/3375 [16:11<31:05,  1.18it/s]

  Batch 1180/3375 | Loss: 1.9785


Training:  36%|█████████████████▍                               | 1200/3375 [16:27<29:50,  1.22it/s]

  Batch 1200/3375 | Loss: 2.0076


Training:  36%|█████████████████▋                               | 1220/3375 [16:43<28:27,  1.26it/s]

  Batch 1220/3375 | Loss: 1.9914


Training:  37%|██████████████████                               | 1240/3375 [17:00<29:21,  1.21it/s]

  Batch 1240/3375 | Loss: 2.0188


Training:  37%|██████████████████▎                              | 1260/3375 [17:16<28:01,  1.26it/s]

  Batch 1260/3375 | Loss: 2.0859


Training:  38%|██████████████████▌                              | 1280/3375 [17:33<28:58,  1.20it/s]

  Batch 1280/3375 | Loss: 2.0445


Training:  39%|██████████████████▊                              | 1300/3375 [17:49<28:20,  1.22it/s]

  Batch 1300/3375 | Loss: 2.0156


Training:  39%|███████████████████▏                             | 1320/3375 [18:05<27:17,  1.25it/s]

  Batch 1320/3375 | Loss: 2.0129


Training:  40%|███████████████████▍                             | 1340/3375 [18:22<28:59,  1.17it/s]

  Batch 1340/3375 | Loss: 1.9823


Training:  40%|███████████████████▋                             | 1360/3375 [18:38<26:56,  1.25it/s]

  Batch 1360/3375 | Loss: 2.0290


Training:  41%|████████████████████                             | 1380/3375 [18:55<26:30,  1.25it/s]

  Batch 1380/3375 | Loss: 2.0536


Training:  41%|████████████████████▎                            | 1400/3375 [19:11<29:15,  1.13it/s]

  Batch 1400/3375 | Loss: 2.0724


Training:  42%|████████████████████▌                            | 1420/3375 [19:27<26:16,  1.24it/s]

  Batch 1420/3375 | Loss: 2.0667


Training:  43%|████████████████████▉                            | 1440/3375 [19:44<25:51,  1.25it/s]

  Batch 1440/3375 | Loss: 2.0040


Training:  43%|█████████████████████▏                           | 1460/3375 [20:00<27:39,  1.15it/s]

  Batch 1460/3375 | Loss: 2.0154


Training:  44%|█████████████████████▍                           | 1480/3375 [20:17<25:29,  1.24it/s]

  Batch 1480/3375 | Loss: 1.9805


Training:  44%|█████████████████████▊                           | 1500/3375 [20:33<24:43,  1.26it/s]

  Batch 1500/3375 | Loss: 1.9799


Training:  45%|██████████████████████                           | 1520/3375 [20:50<26:14,  1.18it/s]

  Batch 1520/3375 | Loss: 1.9658


Training:  46%|██████████████████████▎                          | 1540/3375 [21:06<24:21,  1.26it/s]

  Batch 1540/3375 | Loss: 2.0667


Training:  46%|██████████████████████▋                          | 1560/3375 [21:22<24:21,  1.24it/s]

  Batch 1560/3375 | Loss: 2.1063


Training:  47%|██████████████████████▉                          | 1580/3375 [21:39<24:44,  1.21it/s]

  Batch 1580/3375 | Loss: 2.1172


Training:  47%|███████████████████████▏                         | 1600/3375 [21:55<23:23,  1.26it/s]

  Batch 1600/3375 | Loss: 2.0545


Training:  48%|███████████████████████▌                         | 1620/3375 [22:11<24:52,  1.18it/s]

  Batch 1620/3375 | Loss: 2.0633


Training:  49%|███████████████████████▊                         | 1640/3375 [22:28<23:48,  1.21it/s]

  Batch 1640/3375 | Loss: 2.1761


Training:  49%|████████████████████████                         | 1660/3375 [22:44<22:48,  1.25it/s]

  Batch 1660/3375 | Loss: 2.1653


Training:  50%|████████████████████████▍                        | 1680/3375 [23:01<26:12,  1.08it/s]

  Batch 1680/3375 | Loss: 2.0156


Training:  50%|████████████████████████▋                        | 1700/3375 [23:17<22:32,  1.24it/s]

  Batch 1700/3375 | Loss: 2.0653


Training:  51%|████████████████████████▉                        | 1720/3375 [23:33<21:56,  1.26it/s]

  Batch 1720/3375 | Loss: 2.0535


Training:  52%|█████████████████████████▎                       | 1740/3375 [23:50<23:44,  1.15it/s]

  Batch 1740/3375 | Loss: 2.0119


Training:  52%|█████████████████████████▌                       | 1760/3375 [24:06<21:22,  1.26it/s]

  Batch 1760/3375 | Loss: 2.0041


Training:  53%|█████████████████████████▊                       | 1780/3375 [24:23<21:19,  1.25it/s]

  Batch 1780/3375 | Loss: 2.0407


Training:  53%|██████████████████████████▏                      | 1800/3375 [24:39<21:56,  1.20it/s]

  Batch 1800/3375 | Loss: 1.9949


Training:  54%|██████████████████████████▍                      | 1820/3375 [24:55<20:57,  1.24it/s]

  Batch 1820/3375 | Loss: 1.9059


Training:  55%|██████████████████████████▋                      | 1840/3375 [25:12<20:18,  1.26it/s]

  Batch 1840/3375 | Loss: 2.0048


Training:  55%|███████████████████████████                      | 1860/3375 [25:28<20:58,  1.20it/s]

  Batch 1860/3375 | Loss: 2.0558


Training:  56%|███████████████████████████▎                     | 1880/3375 [25:44<19:44,  1.26it/s]

  Batch 1880/3375 | Loss: 1.9903


Training:  56%|███████████████████████████▌                     | 1900/3375 [26:01<20:53,  1.18it/s]

  Batch 1900/3375 | Loss: 2.0158


Training:  57%|███████████████████████████▉                     | 1920/3375 [26:17<19:48,  1.22it/s]

  Batch 1920/3375 | Loss: 2.0373


Training:  57%|████████████████████████████▏                    | 1940/3375 [26:34<18:52,  1.27it/s]

  Batch 1940/3375 | Loss: 1.9692


Training:  58%|████████████████████████████▍                    | 1960/3375 [26:50<20:46,  1.13it/s]

  Batch 1960/3375 | Loss: 2.0186


Training:  59%|████████████████████████████▋                    | 1980/3375 [27:06<18:41,  1.24it/s]

  Batch 1980/3375 | Loss: 1.9514


Training:  59%|█████████████████████████████                    | 2000/3375 [27:23<18:04,  1.27it/s]

  Batch 2000/3375 | Loss: 1.9995


Training:  60%|█████████████████████████████▎                   | 2020/3375 [27:39<19:44,  1.14it/s]

  Batch 2020/3375 | Loss: 2.0060


Training:  60%|█████████████████████████████▌                   | 2040/3375 [27:55<17:45,  1.25it/s]

  Batch 2040/3375 | Loss: 2.0049


Training:  61%|█████████████████████████████▉                   | 2060/3375 [28:12<17:22,  1.26it/s]

  Batch 2060/3375 | Loss: 2.1534


Training:  62%|██████████████████████████████▏                  | 2080/3375 [28:28<18:10,  1.19it/s]

  Batch 2080/3375 | Loss: 2.0327


Training:  62%|██████████████████████████████▍                  | 2100/3375 [28:45<17:43,  1.20it/s]

  Batch 2100/3375 | Loss: 2.0493


Training:  63%|██████████████████████████████▊                  | 2120/3375 [29:01<16:42,  1.25it/s]

  Batch 2120/3375 | Loss: 2.0408


Training:  63%|███████████████████████████████                  | 2140/3375 [29:18<16:56,  1.22it/s]

  Batch 2140/3375 | Loss: 1.9569


Training:  64%|███████████████████████████████▎                 | 2160/3375 [29:34<15:57,  1.27it/s]

  Batch 2160/3375 | Loss: 2.0364


Training:  65%|███████████████████████████████▋                 | 2180/3375 [29:50<16:54,  1.18it/s]

  Batch 2180/3375 | Loss: 2.0022


Training:  65%|███████████████████████████████▉                 | 2200/3375 [30:07<15:55,  1.23it/s]

  Batch 2200/3375 | Loss: 2.0927


Training:  66%|████████████████████████████████▏                | 2220/3375 [30:23<15:24,  1.25it/s]

  Batch 2220/3375 | Loss: 2.0587


Training:  66%|████████████████████████████████▌                | 2240/3375 [30:40<17:01,  1.11it/s]

  Batch 2240/3375 | Loss: 2.0781


Training:  67%|████████████████████████████████▊                | 2260/3375 [30:56<15:12,  1.22it/s]

  Batch 2260/3375 | Loss: 1.9713


Training:  68%|█████████████████████████████████                | 2280/3375 [31:12<14:34,  1.25it/s]

  Batch 2280/3375 | Loss: 1.8735


Training:  68%|█████████████████████████████████▍               | 2300/3375 [31:29<15:32,  1.15it/s]

  Batch 2300/3375 | Loss: 2.0143


Training:  69%|█████████████████████████████████▋               | 2320/3375 [31:45<13:54,  1.26it/s]

  Batch 2320/3375 | Loss: 2.0957


Training:  69%|█████████████████████████████████▉               | 2340/3375 [32:01<13:41,  1.26it/s]

  Batch 2340/3375 | Loss: 2.1225


Training:  70%|██████████████████████████████████▎              | 2360/3375 [32:18<14:12,  1.19it/s]

  Batch 2360/3375 | Loss: 2.1038


Training:  71%|██████████████████████████████████▌              | 2380/3375 [32:34<13:07,  1.26it/s]

  Batch 2380/3375 | Loss: 2.0499


Training:  71%|██████████████████████████████████▊              | 2400/3375 [32:50<12:45,  1.27it/s]

  Batch 2400/3375 | Loss: 2.1007


Training:  72%|███████████████████████████████████▏             | 2420/3375 [33:07<13:06,  1.21it/s]

  Batch 2420/3375 | Loss: 2.0406


Training:  72%|███████████████████████████████████▍             | 2440/3375 [33:23<12:28,  1.25it/s]

  Batch 2440/3375 | Loss: 2.0833


Training:  73%|███████████████████████████████████▋             | 2460/3375 [33:40<12:55,  1.18it/s]

  Batch 2460/3375 | Loss: 2.0938


Training:  73%|████████████████████████████████████             | 2480/3375 [33:56<12:20,  1.21it/s]

  Batch 2480/3375 | Loss: 2.0762


Training:  74%|████████████████████████████████████▎            | 2500/3375 [34:13<11:40,  1.25it/s]

  Batch 2500/3375 | Loss: 2.0591


Training:  75%|████████████████████████████████████▌            | 2520/3375 [34:29<12:44,  1.12it/s]

  Batch 2520/3375 | Loss: 2.0783


Training:  75%|████████████████████████████████████▉            | 2540/3375 [34:45<11:17,  1.23it/s]

  Batch 2540/3375 | Loss: 1.9944


Training:  76%|█████████████████████████████████████▏           | 2560/3375 [35:02<10:47,  1.26it/s]

  Batch 2560/3375 | Loss: 2.0013


Training:  76%|█████████████████████████████████████▍           | 2580/3375 [35:18<11:28,  1.16it/s]

  Batch 2580/3375 | Loss: 2.0096


Training:  77%|█████████████████████████████████████▋           | 2600/3375 [35:34<10:15,  1.26it/s]

  Batch 2600/3375 | Loss: 2.0435


Training:  78%|██████████████████████████████████████           | 2620/3375 [35:50<09:49,  1.28it/s]

  Batch 2620/3375 | Loss: 1.9771


Training:  78%|██████████████████████████████████████▎          | 2640/3375 [36:07<10:23,  1.18it/s]

  Batch 2640/3375 | Loss: 2.0604


Training:  79%|██████████████████████████████████████▌          | 2660/3375 [36:23<09:28,  1.26it/s]

  Batch 2660/3375 | Loss: 2.0247


Training:  79%|██████████████████████████████████████▉          | 2680/3375 [36:39<09:06,  1.27it/s]

  Batch 2680/3375 | Loss: 2.0063


Training:  80%|███████████████████████████████████████▏         | 2700/3375 [36:56<09:24,  1.20it/s]

  Batch 2700/3375 | Loss: 1.9691


Training:  81%|███████████████████████████████████████▍         | 2720/3375 [37:12<08:41,  1.26it/s]

  Batch 2720/3375 | Loss: 1.9643


Training:  81%|███████████████████████████████████████▊         | 2740/3375 [37:28<08:16,  1.28it/s]

  Batch 2740/3375 | Loss: 2.0247


Training:  82%|████████████████████████████████████████         | 2760/3375 [37:44<08:31,  1.20it/s]

  Batch 2760/3375 | Loss: 2.0549


Training:  82%|████████████████████████████████████████▎        | 2780/3375 [38:01<07:50,  1.27it/s]

  Batch 2780/3375 | Loss: 1.9509


Training:  83%|████████████████████████████████████████▋        | 2800/3375 [38:17<07:40,  1.25it/s]

  Batch 2800/3375 | Loss: 2.0774


Training:  84%|████████████████████████████████████████▉        | 2820/3375 [38:33<07:35,  1.22it/s]

  Batch 2820/3375 | Loss: 1.9782


Training:  84%|█████████████████████████████████████████▏       | 2840/3375 [38:49<07:03,  1.26it/s]

  Batch 2840/3375 | Loss: 2.0462


Training:  85%|█████████████████████████████████████████▌       | 2860/3375 [39:06<07:18,  1.17it/s]

  Batch 2860/3375 | Loss: 2.0480


Training:  85%|█████████████████████████████████████████▊       | 2880/3375 [39:22<06:37,  1.24it/s]

  Batch 2880/3375 | Loss: 2.0001


Training:  86%|██████████████████████████████████████████       | 2900/3375 [39:38<06:15,  1.26it/s]

  Batch 2900/3375 | Loss: 2.0161


Training:  87%|██████████████████████████████████████████▍      | 2920/3375 [39:55<06:42,  1.13it/s]

  Batch 2920/3375 | Loss: 2.0130


Training:  87%|██████████████████████████████████████████▋      | 2940/3375 [40:11<05:50,  1.24it/s]

  Batch 2940/3375 | Loss: 2.0002


Training:  88%|██████████████████████████████████████████▉      | 2960/3375 [40:27<05:26,  1.27it/s]

  Batch 2960/3375 | Loss: 2.1256


Training:  88%|███████████████████████████████████████████▎     | 2980/3375 [40:44<05:50,  1.13it/s]

  Batch 2980/3375 | Loss: 1.9198


Training:  89%|███████████████████████████████████████████▌     | 3000/3375 [41:00<05:00,  1.25it/s]

  Batch 3000/3375 | Loss: 2.0249


Training:  89%|███████████████████████████████████████████▊     | 3020/3375 [41:16<04:37,  1.28it/s]

  Batch 3020/3375 | Loss: 2.0720


Training:  90%|████████████████████████████████████████████▏    | 3040/3375 [41:33<04:47,  1.17it/s]

  Batch 3040/3375 | Loss: 1.9702


Training:  91%|████████████████████████████████████████████▍    | 3060/3375 [41:49<04:12,  1.25it/s]

  Batch 3060/3375 | Loss: 2.0879


Training:  91%|████████████████████████████████████████████▋    | 3080/3375 [42:05<03:51,  1.27it/s]

  Batch 3080/3375 | Loss: 2.1010


Training:  92%|█████████████████████████████████████████████    | 3100/3375 [42:21<03:47,  1.21it/s]

  Batch 3100/3375 | Loss: 1.9817


Training:  92%|█████████████████████████████████████████████▎   | 3120/3375 [42:37<03:22,  1.26it/s]

  Batch 3120/3375 | Loss: 2.0361


Training:  93%|█████████████████████████████████████████████▌   | 3140/3375 [42:53<03:04,  1.27it/s]

  Batch 3140/3375 | Loss: 2.0445


Training:  94%|█████████████████████████████████████████████▉   | 3160/3375 [43:10<02:58,  1.20it/s]

  Batch 3160/3375 | Loss: 2.0099


Training:  94%|██████████████████████████████████████████████▏  | 3180/3375 [43:26<02:35,  1.25it/s]

  Batch 3180/3375 | Loss: 2.0041


Training:  95%|██████████████████████████████████████████████▍  | 3200/3375 [43:42<02:20,  1.24it/s]

  Batch 3200/3375 | Loss: 2.0282


Training:  95%|██████████████████████████████████████████████▋  | 3220/3375 [43:59<02:07,  1.21it/s]

  Batch 3220/3375 | Loss: 1.9851


Training:  96%|███████████████████████████████████████████████  | 3240/3375 [44:15<01:48,  1.24it/s]

  Batch 3240/3375 | Loss: 2.0393


Training:  97%|███████████████████████████████████████████████▎ | 3260/3375 [44:31<01:35,  1.21it/s]

  Batch 3260/3375 | Loss: 1.9748


Training:  97%|███████████████████████████████████████████████▌ | 3280/3375 [44:48<01:17,  1.22it/s]

  Batch 3280/3375 | Loss: 1.9724


Training:  98%|███████████████████████████████████████████████▉ | 3300/3375 [45:04<00:59,  1.26it/s]

  Batch 3300/3375 | Loss: 2.0167


Training:  98%|████████████████████████████████████████████████▏| 3320/3375 [45:20<00:48,  1.14it/s]

  Batch 3320/3375 | Loss: 1.9765


Training:  99%|████████████████████████████████████████████████▍| 3340/3375 [45:36<00:28,  1.25it/s]

  Batch 3340/3375 | Loss: 1.9421


Training: 100%|████████████████████████████████████████████████▊| 3360/3375 [45:53<00:11,  1.27it/s]

  Batch 3360/3375 | Loss: 1.9912


Training: 100%|█████████████████████████████████████████████████| 3375/3375 [46:05<00:00,  1.22it/s]
Validation: 100%|█████████████████████████████████████████████████| 375/375 [01:54<00:00,  3.28it/s]

⚠️ No improvement. Patience: 2/5

--- Epoch 4 | Reveal Ratio: 0.25 ---



Training:   0%|                                                            | 0/3375 [00:00<?, ?it/s]

  Batch 0/3375 | Loss: 2.1588


Training:   1%|▎                                                  | 20/3375 [00:15<42:19,  1.32it/s]

  Batch 20/3375 | Loss: 2.2450


Training:   1%|▌                                                  | 40/3375 [00:30<41:21,  1.34it/s]

  Batch 40/3375 | Loss: 2.1714


Training:   2%|▉                                                  | 60/3375 [00:46<44:20,  1.25it/s]

  Batch 60/3375 | Loss: 2.2082


Training:   2%|█▏                                                 | 80/3375 [01:01<43:04,  1.28it/s]

  Batch 80/3375 | Loss: 2.1753


Training:   3%|█▍                                                | 100/3375 [01:17<40:25,  1.35it/s]

  Batch 100/3375 | Loss: 2.1540


Training:   4%|█▊                                                | 120/3375 [01:32<40:23,  1.34it/s]

  Batch 120/3375 | Loss: 2.1599


Training:   4%|██                                                | 140/3375 [01:48<43:52,  1.23it/s]

  Batch 140/3375 | Loss: 2.1976


Training:   5%|██▎                                               | 160/3375 [02:03<40:43,  1.32it/s]

  Batch 160/3375 | Loss: 2.2112


Training:   5%|██▋                                               | 180/3375 [02:18<39:47,  1.34it/s]

  Batch 180/3375 | Loss: 2.2072


Training:   6%|██▉                                               | 200/3375 [02:34<41:54,  1.26it/s]

  Batch 200/3375 | Loss: 2.1416


Training:   7%|███▎                                              | 220/3375 [02:49<40:40,  1.29it/s]

  Batch 220/3375 | Loss: 2.1315


Training:   7%|███▌                                              | 240/3375 [03:05<39:36,  1.32it/s]

  Batch 240/3375 | Loss: 2.2175


Training:   8%|███▊                                              | 260/3375 [03:20<38:59,  1.33it/s]

  Batch 260/3375 | Loss: 2.0882


Training:   8%|████▏                                             | 280/3375 [03:36<43:32,  1.18it/s]

  Batch 280/3375 | Loss: 2.1722


Training:   9%|████▍                                             | 300/3375 [03:51<38:45,  1.32it/s]

  Batch 300/3375 | Loss: 2.2226


Training:   9%|████▋                                             | 320/3375 [04:07<37:43,  1.35it/s]

  Batch 320/3375 | Loss: 2.1693


Training:  10%|█████                                             | 340/3375 [04:22<38:11,  1.32it/s]

  Batch 340/3375 | Loss: 2.1893


Training:  11%|█████▎                                            | 360/3375 [04:38<39:28,  1.27it/s]

  Batch 360/3375 | Loss: 2.1865


Training:  11%|█████▋                                            | 380/3375 [04:53<37:51,  1.32it/s]

  Batch 380/3375 | Loss: 2.0582


Training:  12%|█████▉                                            | 400/3375 [05:09<37:04,  1.34it/s]

  Batch 400/3375 | Loss: 2.1485


Training:  12%|██████▏                                           | 420/3375 [05:24<42:21,  1.16it/s]

  Batch 420/3375 | Loss: 2.1411


Training:  13%|██████▌                                           | 440/3375 [05:40<37:43,  1.30it/s]

  Batch 440/3375 | Loss: 2.0810


Training:  14%|██████▊                                           | 460/3375 [05:55<37:00,  1.31it/s]

  Batch 460/3375 | Loss: 2.1091


Training:  14%|███████                                           | 480/3375 [06:10<36:34,  1.32it/s]

  Batch 480/3375 | Loss: 2.1306


Training:  15%|███████▍                                          | 500/3375 [06:26<38:10,  1.26it/s]

  Batch 500/3375 | Loss: 2.2338


Training:  15%|███████▋                                          | 520/3375 [06:42<36:12,  1.31it/s]

  Batch 520/3375 | Loss: 2.1910


Training:  16%|████████                                          | 540/3375 [06:57<35:12,  1.34it/s]

  Batch 540/3375 | Loss: 2.1419


Training:  17%|████████▎                                         | 560/3375 [07:13<39:52,  1.18it/s]

  Batch 560/3375 | Loss: 2.1561


Training:  17%|████████▌                                         | 580/3375 [07:28<35:36,  1.31it/s]

  Batch 580/3375 | Loss: 2.1557


Training:  18%|████████▉                                         | 600/3375 [07:44<34:53,  1.33it/s]

  Batch 600/3375 | Loss: 2.1579


Training:  18%|█████████▏                                        | 620/3375 [07:59<34:44,  1.32it/s]

  Batch 620/3375 | Loss: 2.1193


Training:  19%|█████████▍                                        | 640/3375 [08:15<36:19,  1.26it/s]

  Batch 640/3375 | Loss: 2.1292


Training:  20%|█████████▊                                        | 660/3375 [08:30<34:10,  1.32it/s]

  Batch 660/3375 | Loss: 2.1806


Training:  20%|██████████                                        | 680/3375 [08:46<33:32,  1.34it/s]

  Batch 680/3375 | Loss: 2.1640


Training:  21%|██████████▎                                       | 700/3375 [09:01<37:40,  1.18it/s]

  Batch 700/3375 | Loss: 2.2211


Training:  21%|██████████▋                                       | 720/3375 [09:17<34:21,  1.29it/s]

  Batch 720/3375 | Loss: 2.0909


Training:  22%|██████████▉                                       | 740/3375 [09:32<33:00,  1.33it/s]

  Batch 740/3375 | Loss: 2.1674


Training:  23%|███████████▎                                      | 760/3375 [09:48<32:41,  1.33it/s]

  Batch 760/3375 | Loss: 2.2030


Training:  23%|███████████▌                                      | 780/3375 [10:03<34:17,  1.26it/s]

  Batch 780/3375 | Loss: 2.0962


Training:  24%|███████████▊                                      | 800/3375 [10:19<32:30,  1.32it/s]

  Batch 800/3375 | Loss: 2.1720


Training:  24%|████████████▏                                     | 820/3375 [10:34<31:49,  1.34it/s]

  Batch 820/3375 | Loss: 2.1024


Training:  25%|████████████▍                                     | 840/3375 [10:50<35:11,  1.20it/s]

  Batch 840/3375 | Loss: 2.1395


Training:  25%|████████████▋                                     | 860/3375 [11:05<32:05,  1.31it/s]

  Batch 860/3375 | Loss: 2.0880


Training:  26%|█████████████                                     | 880/3375 [11:21<31:16,  1.33it/s]

  Batch 880/3375 | Loss: 2.1418


Training:  27%|█████████████▎                                    | 900/3375 [11:36<30:54,  1.33it/s]

  Batch 900/3375 | Loss: 2.0878


Training:  27%|█████████████▋                                    | 920/3375 [11:52<33:17,  1.23it/s]

  Batch 920/3375 | Loss: 2.1125


Training:  28%|█████████████▉                                    | 940/3375 [12:07<30:43,  1.32it/s]

  Batch 940/3375 | Loss: 2.1465


Training:  28%|██████████████▏                                   | 960/3375 [12:23<30:25,  1.32it/s]

  Batch 960/3375 | Loss: 2.1574


Training:  29%|██████████████▌                                   | 980/3375 [12:38<32:47,  1.22it/s]

  Batch 980/3375 | Loss: 2.1704


Training:  30%|██████████████▌                                  | 1000/3375 [12:54<30:26,  1.30it/s]

  Batch 1000/3375 | Loss: 2.1708


Training:  30%|██████████████▊                                  | 1020/3375 [13:09<29:59,  1.31it/s]

  Batch 1020/3375 | Loss: 2.1756


Training:  31%|███████████████                                  | 1040/3375 [13:25<29:31,  1.32it/s]

  Batch 1040/3375 | Loss: 2.1533


Training:  31%|███████████████▍                                 | 1060/3375 [13:41<31:06,  1.24it/s]

  Batch 1060/3375 | Loss: 2.1658


Training:  32%|███████████████▋                                 | 1080/3375 [13:56<29:06,  1.31it/s]

  Batch 1080/3375 | Loss: 2.1652


Training:  33%|███████████████▉                                 | 1100/3375 [14:12<28:44,  1.32it/s]

  Batch 1100/3375 | Loss: 2.1642


Training:  33%|████████████████▎                                | 1120/3375 [14:27<31:10,  1.21it/s]

  Batch 1120/3375 | Loss: 2.2557


Training:  34%|████████████████▌                                | 1140/3375 [14:43<28:59,  1.29it/s]

  Batch 1140/3375 | Loss: 2.1496


Training:  34%|████████████████▊                                | 1160/3375 [14:58<27:54,  1.32it/s]

  Batch 1160/3375 | Loss: 2.2333


Training:  35%|█████████████████▏                               | 1180/3375 [15:14<27:19,  1.34it/s]

  Batch 1180/3375 | Loss: 2.1893


Training:  36%|█████████████████▍                               | 1200/3375 [15:30<29:10,  1.24it/s]

  Batch 1200/3375 | Loss: 2.1340


Training:  36%|█████████████████▋                               | 1220/3375 [15:45<27:43,  1.30it/s]

  Batch 1220/3375 | Loss: 2.0547


Training:  37%|██████████████████                               | 1240/3375 [16:01<26:47,  1.33it/s]

  Batch 1240/3375 | Loss: 2.2546


Training:  37%|██████████████████▎                              | 1260/3375 [16:17<29:59,  1.18it/s]

  Batch 1260/3375 | Loss: 2.1693


Training:  38%|██████████████████▌                              | 1280/3375 [16:32<26:57,  1.30it/s]

  Batch 1280/3375 | Loss: 2.0837


Training:  39%|██████████████████▊                              | 1300/3375 [16:47<26:00,  1.33it/s]

  Batch 1300/3375 | Loss: 2.1616


Training:  39%|███████████████████▏                             | 1320/3375 [17:03<25:45,  1.33it/s]

  Batch 1320/3375 | Loss: 2.2323


Training:  40%|███████████████████▍                             | 1340/3375 [17:19<27:11,  1.25it/s]

  Batch 1340/3375 | Loss: 2.2726


Training:  40%|███████████████████▋                             | 1360/3375 [17:34<25:37,  1.31it/s]

  Batch 1360/3375 | Loss: 2.1402


Training:  41%|████████████████████                             | 1380/3375 [17:50<24:58,  1.33it/s]

  Batch 1380/3375 | Loss: 2.2327


Training:  41%|████████████████████▎                            | 1400/3375 [18:05<27:26,  1.20it/s]

  Batch 1400/3375 | Loss: 2.1384


Training:  42%|████████████████████▌                            | 1420/3375 [18:21<25:08,  1.30it/s]

  Batch 1420/3375 | Loss: 2.1944


Training:  43%|████████████████████▉                            | 1440/3375 [18:36<24:29,  1.32it/s]

  Batch 1440/3375 | Loss: 2.1285


Training:  43%|█████████████████████▏                           | 1460/3375 [18:52<23:52,  1.34it/s]

  Batch 1460/3375 | Loss: 2.0637


Training:  44%|█████████████████████▍                           | 1480/3375 [19:08<25:30,  1.24it/s]

  Batch 1480/3375 | Loss: 2.1835


Training:  44%|█████████████████████▊                           | 1500/3375 [19:23<23:51,  1.31it/s]

  Batch 1500/3375 | Loss: 2.1469


Training:  45%|██████████████████████                           | 1520/3375 [19:39<23:19,  1.33it/s]

  Batch 1520/3375 | Loss: 2.2072


Training:  46%|██████████████████████▎                          | 1540/3375 [19:55<25:45,  1.19it/s]

  Batch 1540/3375 | Loss: 2.1124


Training:  46%|██████████████████████▋                          | 1560/3375 [20:10<23:20,  1.30it/s]

  Batch 1560/3375 | Loss: 2.2496


Training:  47%|██████████████████████▉                          | 1580/3375 [20:25<22:19,  1.34it/s]

  Batch 1580/3375 | Loss: 2.1017


Training:  47%|███████████████████████▏                         | 1600/3375 [20:41<22:10,  1.33it/s]

  Batch 1600/3375 | Loss: 2.1486


Training:  48%|███████████████████████▌                         | 1620/3375 [20:57<23:10,  1.26it/s]

  Batch 1620/3375 | Loss: 2.1845


Training:  49%|███████████████████████▊                         | 1640/3375 [21:12<21:55,  1.32it/s]

  Batch 1640/3375 | Loss: 2.1014


Training:  49%|████████████████████████                         | 1660/3375 [21:27<21:32,  1.33it/s]

  Batch 1660/3375 | Loss: 2.1286


Training:  50%|████████████████████████▍                        | 1680/3375 [21:43<23:04,  1.22it/s]

  Batch 1680/3375 | Loss: 2.1693


Training:  50%|████████████████████████▋                        | 1700/3375 [21:59<21:55,  1.27it/s]

  Batch 1700/3375 | Loss: 2.1875


Training:  51%|████████████████████████▉                        | 1720/3375 [22:14<20:36,  1.34it/s]

  Batch 1720/3375 | Loss: 2.1561


Training:  52%|█████████████████████████▎                       | 1740/3375 [22:29<20:23,  1.34it/s]

  Batch 1740/3375 | Loss: 2.1838


Training:  52%|█████████████████████████▌                       | 1760/3375 [22:45<22:08,  1.22it/s]

  Batch 1760/3375 | Loss: 2.1672


Training:  53%|█████████████████████████▊                       | 1780/3375 [23:01<20:10,  1.32it/s]

  Batch 1780/3375 | Loss: 2.1947


Training:  53%|██████████████████████████▏                      | 1800/3375 [23:16<19:32,  1.34it/s]

  Batch 1800/3375 | Loss: 2.1716


Training:  54%|██████████████████████████▍                      | 1820/3375 [23:32<20:55,  1.24it/s]

  Batch 1820/3375 | Loss: 2.1719


Training:  55%|██████████████████████████▋                      | 1840/3375 [23:47<19:59,  1.28it/s]

  Batch 1840/3375 | Loss: 2.1626


Training:  55%|███████████████████████████                      | 1860/3375 [24:03<19:21,  1.30it/s]

  Batch 1860/3375 | Loss: 2.1985


Training:  56%|███████████████████████████▎                     | 1880/3375 [24:18<18:37,  1.34it/s]

  Batch 1880/3375 | Loss: 2.2285


Training:  56%|███████████████████████████▌                     | 1900/3375 [24:34<20:30,  1.20it/s]

  Batch 1900/3375 | Loss: 2.1652


Training:  57%|███████████████████████████▉                     | 1920/3375 [24:50<18:35,  1.30it/s]

  Batch 1920/3375 | Loss: 2.1582


Training:  57%|████████████████████████████▏                    | 1940/3375 [25:05<17:55,  1.33it/s]

  Batch 1940/3375 | Loss: 2.1236


Training:  58%|████████████████████████████▍                    | 1960/3375 [25:21<18:58,  1.24it/s]

  Batch 1960/3375 | Loss: 2.0906


Training:  59%|████████████████████████████▋                    | 1980/3375 [25:36<18:18,  1.27it/s]

  Batch 1980/3375 | Loss: 2.1998


Training:  59%|█████████████████████████████                    | 2000/3375 [25:52<17:21,  1.32it/s]

  Batch 2000/3375 | Loss: 2.1468


Training:  60%|█████████████████████████████▎                   | 2020/3375 [26:07<17:02,  1.33it/s]

  Batch 2020/3375 | Loss: 2.2096


Training:  60%|█████████████████████████████▌                   | 2040/3375 [26:23<18:02,  1.23it/s]

  Batch 2040/3375 | Loss: 2.0973


Training:  61%|█████████████████████████████▉                   | 2060/3375 [26:39<17:02,  1.29it/s]

  Batch 2060/3375 | Loss: 2.0846


Training:  62%|██████████████████████████████▏                  | 2080/3375 [26:54<16:18,  1.32it/s]

  Batch 2080/3375 | Loss: 2.1201


Training:  62%|██████████████████████████████▍                  | 2100/3375 [27:10<17:05,  1.24it/s]

  Batch 2100/3375 | Loss: 2.1344


Training:  63%|██████████████████████████████▊                  | 2120/3375 [27:26<16:12,  1.29it/s]

  Batch 2120/3375 | Loss: 2.2040


Training:  63%|███████████████████████████████                  | 2140/3375 [27:41<15:39,  1.31it/s]

  Batch 2140/3375 | Loss: 2.1599


Training:  64%|███████████████████████████████▎                 | 2160/3375 [27:56<15:12,  1.33it/s]

  Batch 2160/3375 | Loss: 2.1580


Training:  65%|███████████████████████████████▋                 | 2180/3375 [28:12<16:55,  1.18it/s]

  Batch 2180/3375 | Loss: 2.0490


Training:  65%|███████████████████████████████▉                 | 2200/3375 [28:27<14:55,  1.31it/s]

  Batch 2200/3375 | Loss: 2.1805


Training:  66%|████████████████████████████████▏                | 2220/3375 [28:43<14:32,  1.32it/s]

  Batch 2220/3375 | Loss: 2.1898


Training:  66%|████████████████████████████████▌                | 2240/3375 [28:58<14:37,  1.29it/s]

  Batch 2240/3375 | Loss: 2.1560


Training:  67%|████████████████████████████████▊                | 2260/3375 [29:14<14:49,  1.25it/s]

  Batch 2260/3375 | Loss: 2.1091


Training:  68%|█████████████████████████████████                | 2280/3375 [29:30<13:51,  1.32it/s]

  Batch 2280/3375 | Loss: 2.2131


Training:  68%|█████████████████████████████████▍               | 2300/3375 [29:45<13:29,  1.33it/s]

  Batch 2300/3375 | Loss: 2.1124


Training:  69%|█████████████████████████████████▋               | 2320/3375 [30:01<15:03,  1.17it/s]

  Batch 2320/3375 | Loss: 2.1051


Training:  69%|█████████████████████████████████▉               | 2340/3375 [30:17<13:10,  1.31it/s]

  Batch 2340/3375 | Loss: 2.1635


Training:  70%|██████████████████████████████████▎              | 2360/3375 [30:32<12:49,  1.32it/s]

  Batch 2360/3375 | Loss: 2.1530


Training:  71%|██████████████████████████████████▌              | 2380/3375 [30:48<12:39,  1.31it/s]

  Batch 2380/3375 | Loss: 2.1893


Training:  71%|██████████████████████████████████▊              | 2400/3375 [31:03<12:56,  1.25it/s]

  Batch 2400/3375 | Loss: 2.1640


Training:  72%|███████████████████████████████████▏             | 2420/3375 [31:19<12:02,  1.32it/s]

  Batch 2420/3375 | Loss: 2.0511


Training:  72%|███████████████████████████████████▍             | 2440/3375 [31:34<11:42,  1.33it/s]

  Batch 2440/3375 | Loss: 2.1773


Training:  73%|███████████████████████████████████▋             | 2460/3375 [31:50<13:01,  1.17it/s]

  Batch 2460/3375 | Loss: 2.2301


Training:  73%|████████████████████████████████████             | 2480/3375 [32:06<11:33,  1.29it/s]

  Batch 2480/3375 | Loss: 2.1664


Training:  74%|████████████████████████████████████▎            | 2500/3375 [32:21<10:59,  1.33it/s]

  Batch 2500/3375 | Loss: 2.1420


Training:  75%|████████████████████████████████████▌            | 2520/3375 [32:37<10:51,  1.31it/s]

  Batch 2520/3375 | Loss: 2.2660


Training:  75%|████████████████████████████████████▉            | 2540/3375 [32:52<11:03,  1.26it/s]

  Batch 2540/3375 | Loss: 2.1909


Training:  76%|█████████████████████████████████████▏           | 2560/3375 [33:08<10:18,  1.32it/s]

  Batch 2560/3375 | Loss: 2.1480


Training:  76%|█████████████████████████████████████▍           | 2580/3375 [33:23<10:04,  1.31it/s]

  Batch 2580/3375 | Loss: 2.1593


Training:  77%|█████████████████████████████████████▋           | 2600/3375 [33:39<10:56,  1.18it/s]

  Batch 2600/3375 | Loss: 2.1086


Training:  78%|██████████████████████████████████████           | 2620/3375 [33:55<09:37,  1.31it/s]

  Batch 2620/3375 | Loss: 2.1900


Training:  78%|██████████████████████████████████████▎          | 2640/3375 [34:10<09:14,  1.33it/s]

  Batch 2640/3375 | Loss: 2.1442


Training:  79%|██████████████████████████████████████▌          | 2660/3375 [34:25<08:54,  1.34it/s]

  Batch 2660/3375 | Loss: 2.1157


Training:  79%|██████████████████████████████████████▉          | 2680/3375 [34:41<09:22,  1.23it/s]

  Batch 2680/3375 | Loss: 2.1213


Training:  80%|███████████████████████████████████████▏         | 2700/3375 [34:57<08:35,  1.31it/s]

  Batch 2700/3375 | Loss: 2.0661


Training:  81%|███████████████████████████████████████▍         | 2720/3375 [35:12<08:14,  1.32it/s]

  Batch 2720/3375 | Loss: 2.1345


Training:  81%|███████████████████████████████████████▊         | 2740/3375 [35:28<09:04,  1.17it/s]

  Batch 2740/3375 | Loss: 2.2093


Training:  82%|████████████████████████████████████████         | 2760/3375 [35:44<07:55,  1.29it/s]

  Batch 2760/3375 | Loss: 2.1199


Training:  82%|████████████████████████████████████████▎        | 2780/3375 [35:59<07:34,  1.31it/s]

  Batch 2780/3375 | Loss: 2.0852


Training:  83%|████████████████████████████████████████▋        | 2800/3375 [36:15<07:24,  1.29it/s]

  Batch 2800/3375 | Loss: 2.1922


Training:  84%|████████████████████████████████████████▉        | 2820/3375 [36:30<07:17,  1.27it/s]

  Batch 2820/3375 | Loss: 2.1713


Training:  84%|█████████████████████████████████████████▏       | 2840/3375 [36:46<06:48,  1.31it/s]

  Batch 2840/3375 | Loss: 2.1411


Training:  85%|█████████████████████████████████████████▌       | 2860/3375 [37:01<06:28,  1.33it/s]

  Batch 2860/3375 | Loss: 2.1556


Training:  85%|█████████████████████████████████████████▊       | 2880/3375 [37:17<06:55,  1.19it/s]

  Batch 2880/3375 | Loss: 2.1615


Training:  86%|██████████████████████████████████████████       | 2900/3375 [37:33<06:02,  1.31it/s]

  Batch 2900/3375 | Loss: 2.0772


Training:  87%|██████████████████████████████████████████▍      | 2920/3375 [37:48<05:43,  1.32it/s]

  Batch 2920/3375 | Loss: 2.2456


Training:  87%|██████████████████████████████████████████▋      | 2940/3375 [38:03<05:27,  1.33it/s]

  Batch 2940/3375 | Loss: 2.1035


Training:  88%|██████████████████████████████████████████▉      | 2960/3375 [38:19<05:31,  1.25it/s]

  Batch 2960/3375 | Loss: 2.2692


Training:  88%|███████████████████████████████████████████▎     | 2980/3375 [38:34<05:00,  1.32it/s]

  Batch 2980/3375 | Loss: 2.2163


Training:  89%|███████████████████████████████████████████▌     | 3000/3375 [38:50<04:43,  1.32it/s]

  Batch 3000/3375 | Loss: 2.0933


Training:  89%|███████████████████████████████████████████▊     | 3020/3375 [39:06<05:07,  1.16it/s]

  Batch 3020/3375 | Loss: 2.2303


Training:  90%|████████████████████████████████████████████▏    | 3040/3375 [39:21<04:18,  1.30it/s]

  Batch 3040/3375 | Loss: 2.1151


Training:  91%|████████████████████████████████████████████▍    | 3060/3375 [39:37<03:59,  1.32it/s]

  Batch 3060/3375 | Loss: 2.1331


Training:  91%|████████████████████████████████████████████▋    | 3080/3375 [39:52<03:40,  1.34it/s]

  Batch 3080/3375 | Loss: 2.2074


Training:  92%|█████████████████████████████████████████████    | 3100/3375 [40:08<03:41,  1.24it/s]

  Batch 3100/3375 | Loss: 2.1498


Training:  92%|█████████████████████████████████████████████▎   | 3120/3375 [40:23<03:13,  1.32it/s]

  Batch 3120/3375 | Loss: 2.1691


Training:  93%|█████████████████████████████████████████████▌   | 3140/3375 [40:39<02:57,  1.33it/s]

  Batch 3140/3375 | Loss: 2.1264


Training:  94%|█████████████████████████████████████████████▉   | 3160/3375 [40:55<03:02,  1.18it/s]

  Batch 3160/3375 | Loss: 2.1895


Training:  94%|██████████████████████████████████████████████▏  | 3180/3375 [41:10<02:31,  1.29it/s]

  Batch 3180/3375 | Loss: 2.2066


Training:  95%|██████████████████████████████████████████████▍  | 3200/3375 [41:25<02:12,  1.33it/s]

  Batch 3200/3375 | Loss: 2.1299


Training:  95%|██████████████████████████████████████████████▋  | 3220/3375 [41:41<01:57,  1.31it/s]

  Batch 3220/3375 | Loss: 2.1475


Training:  96%|███████████████████████████████████████████████  | 3240/3375 [41:57<01:48,  1.24it/s]

  Batch 3240/3375 | Loss: 2.2493


Training:  97%|███████████████████████████████████████████████▎ | 3260/3375 [42:12<01:28,  1.31it/s]

  Batch 3260/3375 | Loss: 2.1895


Training:  97%|███████████████████████████████████████████████▌ | 3280/3375 [42:28<01:11,  1.34it/s]

  Batch 3280/3375 | Loss: 2.1554


Training:  98%|███████████████████████████████████████████████▉ | 3300/3375 [42:43<01:00,  1.23it/s]

  Batch 3300/3375 | Loss: 2.1514


Training:  98%|████████████████████████████████████████████████▏| 3320/3375 [42:59<00:42,  1.30it/s]

  Batch 3320/3375 | Loss: 2.1932


Training:  99%|████████████████████████████████████████████████▍| 3340/3375 [43:14<00:26,  1.33it/s]

  Batch 3340/3375 | Loss: 2.1346


Training: 100%|████████████████████████████████████████████████▊| 3360/3375 [43:30<00:11,  1.33it/s]

  Batch 3360/3375 | Loss: 2.1350


Training: 100%|█████████████████████████████████████████████████| 3375/3375 [43:41<00:00,  1.29it/s]
Validation: 100%|█████████████████████████████████████████████████| 375/375 [01:49<00:00,  3.44it/s]

⚠️ No improvement. Patience: 3/5

--- Epoch 5 | Reveal Ratio: 0.15 ---



Training:   0%|                                                            | 0/3375 [00:00<?, ?it/s]

  Batch 0/3375 | Loss: 2.3772


Training:   1%|▎                                                  | 20/3375 [00:13<36:16,  1.54it/s]

  Batch 20/3375 | Loss: 2.2909


Training:   1%|▌                                                  | 40/3375 [00:27<40:25,  1.37it/s]

  Batch 40/3375 | Loss: 2.2629


Training:   2%|▉                                                  | 60/3375 [00:41<39:13,  1.41it/s]

  Batch 60/3375 | Loss: 2.3891


Training:   2%|█▏                                                 | 80/3375 [00:54<37:33,  1.46it/s]

  Batch 80/3375 | Loss: 2.2662


Training:   3%|█▍                                                | 100/3375 [01:08<36:24,  1.50it/s]

  Batch 100/3375 | Loss: 2.3735


Training:   4%|█▊                                                | 120/3375 [01:21<36:08,  1.50it/s]

  Batch 120/3375 | Loss: 2.3042


Training:   4%|██                                                | 140/3375 [01:35<35:40,  1.51it/s]

  Batch 140/3375 | Loss: 2.3116


Training:   5%|██▎                                               | 160/3375 [01:48<35:30,  1.51it/s]

  Batch 160/3375 | Loss: 2.3869


Training:   5%|██▋                                               | 180/3375 [02:02<35:14,  1.51it/s]

  Batch 180/3375 | Loss: 2.3018


Training:   6%|██▉                                               | 200/3375 [02:16<37:50,  1.40it/s]

  Batch 200/3375 | Loss: 2.3158


Training:   7%|███▎                                              | 220/3375 [02:30<38:16,  1.37it/s]

  Batch 220/3375 | Loss: 2.3682


Training:   7%|███▌                                              | 240/3375 [02:43<35:43,  1.46it/s]

  Batch 240/3375 | Loss: 2.2656


Training:   8%|███▊                                              | 260/3375 [02:57<34:48,  1.49it/s]

  Batch 260/3375 | Loss: 2.3304


Training:   8%|████▏                                             | 280/3375 [03:11<34:26,  1.50it/s]

  Batch 280/3375 | Loss: 2.3113


Training:   9%|████▍                                             | 300/3375 [03:24<34:16,  1.49it/s]

  Batch 300/3375 | Loss: 2.2343


Training:   9%|████▋                                             | 320/3375 [03:38<33:37,  1.51it/s]

  Batch 320/3375 | Loss: 2.1779


Training:  10%|█████                                             | 340/3375 [03:52<33:30,  1.51it/s]

  Batch 340/3375 | Loss: 2.3554


Training:  11%|█████▎                                            | 360/3375 [04:06<37:07,  1.35it/s]

  Batch 360/3375 | Loss: 2.2874


Training:  11%|█████▋                                            | 380/3375 [04:19<34:58,  1.43it/s]

  Batch 380/3375 | Loss: 2.3141


Training:  12%|█████▉                                            | 400/3375 [04:33<33:44,  1.47it/s]

  Batch 400/3375 | Loss: 2.2478


Training:  12%|██████▏                                           | 420/3375 [04:47<33:00,  1.49it/s]

  Batch 420/3375 | Loss: 2.3314


Training:  13%|██████▌                                           | 440/3375 [05:00<32:25,  1.51it/s]

  Batch 440/3375 | Loss: 2.3403


Training:  14%|██████▊                                           | 460/3375 [05:14<32:21,  1.50it/s]

  Batch 460/3375 | Loss: 2.2900


Training:  14%|███████                                           | 480/3375 [05:27<32:09,  1.50it/s]

  Batch 480/3375 | Loss: 2.3084


Training:  15%|███████▍                                          | 500/3375 [05:41<32:07,  1.49it/s]

  Batch 500/3375 | Loss: 2.3894


Training:  15%|███████▋                                          | 520/3375 [05:55<35:39,  1.33it/s]

  Batch 520/3375 | Loss: 2.1577


Training:  16%|████████                                          | 540/3375 [06:09<35:45,  1.32it/s]

  Batch 540/3375 | Loss: 2.3491


Training:  17%|████████▎                                         | 560/3375 [06:23<31:47,  1.48it/s]

  Batch 560/3375 | Loss: 2.2721


Training:  17%|████████▌                                         | 580/3375 [06:36<30:55,  1.51it/s]

  Batch 580/3375 | Loss: 2.3007


Training:  18%|████████▉                                         | 600/3375 [06:50<30:42,  1.51it/s]

  Batch 600/3375 | Loss: 2.2251


Training:  18%|█████████▏                                        | 620/3375 [07:03<30:23,  1.51it/s]

  Batch 620/3375 | Loss: 2.3290


Training:  19%|█████████▍                                        | 640/3375 [07:17<30:10,  1.51it/s]

  Batch 640/3375 | Loss: 2.1675


Training:  20%|█████████▊                                        | 660/3375 [07:31<32:35,  1.39it/s]

  Batch 660/3375 | Loss: 2.3182


Training:  20%|██████████                                        | 680/3375 [07:45<32:26,  1.38it/s]

  Batch 680/3375 | Loss: 2.3181


Training:  21%|██████████▎                                       | 700/3375 [07:58<30:57,  1.44it/s]

  Batch 700/3375 | Loss: 2.1718


Training:  21%|██████████▋                                       | 720/3375 [08:12<29:45,  1.49it/s]

  Batch 720/3375 | Loss: 2.2885


Training:  22%|██████████▉                                       | 740/3375 [08:26<29:23,  1.49it/s]

  Batch 740/3375 | Loss: 2.2282


Training:  23%|███████████▎                                      | 760/3375 [08:40<29:21,  1.48it/s]

  Batch 760/3375 | Loss: 2.2751


Training:  23%|███████████▌                                      | 780/3375 [08:53<28:46,  1.50it/s]

  Batch 780/3375 | Loss: 2.1344


Training:  24%|███████████▊                                      | 800/3375 [09:07<28:23,  1.51it/s]

  Batch 800/3375 | Loss: 2.2514


Training:  24%|████████████▏                                     | 820/3375 [09:21<30:40,  1.39it/s]

  Batch 820/3375 | Loss: 2.2789


Training:  25%|████████████▍                                     | 840/3375 [09:34<30:10,  1.40it/s]

  Batch 840/3375 | Loss: 2.2758


Training:  25%|████████████▋                                     | 860/3375 [09:48<28:46,  1.46it/s]

  Batch 860/3375 | Loss: 2.2958


Training:  26%|█████████████                                     | 880/3375 [10:02<28:07,  1.48it/s]

  Batch 880/3375 | Loss: 2.3540


Training:  27%|█████████████▎                                    | 900/3375 [10:15<27:32,  1.50it/s]

  Batch 900/3375 | Loss: 2.2587


Training:  27%|█████████████▋                                    | 920/3375 [10:29<27:06,  1.51it/s]

  Batch 920/3375 | Loss: 2.2051


Training:  28%|█████████████▉                                    | 940/3375 [10:43<26:40,  1.52it/s]

  Batch 940/3375 | Loss: 2.2785


Training:  28%|██████████████▏                                   | 960/3375 [10:56<26:58,  1.49it/s]

  Batch 960/3375 | Loss: 2.2405


Training:  29%|██████████████▌                                   | 980/3375 [11:10<29:50,  1.34it/s]

  Batch 980/3375 | Loss: 2.1573


Training:  30%|██████████████▌                                  | 1000/3375 [11:24<27:58,  1.41it/s]

  Batch 1000/3375 | Loss: 2.2771


Training:  30%|██████████████▊                                  | 1020/3375 [11:37<26:40,  1.47it/s]

  Batch 1020/3375 | Loss: 2.2210


Training:  31%|███████████████                                  | 1040/3375 [11:51<26:10,  1.49it/s]

  Batch 1040/3375 | Loss: 2.2732


Training:  31%|███████████████▍                                 | 1060/3375 [12:05<26:02,  1.48it/s]

  Batch 1060/3375 | Loss: 2.1999


Training:  32%|███████████████▋                                 | 1080/3375 [12:18<25:27,  1.50it/s]

  Batch 1080/3375 | Loss: 2.1829


Training:  33%|███████████████▉                                 | 1100/3375 [12:32<25:20,  1.50it/s]

  Batch 1100/3375 | Loss: 2.2409


Training:  33%|████████████████▎                                | 1120/3375 [12:46<26:10,  1.44it/s]

  Batch 1120/3375 | Loss: 2.2481


Training:  34%|████████████████▌                                | 1140/3375 [13:00<27:38,  1.35it/s]

  Batch 1140/3375 | Loss: 2.1460


Training:  34%|████████████████▊                                | 1160/3375 [13:13<25:30,  1.45it/s]

  Batch 1160/3375 | Loss: 2.3152


Training:  35%|█████████████████▏                               | 1180/3375 [13:27<24:58,  1.46it/s]

  Batch 1180/3375 | Loss: 2.2306


Training:  36%|█████████████████▍                               | 1200/3375 [13:41<24:16,  1.49it/s]

  Batch 1200/3375 | Loss: 2.2038


Training:  36%|█████████████████▋                               | 1220/3375 [13:54<23:53,  1.50it/s]

  Batch 1220/3375 | Loss: 2.2169


Training:  37%|██████████████████                               | 1240/3375 [14:08<23:42,  1.50it/s]

  Batch 1240/3375 | Loss: 2.2199


Training:  37%|██████████████████▎                              | 1260/3375 [14:22<23:09,  1.52it/s]

  Batch 1260/3375 | Loss: 2.2607


Training:  38%|██████████████████▌                              | 1280/3375 [14:35<25:10,  1.39it/s]

  Batch 1280/3375 | Loss: 2.2075


Training:  39%|██████████████████▊                              | 1300/3375 [14:49<25:02,  1.38it/s]

  Batch 1300/3375 | Loss: 2.2299


Training:  39%|███████████████████▏                             | 1320/3375 [15:03<23:47,  1.44it/s]

  Batch 1320/3375 | Loss: 2.3109


Training:  40%|███████████████████▍                             | 1340/3375 [15:17<22:56,  1.48it/s]

  Batch 1340/3375 | Loss: 2.3879


Training:  40%|███████████████████▋                             | 1360/3375 [15:30<22:25,  1.50it/s]

  Batch 1360/3375 | Loss: 2.3123


Training:  41%|████████████████████                             | 1380/3375 [15:44<22:27,  1.48it/s]

  Batch 1380/3375 | Loss: 2.2328


Training:  41%|████████████████████▎                            | 1400/3375 [15:58<21:52,  1.50it/s]

  Batch 1400/3375 | Loss: 2.2861


Training:  42%|████████████████████▌                            | 1420/3375 [16:12<22:10,  1.47it/s]

  Batch 1420/3375 | Loss: 2.2964


Training:  43%|████████████████████▉                            | 1440/3375 [16:26<24:22,  1.32it/s]

  Batch 1440/3375 | Loss: 2.2830


Training:  43%|█████████████████████▏                           | 1460/3375 [16:39<22:32,  1.42it/s]

  Batch 1460/3375 | Loss: 2.1533


Training:  44%|█████████████████████▍                           | 1480/3375 [16:53<21:14,  1.49it/s]

  Batch 1480/3375 | Loss: 2.2549


Training:  44%|█████████████████████▊                           | 1500/3375 [17:07<21:16,  1.47it/s]

  Batch 1500/3375 | Loss: 2.2657


Training:  45%|██████████████████████                           | 1520/3375 [17:20<20:21,  1.52it/s]

  Batch 1520/3375 | Loss: 2.1887


Training:  46%|██████████████████████▎                          | 1540/3375 [17:34<20:22,  1.50it/s]

  Batch 1540/3375 | Loss: 2.2603


Training:  46%|██████████████████████▋                          | 1560/3375 [17:48<20:01,  1.51it/s]

  Batch 1560/3375 | Loss: 2.2427


Training:  47%|██████████████████████▉                          | 1580/3375 [18:01<21:03,  1.42it/s]

  Batch 1580/3375 | Loss: 2.2947


Training:  47%|███████████████████████▏                         | 1600/3375 [18:15<21:42,  1.36it/s]

  Batch 1600/3375 | Loss: 2.1713


Training:  48%|███████████████████████▌                         | 1620/3375 [18:29<20:04,  1.46it/s]

  Batch 1620/3375 | Loss: 2.2923


Training:  49%|███████████████████████▊                         | 1640/3375 [18:43<19:26,  1.49it/s]

  Batch 1640/3375 | Loss: 2.3479


Training:  49%|████████████████████████                         | 1660/3375 [18:56<19:07,  1.49it/s]

  Batch 1660/3375 | Loss: 2.4287


Training:  50%|████████████████████████▍                        | 1680/3375 [19:10<19:04,  1.48it/s]

  Batch 1680/3375 | Loss: 2.2783


Training:  50%|████████████████████████▋                        | 1700/3375 [19:24<18:53,  1.48it/s]

  Batch 1700/3375 | Loss: 2.2646


Training:  51%|████████████████████████▉                        | 1720/3375 [19:38<19:21,  1.42it/s]

  Batch 1720/3375 | Loss: 2.2886


Training:  52%|█████████████████████████▎                       | 1740/3375 [19:52<20:12,  1.35it/s]

  Batch 1740/3375 | Loss: 2.2830


Training:  52%|█████████████████████████▌                       | 1760/3375 [20:06<18:57,  1.42it/s]

  Batch 1760/3375 | Loss: 2.2477


Training:  53%|█████████████████████████▊                       | 1780/3375 [20:20<18:23,  1.45it/s]

  Batch 1780/3375 | Loss: 2.3193


Training:  53%|██████████████████████████▏                      | 1800/3375 [20:34<17:43,  1.48it/s]

  Batch 1800/3375 | Loss: 2.2452


Training:  54%|██████████████████████████▍                      | 1820/3375 [20:47<17:13,  1.50it/s]

  Batch 1820/3375 | Loss: 2.2773


Training:  55%|██████████████████████████▋                      | 1840/3375 [21:01<16:52,  1.52it/s]

  Batch 1840/3375 | Loss: 2.3431


Training:  55%|███████████████████████████                      | 1860/3375 [21:15<16:36,  1.52it/s]

  Batch 1860/3375 | Loss: 2.1955


Training:  56%|███████████████████████████▎                     | 1880/3375 [21:29<18:07,  1.37it/s]

  Batch 1880/3375 | Loss: 2.3102


Training:  56%|███████████████████████████▌                     | 1900/3375 [21:42<17:38,  1.39it/s]

  Batch 1900/3375 | Loss: 2.1189


Training:  57%|███████████████████████████▉                     | 1920/3375 [21:56<16:33,  1.47it/s]

  Batch 1920/3375 | Loss: 2.1998


Training:  57%|████████████████████████████▏                    | 1940/3375 [22:10<16:16,  1.47it/s]

  Batch 1940/3375 | Loss: 2.2589


Training:  58%|████████████████████████████▍                    | 1960/3375 [22:24<15:41,  1.50it/s]

  Batch 1960/3375 | Loss: 2.2614


Training:  59%|████████████████████████████▋                    | 1980/3375 [22:37<15:30,  1.50it/s]

  Batch 1980/3375 | Loss: 2.2956


Training:  59%|█████████████████████████████                    | 2000/3375 [22:51<15:31,  1.48it/s]

  Batch 2000/3375 | Loss: 2.1835


Training:  60%|█████████████████████████████▎                   | 2020/3375 [23:05<15:45,  1.43it/s]

  Batch 2020/3375 | Loss: 2.3052


Training:  60%|█████████████████████████████▌                   | 2040/3375 [23:19<16:47,  1.33it/s]

  Batch 2040/3375 | Loss: 2.2654


Training:  61%|█████████████████████████████▉                   | 2060/3375 [23:32<15:05,  1.45it/s]

  Batch 2060/3375 | Loss: 2.2120


Training:  62%|██████████████████████████████▏                  | 2080/3375 [23:46<14:40,  1.47it/s]

  Batch 2080/3375 | Loss: 2.2885


Training:  62%|██████████████████████████████▍                  | 2100/3375 [24:00<14:12,  1.50it/s]

  Batch 2100/3375 | Loss: 2.2776


Training:  63%|██████████████████████████████▊                  | 2120/3375 [24:13<14:01,  1.49it/s]

  Batch 2120/3375 | Loss: 2.3023


Training:  63%|███████████████████████████████                  | 2140/3375 [24:27<13:46,  1.49it/s]

  Batch 2140/3375 | Loss: 2.1925


Training:  64%|███████████████████████████████▎                 | 2160/3375 [24:41<13:26,  1.51it/s]

  Batch 2160/3375 | Loss: 2.3024


Training:  65%|███████████████████████████████▋                 | 2180/3375 [24:55<14:28,  1.38it/s]

  Batch 2180/3375 | Loss: 2.3026


Training:  65%|███████████████████████████████▉                 | 2200/3375 [25:09<14:12,  1.38it/s]

  Batch 2200/3375 | Loss: 2.2498


Training:  66%|████████████████████████████████▏                | 2220/3375 [25:22<13:17,  1.45it/s]

  Batch 2220/3375 | Loss: 2.2502


Training:  66%|████████████████████████████████▌                | 2240/3375 [25:36<12:51,  1.47it/s]

  Batch 2240/3375 | Loss: 2.3148


Training:  67%|████████████████████████████████▊                | 2260/3375 [25:50<12:29,  1.49it/s]

  Batch 2260/3375 | Loss: 2.2368


Training:  68%|█████████████████████████████████                | 2280/3375 [26:03<12:09,  1.50it/s]

  Batch 2280/3375 | Loss: 2.2953


Training:  68%|█████████████████████████████████▍               | 2300/3375 [26:17<12:00,  1.49it/s]

  Batch 2300/3375 | Loss: 2.2775


Training:  69%|█████████████████████████████████▋               | 2320/3375 [26:31<11:47,  1.49it/s]

  Batch 2320/3375 | Loss: 2.1853


Training:  69%|█████████████████████████████████▉               | 2340/3375 [26:45<13:01,  1.32it/s]

  Batch 2340/3375 | Loss: 2.3362


Training:  70%|██████████████████████████████████▎              | 2360/3375 [26:59<11:55,  1.42it/s]

  Batch 2360/3375 | Loss: 2.2251


Training:  71%|██████████████████████████████████▌              | 2380/3375 [27:12<11:12,  1.48it/s]

  Batch 2380/3375 | Loss: 2.2487


Training:  71%|██████████████████████████████████▊              | 2400/3375 [27:26<10:53,  1.49it/s]

  Batch 2400/3375 | Loss: 2.1749


Training:  72%|███████████████████████████████████▏             | 2420/3375 [27:40<10:41,  1.49it/s]

  Batch 2420/3375 | Loss: 2.3290


Training:  72%|███████████████████████████████████▍             | 2440/3375 [27:54<10:24,  1.50it/s]

  Batch 2440/3375 | Loss: 2.1947


Training:  73%|███████████████████████████████████▋             | 2460/3375 [28:07<10:04,  1.51it/s]

  Batch 2460/3375 | Loss: 2.2661


Training:  73%|████████████████████████████████████             | 2480/3375 [28:21<10:30,  1.42it/s]

  Batch 2480/3375 | Loss: 2.3009


Training:  74%|████████████████████████████████████▎            | 2500/3375 [28:35<10:43,  1.36it/s]

  Batch 2500/3375 | Loss: 2.2880


Training:  75%|████████████████████████████████████▌            | 2520/3375 [28:49<09:50,  1.45it/s]

  Batch 2520/3375 | Loss: 2.3204


Training:  75%|████████████████████████████████████▉            | 2540/3375 [29:02<09:19,  1.49it/s]

  Batch 2540/3375 | Loss: 2.3100


Training:  76%|█████████████████████████████████████▏           | 2560/3375 [29:16<09:04,  1.50it/s]

  Batch 2560/3375 | Loss: 2.3057


Training:  76%|█████████████████████████████████████▍           | 2580/3375 [29:30<08:53,  1.49it/s]

  Batch 2580/3375 | Loss: 2.1736


Training:  77%|█████████████████████████████████████▋           | 2600/3375 [29:43<08:35,  1.50it/s]

  Batch 2600/3375 | Loss: 2.2589


Training:  78%|██████████████████████████████████████           | 2620/3375 [29:57<08:20,  1.51it/s]

  Batch 2620/3375 | Loss: 2.2152


Training:  78%|██████████████████████████████████████▎          | 2640/3375 [30:11<08:47,  1.39it/s]

  Batch 2640/3375 | Loss: 2.2293


Training:  79%|██████████████████████████████████████▌          | 2660/3375 [30:25<08:32,  1.39it/s]

  Batch 2660/3375 | Loss: 2.2730


Training:  79%|██████████████████████████████████████▉          | 2680/3375 [30:39<08:04,  1.43it/s]

  Batch 2680/3375 | Loss: 2.2992


Training:  80%|███████████████████████████████████████▏         | 2700/3375 [30:52<07:34,  1.48it/s]

  Batch 2700/3375 | Loss: 2.1914


Training:  81%|███████████████████████████████████████▍         | 2720/3375 [31:06<07:16,  1.50it/s]

  Batch 2720/3375 | Loss: 2.2731


Training:  81%|███████████████████████████████████████▊         | 2740/3375 [31:20<06:58,  1.52it/s]

  Batch 2740/3375 | Loss: 2.3295


Training:  82%|████████████████████████████████████████         | 2760/3375 [31:33<06:45,  1.52it/s]

  Batch 2760/3375 | Loss: 2.2708


Training:  82%|████████████████████████████████████████▎        | 2780/3375 [31:47<06:40,  1.49it/s]

  Batch 2780/3375 | Loss: 2.2778


Training:  83%|████████████████████████████████████████▋        | 2800/3375 [32:01<07:13,  1.33it/s]

  Batch 2800/3375 | Loss: 2.2642


Training:  84%|████████████████████████████████████████▉        | 2820/3375 [32:15<06:30,  1.42it/s]

  Batch 2820/3375 | Loss: 2.3224


Training:  84%|█████████████████████████████████████████▏       | 2840/3375 [32:28<06:02,  1.48it/s]

  Batch 2840/3375 | Loss: 2.1903


Training:  85%|█████████████████████████████████████████▌       | 2860/3375 [32:42<05:46,  1.49it/s]

  Batch 2860/3375 | Loss: 2.2920


Training:  85%|█████████████████████████████████████████▊       | 2880/3375 [32:55<05:30,  1.50it/s]

  Batch 2880/3375 | Loss: 2.2259


Training:  86%|██████████████████████████████████████████       | 2900/3375 [33:09<05:13,  1.52it/s]

  Batch 2900/3375 | Loss: 2.2633


Training:  87%|██████████████████████████████████████████▍      | 2920/3375 [33:23<05:02,  1.50it/s]

  Batch 2920/3375 | Loss: 2.2469


Training:  87%|██████████████████████████████████████████▋      | 2940/3375 [33:37<05:04,  1.43it/s]

  Batch 2940/3375 | Loss: 2.2602


Training:  88%|██████████████████████████████████████████▉      | 2960/3375 [33:51<05:13,  1.32it/s]

  Batch 2960/3375 | Loss: 2.2132


Training:  88%|███████████████████████████████████████████▎     | 2980/3375 [34:04<04:36,  1.43it/s]

  Batch 2980/3375 | Loss: 2.2614


Training:  89%|███████████████████████████████████████████▌     | 3000/3375 [34:18<04:12,  1.48it/s]

  Batch 3000/3375 | Loss: 2.1696


Training:  89%|███████████████████████████████████████████▊     | 3020/3375 [34:31<03:55,  1.51it/s]

  Batch 3020/3375 | Loss: 2.2629


Training:  90%|████████████████████████████████████████████▏    | 3040/3375 [34:45<03:43,  1.50it/s]

  Batch 3040/3375 | Loss: 2.1932


Training:  91%|████████████████████████████████████████████▍    | 3060/3375 [34:59<03:32,  1.48it/s]

  Batch 3060/3375 | Loss: 2.3201


Training:  91%|████████████████████████████████████████████▋    | 3080/3375 [35:13<03:18,  1.48it/s]

  Batch 3080/3375 | Loss: 2.2722


Training:  92%|█████████████████████████████████████████████    | 3100/3375 [35:27<03:25,  1.34it/s]

  Batch 3100/3375 | Loss: 2.2698


Training:  92%|█████████████████████████████████████████████▎   | 3120/3375 [35:41<03:04,  1.38it/s]

  Batch 3120/3375 | Loss: 2.2972


Training:  93%|█████████████████████████████████████████████▌   | 3140/3375 [35:55<02:46,  1.42it/s]

  Batch 3140/3375 | Loss: 2.2733


Training:  94%|█████████████████████████████████████████████▉   | 3160/3375 [36:09<02:28,  1.45it/s]

  Batch 3160/3375 | Loss: 2.3228


Training:  94%|██████████████████████████████████████████████▏  | 3180/3375 [36:23<02:12,  1.48it/s]

  Batch 3180/3375 | Loss: 2.1243


Training:  95%|██████████████████████████████████████████████▍  | 3200/3375 [36:37<01:57,  1.49it/s]

  Batch 3200/3375 | Loss: 2.2026


Training:  95%|██████████████████████████████████████████████▋  | 3220/3375 [36:51<01:43,  1.50it/s]

  Batch 3220/3375 | Loss: 2.2657


Training:  96%|███████████████████████████████████████████████  | 3240/3375 [37:05<01:39,  1.35it/s]

  Batch 3240/3375 | Loss: 2.2819


Training:  97%|███████████████████████████████████████████████▎ | 3260/3375 [37:19<01:22,  1.39it/s]

  Batch 3260/3375 | Loss: 2.2881


Training:  97%|███████████████████████████████████████████████▌ | 3280/3375 [37:33<01:05,  1.45it/s]

  Batch 3280/3375 | Loss: 2.1938


Training:  98%|███████████████████████████████████████████████▉ | 3300/3375 [37:46<00:50,  1.48it/s]

  Batch 3300/3375 | Loss: 2.2487


Training:  98%|████████████████████████████████████████████████▏| 3320/3375 [38:00<00:36,  1.49it/s]

  Batch 3320/3375 | Loss: 2.2500


Training:  99%|████████████████████████████████████████████████▍| 3340/3375 [38:14<00:23,  1.49it/s]

  Batch 3340/3375 | Loss: 2.2432


Training: 100%|████████████████████████████████████████████████▊| 3360/3375 [38:28<00:10,  1.48it/s]

  Batch 3360/3375 | Loss: 2.3674


Training: 100%|█████████████████████████████████████████████████| 3375/3375 [38:38<00:00,  1.46it/s]
Validation: 100%|█████████████████████████████████████████████████| 375/375 [01:36<00:00,  3.87it/s]

⚠️ No improvement. Patience: 4/5

--- Epoch 6 | Reveal Ratio: 0.10 ---



Training:   0%|                                                            | 0/3375 [00:00<?, ?it/s]

  Batch 0/3375 | Loss: 2.3237


Training:   1%|▎                                                  | 20/3375 [00:12<34:08,  1.64it/s]

  Batch 20/3375 | Loss: 2.2990


Training:   1%|▌                                                  | 40/3375 [00:25<33:18,  1.67it/s]

  Batch 40/3375 | Loss: 2.2520


Training:   2%|▉                                                  | 60/3375 [00:37<33:08,  1.67it/s]

  Batch 60/3375 | Loss: 2.1448


Training:   2%|█▏                                                 | 80/3375 [00:49<32:51,  1.67it/s]

  Batch 80/3375 | Loss: 2.2563


Training:   3%|█▍                                                | 100/3375 [01:02<33:08,  1.65it/s]

  Batch 100/3375 | Loss: 2.2508


Training:   4%|█▊                                                | 120/3375 [01:14<32:44,  1.66it/s]

  Batch 120/3375 | Loss: 2.1705


Training:   4%|██                                                | 140/3375 [01:26<32:47,  1.64it/s]

  Batch 140/3375 | Loss: 2.2873


Training:   5%|██▎                                               | 160/3375 [01:39<32:20,  1.66it/s]

  Batch 160/3375 | Loss: 2.2641


Training:   5%|██▋                                               | 180/3375 [01:51<31:55,  1.67it/s]

  Batch 180/3375 | Loss: 2.3250


Training:   6%|██▉                                               | 200/3375 [02:04<31:43,  1.67it/s]

  Batch 200/3375 | Loss: 2.3051


Training:   7%|███▎                                              | 220/3375 [02:16<31:43,  1.66it/s]

  Batch 220/3375 | Loss: 2.1474


Training:   7%|███▌                                              | 240/3375 [02:29<31:26,  1.66it/s]

  Batch 240/3375 | Loss: 2.2344


Training:   8%|███▊                                              | 260/3375 [02:41<31:06,  1.67it/s]

  Batch 260/3375 | Loss: 2.0800


Training:   8%|████▏                                             | 280/3375 [02:54<30:58,  1.67it/s]

  Batch 280/3375 | Loss: 2.1054


Training:   9%|████▍                                             | 300/3375 [03:06<31:15,  1.64it/s]

  Batch 300/3375 | Loss: 2.2534


Training:   9%|████▋                                             | 320/3375 [03:19<30:36,  1.66it/s]

  Batch 320/3375 | Loss: 2.2036


Training:  10%|█████                                             | 340/3375 [03:31<31:31,  1.60it/s]

  Batch 340/3375 | Loss: 2.2326


Training:  11%|█████▎                                            | 360/3375 [03:44<32:19,  1.55it/s]

  Batch 360/3375 | Loss: 2.2543


Training:  11%|█████▋                                            | 380/3375 [03:57<35:07,  1.42it/s]

  Batch 380/3375 | Loss: 2.3452


Training:  12%|█████▉                                            | 400/3375 [04:10<34:40,  1.43it/s]

  Batch 400/3375 | Loss: 2.3606


Training:  12%|██████▏                                           | 420/3375 [04:22<33:51,  1.45it/s]

  Batch 420/3375 | Loss: 2.2802


Training:  13%|██████▌                                           | 440/3375 [04:35<32:26,  1.51it/s]

  Batch 440/3375 | Loss: 2.3029


Training:  14%|██████▊                                           | 460/3375 [04:47<31:46,  1.53it/s]

  Batch 460/3375 | Loss: 2.2872


Training:  14%|███████                                           | 480/3375 [04:59<30:42,  1.57it/s]

  Batch 480/3375 | Loss: 2.0664


Training:  15%|███████▍                                          | 500/3375 [05:12<30:36,  1.57it/s]

  Batch 500/3375 | Loss: 2.2433


Training:  15%|███████▋                                          | 520/3375 [05:24<29:56,  1.59it/s]

  Batch 520/3375 | Loss: 2.1553


Training:  16%|████████                                          | 540/3375 [05:37<29:39,  1.59it/s]

  Batch 540/3375 | Loss: 2.1690


Training:  17%|████████▎                                         | 560/3375 [05:49<29:43,  1.58it/s]

  Batch 560/3375 | Loss: 2.1963


Training:  17%|████████▌                                         | 580/3375 [06:02<29:02,  1.60it/s]

  Batch 580/3375 | Loss: 2.1515


Training:  18%|████████▉                                         | 600/3375 [06:15<28:22,  1.63it/s]

  Batch 600/3375 | Loss: 2.3051


Training:  18%|█████████▏                                        | 620/3375 [06:27<28:00,  1.64it/s]

  Batch 620/3375 | Loss: 2.2420


Training:  19%|█████████▍                                        | 640/3375 [06:39<28:07,  1.62it/s]

  Batch 640/3375 | Loss: 2.3358


Training:  20%|█████████▊                                        | 660/3375 [06:52<28:08,  1.61it/s]

  Batch 660/3375 | Loss: 2.1533


Training:  20%|██████████                                        | 680/3375 [07:05<27:48,  1.62it/s]

  Batch 680/3375 | Loss: 2.2452


Training:  21%|██████████▎                                       | 700/3375 [07:18<27:46,  1.61it/s]

  Batch 700/3375 | Loss: 2.3566


Training:  21%|██████████▋                                       | 720/3375 [07:30<27:12,  1.63it/s]

  Batch 720/3375 | Loss: 2.1371


Training:  22%|██████████▉                                       | 740/3375 [07:43<26:57,  1.63it/s]

  Batch 740/3375 | Loss: 2.2110


Training:  23%|███████████▎                                      | 760/3375 [07:55<26:59,  1.61it/s]

  Batch 760/3375 | Loss: 2.2262


Training:  23%|███████████▌                                      | 780/3375 [08:08<26:39,  1.62it/s]

  Batch 780/3375 | Loss: 2.2503


Training:  24%|███████████▊                                      | 800/3375 [08:21<26:41,  1.61it/s]

  Batch 800/3375 | Loss: 2.1098


Training:  24%|████████████▏                                     | 820/3375 [08:33<25:43,  1.66it/s]

  Batch 820/3375 | Loss: 2.2876


Training:  25%|████████████▍                                     | 840/3375 [08:46<25:34,  1.65it/s]

  Batch 840/3375 | Loss: 2.2198


Training:  25%|████████████▋                                     | 860/3375 [08:58<25:10,  1.66it/s]

  Batch 860/3375 | Loss: 2.1888


Training:  26%|█████████████                                     | 880/3375 [09:11<24:59,  1.66it/s]

  Batch 880/3375 | Loss: 2.1832


Training:  27%|█████████████▎                                    | 900/3375 [09:23<25:00,  1.65it/s]

  Batch 900/3375 | Loss: 2.2477


Training:  27%|█████████████▋                                    | 920/3375 [09:36<24:33,  1.67it/s]

  Batch 920/3375 | Loss: 2.2740


Training:  28%|█████████████▉                                    | 940/3375 [09:48<24:35,  1.65it/s]

  Batch 940/3375 | Loss: 2.1718


Training:  28%|██████████████▏                                   | 960/3375 [10:01<24:23,  1.65it/s]

  Batch 960/3375 | Loss: 2.3152


Training:  29%|██████████████▌                                   | 980/3375 [10:13<25:12,  1.58it/s]

  Batch 980/3375 | Loss: 2.0840


Training:  30%|██████████████▌                                  | 1000/3375 [10:26<25:49,  1.53it/s]

  Batch 1000/3375 | Loss: 2.0880


Training:  30%|██████████████▊                                  | 1020/3375 [10:39<26:09,  1.50it/s]

  Batch 1020/3375 | Loss: 2.1456


Training:  31%|███████████████                                  | 1040/3375 [10:51<26:08,  1.49it/s]

  Batch 1040/3375 | Loss: 2.2355


Training:  31%|███████████████▍                                 | 1060/3375 [11:04<26:35,  1.45it/s]

  Batch 1060/3375 | Loss: 2.2438


Training:  32%|███████████████▋                                 | 1080/3375 [11:17<25:53,  1.48it/s]

  Batch 1080/3375 | Loss: 2.2257


Training:  33%|███████████████▉                                 | 1100/3375 [11:29<25:33,  1.48it/s]

  Batch 1100/3375 | Loss: 2.1982


Training:  33%|████████████████▎                                | 1120/3375 [11:42<24:28,  1.54it/s]

  Batch 1120/3375 | Loss: 2.0845


Training:  34%|████████████████▌                                | 1140/3375 [11:54<24:29,  1.52it/s]

  Batch 1140/3375 | Loss: 2.3060


Training:  34%|████████████████▊                                | 1160/3375 [12:07<24:02,  1.54it/s]

  Batch 1160/3375 | Loss: 2.1191


Training:  35%|█████████████████▏                               | 1180/3375 [12:20<23:58,  1.53it/s]

  Batch 1180/3375 | Loss: 2.2715


Training:  36%|█████████████████▍                               | 1200/3375 [12:33<23:24,  1.55it/s]

  Batch 1200/3375 | Loss: 2.1867


Training:  36%|█████████████████▋                               | 1220/3375 [12:46<23:03,  1.56it/s]

  Batch 1220/3375 | Loss: 2.0860


Training:  37%|██████████████████                               | 1240/3375 [13:00<22:57,  1.55it/s]

  Batch 1240/3375 | Loss: 2.2820


Training:  37%|██████████████████▎                              | 1260/3375 [13:13<22:25,  1.57it/s]

  Batch 1260/3375 | Loss: 2.2310


Training:  38%|██████████████████▌                              | 1280/3375 [13:26<22:28,  1.55it/s]

  Batch 1280/3375 | Loss: 2.1767


Training:  39%|██████████████████▊                              | 1300/3375 [13:39<21:58,  1.57it/s]

  Batch 1300/3375 | Loss: 2.1314


Training:  39%|███████████████████▏                             | 1320/3375 [13:52<21:31,  1.59it/s]

  Batch 1320/3375 | Loss: 2.2486


Training:  40%|███████████████████▍                             | 1340/3375 [14:05<21:31,  1.58it/s]

  Batch 1340/3375 | Loss: 2.2504


Training:  40%|███████████████████▋                             | 1360/3375 [14:18<20:51,  1.61it/s]

  Batch 1360/3375 | Loss: 2.3811


Training:  41%|████████████████████                             | 1380/3375 [14:32<21:04,  1.58it/s]

  Batch 1380/3375 | Loss: 2.2814


Training:  41%|████████████████████▎                            | 1400/3375 [14:45<20:36,  1.60it/s]

  Batch 1400/3375 | Loss: 2.2784


Training:  42%|████████████████████▌                            | 1420/3375 [14:58<21:40,  1.50it/s]

  Batch 1420/3375 | Loss: 2.0820


Training:  43%|████████████████████▉                            | 1440/3375 [15:11<21:50,  1.48it/s]

  Batch 1440/3375 | Loss: 2.1232


Training:  43%|█████████████████████▏                           | 1460/3375 [15:25<23:11,  1.38it/s]

  Batch 1460/3375 | Loss: 2.2138


Training:  44%|█████████████████████▍                           | 1480/3375 [15:38<21:59,  1.44it/s]

  Batch 1480/3375 | Loss: 2.3305


Training:  44%|█████████████████████▊                           | 1500/3375 [15:51<21:33,  1.45it/s]

  Batch 1500/3375 | Loss: 2.2552


Training:  45%|██████████████████████                           | 1520/3375 [16:04<20:46,  1.49it/s]

  Batch 1520/3375 | Loss: 2.3427


Training:  46%|██████████████████████▎                          | 1540/3375 [16:16<20:03,  1.52it/s]

  Batch 1540/3375 | Loss: 2.1753


Training:  46%|██████████████████████▋                          | 1560/3375 [16:29<19:45,  1.53it/s]

  Batch 1560/3375 | Loss: 2.2498


Training:  47%|██████████████████████▉                          | 1580/3375 [16:42<19:01,  1.57it/s]

  Batch 1580/3375 | Loss: 2.1592


Training:  47%|███████████████████████▏                         | 1600/3375 [16:55<18:52,  1.57it/s]

  Batch 1600/3375 | Loss: 2.0472


Training:  48%|███████████████████████▌                         | 1620/3375 [17:08<18:28,  1.58it/s]

  Batch 1620/3375 | Loss: 2.3189


Training:  49%|███████████████████████▊                         | 1640/3375 [17:21<17:55,  1.61it/s]

  Batch 1640/3375 | Loss: 2.1893


Training:  49%|████████████████████████                         | 1660/3375 [17:34<17:50,  1.60it/s]

  Batch 1660/3375 | Loss: 2.1873


Training:  50%|████████████████████████▍                        | 1680/3375 [17:46<17:42,  1.59it/s]

  Batch 1680/3375 | Loss: 2.3213


Training:  50%|████████████████████████▋                        | 1700/3375 [18:00<17:46,  1.57it/s]

  Batch 1700/3375 | Loss: 2.2909


Training:  51%|████████████████████████▉                        | 1720/3375 [18:13<17:39,  1.56it/s]

  Batch 1720/3375 | Loss: 2.2917


Training:  52%|█████████████████████████▎                       | 1740/3375 [18:26<17:24,  1.56it/s]

  Batch 1740/3375 | Loss: 2.3035


Training:  52%|█████████████████████████▌                       | 1760/3375 [18:39<17:12,  1.56it/s]

  Batch 1760/3375 | Loss: 2.3384


Training:  53%|█████████████████████████▊                       | 1780/3375 [18:52<16:31,  1.61it/s]

  Batch 1780/3375 | Loss: 2.3283


Training:  53%|██████████████████████████▏                      | 1800/3375 [19:04<16:11,  1.62it/s]

  Batch 1800/3375 | Loss: 2.1908


Training:  54%|██████████████████████████▍                      | 1820/3375 [19:17<15:45,  1.65it/s]

  Batch 1820/3375 | Loss: 2.1395


Training:  55%|██████████████████████████▋                      | 1840/3375 [19:30<15:43,  1.63it/s]

  Batch 1840/3375 | Loss: 2.0939


Training:  55%|███████████████████████████                      | 1860/3375 [19:42<16:11,  1.56it/s]

  Batch 1860/3375 | Loss: 2.3749


Training:  56%|███████████████████████████▎                     | 1880/3375 [19:55<16:32,  1.51it/s]

  Batch 1880/3375 | Loss: 2.2552


Training:  56%|███████████████████████████▌                     | 1900/3375 [20:08<16:24,  1.50it/s]

  Batch 1900/3375 | Loss: 2.4562


Training:  57%|███████████████████████████▉                     | 1920/3375 [20:21<16:55,  1.43it/s]

  Batch 1920/3375 | Loss: 2.2181


Training:  57%|████████████████████████████▏                    | 1940/3375 [20:33<16:32,  1.45it/s]

  Batch 1940/3375 | Loss: 2.1678


Training:  58%|████████████████████████████▍                    | 1960/3375 [20:46<15:40,  1.51it/s]

  Batch 1960/3375 | Loss: 2.1724


Training:  59%|████████████████████████████▋                    | 1980/3375 [20:58<15:27,  1.50it/s]

  Batch 1980/3375 | Loss: 2.1600


Training:  59%|█████████████████████████████                    | 2000/3375 [21:11<14:39,  1.56it/s]

  Batch 2000/3375 | Loss: 2.3219


Training:  59%|█████████████████████████████                    | 2002/3375 [21:12<14:15,  1.60it/s]