In [None]:
import json
import os
import requests
import random
import string
import secrets
import time
import re
import collections
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import copy
import numpy as np
from collections import defaultdict

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random, string, copy
from collections import Counter

class HangmanDataset(Dataset):
    def __init__(self, words, max_word_length=45, reveal_ratio=0.5):
        self.words = [word.lower() for word in words if len(word) <= max_word_length]
        self.max_length = max_word_length
        self.reveal_ratio = reveal_ratio
        self.char_to_idx = {char: i+1 for i, char in enumerate(string.ascii_lowercase)}
        self.char_to_idx['_'] = 0  # blank
        self.char_to_idx['PAD'] = 27

    def __len__(self): return len(self.words) * 80

    def __getitem__(self, idx):
        word = self.words[idx % len(self.words)]
        reveal_count = int(len(word) * self.reveal_ratio)
        revealed = random.sample(range(len(word)), reveal_count) if reveal_count > 0 else []

        word_state = [0] * self.max_length
        for pos in revealed: word_state[pos] = self.char_to_idx[word[pos]]

        target_pos, target_chars, position_context, vowels = [], [], [0]*self.max_length, set('aeiou')
        for i in range(len(word)):
            if i not in revealed:
                ctx = 0
                if i > 0 and word_state[i-1] != 0: ctx += 1
                if i < len(word)-1 and word_state[i+1] != 0: ctx += 2
                if ctx:
                    target_pos.append(i)
                    target_chars.append(self.char_to_idx[word[i]])
                    position_context[i] = ctx

        count_blanks = word_state[:len(word)].count(0)
        blank_vowel_next = [0]*self.max_length
        for i in range(len(word)):
            if word_state[i] == 0:
                l = word[i-1] if i > 0 else 'x'
                r = word[i+1] if i < len(word)-1 else 'x'
                if l in vowels or r in vowels:
                    blank_vowel_next[i] = 1

        max_targets = 10
        while len(target_pos) < max_targets:
            target_pos.append(-1)
            target_chars.append(0)

        return {
            'word_state': torch.tensor(word_state, dtype=torch.long),
            'position_context': torch.tensor(position_context, dtype=torch.long),
            'target_positions': torch.tensor(target_pos[:max_targets], dtype=torch.long),
            'target_chars': torch.tensor(target_chars[:max_targets], dtype=torch.long),
            'word_length': torch.tensor(len(word), dtype=torch.long),
            'blank_count': torch.tensor(count_blanks, dtype=torch.long),
            'next_to_vowel': torch.tensor(blank_vowel_next, dtype=torch.float)
        }

class EnhancedHangmanModel(nn.Module):
    def __init__(self, vocab_size=28, max_len=45, emb_dim=128, hidden_dim=1024, ablate={}):
        super().__init__()
        self.ablate = ablate
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.ctx_emb = nn.Embedding(4, 32)

        self.pattern_cnn = nn.Sequential(
            nn.Conv1d(emb_dim, 64, 3, padding=1), nn.ReLU(), nn.Dropout(0.2),
            nn.Conv1d(64, 64, 3, padding=1), nn.ReLU(), nn.Dropout(0.2)
        )

        self.encoder = nn.LSTM(emb_dim + 32, hidden_dim, bidirectional=True, batch_first=True)

        self.pos_prior_mlp = nn.Sequential(
            nn.Linear(1 + 1 + 64, 32), nn.ReLU(), nn.Dropout(0.2), nn.Linear(32, 26)
        )

        def decoder():
            return nn.Sequential(
                nn.Linear(hidden_dim*2 + 26, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3),

                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),

                nn.Linear(hidden_dim // 2, 26)
            )

        self.left_decoder = decoder()
        self.right_decoder = decoder()
        self.both_decoder = decoder()

    def forward(self, word_state, position_context, word_length, blank_count, next_to_vowel):
        B, L = word_state.size()
        emb = self.char_emb(word_state)
        cnn_feat = self.pattern_cnn(emb.transpose(1, 2)).transpose(1, 2)
        ctx = self.ctx_emb(position_context)
        encoded, _ = self.encoder(torch.cat([emb, ctx], -1))

        pos_scores = []
        for i in range(L):
            is_blank = (word_state[:, i] == 0).float().unsqueeze(1)
            bc = blank_count.unsqueeze(1).float() / L
            pos_input = torch.cat([is_blank, bc, cnn_feat[:, i, :]], -1)
            pos_scores.append(self.pos_prior_mlp(pos_input).unsqueeze(1))
        priors = torch.cat(pos_scores, 1)  # [B, L, 26]

        out = torch.zeros(B, L, 26, device=word_state.device)
        for i in range(L):
            h = encoded[:, i, :]
            ptype = position_context[:, i]
            inp = torch.cat([h, priors[:, i, :]], -1)
            out[ptype==1, i, :] = self.left_decoder(inp[ptype==1])
            out[ptype==2, i, :] = self.right_decoder(inp[ptype==2])
            out[ptype==3, i, :] = self.both_decoder(inp[ptype==3])

        return out

class HangmanSolver:
    def __init__(self, word_list, model_path="best_model.pth"):
        self.model = EnhancedHangmanModel()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Load state dict without DataParallel
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))

        # Now optionally wrap in DataParallel AFTER loading
        if torch.cuda.device_count() > 1:
            self.model = nn.DataParallel(self.model)

        self.model.eval()

        self.dictionary = word_list
        self.char_to_idx = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
        self.char_to_idx['_'] = 0
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items() if v != 0}

    def _fallback_prediction(self, pattern, guessed):
        counter = Counter()
        for word in self.dictionary:
            if len(word) != len(pattern):
                continue
            match = True
            for wc, pc in zip(word, pattern):
                if pc != '_' and pc != wc:
                    match = False
                    break
                if pc == '_' and wc in guessed:
                    match = False
                    break
            if match:
                for i, c in enumerate(word):
                    if pattern[i] == '_' and c not in guessed:
                        counter[c] += 1

        if not counter:
            for c in string.ascii_lowercase:
                if c not in guessed:
                    return c
            return random.choice([c for c in string.ascii_lowercase if c not in guessed])  # final fallback

        for letter, _ in counter.most_common():
            if letter not in guessed:
                return letter

        return random.choice([c for c in string.ascii_lowercase if c not in guessed])

    def predict_letter(self, word_state, guessed_letters=None):
        if guessed_letters is None:
            guessed_letters = set()
        if ' ' in word_state:
            word_state = word_state.replace(' ', '')
        if all(c == '_' for c in word_state):
            # Return most frequent unguessed letter as the first guess
            common_order = "etaoinshrdlucmfwypvbgkjqxz"
            for letter in common_order:
                if letter not in guessed_letters:
                    return letter
        print("ok")
        word_state = word_state.lower()
        max_length = 45
        state_indices = []
        position_context = []

        for i, char in enumerate(word_state):
            if char == '_':
                state_indices.append(0)
                context = 0
                if i > 0 and word_state[i-1] != '_':
                    context += 1
                if i < len(word_state)-1 and word_state[i+1] != '_':
                    context += 2
                position_context.append(context)
            else:
                state_indices.append(self.char_to_idx.get(char, 27))
                position_context.append(0)

        while len(state_indices) < max_length:
            state_indices.append(27)
            position_context.append(0)

        word_tensor = torch.tensor([state_indices], dtype=torch.long).to(self.device)
        context_tensor = torch.tensor([position_context], dtype=torch.long).to(self.device)
        length_tensor = torch.tensor([len(word_state)], dtype=torch.long).to(self.device)
        blank_count_tensor = torch.tensor([word_state.count('_')], dtype=torch.long).to(self.device)
        blank_vowel_next = [0]*max_length

        for i in range(len(word_state)):
            if word_state[i] == '_':
                l = word_state[i-1] if i > 0 else 'x'
                r = word_state[i+1] if i < len(word_state)-1 else 'x'
                if l in 'aeiou' or r in 'aeiou':
                    blank_vowel_next[i] = 1

        blank_vowel_tensor = torch.tensor([blank_vowel_next], dtype=torch.float).to(self.device)

        with torch.no_grad():
            predictions = self.model(word_tensor, context_tensor, length_tensor, blank_count_tensor, blank_vowel_tensor)

        best_predictions = []
        for i in range(len(word_state)):
            if word_state[i] == '_' and position_context[i] > 0:
                probs = torch.softmax(predictions[0, i, :], dim=0)
                for j, prob in enumerate(probs):
                    letter = chr(ord('a') + j)
                    if letter not in guessed_letters:
                        best_predictions.append((letter, prob.item(), i))

        if not best_predictions:
            for i in range(len(word_state)):
                if word_state[i] == '_':
                    probs = torch.softmax(predictions[0, i, :], dim=0)
                    for j, prob in enumerate(probs):
                        letter = chr(ord('a') + j)
                        if letter not in guessed_letters:
                            best_predictions.append((letter, prob.item(), i))

        if best_predictions:
            best_predictions.sort(key=lambda x: x[1], reverse=True)
            return best_predictions[0][0]
        
        # Fallback to frequency-based guess if model fails
        common_order = "etaoinshrdlucmfwypvbgkjqxz"
        for letter in common_order:
            if letter not in guessed_letters:
                return letter

        return 'e'  # very rare fallback if all else fails

from tqdm import tqdm

def train_model(words, epochs=10, early_stopping_patience=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = EnhancedHangmanModel()
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = ReduceLROnPlateau(opt, patience=2)
    loss_fn = nn.CrossEntropyLoss()

    best = float('inf')
    patience_counter = 0

    for ep in range(epochs):
        # Curriculum: reveal_ratio increases with epoch (starts hard, becomes easier)
        reveal_ratio = max(0.1, 1.0 - (ep+1) * 0.02)  # Start from 1.0, decrease to 0.1
        ds = HangmanDataset(words, reveal_ratio=reveal_ratio)
        train_len = int(0.9 * len(ds))
        tr, val = random_split(ds, [train_len, len(ds)-train_len])
        dl = DataLoader(tr, shuffle=True, pin_memory=True, batch_size=256, num_workers=4)
        vl = DataLoader(val, pin_memory=True, batch_size=256, num_workers=4)

        print(f"\n--- Epoch {ep+1} | Reveal Ratio: {reveal_ratio:.2f} ---", flush=True)
        model.train()
        total_loss = 0
        batch_count = 0

        for i, batch in enumerate(tqdm(dl, desc="Training", ncols=100)):
            opt.zero_grad()
            out = model(batch['word_state'].to(device), batch['position_context'].to(device),
                        batch['word_length'].to(device), batch['blank_count'].to(device),
                        batch['next_to_vowel'].to(device))
            loss, count = 0, 0
            for b in range(out.size(0)):
                target_pos = batch['target_positions'][b].to(device)
                target_char = batch['target_chars'][b].to(device)
                for p, c in zip(target_pos, target_char):
                    if p >= 0 and c > 0:
                        loss += loss_fn(out[b, p], c-1)
                        count += 1
            if count > 0:
                loss = loss / count
                loss.backward()
                opt.step()
                total_loss += loss.item()
                batch_count += 1
            if i % 20 == 0:
                if isinstance(loss, torch.Tensor):
                    print(f"  Batch {i}/{len(dl)} | Loss: {loss.item():.4f}", flush=True)
                else:
                    print(f"  Batch {i}/{len(dl)} | Loss: N/A (no valid targets)", flush=True)


        train_loss = total_loss / batch_count if batch_count > 0 else 0

        # Validation
        model.eval()
        val_loss = 0
        val_batches = 0
        with torch.no_grad():
            for batch in tqdm(vl, desc="Validation", ncols=100):
                out = model(batch['word_state'].to(device), batch['position_context'].to(device),
                            batch['word_length'].to(device), batch['blank_count'].to(device),
                            batch['next_to_vowel'].to(device))
                loss, count = 0, 0
                for b in range(out.size(0)):
                    target_pos = batch['target_positions'][b].to(device)
                    target_char = batch['target_chars'][b].to(device)
                    for p, c in zip(target_pos, target_char):
                        if p >= 0 and c > 0:
                            loss += loss_fn(out[b, p], c-1)
                            count += 1
                if count > 0:
                    val_loss += loss.item() / count
                    val_batches += 1

        val_loss = val_loss / val_batches if val_batches > 0 else 0
        scheduler.step(val_loss)

        if count > 0 and i % 20 == 0:
            print(f"  Batch {i}/{len(dl)} | Loss: {loss.item():.4f}", flush=True)

        if val_loss < best:
            best = val_loss
            patience_counter = 0
            torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "best_model.pth")
            print("✅ Model improved and saved.", flush=True)
        else:
            patience_counter += 1
            print(f"⚠️ No improvement. Patience: {patience_counter}/{early_stopping_patience}", flush=True)
            if patience_counter >= early_stopping_patience:
                print(f"🛑 Early stopping at epoch {ep+1}", flush=True)
                break

    return model

In [10]:
dictionary = open("/kaggle/input/yeshui/words_250000_train.txt").read().splitlines()
# Test code to run in a separate cell after defining the model classes
random.shuffle(dictionary)
word_list = dictionary[:2500]

print("Starting training...")
print(f"Training on {len(word_list)} words")
model = train_model(word_list, epochs=50)

print("\nTraining completed!")
torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "best_model.pth")
print("Model saved as 'best_model.pth'")

solver = HangmanSolver(word_list)
solver.model = model
solver.model.eval()

# Game simulation
def simulate_hangman_game(word, solver, max_wrong=6, verbose=True):
    word = word.lower()
    current = '_' * len(word)
    guessed = set()
    wrong_count = 0
    moves = []

    if verbose:
        print(f"\n{'='*30}\nGAME: {word}\n{'='*30}")

    while '_' in current and wrong_count < max_wrong:
        if verbose:
            print(f"Current: {current}\nWrong guesses: {wrong_count}/{max_wrong}")

        letter = solver.predict_letter(current, guessed)
        guessed.add(letter)
        moves.append(letter)

        if letter in word:
            current = ''.join([c if c == letter or current[i] != '_' else '_' for i, c in enumerate(word)])
            if verbose:
                print(f"✓ Correct! Guessed: {letter}")
        else:
            wrong_count += 1
            if verbose:
                print(f"✗ Wrong! Guessed: {letter}")
        if verbose:
            print()

    if verbose:
        if '_' not in current:
            print(f"🎉 WON! Final word: {current}")
        else:
            print(f"💀 LOST! Word was: {word}")
        print(f"Moves: {' -> '.join(moves)}")

    return '_' not in current, moves, wrong_count

# Run game test
simulate_hangman_game("learning", solver, verbose=True)

Starting training...
Training on 2500 words

--- Epoch 1 | Reveal Ratio: 0.98 ---


Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 3.2514


Training:   3%|█▍                                                  | 20/704 [00:10<05:44,  1.98it/s]

  Batch 20/704 | Loss: 2.9088


Training:   6%|██▉                                                 | 40/704 [00:20<05:32,  2.00it/s]

  Batch 40/704 | Loss: 2.6062


Training:   9%|████▍                                               | 60/704 [00:31<05:32,  1.93it/s]

  Batch 60/704 | Loss: 2.4197


Training:  11%|█████▉                                              | 80/704 [00:41<05:14,  1.98it/s]

  Batch 80/704 | Loss: 2.2797


Training:  14%|███████▏                                           | 100/704 [00:51<05:01,  2.00it/s]

  Batch 100/704 | Loss: 2.1516


Training:  17%|████████▋                                          | 120/704 [01:01<05:13,  1.86it/s]

  Batch 120/704 | Loss: 2.1777


Training:  20%|██████████▏                                        | 140/704 [01:11<04:41,  2.00it/s]

  Batch 140/704 | Loss: 2.1410


Training:  23%|███████████▌                                       | 160/704 [01:21<04:31,  2.01it/s]

  Batch 160/704 | Loss: 2.1675


Training:  26%|█████████████                                      | 180/704 [01:31<04:21,  2.01it/s]

  Batch 180/704 | Loss: 1.9065


Training:  28%|██████████████▍                                    | 200/704 [01:42<04:14,  1.98it/s]

  Batch 200/704 | Loss: 2.0256


Training:  31%|███████████████▉                                   | 220/704 [01:52<04:08,  1.95it/s]

  Batch 220/704 | Loss: 1.9956


Training:  34%|█████████████████▍                                 | 240/704 [02:02<03:52,  1.99it/s]

  Batch 240/704 | Loss: 1.9110


Training:  37%|██████████████████▊                                | 260/704 [02:12<03:46,  1.96it/s]

  Batch 260/704 | Loss: 1.8032


Training:  40%|████████████████████▎                              | 280/704 [02:23<03:39,  1.93it/s]

  Batch 280/704 | Loss: 1.7403


Training:  43%|█████████████████████▋                             | 300/704 [02:33<03:28,  1.94it/s]

  Batch 300/704 | Loss: 1.7711


Training:  45%|███████████████████████▏                           | 320/704 [02:43<03:13,  1.98it/s]

  Batch 320/704 | Loss: 1.6817


Training:  48%|████████████████████████▋                          | 340/704 [02:53<03:06,  1.95it/s]

  Batch 340/704 | Loss: 1.5825


Training:  51%|██████████████████████████                         | 360/704 [03:03<02:56,  1.95it/s]

  Batch 360/704 | Loss: 1.5419


Training:  54%|███████████████████████████▌                       | 380/704 [03:13<02:45,  1.96it/s]

  Batch 380/704 | Loss: 1.6906


Training:  57%|████████████████████████████▉                      | 400/704 [03:23<02:33,  1.98it/s]

  Batch 400/704 | Loss: 1.4796


Training:  60%|██████████████████████████████▍                    | 420/704 [03:34<02:22,  1.99it/s]

  Batch 420/704 | Loss: 1.5460


Training:  62%|███████████████████████████████▉                   | 440/704 [03:44<02:15,  1.94it/s]

  Batch 440/704 | Loss: 1.3524


Training:  65%|█████████████████████████████████▎                 | 460/704 [03:54<02:03,  1.97it/s]

  Batch 460/704 | Loss: 1.3433


Training:  68%|██████████████████████████████████▊                | 480/704 [04:04<01:52,  1.99it/s]

  Batch 480/704 | Loss: 1.1597


Training:  71%|████████████████████████████████████▏              | 500/704 [04:14<01:45,  1.93it/s]

  Batch 500/704 | Loss: 1.1037


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:24<01:34,  1.96it/s]

  Batch 520/704 | Loss: 1.0683


Training:  77%|███████████████████████████████████████            | 540/704 [04:35<01:22,  1.98it/s]

  Batch 540/704 | Loss: 1.0249


Training:  80%|████████████████████████████████████████▌          | 560/704 [04:45<01:19,  1.82it/s]

  Batch 560/704 | Loss: 0.9228


Training:  82%|██████████████████████████████████████████         | 580/704 [04:55<01:02,  1.97it/s]

  Batch 580/704 | Loss: 0.7576


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:05<00:52,  1.99it/s]

  Batch 600/704 | Loss: 0.7784


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:15<00:44,  1.88it/s]

  Batch 620/704 | Loss: 0.7590


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:26<00:32,  1.99it/s]

  Batch 640/704 | Loss: 0.7294


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [05:36<00:22,  1.94it/s]

  Batch 660/704 | Loss: 0.6735


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [05:46<00:12,  1.95it/s]

  Batch 680/704 | Loss: 0.6116


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [05:56<00:02,  1.99it/s]

  Batch 700/704 | Loss: 0.6158


Training: 100%|███████████████████████████████████████████████████| 704/704 [05:58<00:00,  1.96it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:18<00:00,  4.26it/s]


✅ Model improved and saved.

--- Epoch 2 | Reveal Ratio: 0.96 ---


Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.5849


Training:   3%|█▍                                                  | 20/704 [00:10<05:40,  2.01it/s]

  Batch 20/704 | Loss: 0.6076


Training:   6%|██▉                                                 | 40/704 [00:20<05:34,  1.98it/s]

  Batch 40/704 | Loss: 0.5683


Training:   9%|████▍                                               | 60/704 [00:30<05:28,  1.96it/s]

  Batch 60/704 | Loss: 0.5488


Training:  11%|█████▉                                              | 80/704 [00:41<05:14,  1.98it/s]

  Batch 80/704 | Loss: 0.4544


Training:  14%|███████▏                                           | 100/704 [00:51<05:01,  2.00it/s]

  Batch 100/704 | Loss: 0.4233


Training:  17%|████████▋                                          | 120/704 [01:01<04:54,  1.98it/s]

  Batch 120/704 | Loss: 0.4121


Training:  20%|██████████▏                                        | 140/704 [01:11<04:44,  1.98it/s]

  Batch 140/704 | Loss: 0.4228


Training:  23%|███████████▌                                       | 160/704 [01:21<04:30,  2.01it/s]

  Batch 160/704 | Loss: 0.3599


Training:  26%|█████████████                                      | 180/704 [01:31<04:25,  1.97it/s]

  Batch 180/704 | Loss: 0.2982


Training:  28%|██████████████▍                                    | 200/704 [01:42<04:17,  1.96it/s]

  Batch 200/704 | Loss: 0.2981


Training:  31%|███████████████▉                                   | 220/704 [01:52<04:01,  2.01it/s]

  Batch 220/704 | Loss: 0.2834


Training:  34%|█████████████████▍                                 | 240/704 [02:02<03:54,  1.98it/s]

  Batch 240/704 | Loss: 0.3250


Training:  37%|██████████████████▊                                | 260/704 [02:12<03:52,  1.91it/s]

  Batch 260/704 | Loss: 0.1824


Training:  40%|████████████████████▎                              | 280/704 [02:22<03:31,  2.00it/s]

  Batch 280/704 | Loss: 0.2345


Training:  43%|█████████████████████▋                             | 300/704 [02:32<03:21,  2.00it/s]

  Batch 300/704 | Loss: 0.3053


Training:  45%|███████████████████████▏                           | 320/704 [02:43<03:29,  1.83it/s]

  Batch 320/704 | Loss: 0.1747


Training:  48%|████████████████████████▋                          | 340/704 [02:53<03:04,  1.97it/s]

  Batch 340/704 | Loss: 0.1926


Training:  51%|██████████████████████████                         | 360/704 [03:03<02:54,  1.97it/s]

  Batch 360/704 | Loss: 0.1849


Training:  54%|███████████████████████████▌                       | 380/704 [03:13<02:42,  1.99it/s]

  Batch 380/704 | Loss: 0.1623


Training:  57%|████████████████████████████▉                      | 400/704 [03:23<02:33,  1.99it/s]

  Batch 400/704 | Loss: 0.1782


Training:  60%|██████████████████████████████▍                    | 420/704 [03:33<02:22,  2.00it/s]

  Batch 420/704 | Loss: 0.1160


Training:  62%|███████████████████████████████▉                   | 440/704 [03:44<02:13,  1.98it/s]

  Batch 440/704 | Loss: 0.1763


Training:  65%|█████████████████████████████████▎                 | 460/704 [03:54<02:03,  1.98it/s]

  Batch 460/704 | Loss: 0.2224


Training:  68%|██████████████████████████████████▊                | 480/704 [04:04<01:51,  2.00it/s]

  Batch 480/704 | Loss: 0.1231


Training:  71%|████████████████████████████████████▏              | 500/704 [04:14<01:41,  2.00it/s]

  Batch 500/704 | Loss: 0.0732


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:24<01:32,  1.98it/s]

  Batch 520/704 | Loss: 0.0805


Training:  77%|███████████████████████████████████████            | 540/704 [04:34<01:22,  2.00it/s]

  Batch 540/704 | Loss: 0.1154


Training:  80%|████████████████████████████████████████▌          | 560/704 [04:44<01:13,  1.97it/s]

  Batch 560/704 | Loss: 0.1830


Training:  82%|██████████████████████████████████████████         | 580/704 [04:55<01:04,  1.91it/s]

  Batch 580/704 | Loss: 0.1401


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:05<00:52,  1.96it/s]

  Batch 600/704 | Loss: 0.0919


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:15<00:42,  1.96it/s]

  Batch 620/704 | Loss: 0.0835


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:25<00:33,  1.91it/s]

  Batch 640/704 | Loss: 0.1366


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [05:36<00:22,  1.94it/s]

  Batch 660/704 | Loss: 0.1019


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [05:46<00:12,  1.97it/s]

  Batch 680/704 | Loss: 0.1733


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [05:56<00:02,  1.90it/s]

  Batch 700/704 | Loss: 0.1443


Training: 100%|███████████████████████████████████████████████████| 704/704 [05:58<00:00,  1.97it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:18<00:00,  4.29it/s]


✅ Model improved and saved.

--- Epoch 3 | Reveal Ratio: 0.94 ---


Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.0771


Training:   3%|█▍                                                  | 20/704 [00:10<06:19,  1.80it/s]

  Batch 20/704 | Loss: 0.0831


Training:   6%|██▉                                                 | 40/704 [00:20<05:35,  1.98it/s]

  Batch 40/704 | Loss: 0.1278


Training:   9%|████▍                                               | 60/704 [00:31<05:27,  1.97it/s]

  Batch 60/704 | Loss: 0.1268


Training:  11%|█████▉                                              | 80/704 [00:41<05:32,  1.88it/s]

  Batch 80/704 | Loss: 0.1069


Training:  14%|███████▏                                           | 100/704 [00:51<05:01,  2.00it/s]

  Batch 100/704 | Loss: 0.1737


Training:  17%|████████▋                                          | 120/704 [01:01<04:57,  1.97it/s]

  Batch 120/704 | Loss: 0.0821


Training:  20%|██████████▏                                        | 140/704 [01:11<04:49,  1.95it/s]

  Batch 140/704 | Loss: 0.1034


Training:  23%|███████████▌                                       | 160/704 [01:22<04:37,  1.96it/s]

  Batch 160/704 | Loss: 0.0856


Training:  26%|█████████████                                      | 180/704 [01:32<04:25,  1.97it/s]

  Batch 180/704 | Loss: 0.0717


Training:  28%|██████████████▍                                    | 200/704 [01:42<04:11,  2.00it/s]

  Batch 200/704 | Loss: 0.0975


Training:  31%|███████████████▉                                   | 220/704 [01:52<04:05,  1.97it/s]

  Batch 220/704 | Loss: 0.1071


Training:  34%|█████████████████▍                                 | 240/704 [02:02<03:53,  1.99it/s]

  Batch 240/704 | Loss: 0.0612


Training:  37%|██████████████████▊                                | 260/704 [02:12<03:45,  1.97it/s]

  Batch 260/704 | Loss: 0.1068


Training:  40%|████████████████████▎                              | 280/704 [02:23<03:44,  1.89it/s]

  Batch 280/704 | Loss: 0.0655


Training:  43%|█████████████████████▋                             | 300/704 [02:33<03:23,  1.98it/s]

  Batch 300/704 | Loss: 0.1034


Training:  45%|███████████████████████▏                           | 320/704 [02:43<03:14,  1.97it/s]

  Batch 320/704 | Loss: 0.1533


Training:  48%|████████████████████████▋                          | 340/704 [02:54<03:02,  1.99it/s]

  Batch 340/704 | Loss: 0.0766


Training:  51%|██████████████████████████                         | 360/704 [03:04<02:53,  1.98it/s]

  Batch 360/704 | Loss: 0.0592


Training:  54%|███████████████████████████▌                       | 380/704 [03:14<02:45,  1.96it/s]

  Batch 380/704 | Loss: 0.0582


Training:  57%|████████████████████████████▉                      | 400/704 [03:24<02:34,  1.96it/s]

  Batch 400/704 | Loss: 0.0856


Training:  60%|██████████████████████████████▍                    | 420/704 [03:34<02:24,  1.97it/s]

  Batch 420/704 | Loss: 0.0828


Training:  62%|███████████████████████████████▉                   | 440/704 [03:44<02:13,  1.98it/s]

  Batch 440/704 | Loss: 0.0484


Training:  65%|█████████████████████████████████▎                 | 460/704 [03:55<02:10,  1.87it/s]

  Batch 460/704 | Loss: 0.0902


Training:  68%|██████████████████████████████████▊                | 480/704 [04:05<01:53,  1.97it/s]

  Batch 480/704 | Loss: 0.1363


Training:  71%|████████████████████████████████████▏              | 500/704 [04:15<01:41,  2.00it/s]

  Batch 500/704 | Loss: 0.0699


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:25<01:42,  1.79it/s]

  Batch 520/704 | Loss: 0.0763


Training:  77%|███████████████████████████████████████            | 540/704 [04:36<01:23,  1.97it/s]

  Batch 540/704 | Loss: 0.1652


Training:  80%|████████████████████████████████████████▌          | 560/704 [04:46<01:13,  1.96it/s]

  Batch 560/704 | Loss: 0.0783


Training:  82%|██████████████████████████████████████████         | 580/704 [04:56<01:03,  1.94it/s]

  Batch 580/704 | Loss: 0.1058


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:06<00:52,  1.99it/s]

  Batch 600/704 | Loss: 0.0780


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:17<00:42,  1.98it/s]

  Batch 620/704 | Loss: 0.0908


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:27<00:32,  1.97it/s]

  Batch 640/704 | Loss: 0.0976


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [05:37<00:22,  1.97it/s]

  Batch 660/704 | Loss: 0.0767


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [05:47<00:12,  1.98it/s]

  Batch 680/704 | Loss: 0.0908


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [05:57<00:01,  2.01it/s]

  Batch 700/704 | Loss: 0.0642


Training: 100%|███████████████████████████████████████████████████| 704/704 [05:59<00:00,  1.96it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:18<00:00,  4.24it/s]


✅ Model improved and saved.

--- Epoch 4 | Reveal Ratio: 0.92 ---


Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.3173


Training:   3%|█▍                                                  | 20/704 [00:10<06:00,  1.90it/s]

  Batch 20/704 | Loss: 0.3826


Training:   6%|██▉                                                 | 40/704 [00:21<05:44,  1.93it/s]

  Batch 40/704 | Loss: 0.3105


Training:   9%|████▍                                               | 60/704 [00:31<05:37,  1.91it/s]

  Batch 60/704 | Loss: 0.2807


Training:  11%|█████▉                                              | 80/704 [00:41<05:24,  1.92it/s]

  Batch 80/704 | Loss: 0.1805


Training:  14%|███████▏                                           | 100/704 [00:52<05:12,  1.93it/s]

  Batch 100/704 | Loss: 0.3670


Training:  17%|████████▋                                          | 120/704 [01:02<04:55,  1.97it/s]

  Batch 120/704 | Loss: 0.0993


Training:  20%|██████████▏                                        | 140/704 [01:12<04:48,  1.96it/s]

  Batch 140/704 | Loss: 0.1570


Training:  23%|███████████▌                                       | 160/704 [01:23<04:45,  1.91it/s]

  Batch 160/704 | Loss: 0.2589


Training:  26%|█████████████                                      | 180/704 [01:33<04:28,  1.95it/s]

  Batch 180/704 | Loss: 0.2289


Training:  28%|██████████████▍                                    | 200/704 [01:43<04:17,  1.96it/s]

  Batch 200/704 | Loss: 0.1530


Training:  31%|███████████████▉                                   | 220/704 [01:54<04:12,  1.91it/s]

  Batch 220/704 | Loss: 0.1832


Training:  34%|█████████████████▍                                 | 240/704 [02:04<04:00,  1.93it/s]

  Batch 240/704 | Loss: 0.1765


Training:  37%|██████████████████▊                                | 260/704 [02:15<03:49,  1.94it/s]

  Batch 260/704 | Loss: 0.1348


Training:  40%|████████████████████▎                              | 280/704 [02:25<03:39,  1.93it/s]

  Batch 280/704 | Loss: 0.1909


Training:  43%|█████████████████████▋                             | 300/704 [02:35<03:28,  1.94it/s]

  Batch 300/704 | Loss: 0.1742


Training:  45%|███████████████████████▏                           | 320/704 [02:46<03:16,  1.96it/s]

  Batch 320/704 | Loss: 0.1308


Training:  48%|████████████████████████▋                          | 340/704 [02:56<03:12,  1.89it/s]

  Batch 340/704 | Loss: 0.1075


Training:  51%|██████████████████████████                         | 360/704 [03:06<02:55,  1.96it/s]

  Batch 360/704 | Loss: 0.1363


Training:  54%|███████████████████████████▌                       | 380/704 [03:17<02:56,  1.84it/s]

  Batch 380/704 | Loss: 0.0865


Training:  57%|████████████████████████████▉                      | 400/704 [03:28<02:46,  1.83it/s]

  Batch 400/704 | Loss: 0.1125


Training:  60%|██████████████████████████████▍                    | 420/704 [03:38<02:24,  1.96it/s]

  Batch 420/704 | Loss: 0.0858


Training:  62%|███████████████████████████████▉                   | 440/704 [03:48<02:18,  1.91it/s]

  Batch 440/704 | Loss: 0.1350


Training:  65%|█████████████████████████████████▎                 | 460/704 [03:59<02:14,  1.82it/s]

  Batch 460/704 | Loss: 0.1451


Training:  68%|██████████████████████████████████▊                | 480/704 [04:09<01:58,  1.89it/s]

  Batch 480/704 | Loss: 0.1574


Training:  71%|████████████████████████████████████▏              | 500/704 [04:20<01:48,  1.89it/s]

  Batch 500/704 | Loss: 0.0976


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:30<01:42,  1.80it/s]

  Batch 520/704 | Loss: 0.1243


Training:  77%|███████████████████████████████████████            | 540/704 [04:40<01:24,  1.94it/s]

  Batch 540/704 | Loss: 0.1652


Training:  80%|████████████████████████████████████████▌          | 560/704 [04:51<01:13,  1.95it/s]

  Batch 560/704 | Loss: 0.0781


Training:  82%|██████████████████████████████████████████         | 580/704 [05:01<01:05,  1.89it/s]

  Batch 580/704 | Loss: 0.2511


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:12<00:53,  1.95it/s]

  Batch 600/704 | Loss: 0.1360


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:22<00:43,  1.94it/s]

  Batch 620/704 | Loss: 0.1426


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:33<00:33,  1.92it/s]

  Batch 640/704 | Loss: 0.1348


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [05:43<00:22,  1.94it/s]

  Batch 660/704 | Loss: 0.1451


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [05:53<00:12,  1.93it/s]

  Batch 680/704 | Loss: 0.0976


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [06:04<00:02,  1.94it/s]

  Batch 700/704 | Loss: 0.1388


Training: 100%|███████████████████████████████████████████████████| 704/704 [06:06<00:00,  1.92it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:18<00:00,  4.17it/s]

⚠️ No improvement. Patience: 1/5






--- Epoch 5 | Reveal Ratio: 0.90 ---


Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.3024


Training:   3%|█▍                                                  | 20/704 [00:10<06:08,  1.86it/s]

  Batch 20/704 | Loss: 0.3434


Training:   6%|██▉                                                 | 40/704 [00:21<05:51,  1.89it/s]

  Batch 40/704 | Loss: 0.3461


Training:   9%|████▍                                               | 60/704 [00:32<05:43,  1.87it/s]

  Batch 60/704 | Loss: 0.2532


Training:  11%|█████▉                                              | 80/704 [00:43<05:34,  1.87it/s]

  Batch 80/704 | Loss: 0.2585


Training:  14%|███████▏                                           | 100/704 [00:53<05:17,  1.90it/s]

  Batch 100/704 | Loss: 0.2077


Training:  17%|████████▋                                          | 120/704 [01:04<05:11,  1.88it/s]

  Batch 120/704 | Loss: 0.2365


Training:  20%|██████████▏                                        | 140/704 [01:15<04:57,  1.89it/s]

  Batch 140/704 | Loss: 0.3026


Training:  23%|███████████▌                                       | 160/704 [01:25<04:48,  1.89it/s]

  Batch 160/704 | Loss: 0.1876


Training:  26%|█████████████                                      | 180/704 [01:36<04:38,  1.88it/s]

  Batch 180/704 | Loss: 0.2056


Training:  28%|██████████████▍                                    | 200/704 [01:47<04:25,  1.90it/s]

  Batch 200/704 | Loss: 0.1692


Training:  31%|███████████████▉                                   | 220/704 [01:57<04:18,  1.87it/s]

  Batch 220/704 | Loss: 0.2706


Training:  34%|█████████████████▍                                 | 240/704 [02:08<04:11,  1.85it/s]

  Batch 240/704 | Loss: 0.1833


Training:  37%|██████████████████▊                                | 260/704 [02:19<03:54,  1.89it/s]

  Batch 260/704 | Loss: 0.1658


Training:  40%|████████████████████▎                              | 280/704 [02:30<03:43,  1.89it/s]

  Batch 280/704 | Loss: 0.1924


Training:  43%|█████████████████████▋                             | 300/704 [02:40<03:33,  1.89it/s]

  Batch 300/704 | Loss: 0.2147


Training:  45%|███████████████████████▏                           | 320/704 [02:51<03:23,  1.89it/s]

  Batch 320/704 | Loss: 0.1820


Training:  48%|████████████████████████▋                          | 340/704 [03:01<03:11,  1.91it/s]

  Batch 340/704 | Loss: 0.1195


Training:  51%|██████████████████████████                         | 360/704 [03:12<03:04,  1.87it/s]

  Batch 360/704 | Loss: 0.1466


Training:  54%|███████████████████████████▌                       | 380/704 [03:23<02:48,  1.92it/s]

  Batch 380/704 | Loss: 0.1946


Training:  57%|████████████████████████████▉                      | 400/704 [03:33<02:42,  1.87it/s]

  Batch 400/704 | Loss: 0.2319


Training:  60%|██████████████████████████████▍                    | 420/704 [03:44<02:31,  1.87it/s]

  Batch 420/704 | Loss: 0.1523


Training:  62%|███████████████████████████████▉                   | 440/704 [03:55<02:19,  1.89it/s]

  Batch 440/704 | Loss: 0.1622


Training:  65%|█████████████████████████████████▎                 | 460/704 [04:05<02:08,  1.90it/s]

  Batch 460/704 | Loss: 0.1149


Training:  68%|██████████████████████████████████▊                | 480/704 [04:16<01:57,  1.91it/s]

  Batch 480/704 | Loss: 0.1939


Training:  71%|████████████████████████████████████▏              | 500/704 [04:27<01:50,  1.84it/s]

  Batch 500/704 | Loss: 0.1232


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:37<01:38,  1.87it/s]

  Batch 520/704 | Loss: 0.1834


Training:  77%|███████████████████████████████████████            | 540/704 [04:48<01:26,  1.90it/s]

  Batch 540/704 | Loss: 0.1343


Training:  80%|████████████████████████████████████████▌          | 560/704 [04:58<01:16,  1.88it/s]

  Batch 560/704 | Loss: 0.1480


Training:  82%|██████████████████████████████████████████         | 580/704 [05:09<01:05,  1.90it/s]

  Batch 580/704 | Loss: 0.2585


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:20<00:54,  1.90it/s]

  Batch 600/704 | Loss: 0.1258


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:30<00:43,  1.92it/s]

  Batch 620/704 | Loss: 0.1243


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:41<00:33,  1.91it/s]

  Batch 640/704 | Loss: 0.0878


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [05:52<00:22,  1.91it/s]

  Batch 660/704 | Loss: 0.1726


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [06:02<00:12,  1.88it/s]

  Batch 680/704 | Loss: 0.0958


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [06:13<00:02,  1.89it/s]

  Batch 700/704 | Loss: 0.0741


Training: 100%|███████████████████████████████████████████████████| 704/704 [06:15<00:00,  1.87it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:19<00:00,  4.10it/s]

⚠️ No improvement. Patience: 2/5

--- Epoch 6 | Reveal Ratio: 0.88 ---



Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.4404


Training:   3%|█▍                                                  | 20/704 [00:11<06:19,  1.80it/s]

  Batch 20/704 | Loss: 0.4205


Training:   6%|██▉                                                 | 40/704 [00:22<06:11,  1.79it/s]

  Batch 40/704 | Loss: 0.3797


Training:   9%|████▍                                               | 60/704 [00:33<06:17,  1.71it/s]

  Batch 60/704 | Loss: 0.4186


Training:  11%|█████▉                                              | 80/704 [00:44<05:37,  1.85it/s]

  Batch 80/704 | Loss: 0.3098


Training:  14%|███████▏                                           | 100/704 [00:55<05:26,  1.85it/s]

  Batch 100/704 | Loss: 0.3129


Training:  17%|████████▋                                          | 120/704 [01:06<05:28,  1.78it/s]

  Batch 120/704 | Loss: 0.4651


Training:  20%|██████████▏                                        | 140/704 [01:17<05:06,  1.84it/s]

  Batch 140/704 | Loss: 0.2546


Training:  23%|███████████▌                                       | 160/704 [01:28<04:56,  1.84it/s]

  Batch 160/704 | Loss: 0.2697


Training:  26%|█████████████                                      | 180/704 [01:39<04:52,  1.79it/s]

  Batch 180/704 | Loss: 0.3061


Training:  28%|██████████████▍                                    | 200/704 [01:50<04:36,  1.82it/s]

  Batch 200/704 | Loss: 0.1800


Training:  31%|███████████████▉                                   | 220/704 [02:01<04:28,  1.80it/s]

  Batch 220/704 | Loss: 0.2325


Training:  34%|█████████████████▍                                 | 240/704 [02:12<04:15,  1.82it/s]

  Batch 240/704 | Loss: 0.3473


Training:  37%|██████████████████▊                                | 260/704 [02:23<04:03,  1.82it/s]

  Batch 260/704 | Loss: 0.2832


Training:  40%|████████████████████▎                              | 280/704 [02:34<03:50,  1.84it/s]

  Batch 280/704 | Loss: 0.3077


Training:  43%|█████████████████████▋                             | 300/704 [02:45<03:39,  1.84it/s]

  Batch 300/704 | Loss: 0.1935


Training:  45%|███████████████████████▏                           | 320/704 [02:56<03:29,  1.83it/s]

  Batch 320/704 | Loss: 0.1555


Training:  48%|████████████████████████▋                          | 340/704 [03:07<03:17,  1.84it/s]

  Batch 340/704 | Loss: 0.1769


Training:  51%|██████████████████████████                         | 360/704 [03:18<03:07,  1.84it/s]

  Batch 360/704 | Loss: 0.1936


Training:  54%|███████████████████████████▌                       | 380/704 [03:29<02:58,  1.81it/s]

  Batch 380/704 | Loss: 0.2145


Training:  57%|████████████████████████████▉                      | 400/704 [03:40<02:48,  1.81it/s]

  Batch 400/704 | Loss: 0.2482


Training:  60%|██████████████████████████████▍                    | 420/704 [03:51<02:34,  1.84it/s]

  Batch 420/704 | Loss: 0.1861


Training:  62%|███████████████████████████████▉                   | 440/704 [04:02<02:23,  1.84it/s]

  Batch 440/704 | Loss: 0.2612


Training:  65%|█████████████████████████████████▎                 | 460/704 [04:13<02:12,  1.84it/s]

  Batch 460/704 | Loss: 0.2640


Training:  68%|██████████████████████████████████▊                | 480/704 [04:24<02:03,  1.82it/s]

  Batch 480/704 | Loss: 0.1534


Training:  71%|████████████████████████████████████▏              | 500/704 [04:35<01:51,  1.84it/s]

  Batch 500/704 | Loss: 0.2366


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:46<01:41,  1.82it/s]

  Batch 520/704 | Loss: 0.1690


Training:  77%|███████████████████████████████████████            | 540/704 [04:57<01:29,  1.84it/s]

  Batch 540/704 | Loss: 0.1972


Training:  80%|████████████████████████████████████████▌          | 560/704 [05:08<01:19,  1.81it/s]

  Batch 560/704 | Loss: 0.1442


Training:  82%|██████████████████████████████████████████         | 580/704 [05:19<01:10,  1.77it/s]

  Batch 580/704 | Loss: 0.2492


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:30<00:57,  1.80it/s]

  Batch 600/704 | Loss: 0.1137


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:41<00:45,  1.83it/s]

  Batch 620/704 | Loss: 0.0879


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:52<00:38,  1.65it/s]

  Batch 640/704 | Loss: 0.1855


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [06:03<00:24,  1.83it/s]

  Batch 660/704 | Loss: 0.1242


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [06:14<00:13,  1.82it/s]

  Batch 680/704 | Loss: 0.1381


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [06:25<00:02,  1.80it/s]

  Batch 700/704 | Loss: 0.1659


Training: 100%|███████████████████████████████████████████████████| 704/704 [06:27<00:00,  1.82it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:19<00:00,  3.96it/s]

⚠️ No improvement. Patience: 3/5

--- Epoch 7 | Reveal Ratio: 0.86 ---



Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.3496


Training:   3%|█▍                                                  | 20/704 [00:11<06:25,  1.77it/s]

  Batch 20/704 | Loss: 0.3585


Training:   6%|██▉                                                 | 40/704 [00:22<06:11,  1.79it/s]

  Batch 40/704 | Loss: 0.2860


Training:   9%|████▍                                               | 60/704 [00:34<06:01,  1.78it/s]

  Batch 60/704 | Loss: 0.2308


Training:  11%|█████▉                                              | 80/704 [00:45<05:48,  1.79it/s]

  Batch 80/704 | Loss: 0.2904


Training:  14%|███████▏                                           | 100/704 [00:56<05:35,  1.80it/s]

  Batch 100/704 | Loss: 0.2522


Training:  17%|████████▋                                          | 120/704 [01:07<05:20,  1.82it/s]

  Batch 120/704 | Loss: 0.2169


Training:  20%|██████████▏                                        | 140/704 [01:18<05:09,  1.82it/s]

  Batch 140/704 | Loss: 0.2763


Training:  23%|███████████▌                                       | 160/704 [01:29<05:00,  1.81it/s]

  Batch 160/704 | Loss: 0.1966


Training:  26%|█████████████                                      | 180/704 [01:41<04:47,  1.82it/s]

  Batch 180/704 | Loss: 0.2337


Training:  28%|██████████████▍                                    | 200/704 [01:52<04:44,  1.77it/s]

  Batch 200/704 | Loss: 0.2018


Training:  31%|███████████████▉                                   | 220/704 [02:03<04:33,  1.77it/s]

  Batch 220/704 | Loss: 0.2429


Training:  34%|█████████████████▍                                 | 240/704 [02:14<04:20,  1.78it/s]

  Batch 240/704 | Loss: 0.2171


Training:  37%|██████████████████▊                                | 260/704 [02:26<04:07,  1.79it/s]

  Batch 260/704 | Loss: 0.1025


Training:  40%|████████████████████▎                              | 280/704 [02:37<03:53,  1.82it/s]

  Batch 280/704 | Loss: 0.1957


Training:  43%|█████████████████████▋                             | 300/704 [02:48<04:01,  1.67it/s]

  Batch 300/704 | Loss: 0.1966


Training:  45%|███████████████████████▏                           | 320/704 [02:59<03:31,  1.81it/s]

  Batch 320/704 | Loss: 0.1870


Training:  48%|████████████████████████▋                          | 340/704 [03:10<03:19,  1.83it/s]

  Batch 340/704 | Loss: 0.1285


Training:  51%|██████████████████████████                         | 360/704 [03:22<03:19,  1.72it/s]

  Batch 360/704 | Loss: 0.1714


Training:  54%|███████████████████████████▌                       | 380/704 [03:33<03:00,  1.80it/s]

  Batch 380/704 | Loss: 0.2032


Training:  57%|████████████████████████████▉                      | 400/704 [03:44<03:01,  1.68it/s]

  Batch 400/704 | Loss: 0.2072


Training:  60%|██████████████████████████████▍                    | 420/704 [03:55<02:39,  1.78it/s]

  Batch 420/704 | Loss: 0.1910


Training:  62%|███████████████████████████████▉                   | 440/704 [04:06<02:25,  1.81it/s]

  Batch 440/704 | Loss: 0.1916


Training:  65%|█████████████████████████████████▎                 | 460/704 [04:18<02:16,  1.79it/s]

  Batch 460/704 | Loss: 0.2729


Training:  68%|██████████████████████████████████▊                | 480/704 [04:29<02:04,  1.80it/s]

  Batch 480/704 | Loss: 0.1527


Training:  71%|████████████████████████████████████▏              | 500/704 [04:40<01:52,  1.81it/s]

  Batch 500/704 | Loss: 0.1835


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:51<01:42,  1.80it/s]

  Batch 520/704 | Loss: 0.1686


Training:  77%|███████████████████████████████████████            | 540/704 [05:03<01:31,  1.79it/s]

  Batch 540/704 | Loss: 0.1733


Training:  80%|████████████████████████████████████████▌          | 560/704 [05:14<01:20,  1.80it/s]

  Batch 560/704 | Loss: 0.1686


Training:  82%|██████████████████████████████████████████         | 580/704 [05:25<01:08,  1.82it/s]

  Batch 580/704 | Loss: 0.1587


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:36<00:58,  1.78it/s]

  Batch 600/704 | Loss: 0.1545


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:47<00:46,  1.82it/s]

  Batch 620/704 | Loss: 0.1230


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [05:58<00:37,  1.73it/s]

  Batch 640/704 | Loss: 0.1587


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [06:10<00:24,  1.78it/s]

  Batch 660/704 | Loss: 0.0824


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [06:21<00:13,  1.81it/s]

  Batch 680/704 | Loss: 0.1327


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [06:32<00:02,  1.71it/s]

  Batch 700/704 | Loss: 0.1178


Training: 100%|███████████████████████████████████████████████████| 704/704 [06:34<00:00,  1.78it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:20<00:00,  3.88it/s]

⚠️ No improvement. Patience: 4/5

--- Epoch 8 | Reveal Ratio: 0.84 ---



Training:   0%|                                                             | 0/704 [00:00<?, ?it/s]

  Batch 0/704 | Loss: 0.2797


Training:   3%|█▍                                                  | 20/704 [00:11<06:32,  1.74it/s]

  Batch 20/704 | Loss: 0.3208


Training:   6%|██▉                                                 | 40/704 [00:23<06:21,  1.74it/s]

  Batch 40/704 | Loss: 0.2755


Training:   9%|████▍                                               | 60/704 [00:34<06:09,  1.74it/s]

  Batch 60/704 | Loss: 0.2310


Training:  11%|█████▉                                              | 80/704 [00:46<06:04,  1.71it/s]

  Batch 80/704 | Loss: 0.1971


Training:  14%|███████▏                                           | 100/704 [00:58<05:42,  1.77it/s]

  Batch 100/704 | Loss: 0.2417


Training:  17%|████████▋                                          | 120/704 [01:09<05:31,  1.76it/s]

  Batch 120/704 | Loss: 0.1759


Training:  20%|██████████▏                                        | 140/704 [01:21<05:20,  1.76it/s]

  Batch 140/704 | Loss: 0.2413


Training:  23%|███████████▌                                       | 160/704 [01:32<05:08,  1.76it/s]

  Batch 160/704 | Loss: 0.2561


Training:  26%|█████████████                                      | 180/704 [01:43<05:13,  1.67it/s]

  Batch 180/704 | Loss: 0.2256


Training:  28%|██████████████▍                                    | 200/704 [01:55<04:47,  1.76it/s]

  Batch 200/704 | Loss: 0.2877


Training:  31%|███████████████▉                                   | 220/704 [02:06<04:43,  1.71it/s]

  Batch 220/704 | Loss: 0.2281


Training:  34%|█████████████████▍                                 | 240/704 [02:18<04:30,  1.71it/s]

  Batch 240/704 | Loss: 0.1313


Training:  37%|██████████████████▊                                | 260/704 [02:29<04:10,  1.77it/s]

  Batch 260/704 | Loss: 0.2334


Training:  40%|████████████████████▎                              | 280/704 [02:41<04:02,  1.75it/s]

  Batch 280/704 | Loss: 0.1864


Training:  43%|█████████████████████▋                             | 300/704 [02:53<03:49,  1.76it/s]

  Batch 300/704 | Loss: 0.2447


Training:  45%|███████████████████████▏                           | 320/704 [03:04<03:46,  1.70it/s]

  Batch 320/704 | Loss: 0.2529


Training:  48%|████████████████████████▋                          | 340/704 [03:16<03:28,  1.74it/s]

  Batch 340/704 | Loss: 0.1722


Training:  51%|██████████████████████████                         | 360/704 [03:27<03:14,  1.77it/s]

  Batch 360/704 | Loss: 0.1703


Training:  54%|███████████████████████████▌                       | 380/704 [03:39<03:02,  1.77it/s]

  Batch 380/704 | Loss: 0.2144


Training:  57%|████████████████████████████▉                      | 400/704 [03:50<02:58,  1.70it/s]

  Batch 400/704 | Loss: 0.1393


Training:  60%|██████████████████████████████▍                    | 420/704 [04:02<02:40,  1.76it/s]

  Batch 420/704 | Loss: 0.2194


Training:  62%|███████████████████████████████▉                   | 440/704 [04:13<02:29,  1.76it/s]

  Batch 440/704 | Loss: 0.1669


Training:  65%|█████████████████████████████████▎                 | 460/704 [04:25<02:24,  1.69it/s]

  Batch 460/704 | Loss: 0.1927


Training:  68%|██████████████████████████████████▊                | 480/704 [04:36<02:09,  1.73it/s]

  Batch 480/704 | Loss: 0.1327


Training:  71%|████████████████████████████████████▏              | 500/704 [04:48<01:56,  1.75it/s]

  Batch 500/704 | Loss: 0.1282


Training:  74%|█████████████████████████████████████▋             | 520/704 [04:59<01:45,  1.75it/s]

  Batch 520/704 | Loss: 0.1753


Training:  77%|███████████████████████████████████████            | 540/704 [05:11<01:33,  1.76it/s]

  Batch 540/704 | Loss: 0.1417


Training:  80%|████████████████████████████████████████▌          | 560/704 [05:22<01:22,  1.76it/s]

  Batch 560/704 | Loss: 0.1350


Training:  82%|██████████████████████████████████████████         | 580/704 [05:34<01:09,  1.78it/s]

  Batch 580/704 | Loss: 0.1852


Training:  85%|███████████████████████████████████████████▍       | 600/704 [05:45<00:59,  1.74it/s]

  Batch 600/704 | Loss: 0.1797


Training:  88%|████████████████████████████████████████████▉      | 620/704 [05:57<00:48,  1.74it/s]

  Batch 620/704 | Loss: 0.1740


Training:  91%|██████████████████████████████████████████████▎    | 640/704 [06:08<00:36,  1.76it/s]

  Batch 640/704 | Loss: 0.2052


Training:  94%|███████████████████████████████████████████████▊   | 660/704 [06:20<00:25,  1.72it/s]

  Batch 660/704 | Loss: 0.1138


Training:  97%|█████████████████████████████████████████████████▎ | 680/704 [06:32<00:15,  1.60it/s]

  Batch 680/704 | Loss: 0.1809


Training:  99%|██████████████████████████████████████████████████▋| 700/704 [06:43<00:02,  1.75it/s]

  Batch 700/704 | Loss: 0.1111


Training: 100%|███████████████████████████████████████████████████| 704/704 [06:45<00:00,  1.74it/s]
Validation: 100%|███████████████████████████████████████████████████| 79/79 [00:20<00:00,  3.77it/s]

⚠️ No improvement. Patience: 5/5
🛑 Early stopping at epoch 8






Training completed!
Model saved as 'best_model.pth'

GAME: learning
Current: ________
Wrong guesses: 0/6
✓ Correct! Guessed: e

Current: _e______
Wrong guesses: 0/6
ok
✓ Correct! Guessed: n

Current: _e__n_n_
Wrong guesses: 0/6
ok
✓ Correct! Guessed: a

Current: _ea_n_n_
Wrong guesses: 0/6
ok
✗ Wrong! Guessed: d

Current: _ea_n_n_
Wrong guesses: 1/6
ok
✓ Correct! Guessed: i

Current: _ea_nin_
Wrong guesses: 1/6
ok
✓ Correct! Guessed: g

Current: _ea_ning
Wrong guesses: 1/6
ok
✓ Correct! Guessed: r

Current: _earning
Wrong guesses: 1/6
ok
✗ Wrong! Guessed: m

Current: _earning
Wrong guesses: 2/6
ok
✗ Wrong! Guessed: b

Current: _earning
Wrong guesses: 3/6
ok
✗ Wrong! Guessed: c

Current: _earning
Wrong guesses: 4/6
ok
✗ Wrong! Guessed: p

Current: _earning
Wrong guesses: 5/6
ok
✗ Wrong! Guessed: s

💀 LOST! Word was: learning
Moves: e -> n -> a -> d -> i -> g -> r -> m -> b -> c -> p -> s


(False, ['e', 'n', 'a', 'd', 'i', 'g', 'r', 'm', 'b', 'c', 'p', 's'], 6)