In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from collections import Counter
from tqdm import tqdm
import os
import math

# configs
EXPERT_TRAIN_DICT_PATH = "/kaggle/input/hman-ds/expert_train.txt"
FINAL_TEST_DICT_PATH = "/kaggle/input/hman-ds/final_test.txt"
RL_VAL_DICT_PATH = "/kaggle/input/hman-ds/rl_train.txt"
MODEL_FILE = "bilstm_attn_hangman_GATED_MLPGATE_improv2.pth"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

D_MODEL = 128       
HIDDEN_DIM = 256    
NUM_LAYERS = 4      
ATTN_HEADS = 8      
MAX_SEQ_LEN = 32
ALPHABET_LEN = 26   
STATE_DIM = ALPHABET_LEN * 2 
MAX_SIMULATED_WRONG_GUESSES = 6 

EPOCHS = 120
LEARNING_RATE = 0.001
BATCH_SIZE = 128
WEIGHT_DECAY = 1e-6 

CHAR_TO_IX = {char: i + 2 for i, char in enumerate("abcdefghijklmnopqrstuvwxyz")}
CHAR_TO_IX['_'] = 1
VOCAB_SIZE = len(CHAR_TO_IX) + 1
IX_TO_CHAR = {i: char for char, i in CHAR_TO_IX.items()}
IX_TO_CHAR[0] = '<pad>'
ALL_LETTERS = set("abcdefghijklmnopqrstuvwxyz")

# arch
class Gated_BiLSTM_Attention_Hangman(nn.Module):
    """
    BiLSTM with Self-Attention and Gated Fusion using a MLP Gate.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, state_dim, num_heads):
        super(Gated_BiLSTM_Attention_Hangman, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm_output_dim = hidden_dim * 2  # 512
        
        # BiLSTM
        lstm_input_dim = embedding_dim
        self.lstm = nn.LSTM(
            lstm_input_dim, hidden_dim, num_layers=num_layers,
            bidirectional=True, batch_first=True
        )
        
        # self-Attention layer
        self.attention = nn.MultiheadAttention(
            embed_dim=self.lstm_output_dim, num_heads=num_heads, batch_first=True
        )
        self.norm = nn.LayerNorm(self.lstm_output_dim)

        # deep projection for Game State (C_State)
        self.state_projection = nn.Sequential(
            nn.Linear(state_dim, self.lstm_output_dim),                                     # initial projection to match D_out
            nn.ReLU(),
            nn.Linear(self.lstm_output_dim, self.lstm_output_dim)
        )

        # the MLP gate
        # Input size is [O_Attn; C_State] = 2 * lstm_output_dim = 1024D
        self.gate_fc = nn.Sequential(
            nn.Linear(self.lstm_output_dim * 2, self.lstm_output_dim),                      # input: 1024D, Output: 512D
            nn.ReLU(),                                                                      # added non-linearity.
            nn.Linear(self.lstm_output_dim, self.lstm_output_dim),                          # input: 512D, Output: 512D (Gate Vector)
            nn.Sigmoid()
        )
        
        # final classificatier
        self.fc = nn.Linear(self.lstm_output_dim, vocab_size)

    def forward(self, x, state):
        
        attn_key_padding_mask = (x == 0) 
        embedded = self.embedding(x) 
        seq_len = x.size(1)

        lstm_out, _ = self.lstm(embedded) 

        attn_output, _ = self.attention(
            lstm_out, lstm_out, lstm_out, key_padding_mask=attn_key_padding_mask
        )
        O_Attn = self.norm(lstm_out + attn_output) 
        
        C_State_projected = self.state_projection(state) 
        C_State = C_State_projected.unsqueeze(1).repeat(1, seq_len, 1)

        combined_features = torch.cat([O_Attn, C_State], dim=-1)
        # Use the deep MLP gate
        Gamma = self.gate_fc(combined_features) 
        
        O_Gated = Gamma * O_Attn + (1 - Gamma) * C_State
        
        return self.fc(O_Gated)


class HangmanDataset(Dataset):
    """
    Generates the 52D (deep) state vector.
    """
    def __init__(self, word_list, max_seq_len, max_simulated_wrong_guesses, name=""):
        self.max_simulated_wrong_guesses = max_simulated_wrong_guesses
        self.max_seq_len = max_seq_len
        
        original_count = len(word_list)
        self.word_list = [word for word in word_list if word and len(word) < self.max_seq_len]
        filtered_count = original_count - len(self.word_list)
        if filtered_count > 0:
            print(f"  -> [{name} dataset] Filtered {filtered_count} words (empty or > {self.max_seq_len} chars).")

    def __len__(self): return len(self.word_list)

    def __getitem__(self, idx):
        word = self.word_list[idx]
        secret_letters = set(word)
        
        mask_count = random.randint(1, len(word))
        mask_indices = sorted(random.sample(range(len(word)), k=mask_count))
        masked_word_list = list(word)
        for i in mask_indices: masked_word_list[i] = '_'
        masked_word = "".join(masked_word_list)
        input_seq = [CHAR_TO_IX[c] for c in masked_word]
        target_seq = [CHAR_TO_IX[c] for c in word]

        visible_letters = set(c for c in masked_word if c != '_')
        incorrect_letters_pool = list(ALL_LETTERS - secret_letters)
        
        num_incorrect_to_simulate = random.randint(0, self.max_simulated_wrong_guesses)
        num_incorrect_to_simulate = min(num_incorrect_to_simulate, len(incorrect_letters_pool))
        
        simulated_incorrect_guesses = set(random.sample(incorrect_letters_pool, k=num_incorrect_to_simulate))
        
        state_vector = torch.zeros(ALPHABET_LEN * 2) 

        for char in visible_letters:
            if char in CHAR_TO_IX:
                char_idx = CHAR_TO_IX[char] - 2 
                if 0 <= char_idx < ALPHABET_LEN:
                    state_vector[char_idx] = 1.0

        for char in simulated_incorrect_guesses:
            if char in CHAR_TO_IX:
                char_idx = CHAR_TO_IX[char] - 2 
                if 0 <= char_idx < ALPHABET_LEN:
                    state_vector[char_idx + ALPHABET_LEN] = 1.0

        return (
            torch.tensor(input_seq, dtype=torch.long),
            torch.tensor(target_seq, dtype=torch.long),
            state_vector
        )

# solver trainer
class IntelligentHangmanSolver:
    def __init__(self, full_train_dict, train_mode=False, train_words=None, val_words=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        self.all_letters = ALL_LETTERS 
        self.all_letters_list = "abcdefghijklmnopqrstuvwxyz" 
        
        self._load_or_train_model(train_mode, train_words, val_words)

    def _load_or_train_model(self, train_mode, train_words, val_words):
        # instantiates the model
        self.model = Gated_BiLSTM_Attention_Hangman(
            VOCAB_SIZE, D_MODEL, HIDDEN_DIM, NUM_LAYERS, state_dim=STATE_DIM, num_heads=ATTN_HEADS
        ).to(self.device)

        if train_mode and train_words and val_words:
            print("Starting training process with enhanced gate.")
            self._train_model(train_words, val_words) 
            print(f"Training complete. Loading best model from {MODEL_FILE} for evaluation.")
            
            if os.path.exists(MODEL_FILE):
                self.model.load_state_dict(torch.load(MODEL_FILE, map_location=self.device))
            else:
                print("Warning: Best model was not saved.")
        
        elif os.path.exists(MODEL_FILE):
            print(f"Loading pre-trained model from {MODEL_FILE}.")
            self.model.load_state_dict(torch.load(MODEL_FILE, map_location=self.device))
        else:
            raise FileNotFoundError(f"Model file '{MODEL_FILE}' not found and not in train mode. Please enable training.")
        
        self.model.eval()

    def _train_model(self, train_words, val_words):
        print("Initializing datasets with 52-dim state simulation.")
        
        train_dataset = HangmanDataset(train_words, MAX_SEQ_LEN, MAX_SIMULATED_WRONG_GUESSES, name="Train")
        val_dataset = HangmanDataset(val_words, MAX_SEQ_LEN, MAX_SIMULATED_WRONG_GUESSES, name="Validation")
        
        print("Datasets initialized.")

        def collate_fn(batch):
            inputs, targets, states = zip(*batch)
            inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
            targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
            states_batched = torch.stack(states, dim=0)
            return inputs_padded, targets_padded, states_batched

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

        criterion = nn.CrossEntropyLoss(ignore_index=0)
        optimizer = optim.AdamW(self.model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

        best_val_loss = float('inf')

        for epoch in range(EPOCHS):
            self.model.train()
            total_train_loss = 0
            for inputs, targets, states in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]"):
                inputs, targets, states = inputs.to(self.device), targets.to(self.device), states.to(self.device)
                optimizer.zero_grad()
                outputs = self.model(inputs, states) 
                loss = criterion(outputs.view(-1, VOCAB_SIZE), targets.view(-1))
                if math.isnan(loss.item()): print("\n!!! Train NaN !!!"); return
                loss.backward()
                optimizer.step()
                total_train_loss += loss.item()
            avg_train_loss = total_train_loss / len(train_loader)

            self.model.eval()
            total_val_loss = 0
            with torch.no_grad():
                for inputs, targets, states in tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Val]"):
                    inputs, targets, states = inputs.to(self.device), targets.to(self.device), states.to(self.device)
                    outputs = self.model(inputs, states) 
                    loss = criterion(outputs.view(-1, VOCAB_SIZE), targets.view(-1))
                    if math.isnan(loss.item()): print("\n!!! Val NaN !!!"); return
                    total_val_loss += loss.item()
            avg_val_loss = total_val_loss / len(val_loader)

            scheduler.step()
            current_lr = optimizer.param_groups[0]['lr']
            print(f"Epoch {epoch+1}/{EPOCHS} -> Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {current_lr:.6f}")

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(self.model.state_dict(), MODEL_FILE)
                print(f"  -> Validation loss improved to {best_val_loss:.4f}. Saving model to {MODEL_FILE}")

    # inference fn, creates 52-D state
    def _get_lstm_scores(self, pattern, guessed_letters_set):
        self.model.eval()
        input_seq = torch.tensor([[CHAR_TO_IX.get(c, 0) for c in pattern]], dtype=torch.long).to(self.device)
        
        visible_letters = set(c for c in pattern if c.isalpha())
        incorrect_guesses = guessed_letters_set - visible_letters
        
        state_vector = torch.zeros(ALPHABET_LEN * 2) 

        for char in visible_letters:
            if char in CHAR_TO_IX:
                char_idx = CHAR_TO_IX[char] - 2
                if 0 <= char_idx < ALPHABET_LEN:
                    state_vector[char_idx] = 1.0
        
        for char in incorrect_guesses:
            if char in CHAR_TO_IX:
                char_idx = CHAR_TO_IX[char] - 2
                if 0 <= char_idx < ALPHABET_LEN:
                    state_vector[char_idx + ALPHABET_LEN] = 1.0
        
        state_tensor = state_vector.unsqueeze(0).to(self.device)

        with torch.no_grad():
            logits = self.model(input_seq, state_tensor) 
            probabilities = torch.softmax(logits, dim=2).squeeze(0)
            
        letter_scores = {}
        for i, char in enumerate(pattern):
            if char == '_':
                for letter_idx, letter in IX_TO_CHAR.items():
                    if letter_idx >= 2: 
                        prob = probabilities[i, letter_idx].item()
                        letter_scores[letter] = max(letter_scores.get(letter, 0.0), prob)
        return letter_scores

    def choose_letter(self, pattern, guessed_letters_set):
        lstm_scores = self._get_lstm_scores(pattern, guessed_letters_set) 
        all_possible_letters = self.all_letters - guessed_letters_set
        best_letter = None
        max_score = -1.0
        
        for letter in all_possible_letters:
            score = lstm_scores.get(letter, 0.0)
            if score > max_score:
                max_score = score
                best_letter = letter
                
        if best_letter is not None:
            return best_letter
        else:
            fallback_order = "esiarntolcdugpmhbyfvwkxqz"
            for letter in fallback_order:
                if letter not in guessed_letters_set: return letter
            return 'a'

class LocalGameSimulator:
    def __init__(self, secret_word, max_guesses=6):
        self.secret_word = secret_word.lower()
        self.max_guesses = max_guesses
        self.lives_remaining = self.max_guesses
        self.guessed_letters = set()
        self.pattern = ['_'] * len(self.secret_word)
    def get_pattern(self): return "".join(self.pattern)
    def is_won(self): return '_' not in self.pattern
    def is_lost(self): return self.lives_remaining <= 0
    def is_game_over(self): return self.is_won() or self.is_lost()
    def guess(self, letter_char):
        if letter_char in self.guessed_letters: return
        self.guessed_letters.add(letter_char)
        if letter_char in self.secret_word:
            for i, char in enumerate(self.secret_word):
                if char == letter_char: self.pattern[i] = letter_char
        else: self.lives_remaining -= 1

def run_local_test(solver, test_words):
    print(f"\n--- STARTING FINAL EVALUATION (MAX CAPACITY GATED-ATTN) ---")
    original_count = len(test_words)
    words_to_play = [w for w in test_words if w and len(w) < MAX_SEQ_LEN]
    filtered_count = original_count - len(words_to_play)
    if filtered_count > 0: print(f"  -> Filtered {filtered_count} test words.")
    if len(words_to_play) == 0: print("Error: No valid test words."); return
    wins = 0
    for secret_word in tqdm(words_to_play, desc="Evaluating Final Agent"):
        game = LocalGameSimulator(secret_word)
        while not game.is_game_over():
            current_pattern = game.get_pattern()
            guessed_letter = solver.choose_letter(current_pattern, game.guessed_letters)
            game.guess(guessed_letter)
        if game.is_won(): wins += 1
    accuracy = (wins / len(words_to_play)) * 100
    print("\n--- EVALUATION COMPLETE ---")
    print(f"Final Agent won {wins} out of {len(words_to_play)} games.")
    print(f"Final Accuracy: {accuracy:.2f}%")

if __name__ == '__main__':
    FORCE_TRAINING = True
    if not all(os.path.exists(p) for p in [EXPERT_TRAIN_DICT_PATH, FINAL_TEST_DICT_PATH, RL_VAL_DICT_PATH]):
        print("ERROR: Ensure data files 'expert_train.txt', 'final_test.txt', and 'rl_train.txt' are present.")
    else:
        print("Loading data")
        with open(EXPERT_TRAIN_DICT_PATH, "r") as f: train_words = [l.strip().lower() for l in f if l.strip()]
        with open(RL_VAL_DICT_PATH, "r") as f: val_words = [l.strip().lower() for l in f if l.strip()]
        with open(FINAL_TEST_DICT_PATH, "r") as f: test_words = [l.strip().lower() for l in f if l.strip()]
        print(f"Data loaded: {len(train_words)} train, {len(val_words)} val, {len(test_words)} test.")
        
        should_train = FORCE_TRAINING or not os.path.exists(MODEL_FILE)
        
        if should_train:
            print("Training mode enabled.")
            solver = IntelligentHangmanSolver(train_words, True, train_words, val_words) 
        else:
            print(f"Found existing model at {MODEL_FILE}. Skipping training.")
            solver = IntelligentHangmanSolver(train_words, False)
            
        run_local_test(solver, test_words)


Loading data.
Data loaded: 205301 train, 19999 val, 2000 test.
Training mode enabled.
Using device: cuda
Starting training process with enhanced gate.
Initializing datasets with 52-dim state simulation.
Datasets initialized.


Epoch 1/120 [Train]: 100%|██████████| 1604/1604 [01:08<00:00, 23.28it/s]
Epoch 1/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.55it/s]


Epoch 1/120 -> Train Loss: 1.3511, Val Loss: 1.2760, LR: 0.001000
  -> Validation loss improved to 1.2760. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 2/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.37it/s]
Epoch 2/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.28it/s]


Epoch 2/120 -> Train Loss: 1.2624, Val Loss: 1.2443, LR: 0.000999
  -> Validation loss improved to 1.2443. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 3/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.52it/s]
Epoch 3/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.16it/s]


Epoch 3/120 -> Train Loss: 1.2320, Val Loss: 1.2279, LR: 0.000998
  -> Validation loss improved to 1.2279. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 4/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.62it/s]
Epoch 4/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.81it/s]


Epoch 4/120 -> Train Loss: 1.2201, Val Loss: 1.2210, LR: 0.000997
  -> Validation loss improved to 1.2210. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 5/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.47it/s]
Epoch 5/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.18it/s]


Epoch 5/120 -> Train Loss: 1.2151, Val Loss: 1.2211, LR: 0.000996


Epoch 6/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.59it/s]
Epoch 6/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.24it/s]


Epoch 6/120 -> Train Loss: 1.2098, Val Loss: 1.2013, LR: 0.000994
  -> Validation loss improved to 1.2013. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 7/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.58it/s]
Epoch 7/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.22it/s]


Epoch 7/120 -> Train Loss: 1.2008, Val Loss: 1.2023, LR: 0.000992


Epoch 8/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.46it/s]
Epoch 8/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.86it/s]


Epoch 8/120 -> Train Loss: 1.1941, Val Loss: 1.1943, LR: 0.000989
  -> Validation loss improved to 1.1943. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 9/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.35it/s]
Epoch 9/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.94it/s]


Epoch 9/120 -> Train Loss: 1.1908, Val Loss: 1.1890, LR: 0.000986
  -> Validation loss improved to 1.1890. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 10/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.29it/s]
Epoch 10/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.75it/s]


Epoch 10/120 -> Train Loss: 1.1880, Val Loss: 1.1920, LR: 0.000983


Epoch 11/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.47it/s]
Epoch 11/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.94it/s]


Epoch 11/120 -> Train Loss: 1.1855, Val Loss: 1.1824, LR: 0.000979
  -> Validation loss improved to 1.1824. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 12/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.51it/s]
Epoch 12/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.26it/s]


Epoch 12/120 -> Train Loss: 1.1806, Val Loss: 1.1800, LR: 0.000976
  -> Validation loss improved to 1.1800. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 13/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.40it/s]
Epoch 13/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.83it/s]


Epoch 13/120 -> Train Loss: 1.1798, Val Loss: 1.1760, LR: 0.000971
  -> Validation loss improved to 1.1760. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 14/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.37it/s]
Epoch 14/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.06it/s]


Epoch 14/120 -> Train Loss: 1.1777, Val Loss: 1.1700, LR: 0.000967
  -> Validation loss improved to 1.1700. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 15/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.38it/s]
Epoch 15/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.83it/s]


Epoch 15/120 -> Train Loss: 1.1776, Val Loss: 1.1813, LR: 0.000962


Epoch 16/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.45it/s]
Epoch 16/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.67it/s]


Epoch 16/120 -> Train Loss: 1.1727, Val Loss: 1.1705, LR: 0.000957


Epoch 17/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.44it/s]
Epoch 17/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.90it/s]


Epoch 17/120 -> Train Loss: 1.1702, Val Loss: 1.1694, LR: 0.000951
  -> Validation loss improved to 1.1694. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 18/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.36it/s]
Epoch 18/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.05it/s]


Epoch 18/120 -> Train Loss: 1.1729, Val Loss: 1.1690, LR: 0.000946
  -> Validation loss improved to 1.1690. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 19/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.45it/s]
Epoch 19/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.78it/s]


Epoch 19/120 -> Train Loss: 1.1684, Val Loss: 1.1573, LR: 0.000939
  -> Validation loss improved to 1.1573. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 20/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 20/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.93it/s]


Epoch 20/120 -> Train Loss: 1.1662, Val Loss: 1.1721, LR: 0.000933


Epoch 21/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 21/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.71it/s]


Epoch 21/120 -> Train Loss: 1.1633, Val Loss: 1.1567, LR: 0.000926
  -> Validation loss improved to 1.1567. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 22/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 22/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.87it/s]


Epoch 22/120 -> Train Loss: 1.1646, Val Loss: 1.1639, LR: 0.000919


Epoch 23/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 23/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.01it/s]


Epoch 23/120 -> Train Loss: 1.1636, Val Loss: 1.1666, LR: 0.000912


Epoch 24/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 24/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.69it/s]


Epoch 24/120 -> Train Loss: 1.1619, Val Loss: 1.1628, LR: 0.000905


Epoch 25/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.40it/s]
Epoch 25/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.60it/s]


Epoch 25/120 -> Train Loss: 1.1615, Val Loss: 1.1559, LR: 0.000897
  -> Validation loss improved to 1.1559. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 26/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 26/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.96it/s]


Epoch 26/120 -> Train Loss: 1.1564, Val Loss: 1.1668, LR: 0.000889


Epoch 27/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.45it/s]
Epoch 27/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.83it/s]


Epoch 27/120 -> Train Loss: 1.1557, Val Loss: 1.1591, LR: 0.000880


Epoch 28/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.47it/s]
Epoch 28/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.07it/s]


Epoch 28/120 -> Train Loss: 1.1561, Val Loss: 1.1549, LR: 0.000872
  -> Validation loss improved to 1.1549. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 29/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.49it/s]
Epoch 29/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.00it/s]


Epoch 29/120 -> Train Loss: 1.1552, Val Loss: 1.1515, LR: 0.000863
  -> Validation loss improved to 1.1515. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 30/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.49it/s]
Epoch 30/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.29it/s]


Epoch 30/120 -> Train Loss: 1.1521, Val Loss: 1.1555, LR: 0.000854


Epoch 31/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.47it/s]
Epoch 31/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.90it/s]


Epoch 31/120 -> Train Loss: 1.1488, Val Loss: 1.1437, LR: 0.000844
  -> Validation loss improved to 1.1437. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 32/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.51it/s]
Epoch 32/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.18it/s]


Epoch 32/120 -> Train Loss: 1.1501, Val Loss: 1.1557, LR: 0.000835


Epoch 33/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 33/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.02it/s]


Epoch 33/120 -> Train Loss: 1.1487, Val Loss: 1.1591, LR: 0.000825


Epoch 34/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.46it/s]
Epoch 34/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.70it/s]


Epoch 34/120 -> Train Loss: 1.1508, Val Loss: 1.1556, LR: 0.000815


Epoch 35/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 35/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.87it/s]


Epoch 35/120 -> Train Loss: 1.1464, Val Loss: 1.1543, LR: 0.000804


Epoch 36/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 36/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.93it/s]


Epoch 36/120 -> Train Loss: 1.1438, Val Loss: 1.1581, LR: 0.000794


Epoch 37/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.47it/s]
Epoch 37/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.90it/s]


Epoch 37/120 -> Train Loss: 1.1451, Val Loss: 1.1523, LR: 0.000783


Epoch 38/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 38/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.75it/s]


Epoch 38/120 -> Train Loss: 1.1475, Val Loss: 1.1473, LR: 0.000772


Epoch 39/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 39/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.87it/s]


Epoch 39/120 -> Train Loss: 1.1417, Val Loss: 1.1358, LR: 0.000761
  -> Validation loss improved to 1.1358. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 40/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.44it/s]
Epoch 40/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.23it/s]


Epoch 40/120 -> Train Loss: 1.1456, Val Loss: 1.1455, LR: 0.000750


Epoch 41/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 41/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.90it/s]


Epoch 41/120 -> Train Loss: 1.1402, Val Loss: 1.1410, LR: 0.000739


Epoch 42/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.51it/s]
Epoch 42/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.63it/s]


Epoch 42/120 -> Train Loss: 1.1421, Val Loss: 1.1489, LR: 0.000727


Epoch 43/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 43/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.00it/s]


Epoch 43/120 -> Train Loss: 1.1406, Val Loss: 1.1537, LR: 0.000715


Epoch 44/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.47it/s]
Epoch 44/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.86it/s]


Epoch 44/120 -> Train Loss: 1.1359, Val Loss: 1.1519, LR: 0.000703


Epoch 45/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.45it/s]
Epoch 45/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.11it/s]


Epoch 45/120 -> Train Loss: 1.1352, Val Loss: 1.1405, LR: 0.000691


Epoch 46/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.49it/s]
Epoch 46/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.94it/s]


Epoch 46/120 -> Train Loss: 1.1426, Val Loss: 1.1326, LR: 0.000679
  -> Validation loss improved to 1.1326. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 47/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.49it/s]
Epoch 47/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.10it/s]


Epoch 47/120 -> Train Loss: 1.1387, Val Loss: 1.1357, LR: 0.000667


Epoch 48/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.50it/s]
Epoch 48/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.83it/s]


Epoch 48/120 -> Train Loss: 1.1339, Val Loss: 1.1466, LR: 0.000655


Epoch 49/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 49/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.91it/s]


Epoch 49/120 -> Train Loss: 1.1367, Val Loss: 1.1431, LR: 0.000642


Epoch 50/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.38it/s]
Epoch 50/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.99it/s]


Epoch 50/120 -> Train Loss: 1.1306, Val Loss: 1.1384, LR: 0.000629


Epoch 51/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.40it/s]
Epoch 51/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.89it/s]


Epoch 51/120 -> Train Loss: 1.1326, Val Loss: 1.1345, LR: 0.000617


Epoch 52/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.52it/s]
Epoch 52/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.09it/s]


Epoch 52/120 -> Train Loss: 1.1326, Val Loss: 1.1446, LR: 0.000604


Epoch 53/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.60it/s]
Epoch 53/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.20it/s]


Epoch 53/120 -> Train Loss: 1.1330, Val Loss: 1.1438, LR: 0.000591


Epoch 54/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.56it/s]
Epoch 54/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.77it/s]


Epoch 54/120 -> Train Loss: 1.1293, Val Loss: 1.1229, LR: 0.000578
  -> Validation loss improved to 1.1229. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 55/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.44it/s]
Epoch 55/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.64it/s]


Epoch 55/120 -> Train Loss: 1.1281, Val Loss: 1.1472, LR: 0.000565


Epoch 56/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.37it/s]
Epoch 56/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.26it/s]


Epoch 56/120 -> Train Loss: 1.1278, Val Loss: 1.1373, LR: 0.000552


Epoch 57/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.56it/s]
Epoch 57/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.33it/s]


Epoch 57/120 -> Train Loss: 1.1290, Val Loss: 1.1305, LR: 0.000539


Epoch 58/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.55it/s]
Epoch 58/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.03it/s]


Epoch 58/120 -> Train Loss: 1.1274, Val Loss: 1.1319, LR: 0.000526


Epoch 59/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 59/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.78it/s]


Epoch 59/120 -> Train Loss: 1.1236, Val Loss: 1.1371, LR: 0.000513


Epoch 60/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.35it/s]
Epoch 60/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.69it/s]


Epoch 60/120 -> Train Loss: 1.1281, Val Loss: 1.1437, LR: 0.000500


Epoch 61/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.50it/s]
Epoch 61/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.16it/s]


Epoch 61/120 -> Train Loss: 1.1172, Val Loss: 1.1321, LR: 0.000487


Epoch 62/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.51it/s]
Epoch 62/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.84it/s]


Epoch 62/120 -> Train Loss: 1.1250, Val Loss: 1.1327, LR: 0.000474


Epoch 63/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.64it/s]
Epoch 63/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.07it/s]


Epoch 63/120 -> Train Loss: 1.1236, Val Loss: 1.1356, LR: 0.000461


Epoch 64/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.60it/s]
Epoch 64/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.17it/s]


Epoch 64/120 -> Train Loss: 1.1201, Val Loss: 1.1309, LR: 0.000448


Epoch 65/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.58it/s]
Epoch 65/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.03it/s]


Epoch 65/120 -> Train Loss: 1.1244, Val Loss: 1.1386, LR: 0.000435


Epoch 66/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 66/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.44it/s]


Epoch 66/120 -> Train Loss: 1.1199, Val Loss: 1.1243, LR: 0.000422


Epoch 67/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.46it/s]
Epoch 67/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.80it/s]


Epoch 67/120 -> Train Loss: 1.1196, Val Loss: 1.1344, LR: 0.000409


Epoch 68/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.32it/s]
Epoch 68/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.71it/s]


Epoch 68/120 -> Train Loss: 1.1177, Val Loss: 1.1340, LR: 0.000396


Epoch 69/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.38it/s]
Epoch 69/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.01it/s]


Epoch 69/120 -> Train Loss: 1.1153, Val Loss: 1.1286, LR: 0.000383


Epoch 70/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.51it/s]
Epoch 70/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.93it/s]


Epoch 70/120 -> Train Loss: 1.1137, Val Loss: 1.1263, LR: 0.000371


Epoch 71/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.60it/s]
Epoch 71/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.20it/s]


Epoch 71/120 -> Train Loss: 1.1184, Val Loss: 1.1167, LR: 0.000358
  -> Validation loss improved to 1.1167. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 72/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 72/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.22it/s]


Epoch 72/120 -> Train Loss: 1.1147, Val Loss: 1.1339, LR: 0.000345


Epoch 73/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.53it/s]
Epoch 73/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.94it/s]


Epoch 73/120 -> Train Loss: 1.1181, Val Loss: 1.1349, LR: 0.000333


Epoch 74/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.43it/s]
Epoch 74/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.75it/s]


Epoch 74/120 -> Train Loss: 1.1123, Val Loss: 1.1363, LR: 0.000321


Epoch 75/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.36it/s]
Epoch 75/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.00it/s]


Epoch 75/120 -> Train Loss: 1.1143, Val Loss: 1.1161, LR: 0.000309
  -> Validation loss improved to 1.1161. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 76/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.36it/s]
Epoch 76/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.80it/s]


Epoch 76/120 -> Train Loss: 1.1140, Val Loss: 1.1203, LR: 0.000297


Epoch 77/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.38it/s]
Epoch 77/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.98it/s]


Epoch 77/120 -> Train Loss: 1.1117, Val Loss: 1.1232, LR: 0.000285


Epoch 78/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.52it/s]
Epoch 78/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.08it/s]


Epoch 78/120 -> Train Loss: 1.1110, Val Loss: 1.1279, LR: 0.000273


Epoch 79/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.61it/s]
Epoch 79/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.17it/s]


Epoch 79/120 -> Train Loss: 1.1077, Val Loss: 1.1228, LR: 0.000261


Epoch 80/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 80/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.43it/s]


Epoch 80/120 -> Train Loss: 1.1103, Val Loss: 1.1252, LR: 0.000250


Epoch 81/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.62it/s]
Epoch 81/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.06it/s]


Epoch 81/120 -> Train Loss: 1.1089, Val Loss: 1.1165, LR: 0.000239


Epoch 82/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.61it/s]
Epoch 82/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.99it/s]


Epoch 82/120 -> Train Loss: 1.1087, Val Loss: 1.1223, LR: 0.000228


Epoch 83/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 83/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.88it/s]


Epoch 83/120 -> Train Loss: 1.1064, Val Loss: 1.1069, LR: 0.000217
  -> Validation loss improved to 1.1069. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 84/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.37it/s]
Epoch 84/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.81it/s]


Epoch 84/120 -> Train Loss: 1.1074, Val Loss: 1.1251, LR: 0.000206


Epoch 85/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.36it/s]
Epoch 85/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.75it/s]


Epoch 85/120 -> Train Loss: 1.1020, Val Loss: 1.1343, LR: 0.000196


Epoch 86/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.38it/s]
Epoch 86/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.06it/s]


Epoch 86/120 -> Train Loss: 1.1041, Val Loss: 1.1235, LR: 0.000185


Epoch 87/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.49it/s]
Epoch 87/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.24it/s]


Epoch 87/120 -> Train Loss: 1.1050, Val Loss: 1.1095, LR: 0.000175


Epoch 88/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 88/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.30it/s]


Epoch 88/120 -> Train Loss: 1.1039, Val Loss: 1.1217, LR: 0.000165


Epoch 89/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 89/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.09it/s]


Epoch 89/120 -> Train Loss: 1.1057, Val Loss: 1.1193, LR: 0.000156


Epoch 90/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.48it/s]
Epoch 90/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.02it/s]


Epoch 90/120 -> Train Loss: 1.1024, Val Loss: 1.1222, LR: 0.000146


Epoch 91/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.38it/s]
Epoch 91/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.73it/s]


Epoch 91/120 -> Train Loss: 1.0985, Val Loss: 1.1167, LR: 0.000137


Epoch 92/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.39it/s]
Epoch 92/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.81it/s]


Epoch 92/120 -> Train Loss: 1.1021, Val Loss: 1.1340, LR: 0.000128


Epoch 93/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.50it/s]
Epoch 93/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.85it/s]


Epoch 93/120 -> Train Loss: 1.1027, Val Loss: 1.1223, LR: 0.000120


Epoch 94/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.51it/s]
Epoch 94/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.74it/s]


Epoch 94/120 -> Train Loss: 1.1028, Val Loss: 1.1084, LR: 0.000111


Epoch 95/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 95/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.24it/s]


Epoch 95/120 -> Train Loss: 1.1013, Val Loss: 1.1145, LR: 0.000103


Epoch 96/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 96/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.15it/s]


Epoch 96/120 -> Train Loss: 1.0993, Val Loss: 1.1194, LR: 0.000095


Epoch 97/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 97/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 53.74it/s]


Epoch 97/120 -> Train Loss: 1.1003, Val Loss: 1.1275, LR: 0.000088


Epoch 98/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.14it/s]
Epoch 98/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 54.35it/s]


Epoch 98/120 -> Train Loss: 1.0977, Val Loss: 1.1152, LR: 0.000081


Epoch 99/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.17it/s]
Epoch 99/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 53.68it/s]


Epoch 99/120 -> Train Loss: 1.0988, Val Loss: 1.1162, LR: 0.000074


Epoch 100/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.18it/s]
Epoch 100/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 53.72it/s]


Epoch 100/120 -> Train Loss: 1.0959, Val Loss: 1.1293, LR: 0.000067


Epoch 101/120 [Train]: 100%|██████████| 1604/1604 [01:16<00:00, 21.08it/s]
Epoch 101/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 53.99it/s]


Epoch 101/120 -> Train Loss: 1.0972, Val Loss: 1.1216, LR: 0.000061


Epoch 102/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.15it/s]
Epoch 102/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 54.45it/s]


Epoch 102/120 -> Train Loss: 1.0990, Val Loss: 1.1079, LR: 0.000054


Epoch 103/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.17it/s]
Epoch 103/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.52it/s]


Epoch 103/120 -> Train Loss: 1.1003, Val Loss: 1.0969, LR: 0.000049
  -> Validation loss improved to 1.0969. Saving model to bilstm_attn_hangman_GATED_MLPGATE_improv2.pth


Epoch 104/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 104/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 54.91it/s]


Epoch 104/120 -> Train Loss: 1.0983, Val Loss: 1.1203, LR: 0.000043


Epoch 105/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.37it/s]
Epoch 105/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.90it/s]


Epoch 105/120 -> Train Loss: 1.0947, Val Loss: 1.1119, LR: 0.000038


Epoch 106/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.45it/s]
Epoch 106/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.00it/s]


Epoch 106/120 -> Train Loss: 1.0998, Val Loss: 1.1093, LR: 0.000033


Epoch 107/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.65it/s]
Epoch 107/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.29it/s]


Epoch 107/120 -> Train Loss: 1.0965, Val Loss: 1.1256, LR: 0.000029


Epoch 108/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.56it/s]
Epoch 108/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.82it/s]


Epoch 108/120 -> Train Loss: 1.0936, Val Loss: 1.1070, LR: 0.000024


Epoch 109/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.42it/s]
Epoch 109/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.93it/s]


Epoch 109/120 -> Train Loss: 1.0962, Val Loss: 1.1098, LR: 0.000021


Epoch 110/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.46it/s]
Epoch 110/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.01it/s]


Epoch 110/120 -> Train Loss: 1.0931, Val Loss: 1.0992, LR: 0.000017


Epoch 111/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.55it/s]
Epoch 111/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.19it/s]


Epoch 111/120 -> Train Loss: 1.0934, Val Loss: 1.1246, LR: 0.000014


Epoch 112/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.58it/s]
Epoch 112/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.98it/s]


Epoch 112/120 -> Train Loss: 1.0967, Val Loss: 1.1162, LR: 0.000011


Epoch 113/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.59it/s]
Epoch 113/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.92it/s]


Epoch 113/120 -> Train Loss: 1.0945, Val Loss: 1.1124, LR: 0.000008


Epoch 114/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.56it/s]
Epoch 114/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.11it/s]


Epoch 114/120 -> Train Loss: 1.0952, Val Loss: 1.1140, LR: 0.000006


Epoch 115/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 115/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.04it/s]


Epoch 115/120 -> Train Loss: 1.0952, Val Loss: 1.1191, LR: 0.000004


Epoch 116/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 116/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.16it/s]


Epoch 116/120 -> Train Loss: 1.0948, Val Loss: 1.1074, LR: 0.000003


Epoch 117/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.57it/s]
Epoch 117/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.68it/s]


Epoch 117/120 -> Train Loss: 1.0906, Val Loss: 1.1213, LR: 0.000002


Epoch 118/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.58it/s]
Epoch 118/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 56.05it/s]


Epoch 118/120 -> Train Loss: 1.0952, Val Loss: 1.1152, LR: 0.000001


Epoch 119/120 [Train]: 100%|██████████| 1604/1604 [01:14<00:00, 21.44it/s]
Epoch 119/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.87it/s]


Epoch 119/120 -> Train Loss: 1.0943, Val Loss: 1.1125, LR: 0.000000


Epoch 120/120 [Train]: 100%|██████████| 1604/1604 [01:15<00:00, 21.36it/s]
Epoch 120/120 [Val]: 100%|██████████| 157/157 [00:02<00:00, 55.84it/s]


Epoch 120/120 -> Train Loss: 1.0943, Val Loss: 1.1075, LR: 0.000000
Training complete. Loading best model from bilstm_attn_hangman_GATED_MLPGATE_improv2.pth for evaluation.

--- STARTING FINAL EVALUATION (MAX CAPACITY GATED-ATTN) ---


Evaluating Final Agent: 100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


--- EVALUATION COMPLETE ---
Final Agent won 1240 out of 2000 games.
Final Accuracy: 62.00%





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader 
import numpy as np
import random
from collections import Counter
from tqdm import tqdm
import os
import math
import re

# configs
EXPERT_TRAIN_DICT_PATH = "/kaggle/input/hman-ds/expert_train.txt"
FINAL_TEST_DICT_PATH = "/kaggle/input/hman-ds/final_test.txt"
RL_VAL_DICT_PATH = "/kaggle/input/hman-ds/rl_train.txt"
MODEL_FILE = "bilstm_attn_hangman_GATED_MLPGATE_improv2.pth"

USE_DAWG_FUSION = True

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

D_MODEL = 128
HIDDEN_DIM = 256
NUM_LAYERS = 4
ATTN_HEADS = 8
MAX_SEQ_LEN = 32
ALPHABET_LEN = 26
STATE_DIM = ALPHABET_LEN * 2

CHAR_TO_IX = {char: i + 2 for i, char in enumerate("abcdefghijklmnopqrstuvwxyz")}
CHAR_TO_IX['_'] = 1
VOCAB_SIZE = len(CHAR_TO_IX) + 1
IX_TO_CHAR = {i: char for char, i in CHAR_TO_IX.items()}
IX_TO_CHAR[0] = '<pad>'
ALL_LETTERS = set("abcdefghijklmnopqrstuvwxyz")

# arch
class Gated_BiLSTM_Attention_Hangman(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, state_dim, num_heads):
        super(Gated_BiLSTM_Attention_Hangman, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm_output_dim = hidden_dim * 2
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.attention = nn.MultiheadAttention(embed_dim=self.lstm_output_dim, num_heads=num_heads, batch_first=True)
        self.norm = nn.LayerNorm(self.lstm_output_dim)
        self.state_projection = nn.Sequential(
            nn.Linear(state_dim, self.lstm_output_dim), nn.ReLU(),
            nn.Linear(self.lstm_output_dim, self.lstm_output_dim))
        self.gate_fc = nn.Sequential(
            nn.Linear(self.lstm_output_dim * 2, self.lstm_output_dim), nn.ReLU(),
            nn.Linear(self.lstm_output_dim, self.lstm_output_dim), nn.Sigmoid())
        self.fc = nn.Linear(self.lstm_output_dim, vocab_size)

    def forward(self, x, state):
        attn_key_padding_mask, embedded, seq_len = (x == 0), self.embedding(x), x.size(1)
        lstm_out, _ = self.lstm(embedded)
        attn_output, _ = self.attention(lstm_out, lstm_out, lstm_out, key_padding_mask=attn_key_padding_mask)
        O_Attn = self.norm(lstm_out + attn_output)
        C_State_projected = self.state_projection(state)
        C_State = C_State_projected.unsqueeze(1).repeat(1, seq_len, 1)
        combined_features = torch.cat([O_Attn, C_State], dim=-1)
        Gamma = self.gate_fc(combined_features)
        O_Gated = Gamma * O_Attn + (1 - Gamma) * C_State
        return self.fc(O_Gated)

class IntelligentHangmanSolver:
    def __init__(self, full_word_dictionary):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        self.all_letters = ALL_LETTERS
        self.full_dictionary = full_word_dictionary
        self._load_model()

    def _load_model(self):
        self.model = Gated_BiLSTM_Attention_Hangman(VOCAB_SIZE, D_MODEL, HIDDEN_DIM, NUM_LAYERS, state_dim=STATE_DIM, num_heads=ATTN_HEADS).to(self.device)
        if os.path.exists(MODEL_FILE):
            print(f"Loading pre-trained 62% model from {MODEL_FILE}.")
            self.model.load_state_dict(torch.load(MODEL_FILE, map_location=self.device))
        else:
            raise FileNotFoundError(f"Model file '{MODEL_FILE}' not found.")
        self.model.eval()

    def _get_lstm_probs(self, pattern, guessed_letters_set):
        self.model.eval()
        input_seq = torch.tensor([[CHAR_TO_IX.get(c, 0) for c in pattern]], dtype=torch.long).to(self.device)
        visible_letters, incorrect_guesses = {c for c in pattern if c.isalpha()}, guessed_letters_set - {c for c in pattern if c.isalpha()}
        state_vector = torch.zeros(ALPHABET_LEN * 2)
        for char in visible_letters:
            if char in CHAR_TO_IX: state_vector[CHAR_TO_IX[char] - 2] = 1.0
        for char in incorrect_guesses:
            if char in CHAR_TO_IX: state_vector[CHAR_TO_IX[char] - 2 + ALPHABET_LEN] = 1.0
        state_tensor = state_vector.unsqueeze(0).to(self.device)
        with torch.no_grad():
            logits = self.model(input_seq, state_tensor)
            probabilities = torch.softmax(logits, dim=2).squeeze(0)
        
        # Aggregate probabilities across all '_' positions
        letter_probs = {}
        for i, char in enumerate(pattern):
            if char == '_':
                for letter_idx, letter in IX_TO_CHAR.items():
                    if letter_idx >= 2:
                        prob = probabilities[i, letter_idx].item()
                        letter_probs[letter] = max(letter_probs.get(letter, 0.0), prob)
        return letter_probs

    def _filter_words_py(self, pattern: str, incorrect_guesses: set) -> list:
        valid_words, pattern_len = [], len(pattern)
        try: compiled_regex = re.compile("^" + pattern.replace('_', '.') + "$")
        except re.error: return []
        for word in self.full_dictionary:
            if len(word) == pattern_len and compiled_regex.match(word) and not any(g in word for g in incorrect_guesses):
                valid_words.append(word)
        return valid_words

    def _get_dawg_probs(self, valid_words: list, guessed_letters: set) -> dict:
        freq_counter = Counter("".join(valid_words))
        dawg_probs = {}
        
        # tensor of scores for softmax
        scores_tensor = torch.zeros(ALPHABET_LEN)
        for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz"):
            if letter not in guessed_letters:
                scores_tensor[i] = freq_counter.get(letter, 0)

        # Normalizes with softmax to get a probability distribution
        if torch.sum(scores_tensor) > 0:
            probs_tensor = torch.softmax(scores_tensor, dim=0)
            for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz"):
                dawg_probs[letter] = probs_tensor[i].item()
        
        return dawg_probs

    def choose_letter(self, pattern, guessed_letters_set):
        lstm_probs = self._get_lstm_probs(pattern, guessed_letters_set)
        
        final_scores = lstm_probs
        
        if USE_DAWG_FUSION:
            incorrect_guesses = guessed_letters_set - {c for c in pattern if c.isalpha()}
            valid_words = self._filter_words_py(pattern, incorrect_guesses)
            dawg_probs = self._get_dawg_probs(valid_words, guessed_letters_set)

            # Fuse by simple addition of probabilities
            for letter in dawg_probs:
                final_scores[letter] = final_scores.get(letter, 0.0) + dawg_probs[letter]

        # chooses the best letter based on the combined score ( argmax)
        best_letter, max_score = None, -1.0
        for letter in self.all_letters - guessed_letters_set:
            score = final_scores.get(letter, 0.0)
            if score > max_score: max_score, best_letter = score, letter
        
        if best_letter is not None: return best_letter
        for letter in "esiarntolcdugpmhbyfvwkxqz":
            if letter not in guessed_letters_set: return letter
        return 'a'

class LocalGameSimulator:
    def __init__(self, secret_word, max_guesses=6):
        self.secret_word, self.lives_remaining, self.guessed_letters = secret_word.lower(), max_guesses, set()
        self.pattern = ['_'] * len(self.secret_word)
    def get_pattern(self): return "".join(self.pattern)
    def is_won(self): return '_' not in self.pattern
    def is_lost(self): return self.lives_remaining <= 0
    def is_game_over(self): return self.is_won() or self.is_lost()
    def guess(self, letter_char):
        if letter_char is None or letter_char in self.guessed_letters: return
        self.guessed_letters.add(letter_char)
        if letter_char in self.secret_word:
            for i, char in enumerate(self.secret_word):
                if char == letter_char: self.pattern[i] = char
        else: self.lives_remaining -= 1

def run_local_test(solver, test_words):
    fusion_str = "DAWG Late Fusion" if USE_DAWG_FUSION else "LSTM Only"
    print(f"\n--- STARTING FINAL EVALUATION ({fusion_str}) ---")
    words_to_play = [w for w in test_words if w and len(w) < MAX_SEQ_LEN]
    if not words_to_play: print("Error: No valid test words."); return
    wins = 0
    for secret_word in tqdm(words_to_play, desc="Evaluating Final Agent"):
        game = LocalGameSimulator(secret_word)
        while not game.is_game_over():
            game.guess(solver.choose_letter(game.get_pattern(), game.guessed_letters))
        if game.is_won(): wins += 1
    accuracy = (wins / len(words_to_play)) * 100
    print("\n--- EVALUATION COMPLETE ---")
    print(f"Final Agent won {wins} out of {len(words_to_play)} games.")
    print(f"Final Accuracy: {accuracy:.2f}%")

if __name__ == '__main__':
    FORCE_TRAINING = False 
    
    if not all(os.path.exists(p) for p in [EXPERT_TRAIN_DICT_PATH, FINAL_TEST_DICT_PATH, RL_VAL_DICT_PATH]):
        print("ERROR: Ensure data files are present.")
    else:
        print("Loading dictionary data.")
        with open(EXPERT_TRAIN_DICT_PATH, "r") as f: train_words = [l.strip().lower() for l in f if l.strip()]
        with open(RL_VAL_DICT_PATH, "r") as f: val_words = [l.strip().lower() for l in f if l.strip()]
        full_dictionary = list(set(train_words + val_words))
        with open(FINAL_TEST_DICT_PATH, "r") as f: test_words = [l.strip().lower() for l in f if l.strip()]
        print(f"Data loaded. Using a dictionary of {len(full_dictionary)} words for DAWG calculation.")
        
        if not FORCE_TRAINING and os.path.exists(MODEL_FILE):
            solver = IntelligentHangmanSolver(full_dictionary)
            run_local_test(solver, test_words)
        else:
            print(f"Training is disabled or model file '{MODEL_FILE}' not found. Cannot run evaluation.")


Loading dictionary data.
Data loaded. Using a dictionary of 225300 words for DAWG calculation.
Using device: cuda
Loading pre-trained 62% model from bilstm_attn_hangman_GATED_MLPGATE_improv2.pth.

--- STARTING FINAL EVALUATION (DAWG Late Fusion) ---


Evaluating Final Agent: 100%|██████████| 2000/2000 [08:01<00:00,  4.15it/s]


--- EVALUATION COMPLETE ---
Final Agent won 1263 out of 2000 games.
Final Accuracy: 63.15%



