# DQN Agent for Hangman

**Deep Q-Network (DQN) Implementation**

This notebook implements a Deep Q-Learning agent for the Hangman game using PyTorch.

## Key Features:
- **State Representation**: One-hot encoded masked word + guessed letters bitmap + lives remaining
- **Action Space**: 26 letters (a-z)
- **Architecture**: Multi-layer perceptron with experience replay
- **Training**: Epsilon-greedy exploration with target network updates

**Course:** UE23CS352A - Machine Learning Lab  
**Date:** November 2025

## 1. Import Libraries

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import deque, Counter
from tqdm import tqdm
import random
import pickle
import string

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

ModuleNotFoundError: No module named 'torch'

## 2. Load Data

In [None]:
# Load corpus and test data
with open('Data/Data/corpus.txt', 'r') as f:
    corpus = [line.strip().lower() for line in f if line.strip()]

with open('Data/Data/test.txt', 'r') as f:
    test_words = [line.strip().lower() for line in f if line.strip()]

print(f"Corpus: {len(corpus)} words ({len(set(corpus))} unique)")
print(f"Test: {len(test_words)} words ({len(set(test_words))} unique)")
print(f"Overlap: {len(set(corpus) & set(test_words))} words")

## 3. Hangman Environment

In [None]:
class HangmanEnv:
    """Hangman game environment for DQN"""
    
    def __init__(self, word, max_lives=6):
        self.word = word.lower()
        self.max_lives = max_lives
        self.reset()
        
    def reset(self):
        """Reset the game state"""
        self.guessed = set()
        self.lives = self.max_lives
        self.masked = '_' * len(self.word)
        self.done = False
        return self.get_state()
    
    def get_state(self):
        """Get current state representation
        Returns: dict with 'masked', 'guessed', 'lives', 'available'
        """
        return {
            'masked': self.masked,
            'guessed': self.guessed.copy(),
            'lives': self.lives,
            'available': set(string.ascii_lowercase) - self.guessed
        }
    
    def step(self, letter):
        """Take action (guess letter)
        Returns: (next_state, reward, done)
        """
        if letter in self.guessed or self.done:
            return self.get_state(), -10, self.done  # Invalid action penalty
        
        self.guessed.add(letter)
        
        if letter in self.word:
            # Correct guess
            self.masked = ''.join([c if c in self.guessed else '_' for c in self.word])
            reward = 2  # reward for correct guess
            
            if '_' not in self.masked:
                # Won the game
                self.done = True
                reward = 50  # Big reward for winning
        else:
            # Wrong guess
            self.lives -= 1
            reward = -1  # Small penalty for wrong guess
            
            if self.lives == 0:
                # Lost the game
                self.done = True
                reward = -30  # Penalty for losing
        
        return self.get_state(), reward, self.done

print("✓ HangmanEnv defined")

## 4. DQN Network Architecture

In [None]:
class DQN(nn.Module):
    """Deep Q-Network for Hangman"""
    
    def __init__(self, state_dim, action_dim=26, hidden_dim=256):
        super(DQN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
    
    def forward(self, x):
        return self.network(x)


class ReplayBuffer:
    """Experience replay buffer for DQN"""
    
    def __init__(self, capacity=50000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, min(batch_size, len(self.buffer)))
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.FloatTensor(states).to(device),
            torch.LongTensor(actions).to(device),
            torch.FloatTensor(rewards).to(device),
            torch.FloatTensor(next_states).to(device),
            torch.FloatTensor(dones).to(device)
        )
    
    def __len__(self):
        return len(self.buffer)

print("✓ DQN and ReplayBuffer defined")

## 5. State Encoding & DQN Agent

In [None]:
def encode_state(state, max_word_len=20):
    """Encode state as fixed-size feature vector
    Features: masked word one-hot + guessed bitmap + lives
    """
    masked = state['masked']
    guessed = state['guessed']
    lives = state['lives']
    
    # One-hot encode masked word (pad/truncate to max_word_len)
    # Use 27 features per position: 26 letters + 1 for '_'
    word_encoding = []
    for i in range(max_word_len):
        if i < len(masked):
            char = masked[i]
            if char == '_':
                one_hot = [0] * 26 + [1]
            else:
                idx = ord(char) - ord('a')
                one_hot = [0] * 27
                one_hot[idx] = 1
            word_encoding.extend(one_hot)
        else:
            word_encoding.extend([0] * 27)  # padding
    
    # Guessed letters bitmap (26 bits)
    guessed_encoding = [1 if chr(ord('a') + i) in guessed else 0 for i in range(26)]
    
    # Lives remaining (normalized)
    lives_encoding = [lives / 6.0]
    
    # Combine all features
    state_vector = word_encoding + guessed_encoding + lives_encoding
    return np.array(state_vector, dtype=np.float32)


class DQNAgent:
    """DQN Agent for Hangman"""
    
    def __init__(self, state_dim, action_dim=26, lr=0.001, gamma=0.95, 
                 epsilon_start=1.0, epsilon_end=0.05, epsilon_decay=0.995):
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Q-networks
        self.policy_net = DQN(state_dim, action_dim).to(device)
        self.target_net = DQN(state_dim, action_dim).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.memory = ReplayBuffer(capacity=50000)
        
    def select_action(self, state_dict, available_actions=None):
        """Epsilon-greedy action selection"""
        state = encode_state(state_dict)
        
        if available_actions is None:
            available_actions = list(state_dict['available'])
        
        if not available_actions:
            return None
        
        # Epsilon-greedy
        if random.random() < self.epsilon:
            # Random action from available
            letter = random.choice(available_actions)
            return letter
        else:
            # Greedy action
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                q_values = self.policy_net(state_tensor)[0]
                
                # Mask out unavailable actions
                for i in range(26):
                    letter = chr(ord('a') + i)
                    if letter not in available_actions:
                        q_values[i] = float('-inf')
                
                action_idx = q_values.argmax().item()
                return chr(ord('a') + action_idx)
    
    def train_step(self, batch_size=128):
        """Train on a batch from replay buffer"""
        if len(self.memory) < batch_size:
            return 0.0
        
        states, actions, rewards, next_states, dones = self.memory.sample(batch_size)
        
        # Current Q values
        q_values = self.policy_net(states)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Target Q values
        with torch.no_grad():
            next_q_values = self.target_net(next_states)
            next_q_value = next_q_values.max(1)[0]
            target_q_value = rewards + (1 - dones) * self.gamma * next_q_value
        
        # Compute loss
        loss = nn.MSELoss()(q_value, target_q_value)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.optimizer.step()
        
        return loss.item()
    
    def update_target_network(self):
        """Copy weights from policy to target network"""
        self.target_net.load_state_dict(self.policy_net.state_dict())
    
    def decay_epsilon(self):
        """Decay epsilon"""
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)

print("✓ DQNAgent defined")

## 6. Training Loop

In [None]:
# Initialize DQN agent
STATE_DIM = 20 * 27 + 26 + 1  # (max_word_len * 27) + guessed_bitmap + lives
agent = DQNAgent(state_dim=STATE_DIM, lr=0.0005, gamma=0.95)

# Training parameters
NUM_EPISODES = 2000
BATCH_SIZE = 128
TARGET_UPDATE_FREQ = 50
TRAIN_FREQ = 4

# Training metrics
episode_rewards = []
episode_lengths = []
losses = []
win_rates = []

print("Starting DQN training...")
print(f"Episodes: {NUM_EPISODES}, Batch size: {BATCH_SIZE}")
print(f"Device: {device}\n")

for episode in tqdm(range(NUM_EPISODES), desc="Training"):
    # Sample random word from corpus
    word = random.choice(corpus)
    env = HangmanEnv(word)
    
    state = env.reset()
    episode_reward = 0
    episode_len = 0
    step = 0
    
    while not env.done:
        # Select and perform action
        action_letter = agent.select_action(state)
        if action_letter is None:
            break
        
        next_state, reward, done = env.step(action_letter)
        
        # Store transition
        state_encoded = encode_state(state)
        next_state_encoded = encode_state(next_state)
        action_idx = ord(action_letter) - ord('a')
        
        agent.memory.push(
            state_encoded,
            action_idx,
            reward,
            next_state_encoded,
            float(done)
        )
        
        # Train if enough samples
        if step % TRAIN_FREQ == 0:
            loss = agent.train_step(BATCH_SIZE)
            if loss > 0:
                losses.append(loss)
        
        state = next_state
        episode_reward += reward
        episode_len += 1
        step += 1
    
    # Update target network
    if episode % TARGET_UPDATE_FREQ == 0:
        agent.update_target_network()
    
    # Decay epsilon
    agent.decay_epsilon()
    
    # Record metrics
    episode_rewards.append(episode_reward)
    episode_lengths.append(episode_len)
    
    # Calculate win rate over last 100 episodes
    if episode >= 99:
        recent_wins = sum(1 for i in range(episode-99, episode+1) if episode_rewards[i] > 0)
        win_rates.append(recent_wins / 100.0)
    
    # Print progress
    if (episode + 1) % 200 == 0:
        avg_reward = np.mean(episode_rewards[-200:])
        avg_length = np.mean(episode_lengths[-200:])
        win_rate = win_rates[-1] if win_rates else 0
        print(f"\nEpisode {episode+1}/{NUM_EPISODES}")
        print(f"  Avg Reward: {avg_reward:.2f}")
        print(f"  Avg Length: {avg_length:.2f}")
        print(f"  Win Rate: {win_rate:.2%}")
        print(f"  Epsilon: {agent.epsilon:.3f}")
        print(f"  Replay Buffer: {len(agent.memory)}")

print("\n✓ Training completed!")

## 7. Visualize Training Progress

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Episode rewards
axes[0, 0].plot(episode_rewards, alpha=0.3, label='Raw')
if len(episode_rewards) > 50:
    smoothed = pd.Series(episode_rewards).rolling(50).mean()
    axes[0, 0].plot(smoothed, label='Smoothed (50-ep avg)', linewidth=2)
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Total Reward')
axes[0, 0].set_title('Episode Rewards')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Plot 2: Episode lengths
axes[0, 1].plot(episode_lengths, alpha=0.3, label='Raw')
if len(episode_lengths) > 50:
    smoothed = pd.Series(episode_lengths).rolling(50).mean()
    axes[0, 1].plot(smoothed, label='Smoothed (50-ep avg)', linewidth=2)
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Episode Length')
axes[0, 1].set_title('Episode Lengths (Guesses per Game)')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Plot 3: Training loss
if losses:
    axes[1, 0].plot(losses, alpha=0.3, label='Raw')
    if len(losses) > 100:
        smoothed = pd.Series(losses).rolling(100).mean()
        axes[1, 0].plot(smoothed, label='Smoothed (100-step avg)', linewidth=2)
    axes[1, 0].set_xlabel('Training Step')
    axes[1, 0].set_ylabel('Loss')
    axes[1, 0].set_title('Training Loss')
    axes[1, 0].legend()
    axes[1, 0].grid(alpha=0.3)

# Plot 4: Win rate
if win_rates:
    axes[1, 1].plot(range(99, len(episode_rewards)), win_rates, linewidth=2)
    axes[1, 1].set_xlabel('Episode')
    axes[1, 1].set_ylabel('Win Rate')
    axes[1, 1].set_title('Win Rate (100-episode window)')
    axes[1, 1].set_ylim([0, 1])
    axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('dqn_training_progress.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Training progress visualization saved")

## 8. Evaluation on Test Set

In [None]:
# Set agent to evaluation mode (epsilon = 0 for pure exploitation)
agent.epsilon = 0.0

print("Evaluating DQN agent on test set...")
wins = 0
total_wrong_guesses = 0

for word in tqdm(test_words, desc="Evaluating"):
    env = HangmanEnv(word)
    state = env.reset()
    wrong_guesses = 0
    
    while not env.done:
        action_letter = agent.select_action(state)
        if action_letter is None:
            break
        
        next_state, reward, done = env.step(action_letter)
        
        if reward < 0:  # Wrong guess
            wrong_guesses += 1
        
        state = next_state
    
    if '_' not in env.masked:  # Won
        wins += 1
    
    total_wrong_guesses += wrong_guesses

# Calculate metrics
success_rate = wins / len(test_words)
avg_wrong = total_wrong_guesses / len(test_words)
final_score = wins * 100 - total_wrong_guesses * 100

print("\n" + "="*60)
print("DQN AGENT EVALUATION RESULTS")
print("="*60)
print(f"Total Games: {len(test_words)}")
print(f"Wins: {wins} ({success_rate:.2%})")
print(f"Total Wrong Guesses: {total_wrong_guesses}")
print(f"Avg Wrong Guesses: {avg_wrong:.3f}")
print(f"\nFINAL SCORE: {final_score:.2f}")
print("="*60)

## 9. Comparison with Other Agents

In [None]:
# Comparison data (from previous notebooks)
comparison_data = {
    'Agent': ['Original HMM', 'RL + HMM', 'Improved HMM', 'Enhanced N-gram', 'DQN'],
    'Success Rate': [19.80, 19.90, 24.60, 35.70, success_rate * 100],
    'Final Score': [-55324, -55302, -53878, -50471, final_score]
}

df_comparison = pd.DataFrame(comparison_data)
df_comparison = df_comparison.sort_values('Success Rate', ascending=False)

print("\n" + "="*70)
print("AGENT PERFORMANCE COMPARISON")
print("="*70)
print(df_comparison.to_string(index=False))
print("="*70)

# Visualize comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Success rates
colors = ['#9467bd', '#ff7f0e', '#2ca02c', '#d62728', '#1f77b4']
bars1 = ax1.bar(df_comparison['Agent'], df_comparison['Success Rate'], 
                color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Success Rate (%)', fontsize=12, fontweight='bold')
ax1.set_title('Success Rate Comparison', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3, linestyle='--')
ax1.set_ylim(0, max(df_comparison['Success Rate']) * 1.2)
ax1.tick_params(axis='x', rotation=15)

# Add value labels
for bar, rate in zip(bars1, df_comparison['Success Rate']):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{rate:.2f}%', ha='center', va='bottom', fontweight='bold', fontsize=9)

# Scores
bars2 = ax2.bar(df_comparison['Agent'], df_comparison['Final Score'], 
                color=colors, alpha=0.7, edgecolor='black')
ax2.set_ylabel('Final Score', fontsize=12, fontweight='bold')
ax2.set_title('Final Score Comparison', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3, linestyle='--')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.8)
ax2.tick_params(axis='x', rotation=15)

# Add value labels
for bar, score in zip(bars2, df_comparison['Final Score']):
    height = bar.get_height()
    y_pos = height - 1500 if height < 0 else height + 500
    ax2.text(bar.get_x() + bar.get_width()/2., y_pos,
             f'{score:.0f}', ha='center', va='top' if height < 0 else 'bottom', 
             fontweight='bold', fontsize=9, color='white' if height < 0 else 'black')

plt.tight_layout()
plt.savefig('dqn_agent_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Comparison visualization saved")

## 10. Save DQN Model

In [None]:
# Save the trained DQN model
torch.save({
    'policy_net_state_dict': agent.policy_net.state_dict(),
    'target_net_state_dict': agent.target_net.state_dict(),
    'optimizer_state_dict': agent.optimizer.state_dict(),
    'epsilon': agent.epsilon,
    'state_dim': STATE_DIM,
    'training_stats': {
        'episodes': NUM_EPISODES,
        'final_success_rate': success_rate,
        'final_score': final_score,
        'episode_rewards': episode_rewards,
        'win_rates': win_rates
    }
}, 'dqn_agent.pth')

print("✓ DQN model saved to 'dqn_agent.pth'")

# Save results summary
results_summary = f"""
DQN Agent Results Summary
=========================

Training Configuration:
- Episodes: {NUM_EPISODES}
- Batch Size: {BATCH_SIZE}
- Learning Rate: 0.0005
- Gamma: 0.95
- Epsilon Decay: 0.995
- State Dimension: {STATE_DIM}
- Device: {device}

Evaluation Results:
- Test Set Size: {len(test_words)} words
- Success Rate: {success_rate:.2%}
- Wins: {wins}/{len(test_words)}
- Total Wrong Guesses: {total_wrong_guesses}
- Avg Wrong Guesses: {avg_wrong:.3f}
- Final Score: {final_score:.2f}

Comparison with Other Agents:
{df_comparison.to_string(index=False)}

Model saved to: dqn_agent.pth
"""

with open('dqn_results.txt', 'w') as f:
    f.write(results_summary)

print("✓ Results summary saved to 'dqn_results.txt'")
print("\n" + results_summary)