In [1]:
import gymnasium as gym
import numpy as np
import minigrid
from gymnasium.envs.registration import register
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

  from pkg_resources import resource_stream, resource_exists


In [2]:


# ==========================================================
# 0) Register your custom env (assumes enemy_doorkey_env.py exists)
# ==========================================================
register(
    id="MiniGrid-DoorKey-6x6-Enemy-v0",
    entry_point="enemy_doorkey_env:DoorKeyWithEnemyEnv",
    kwargs={"size":6}
)

# ==========================================================
# 1) Useful actions only (DoorKey)
# ==========================================================
USEFUL_ACTIONS = [0, 1, 2, 3, 5]  # left, right, forward, pickup, toggle

def sample_useful_action():
    return int(np.random.choice(USEFUL_ACTIONS))

# ==========================================================
# 2) State encoder (no enemy_pos -> smaller table)
# ==========================================================
def get_door_open(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "door":
                return 1 if obj.is_open else 0
    return 0

def get_state(env):
    u = env.unwrapped
    ax, ay = u.agent_pos
    ad = int(u.agent_dir)
    has_key = 1 if (u.carrying is not None and getattr(u.carrying, "type", None) == "key") else 0
    door_open = get_door_open(u)
    # Return as a tuple, which we will convert to tensor later
    return np.array([ax, ay, ad, has_key, door_open], dtype=np.float32)

# ==========================================================
# 3) Distance-to-goal shaping helpers (aligned with success)
# ==========================================================
def find_goal_pos(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "goal":
                return (i, j)
    return None

def manhattan(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1])

# ==========================================================
# 4) PPO Actor-Critic Model and Training Loop
# ==========================================================

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class ActorCritic(nn.Module):
    def __init__(self, state_dim, n_actions, hidden=64):
        super().__init__()
        # Shared feature extractor
        self.shared = nn.Sequential(
            layer_init(nn.Linear(state_dim, hidden)),
            nn.Tanh(),
            layer_init(nn.Linear(hidden, hidden)),
            nn.Tanh()
        )
        
        # Actor head (Policy)
        self.actor = layer_init(nn.Linear(hidden, n_actions), std=0.01)
        
        # Critic head (Value)
        self.critic = layer_init(nn.Linear(hidden, 1), std=1.0)
    
    def forward(self, x):
        h = self.shared(x)
        logits = self.actor(h)
        value = self.critic(h)
        return logits, value

def compute_gae(rewards, values, dones, gamma=0.99, lam=0.95):
    """
    Compute Generalized Advantage Estimation (GAE).
    """
    returns = []
    advs = []
    gae = 0
    next_value = 0
    
    # Iterate backwards
    for r, v, d in zip(reversed(rewards), reversed(values), reversed(dones)):
        delta = r + gamma * next_value * (1 - d) - v
        gae = delta + gamma * lam * (1 - d) * gae
        advs.insert(0, gae)
        next_value = v
        returns.insert(0, gae + v)
        
    return torch.tensor(returns, dtype=torch.float32), torch.tensor(advs, dtype=torch.float32)

def train_ppo(
    env,
    model,
    optimizer,
    episodes=2000,
    max_steps=500,
    gamma=0.99,
    lam=0.95,
    eps_clip=0.2,
    update_epochs=4,
    batch_size=64,
    dist_coef=0.02,
    living_penalty=-0.002,
    entropy_coef=0.01,
):
    rewards_history, success_history = [], []
    
    # Logging accumulators for the print block
    actor_loss_sum = 0.0
    critic_loss_sum = 0.0
    val_abs_sum = 0.0
    val_max = 0.0
    updates_count = 0

    for ep in range(episodes):
        obs, info = env.reset()
        s = get_state(env)
        goal = find_goal_pos(env.unwrapped)
        if goal is None: raise RuntimeError("Goal not found.")
        
        log_probs_buf, values_buf, states_buf, actions_buf, rewards_buf, dones_buf = [], [], [], [], [], []
        total_shaped_reward = 0.0
        last_env_r = 0.0
        
        for t in range(max_steps):
            s_tensor = torch.tensor(s).unsqueeze(0)
            logits, value = model(s_tensor)
            m = Categorical(logits=logits)
            action_idx = m.sample()
            action = USEFUL_ACTIONS[action_idx.item()]
            
            prev_pos = env.unwrapped.agent_pos
            obs2, r, terminated, truncated, info = env.step(action)
            new_pos = env.unwrapped.agent_pos
            done = terminated or truncated
            
            shaped_r = max(float(r), -0.2) + dist_coef * (manhattan(prev_pos, goal) - manhattan(new_pos, goal)) + living_penalty
            
            log_probs_buf.append(m.log_prob(action_idx))
            values_buf.append(value)
            states_buf.append(s_tensor)
            actions_buf.append(action_idx)
            rewards_buf.append(shaped_r)
            dones_buf.append(done)
            
            total_shaped_reward += shaped_r
            last_env_r = float(r)
            s = get_state(env)
            
            # Log value stats for this step
            v_scalar = value.item()
            val_abs_sum += abs(v_scalar)
            val_max = max(val_max, abs(v_scalar))
            updates_count += 1
            
            if done: break

        rewards_history.append(total_shaped_reward)
        success_history.append(1 if last_env_r > 0 else 0)
        
        values_tensor = torch.stack(values_buf).reshape(-1)
        log_probs_tensor = torch.stack(log_probs_buf).reshape(-1)
        states_tensor = torch.cat(states_buf)
        actions_tensor = torch.stack(actions_buf).reshape(-1)
        
        returns, advantages = compute_gae(rewards_buf, values_tensor.detach().numpy(), dones_buf, gamma, lam)
        if len(advantages) > 1:
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        dataset_size = len(states_tensor)
        indices = np.arange(dataset_size)
        
        for _ in range(update_epochs):
            np.random.shuffle(indices)
            for start in range(0, dataset_size, batch_size):
                end = start + batch_size
                idx = indices[start:end]
                
                mb_states = states_tensor[idx]
                mb_actions = actions_tensor[idx]
                mb_old_log_probs = log_probs_tensor[idx].detach()
                mb_returns = returns[idx]
                mb_advantages = advantages[idx]
                
                new_logits, new_values = model(mb_states)
                new_dist = Categorical(logits=new_logits)
                new_log_probs = new_dist.log_prob(mb_actions)
                entropy = new_dist.entropy().mean()
                
                ratio = (new_log_probs - mb_old_log_probs).exp()
                surr1 = ratio * mb_advantages
                surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * mb_advantages
                
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = 0.5 * ((mb_returns - new_values.reshape(-1)) ** 2).mean()
                loss = actor_loss + critic_loss - entropy_coef * entropy
                
                # Accumulate loss stats
                actor_loss_sum += actor_loss.item()
                critic_loss_sum += critic_loss.item()
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()

        if (ep + 1) % 500 == 0:
            avg_rew = np.mean(rewards_history[-500:])
            succ_rate = np.mean(success_history[-500:])
            
            # Normalize stats by number of updates/steps
            # (Note: loss sums are accumulated over many mini-batches, so we divide to get avg)
            norm_factor = max(1, updates_count * update_epochs / batch_size) 
            avg_actor_loss = actor_loss_sum / norm_factor
            avg_critic_loss = critic_loss_sum / norm_factor
            
            avg_val_abs = val_abs_sum / max(1, updates_count)
            
            print(
                f"Episode {ep+1}/{episodes} | "
                f"avg_reward(last500)={avg_rew:.3f} | "
                f"success_rate(last500)={succ_rate:.2%} | "
                f"ActorLoss={avg_actor_loss:.4f} | "
                f"CriticLoss={avg_critic_loss:.4f} | "
                f"Val(avg/max)={avg_val_abs:.3f}/{val_max:.3f}"
            )
            
            # Reset trackers for next 500 episodes
            actor_loss_sum = 0.0
            critic_loss_sum = 0.0
            val_abs_sum = 0.0
            val_max = 0.0
            updates_count = 0

    return rewards_history, success_history

In [3]:
env_easy = gym.make("MiniGrid-DoorKey-6x6-v0")
env_hard = gym.make("MiniGrid-DoorKey-6x6-Enemy-v0")
state_dim = 5
n_actions = len(USEFUL_ACTIONS)

model = ActorCritic(state_dim, n_actions)
optimizer = optim.Adam(model.parameters(), lr=3e-4, eps=1e-5)

print("\n=== Phase 1: train on MiniGrid-DoorKey-6x6-v0 ===")
rewards1, success1 = train_ppo(
    env_easy, 
    model, 
    optimizer, 
    episodes=3000, 
    dist_coef=0.01,       # LOWER: Less hand-holding
    entropy_coef=0.005,   # LOWER: Exploit faster
    eps_clip=0.3,         # HIGHER: Allow bigger updates
    update_epochs=5       # HIGHER: Train more on each batch
)

print("\n=== Phase 2: continue on MiniGrid-DoorKey-6x6-Enemy-v0 ===")
optimizer = optim.Adam(model.parameters(), lr=1.5e-4, eps=1e-5)

rewards2, success2 = train_ppo(
    env_hard, 
    model, 
    optimizer, 
    episodes=8000, 
    dist_coef=0.01,       # Keep sparse shaping
    entropy_coef=0.02,    # Standard entropy for hard phase
    update_epochs=4,      # Standard epochs
    eps_clip=0.2          # Standard clip
)
print("\nDone.")


=== Phase 1: train on MiniGrid-DoorKey-6x6-v0 ===
Episode 500/3000 | avg_reward(last500)=-0.032 | success_rate(last500)=68.40% | ActorLoss=-0.0061 | CriticLoss=0.0034 | Val(avg/max)=0.174/0.940
Episode 1000/3000 | avg_reward(last500)=0.188 | success_rate(last500)=78.60% | ActorLoss=-0.0060 | CriticLoss=0.0053 | Val(avg/max)=0.225/0.966
Episode 1500/3000 | avg_reward(last500)=0.314 | success_rate(last500)=86.60% | ActorLoss=-0.0063 | CriticLoss=0.0065 | Val(avg/max)=0.274/0.947
Episode 2000/3000 | avg_reward(last500)=0.351 | success_rate(last500)=85.80% | ActorLoss=-0.0067 | CriticLoss=0.0079 | Val(avg/max)=0.298/0.981
Episode 2500/3000 | avg_reward(last500)=0.543 | success_rate(last500)=96.60% | ActorLoss=-0.0117 | CriticLoss=0.0097 | Val(avg/max)=0.376/1.121
Episode 3000/3000 | avg_reward(last500)=0.500 | success_rate(last500)=93.40% | ActorLoss=-0.0042 | CriticLoss=0.0093 | Val(avg/max)=0.372/1.080

=== Phase 2: continue on MiniGrid-DoorKey-6x6-Enemy-v0 ===
Episode 500/8000 | avg_re