In [1]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from gymnasium.envs.registration import register
import minigrid


  from pkg_resources import resource_stream, resource_exists


In [2]:


# ==========================================================
# 0) Register your custom env (assumes enemy_doorkey_env.py exists)
# ==========================================================
register(
    id="MiniGrid-DoorKey-6x6-Enemy-v0",
    entry_point="enemy_doorkey_env:DoorKeyWithEnemyEnv",
    kwargs={"size":6}
)

# ==========================================================
# 1) Useful actions only (DoorKey)
# ==========================================================
USEFUL_ACTIONS = [0, 1, 2, 3, 5]  # left, right, forward, pickup, toggle

def sample_useful_action():
    return int(np.random.choice(USEFUL_ACTIONS))

# ==========================================================
# 2) State encoder (no enemy_pos -> smaller table)
# ==========================================================
def get_door_open(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "door":
                return 1 if obj.is_open else 0
    return 0

def get_state(env):
    u = env.unwrapped
    ax, ay = u.agent_pos
    ad = int(u.agent_dir)
    has_key = 1 if (u.carrying is not None and getattr(u.carrying, "type", None) == "key") else 0
    door_open = get_door_open(u)
    return (ax, ay, ad, has_key, door_open)

# ==========================================================
# 3) Distance-to-goal shaping helpers (aligned with success)
# ==========================================================
def find_goal_pos(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "goal":
                return (i, j)
    return None

def manhattan(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1])

# ==========================================================
# 4) Q-learning
# ==========================================================
def make_Q(n_actions):
    return defaultdict(lambda: np.zeros(n_actions, dtype=np.float32))

def epsilon_greedy(Q, s, eps):
    if np.random.rand() < eps:
        return sample_useful_action()
    q = Q[s]
    return int(max(USEFUL_ACTIONS, key=lambda a: q[a]))

def train_q(
    env,
    Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.15,
    eps_decay=0.99985,
    living_penalty=-0.001,
    dist_coef=0.02,
):
    rewards, success = [], []
    eps = eps_start

    for ep in range(episodes):
        obs, info = env.reset()
        s = get_state(env)

        goal = find_goal_pos(env.unwrapped)
        if goal is None:
            raise RuntimeError("Goal not found in grid. Check environment generation.")

        total_shaped = 0.0
        last_env_r = 0.0

        # --- NEW: minimal stability trackers ---
        td_abs_sum = 0.0
        td_max = 0.0
        q_abs_sum = 0.0
        q_max = 0.0
        updates = 0
        # -------------------------------------

        for t in range(max_steps):
            prev_dist = manhattan(env.unwrapped.agent_pos, goal)

            a = epsilon_greedy(Q, s, eps)
            obs2, r, terminated, truncated, info = env.step(a)

            r = max(float(r), -0.2)   # clip negative rewards
            last_env_r = r

            s2 = get_state(env)
            done = terminated or truncated

            new_dist = manhattan(env.unwrapped.agent_pos, goal)

            # reward shaping
            shaped = r
            shaped += dist_coef * (prev_dist - new_dist)
            shaped += living_penalty

            # Q-learning update (max over useful actions)
            next_best = 0.0 if done else float(max(Q[s2][aa] for aa in USEFUL_ACTIONS))
            td_target = shaped + gamma * next_best
            td_err = td_target - Q[s][a]

            Q[s][a] += alpha * td_err

            # --- NEW: collect minimal diagnostics ---
            td_abs = abs(td_err)
            td_abs_sum += td_abs
            td_max = max(td_max, td_abs)

            q_vals = Q[s]
            q_abs_sum += np.mean(np.abs(q_vals))
            q_max = max(q_max, np.max(q_vals))
            updates += 1
            # ---------------------------------------

            total_shaped += shaped
            s = s2

            if done:
                break

        eps = max(eps_end, eps * eps_decay)
        rewards.append(total_shaped)
        success.append(1 if last_env_r > 0 else 0)

        if (ep + 1) % 500 == 0:
            print(
                f"Episode {ep+1}/{episodes} | eps={eps:.3f} | "
                f"avg_reward(last500)={np.mean(rewards[-500:]):.3f} | "
                f"success_rate(last500)={np.mean(success[-500:]):.2%} | "
                f"TDabs(avg/max)={td_abs_sum/max(1,updates):.4f}/{td_max:.4f} | "
                f"Q(abs/max)={q_abs_sum/max(1,updates):.3f}/{q_max:.3f}"
            )

    return rewards, success, eps


In [3]:
# ==========================================================
# 5) Curriculum: Phase 1 (DoorKey) -> Phase 2 (DoorKey+Enemy)
# ==========================================================
env_easy = gym.make("MiniGrid-DoorKey-6x6-v0")  # train without rendering
Q = make_Q(env_easy.action_space.n)

print("\n=== Phase 1: train on MiniGrid-DoorKey-6x6-v0 ===")
rewards1, success1, eps_after = train_q(env_easy, Q,
    episodes=6000, max_steps=500,
    alpha=0.12, gamma=0.99,
    eps_start=1.0, eps_end=0.15, eps_decay=0.999913,
    living_penalty=-0.001, dist_coef=0.040
)



=== Phase 1: train on MiniGrid-DoorKey-6x6-v0 ===
Episode 500/6000 | eps=0.957 | avg_reward(last500)=-0.129 | success_rate(last500)=29.40% | TDabs(avg/max)=0.0205/0.3387 | Q(abs/max)=0.382/0.642
Episode 1000/6000 | eps=0.917 | avg_reward(last500)=0.006 | success_rate(last500)=45.20% | TDabs(avg/max)=0.0077/0.0849 | Q(abs/max)=0.428/0.678
Episode 1500/6000 | eps=0.878 | avg_reward(last500)=0.026 | success_rate(last500)=48.00% | TDabs(avg/max)=0.0083/0.2497 | Q(abs/max)=0.423/0.642
Episode 2000/6000 | eps=0.840 | avg_reward(last500)=0.202 | success_rate(last500)=63.60% | TDabs(avg/max)=0.0061/0.6832 | Q(abs/max)=0.443/0.730
Episode 2500/6000 | eps=0.805 | avg_reward(last500)=0.208 | success_rate(last500)=63.40% | TDabs(avg/max)=0.0099/0.0706 | Q(abs/max)=0.451/0.725
Episode 3000/6000 | eps=0.770 | avg_reward(last500)=0.233 | success_rate(last500)=68.80% | TDabs(avg/max)=0.0061/0.1220 | Q(abs/max)=0.420/0.653
Episode 3500/6000 | eps=0.737 | avg_reward(last500)=0.386 | success_rate(last50

In [4]:
env_hard = gym.make("MiniGrid-DoorKey-6x6-Enemy-v0")  # train without rendering

print("\n=== Phase 2: continue on MiniGrid-DoorKey-6x6-Enemy-v0 ===")
rewards2, success2, _ = train_q(env_hard, Q,
    episodes=8000, max_steps=500,
    alpha=0.06, gamma=0.99,
    eps_start=0.9, eps_end=0.40, eps_decay=0.999913,
    living_penalty=-0.001, dist_coef=0.045
)

print("\nDone.")



=== Phase 2: continue on MiniGrid-DoorKey-6x6-Enemy-v0 ===
Episode 500/8000 | eps=0.862 | avg_reward(last500)=-0.255 | success_rate(last500)=2.60% | TDabs(avg/max)=0.0210/0.9138 | Q(abs/max)=0.428/0.761
Episode 1000/8000 | eps=0.825 | avg_reward(last500)=-0.226 | success_rate(last500)=5.00% | TDabs(avg/max)=0.0203/0.8752 | Q(abs/max)=0.423/0.727
Episode 1500/8000 | eps=0.790 | avg_reward(last500)=-0.240 | success_rate(last500)=4.00% | TDabs(avg/max)=0.3915/0.7759 | Q(abs/max)=0.424/0.620
Episode 2000/8000 | eps=0.756 | avg_reward(last500)=-0.218 | success_rate(last500)=5.20% | TDabs(avg/max)=0.1201/0.6457 | Q(abs/max)=0.336/0.489
Episode 2500/8000 | eps=0.724 | avg_reward(last500)=-0.219 | success_rate(last500)=5.80% | TDabs(avg/max)=0.6902/0.6902 | Q(abs/max)=0.336/0.489
Episode 3000/8000 | eps=0.693 | avg_reward(last500)=-0.188 | success_rate(last500)=8.20% | TDabs(avg/max)=0.0103/0.7119 | Q(abs/max)=0.332/0.574
Episode 3500/8000 | eps=0.664 | avg_reward(last500)=-0.161 | success_ra