In [3]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from gymnasium.envs.registration import register
import minigrid


In [4]:

# =========================
# 1) Useful actions only
# =========================
USEFUL_ACTIONS = [0, 1, 2, 3, 5]  # left, right, forward, pickup, toggle

def sample_useful_action():
    return int(np.random.choice(USEFUL_ACTIONS))

# =========================
# 2) State encoder
# =========================
def get_door_open(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "door":
                return 1 if obj.is_open else 0
    return 0

def get_state(env):
    u = env.unwrapped
    ax, ay = u.agent_pos
    ad = int(u.agent_dir)
    has_key = 1 if (u.carrying is not None and getattr(u.carrying, "type", None) == "key") else 0
    door_open = get_door_open(u)
    return (ax, ay, ad, has_key, door_open)

# =========================
# 3) Goal distance shaping
# =========================
def find_goal_pos(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "goal":
                return (i, j)
    return None

def manhattan(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1])

# =========================
# 4) Dyna-Q
# =========================
def make_Q(n_actions):
    return defaultdict(lambda: np.zeros(n_actions, dtype=np.float32))

def epsilon_greedy(Q, s, eps):
    if np.random.rand() < eps:
        return sample_useful_action()
    q = Q[s]
    return int(max(USEFUL_ACTIONS, key=lambda a: q[a]))

def train_dyna_q(
    env,
    Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.15,
    eps_decay=0.99985,
    living_penalty=-0.001,
    dist_coef=0.02,
    k=10,                    # planning updates per real step (after plan_start_episode)
    plan_start_episode=1000, # DELAYED PLANNING FIX ✅
    reward_clip_min=None     # e.g. -0.2 for enemy env; None for no clip
):
    """
    Dyna-Q:
      - Direct RL update from real step
      - Learn a tabular model Model[(s,a)] = (s2, r, done)
      - After plan_start_episode, do k planning updates per real step
    """
    Model = {}       # (s,a) -> (s2, r, done)
    seen_sa = []     # list of (s,a) keys for sampling

    rewards, success = [], []
    eps = eps_start

    for ep in range(episodes):
        obs, info = env.reset()
        s = get_state(env)

        goal = find_goal_pos(env.unwrapped)
        if goal is None:
            raise RuntimeError("Goal not found in grid.")

        total_shaped = 0.0
        last_env_r = 0.0

        # --- NEW: minimal stability trackers (same as Q-learning) ---
        td_abs_sum = 0.0
        td_max = 0.0
        q_abs_sum = 0.0
        q_max = 0.0
        updates = 0
        # -----------------------------------------------------------

        for t in range(max_steps):
            prev_dist = manhattan(env.unwrapped.agent_pos, goal)

            a = epsilon_greedy(Q, s, eps)
            obs2, r_env, terminated, truncated, info = env.step(a)
            s2 = get_state(env)
            done = terminated or truncated

            new_dist = manhattan(env.unwrapped.agent_pos, goal)

            # ----- shaping aligned with reaching goal -----
            r = float(r_env) + dist_coef * (prev_dist - new_dist) + living_penalty

            # optional: stabilize frequent enemy deaths
            if reward_clip_min is not None:
                r = max(r, float(reward_clip_min))

            # ----- (1) Direct Q update -----
            next_best = 0.0 if done else float(max(Q[s2][aa] for aa in USEFUL_ACTIONS))
            td_target = r + gamma * next_best

            td_err = td_target - Q[s][a]
            Q[s][a] += alpha * td_err

            # --- NEW: collect diagnostics for real update ---
            td_abs = abs(td_err)
            td_abs_sum += td_abs
            td_max = max(td_max, td_abs)

            q_vals = Q[s]
            q_abs_sum += float(np.mean(np.abs(q_vals)))
            q_max = max(q_max, float(np.max(q_vals)))
            updates += 1
            # ---------------------------------------------

            # ----- (2) Model learning -----
            key = (s, a)
            if key not in Model:
                seen_sa.append(key)
            Model[key] = (s2, r, done)

            # ----- (3) Planning (DELAYED) ✅ -----
            effective_k = 0 if ep < plan_start_episode else k
            if effective_k > 0 and seen_sa:
                for _ in range(effective_k):
                    s_p, a_p = seen_sa[np.random.randint(len(seen_sa))]
                    s2_p, r_p, done_p = Model[(s_p, a_p)]
                    next_best_p = 0.0 if done_p else float(max(Q[s2_p][aa] for aa in USEFUL_ACTIONS))
                    td_target_p = r_p + gamma * next_best_p

                    td_err_p = td_target_p - Q[s_p][a_p]
                    Q[s_p][a_p] += alpha * td_err_p

                    # --- NEW: diagnostics for planning updates too ---
                    td_abs_p = abs(td_err_p)
                    td_abs_sum += td_abs_p
                    td_max = max(td_max, td_abs_p)

                    q_vals_p = Q[s_p]
                    q_abs_sum += float(np.mean(np.abs(q_vals_p)))
                    q_max = max(q_max, float(np.max(q_vals_p)))
                    updates += 1
                    # -----------------------------------------------

            total_shaped += r
            last_env_r = float(r_env)
            s = s2

            if done:
                break

        eps = max(eps_end, eps * eps_decay)
        rewards.append(total_shaped)
        success.append(1 if last_env_r > 0 else 0)

        if (ep + 1) % 500 == 0:
            print(
                f"Episode {ep+1}/{episodes} | eps={eps:.3f} | "
                f"avg_reward(last500)={np.mean(rewards[-500:]):.3f} | "
                f"success_rate(last500)={np.mean(success[-500:]):.2%} | "
                f"TDabs(avg/max)={td_abs_sum/max(1,updates):.4f}/{td_max:.4f} | "
                f"Q(abs/max)={q_abs_sum/max(1,updates):.3f}/{q_max:.3f}"
            )

    return rewards, success, eps





In [5]:
env_easy = gym.make("MiniGrid-DoorKey-6x6-v0")
Q = make_Q(env_easy.action_space.n)

print("\n=== Phase 1 (Dyna-Q, FAST): MiniGrid-DoorKey-6x6-v0 ===")
rewards1, success1, eps_after = train_dyna_q(
    env_easy, Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.995,
    eps_start=1.0,
    eps_end=0.12,
    eps_decay=0.99980,
    living_penalty=-0.002,
    dist_coef=0.02,
    k=20,
    plan_start_episode=800,
    reward_clip_min=None
)



=== Phase 1 (Dyna-Q, FAST): MiniGrid-DoorKey-6x6-v0 ===
Episode 500/6000 | eps=0.905 | avg_reward(last500)=-0.417 | success_rate(last500)=35.20% | TDabs(avg/max)=0.0475/0.2724 | Q(abs/max)=0.253/0.575
Episode 1000/6000 | eps=0.819 | avg_reward(last500)=-0.242 | success_rate(last500)=51.00% | TDabs(avg/max)=0.0040/0.2136 | Q(abs/max)=0.585/0.885
Episode 1500/6000 | eps=0.741 | avg_reward(last500)=-0.302 | success_rate(last500)=46.00% | TDabs(avg/max)=0.0199/0.7578 | Q(abs/max)=0.496/0.946
Episode 2000/6000 | eps=0.670 | avg_reward(last500)=-0.235 | success_rate(last500)=49.60% | TDabs(avg/max)=0.0043/0.4719 | Q(abs/max)=0.500/0.782
Episode 2500/6000 | eps=0.607 | avg_reward(last500)=-0.314 | success_rate(last500)=41.20% | TDabs(avg/max)=0.0049/0.7546 | Q(abs/max)=0.555/0.844
Episode 3000/6000 | eps=0.549 | avg_reward(last500)=-0.348 | success_rate(last500)=38.20% | TDabs(avg/max)=0.0046/0.7577 | Q(abs/max)=0.518/0.786
Episode 3500/6000 | eps=0.497 | avg_reward(last500)=-0.352 | success

In [6]:
register(
    id="MiniGrid-DoorKey-6x6-Enemy-v0",
    entry_point="enemy_doorkey_env:DoorKeyWithEnemyEnv",
    kwargs={"size": 6}
)

env_hard = gym.make("MiniGrid-DoorKey-6x6-Enemy-v0")

print("\n=== Phase 2 (Dyna-Q, FAST): MiniGrid-DoorKey-6x6-Enemy-v0 ===")
rewards2, success2, _ = train_dyna_q(
    env_hard, Q,
    episodes=8000,
    max_steps=500,
    alpha=0.10,
    gamma=0.995,
    eps_start=1.0,
    eps_end=0.25,
    eps_decay=0.99988,
    living_penalty=-0.001,
    dist_coef=0.02,
    k=30,
    plan_start_episode=400,
    reward_clip_min=-0.2
)

print("\nDone.")



=== Phase 2 (Dyna-Q, FAST): MiniGrid-DoorKey-6x6-Enemy-v0 ===
Episode 500/8000 | eps=0.942 | avg_reward(last500)=-0.282 | success_rate(last500)=3.20% | TDabs(avg/max)=0.0036/0.9270 | Q(abs/max)=0.617/0.944
Episode 1000/8000 | eps=0.887 | avg_reward(last500)=-0.275 | success_rate(last500)=3.20% | TDabs(avg/max)=0.0149/0.7741 | Q(abs/max)=0.430/0.736
Episode 1500/8000 | eps=0.835 | avg_reward(last500)=-0.253 | success_rate(last500)=4.40% | TDabs(avg/max)=0.0149/1.0438 | Q(abs/max)=0.588/0.943
Episode 2000/8000 | eps=0.787 | avg_reward(last500)=-0.258 | success_rate(last500)=3.20% | TDabs(avg/max)=0.0183/0.9599 | Q(abs/max)=0.551/0.940
Episode 2500/8000 | eps=0.741 | avg_reward(last500)=-0.273 | success_rate(last500)=2.60% | TDabs(avg/max)=0.0231/1.0282 | Q(abs/max)=0.566/0.903
Episode 3000/8000 | eps=0.698 | avg_reward(last500)=-0.245 | success_rate(last500)=4.80% | TDabs(avg/max)=0.0078/0.9415 | Q(abs/max)=0.537/0.838
Episode 3500/8000 | eps=0.657 | avg_reward(last500)=-0.263 | success