In [8]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from gymnasium.envs.registration import register
import minigrid


In [9]:

# =========================
# 1) Useful actions only
# =========================
USEFUL_ACTIONS = [0, 1, 2, 3, 5]  # left, right, forward, pickup, toggle

def sample_useful_action():
    return int(np.random.choice(USEFUL_ACTIONS))

# =========================
# 2) State encoder
# =========================
def get_door_open(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "door":
                return 1 if obj.is_open else 0
    return 0

def get_state(env):
    u = env.unwrapped
    ax, ay = u.agent_pos
    ad = int(u.agent_dir)
    has_key = 1 if (u.carrying is not None and getattr(u.carrying, "type", None) == "key") else 0
    door_open = get_door_open(u)
    return (ax, ay, ad, has_key, door_open)

# =========================
# 3) Goal distance shaping
# =========================
def find_goal_pos(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "goal":
                return (i, j)
    return None

def manhattan(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1])

# =========================
# 4) Dyna-Q
# =========================
def make_Q(n_actions):
    return defaultdict(lambda: np.zeros(n_actions, dtype=np.float32))

def epsilon_greedy(Q, s, eps):
    if np.random.rand() < eps:
        return sample_useful_action()
    q = Q[s]
    return int(max(USEFUL_ACTIONS, key=lambda a: q[a]))

def train_dyna_q(
    env,
    Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.15,
    eps_decay=0.99985,
    living_penalty=-0.001,
    dist_coef=0.02,
    k=10,                    # planning updates per real step (after plan_start_episode)
    plan_start_episode=1000, # DELAYED PLANNING FIX ✅
    reward_clip_min=None     # e.g. -0.2 for enemy env; None for no clip
):
    """
    Dyna-Q:
      - Direct RL update from real step
      - Learn a tabular model Model[(s,a)] = (s2, r, done)
      - After plan_start_episode, do k planning updates per real step
    """
    Model = {}       # (s,a) -> (s2, r, done)
    seen_sa = []     # list of (s,a) keys for sampling

    rewards, success = [], []
    eps = eps_start

    for ep in range(episodes):
        obs, info = env.reset()
        s = get_state(env)

        goal = find_goal_pos(env.unwrapped)
        if goal is None:
            raise RuntimeError("Goal not found in grid.")

        total_shaped = 0.0
        last_env_r = 0.0

        for t in range(max_steps):
            prev_dist = manhattan(env.unwrapped.agent_pos, goal)

            a = epsilon_greedy(Q, s, eps)
            obs2, r_env, terminated, truncated, info = env.step(a)
            s2 = get_state(env)
            done = terminated or truncated

            new_dist = manhattan(env.unwrapped.agent_pos, goal)

            # ----- shaping aligned with reaching goal -----
            r = float(r_env) + dist_coef * (prev_dist - new_dist) + living_penalty

            # optional: stabilize frequent enemy deaths
            if reward_clip_min is not None:
                r = max(r, float(reward_clip_min))

            # ----- (1) Direct Q update -----
            next_best = 0.0 if done else float(max(Q[s2][aa] for aa in USEFUL_ACTIONS))
            td_target = r + gamma * next_best
            Q[s][a] += alpha * (td_target - Q[s][a])

            # ----- (2) Model learning -----
            key = (s, a)
            if key not in Model:
                seen_sa.append(key)
            Model[key] = (s2, r, done)

            # ----- (3) Planning (DELAYED) ✅ -----
            effective_k = 0 if ep < plan_start_episode else k
            if effective_k > 0 and seen_sa:
                for _ in range(effective_k):
                    s_p, a_p = seen_sa[np.random.randint(len(seen_sa))]
                    s2_p, r_p, done_p = Model[(s_p, a_p)]
                    next_best_p = 0.0 if done_p else float(max(Q[s2_p][aa] for aa in USEFUL_ACTIONS))
                    td_target_p = r_p + gamma * next_best_p
                    Q[s_p][a_p] += alpha * (td_target_p - Q[s_p][a_p])

            total_shaped += r
            last_env_r = float(r_env)
            s = s2

            if done:
                break

        eps = max(eps_end, eps * eps_decay)
        rewards.append(total_shaped)
        success.append(1 if last_env_r > 0 else 0)

        if (ep + 1) % 500 == 0:
            print(
                f"Episode {ep+1}/{episodes} | eps={eps:.3f} | "
                f"avg_reward(last500)={np.mean(rewards[-500:]):.3f} | "
                f"success_rate(last500)={np.mean(success[-500:]):.2%}"
            )

    return rewards, success, eps

# ==========================================================
# 5) Curriculum: Phase 1 -> Phase 2 (with fixed settings)
# ==========================================================

# Phase 1: DoorKey (no enemy)
env_easy = gym.make("MiniGrid-DoorKey-6x6-v0")
Q = make_Q(env_easy.action_space.n)

print("\n=== Phase 1 (Dyna-Q, delayed planning): MiniGrid-DoorKey-6x6-v0 ===")
rewards1, success1, eps_after = train_dyna_q(
    env_easy, Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.15,
    eps_decay=0.99985,
    living_penalty=-0.001,
    dist_coef=0.02,
    k=5,                    
    plan_start_episode=1000, 
    reward_clip_min=None
)






=== Phase 1 (Dyna-Q, delayed planning): MiniGrid-DoorKey-6x6-v0 ===
Episode 500/6000 | eps=0.928 | avg_reward(last500)=-0.144 | success_rate(last500)=31.80%
Episode 1000/6000 | eps=0.861 | avg_reward(last500)=0.067 | success_rate(last500)=58.00%
Episode 1500/6000 | eps=0.799 | avg_reward(last500)=0.040 | success_rate(last500)=53.60%
Episode 2000/6000 | eps=0.741 | avg_reward(last500)=0.094 | success_rate(last500)=57.60%
Episode 2500/6000 | eps=0.687 | avg_reward(last500)=0.179 | success_rate(last500)=65.20%
Episode 3000/6000 | eps=0.638 | avg_reward(last500)=0.100 | success_rate(last500)=55.60%
Episode 3500/6000 | eps=0.592 | avg_reward(last500)=0.055 | success_rate(last500)=47.40%
Episode 4000/6000 | eps=0.549 | avg_reward(last500)=0.113 | success_rate(last500)=56.60%
Episode 4500/6000 | eps=0.509 | avg_reward(last500)=0.029 | success_rate(last500)=47.20%
Episode 5000/6000 | eps=0.472 | avg_reward(last500)=-0.047 | success_rate(last500)=39.60%
Episode 5500/6000 | eps=0.438 | avg_rewa

In [10]:
# Phase 2: DoorKey + Enemy

register(
    id="MiniGrid-DoorKey-6x6-Enemy-v0",
    entry_point="enemy_doorkey_env:DoorKeyWithEnemyEnv",
    kwargs={"size": 6}
)
env_hard = gym.make("MiniGrid-DoorKey-6x6-Enemy-v0")
print("\n=== Phase 2 (Dyna-Q): MiniGrid-DoorKey-6x6-Enemy-v0 ===")
rewards2, success2, _ = train_dyna_q(
    env_hard, Q,
    episodes=8000,
    max_steps=500,
    alpha=0.10,
    gamma=0.99,

    # restart exploration because dynamics changed
    eps_start=1.0,
    eps_end=0.20,
    eps_decay=0.9999,

    living_penalty=-0.001,
    dist_coef=0.02,

    k=20,                    # stronger planning in Phase 2
    plan_start_episode=500,   # start planning sooner in Phase 2 (model already decent)
    reward_clip_min=-0.2      # stabilize enemy -1 deaths
)

print("\nDone.")


=== Phase 2 (Dyna-Q): MiniGrid-DoorKey-6x6-Enemy-v0 ===
Episode 500/8000 | eps=0.951 | avg_reward(last500)=-0.288 | success_rate(last500)=2.40%
Episode 1000/8000 | eps=0.905 | avg_reward(last500)=-0.264 | success_rate(last500)=4.20%
Episode 1500/8000 | eps=0.861 | avg_reward(last500)=-0.247 | success_rate(last500)=5.60%
Episode 2000/8000 | eps=0.819 | avg_reward(last500)=-0.249 | success_rate(last500)=5.00%
Episode 2500/8000 | eps=0.779 | avg_reward(last500)=-0.259 | success_rate(last500)=2.80%
Episode 3000/8000 | eps=0.741 | avg_reward(last500)=-0.243 | success_rate(last500)=5.60%
Episode 3500/8000 | eps=0.705 | avg_reward(last500)=-0.237 | success_rate(last500)=4.80%
Episode 4000/8000 | eps=0.670 | avg_reward(last500)=-0.231 | success_rate(last500)=6.20%
Episode 4500/8000 | eps=0.638 | avg_reward(last500)=-0.247 | success_rate(last500)=4.20%
Episode 5000/8000 | eps=0.607 | avg_reward(last500)=-0.239 | success_rate(last500)=5.60%
Episode 5500/8000 | eps=0.577 | avg_reward(last500)=-0