In [1]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from gymnasium.envs.registration import register
import minigrid


  from pkg_resources import resource_stream, resource_exists


In [None]:


# ==========================================================
# 0) Register your custom env (assumes enemy_doorkey_env.py exists)
# ==========================================================
register(
    id="MiniGrid-DoorKey-6x6-Enemy-v0",
    entry_point="enemy_doorkey_env:DoorKeyWithEnemyEnv",
    kwargs={"size":6}
)

# ==========================================================
# 1) Useful actions only (DoorKey)
# ==========================================================
USEFUL_ACTIONS = [0, 1, 2, 3, 5]  # left, right, forward, pickup, toggle

def sample_useful_action():
    return int(np.random.choice(USEFUL_ACTIONS))

# ==========================================================
# 2) State encoder (no enemy_pos -> smaller table)
# ==========================================================
def get_door_open(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "door":
                return 1 if obj.is_open else 0
    return 0

def get_state(env):
    u = env.unwrapped
    ax, ay = u.agent_pos
    ad = int(u.agent_dir)
    has_key = 1 if (u.carrying is not None and getattr(u.carrying, "type", None) == "key") else 0
    door_open = get_door_open(u)
    return (ax, ay, ad, has_key, door_open)

# ==========================================================
# 3) Distance-to-goal shaping helpers (aligned with success)
# ==========================================================
def find_goal_pos(u):
    for j in range(u.height):
        for i in range(u.width):
            obj = u.grid.get(i, j)
            if obj is not None and obj.type == "goal":
                return (i, j)
    return None

def manhattan(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1])

# ==========================================================
# 4) Q-learning
# ==========================================================
def make_Q(n_actions):
    return defaultdict(lambda: np.zeros(n_actions, dtype=np.float32))

def epsilon_greedy(Q, s, eps):
    if np.random.rand() < eps:
        return sample_useful_action()
    q = Q[s]
    return int(max(USEFUL_ACTIONS, key=lambda a: q[a]))

def train_q(
    env,
    Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.15,
    eps_decay=0.99985,
    living_penalty=-0.001,
    dist_coef=0.02,
):
    rewards, success = [], []
    eps = eps_start

    for ep in range(episodes):
        obs, info = env.reset()
        s = get_state(env)

        goal = find_goal_pos(env.unwrapped)
        if goal is None:
            raise RuntimeError("Goal not found in grid. Check environment generation.")

        total_shaped = 0.0
        last_env_r = 0.0

        for t in range(max_steps):
            prev_dist = manhattan(env.unwrapped.agent_pos, goal)

            a = epsilon_greedy(Q, s, eps)
            obs2, r, terminated, truncated, info = env.step(a)

            # reward clipping (only for training stability)
            r = max(float(r), -0.2)


            s2 = get_state(env)

            done = terminated or truncated

            new_dist = manhattan(env.unwrapped.agent_pos, goal)

            # Potential-based shaping: reward getting closer to goal
            shaped = float(r)
            shaped += dist_coef * (prev_dist - new_dist)
            shaped += living_penalty

            # Q-learning update (max over useful actions only)
            next_best = 0.0 if done else float(max(Q[s2][aa] for aa in USEFUL_ACTIONS))
            td_target = shaped + gamma * next_best
            Q[s][a] += alpha * (td_target - Q[s][a])

            total_shaped += shaped
            last_env_r = float(r)
            s = s2

            if done:
                break

        eps = max(eps_end, eps * eps_decay)
        rewards.append(total_shaped)
        success.append(1 if last_env_r > 0 else 0)

        if (ep + 1) % 500 == 0:
            print(
                f"Episode {ep+1}/{episodes} | eps={eps:.3f} | "
                f"avg_reward(last500)={np.mean(rewards[-500:]):.3f} | "
                f"success_rate(last500)={np.mean(success[-500:]):.2%}"
            )

    return rewards, success, eps

# ==========================================================
# 5) Curriculum: Phase 1 (DoorKey) -> Phase 2 (DoorKey+Enemy)
# ==========================================================
env_easy = gym.make("MiniGrid-DoorKey-6x6-v0")  # train without rendering
Q = make_Q(env_easy.action_space.n)

print("\n=== Phase 1: train on MiniGrid-DoorKey-6x6-v0 ===")
rewards1, success1, eps_after = train_q(
    env_easy,
    Q,
    episodes=6000,
    max_steps=500,
    alpha=0.15,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.15,
    eps_decay=0.99985,
    living_penalty=-0.001,
    dist_coef=0.02,
)

env_hard = gym.make("MiniGrid-DoorKey-6x6-Enemy-v0")  # train without rendering

print("\n=== Phase 2: continue on MiniGrid-DoorKey-6x6-Enemy-v0 ===")
rewards2, success2, _ = train_q(
    env_hard,
    Q,
    episodes=8000,
    max_steps=500,
    alpha=0.10,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.20,
    eps_decay=0.9999,
    living_penalty=-0.001,
    dist_coef=0.02,
)

print("\nDone.")


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")



=== Phase 1: train on MiniGrid-DoorKey-8x8-v0 ===
Episode 500/6000 | eps=0.928 | avg_reward(last500)=-0.135 | success_rate(last500)=33.00%
Episode 1000/6000 | eps=0.861 | avg_reward(last500)=0.004 | success_rate(last500)=51.00%
Episode 1500/6000 | eps=0.799 | avg_reward(last500)=0.216 | success_rate(last500)=71.40%
Episode 2000/6000 | eps=0.741 | avg_reward(last500)=0.185 | success_rate(last500)=66.00%
Episode 2500/6000 | eps=0.687 | avg_reward(last500)=0.231 | success_rate(last500)=68.60%
Episode 3000/6000 | eps=0.638 | avg_reward(last500)=0.164 | success_rate(last500)=64.60%
Episode 3500/6000 | eps=0.592 | avg_reward(last500)=0.231 | success_rate(last500)=68.80%
Episode 4000/6000 | eps=0.549 | avg_reward(last500)=0.262 | success_rate(last500)=71.00%
Episode 4500/6000 | eps=0.509 | avg_reward(last500)=0.371 | success_rate(last500)=78.40%
Episode 5000/6000 | eps=0.472 | avg_reward(last500)=0.207 | success_rate(last500)=70.20%
Episode 5500/6000 | eps=0.438 | avg_reward(last500)=0.249 |