# RL 8-Puzzle Demo

This notebook demonstrates the trained Q-learning agent for the 3×3 8-puzzle.

- If a saved Q-table exists at `rl_8puzzle/q_table.pkl`, it will be loaded.
- Otherwise, a smaller training run will be executed and saved.
- Then we visualize a greedy solution from a random scrambled start state.

In [None]:
from pathlib import Path
import pickle

from rl_8puzzle.env import EightPuzzleEnv, GOAL_STATE, ACTIONS

In [None]:
def print_board(state):
    """Pretty-print a 3x3 board."""
    for r in range(3):
        row = state[r * 3:(r + 1) * 3]
        print(" ".join("_" if x == 0 else str(x) for x in row))
    print()


def load_q(path="rl_8puzzle/q_table.pkl"):
    """Load Q-table if it exists; otherwise train a smaller one."""
    p = Path(path)
    if p.exists():
        print(f"Loading existing Q-table from {p}…")
        with p.open("rb") as f:
            return pickle.load(f)

    print("Q-table not found. Training a small one (this may take a minute)…")
    from rl_8puzzle.train_q_learning import train, save_q

    Q = train(
        num_episodes=20000,   # smaller than full training but good enough
        max_steps=80,
        scramble_moves=20,
    )
    save_q(Q, p)
    print(f"Saved new Q-table to {p}.")
    return Q

In [None]:
Q = load_q()

env = EightPuzzleEnv(scramble_moves=20)
start_state = env.reset()

print("Start state:")
print_board(start_state)

states = [start_state]
rewards = []

for step in range(80):
    state = env.state
    qs = [Q.get((state, a), 0.0) for a in ACTIONS]
    best_idx = max(range(len(ACTIONS)), key=lambda i: qs[i])
    action = ACTIONS[best_idx]

    next_state, reward, done, _ = env.step(action)
    states.append(next_state)
    rewards.append(reward)

    if done:
        print(f"\nSolved in {step + 1} moves. Final reward: {reward}.")
        break
else:
    print("\nDid not reach goal within the step limit.")

In [None]:
print("\nTrajectory:")
for i, s in enumerate(states):
    print(f"Step {i}")
    print_board(s)