#yourNAME, yourID

# Imports

In [4]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

# LuffyDodge (3)

In [None]:
class LuffyDodgeEnv(gym.Env):
    """
    Custom environment where Luffy must dodge cannonballs.
    Luffy can move left, right, or stay still to avoid being hit.
    """

    metadata = {"render_modes": ["human"]}

    def __init__(self, render_mode=None):
        """
        Initialize environment parameters.
        """
        super(LuffyDodgeEnv, self).__init__()

        # Actions: 0 = left, 1 = stay, 2 = right
        self.action_space = spaces.Discrete(3)

        # Observation: [Luffy_x, Cannonball_x, Cannonball_y]
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0], dtype=np.float32),
            high=np.array([9, 9, 9], dtype=np.float32),
            dtype=np.float32
        )

        self.render_mode = render_mode
        self.reset()

    def reset(self, seed=None, options=None):
        """
        Reset the environment to its initial state.
        """
        super().reset(seed=seed)

        # Luffy starts in the middle bottom
        self.luffy_x = 5.0

        # Cannonball starts at a random x and top y = 9
        self.cannon_x = np.random.randint(0, 10)
        self.cannon_y = 9.0

        # Return initial observation
        obs = np.array([self.luffy_x, self.cannon_x, self.cannon_y], dtype=np.float32)
        return obs, {}

    def step(self, action):
        """
        Performs one time-step transition in the environment.

        Parameters:
            action (int): The action taken by Luffy.
                          0 = move left, 1 = stay still, 2 = move right

        Returns:
            observation (np.array): The next state [luffy_x, cannon_x, cannon_y]
            reward (float): +1 if survived this step, -10 if hit by cannonball
            done (bool): True if Luffy is hit, False otherwise
            truncated (bool): Optional flag if max steps are reached
            info (dict): Additional info (empty for now)

        Notes for Students:
        - Update Luffy’s position based on the action.
        - Move the cannonball down by one step.
        - Check for collision (same x and y == 0).
        - If collision occurs, assign -10 reward and mark `done = True`.
        - Otherwise, assign +1 reward and continue.
        - If the cannonball reaches y = 0 but misses Luffy, reset it to top at a random x position.
        """
        pass

    def render(self):
        """
        Render the environment as simple text output.
        """
        grid = np.full((10, 10), " ", dtype=str)
        grid[int(self.cannon_y), int(self.cannon_x)] = "O"
        grid[0, int(self.luffy_x)] = "L"
        print("\n".join(["".join(row) for row in grid[::-1]]))
        print("-" * 10)

    def close(self):
        """Close the environment."""
        pass

In [None]:
env = LuffyDodgeEnv()
obs, _ = env.reset()

done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)
    env.render()
    print(f"Action: {action}, Reward: {reward}")

# Policy Evaluation (2)

In [None]:
# Grid size
GRID_SIZE = 5

# Actions: 0 = left, 1 = stay, 2 = right
ACTIONS = [0, 1, 2]
NUM_ACTIONS = len(ACTIONS)

# All possible states: (Luffy_x, Cannon_x, Cannon_y)
states = [(lx, cx, cy) for lx in range(GRID_SIZE)
                         for cx in range(GRID_SIZE)
                         for cy in range(GRID_SIZE)]
NUM_STATES = len(states)

# Initialize policy randomly
policy = np.random.choice(ACTIONS, size=NUM_STATES)

# Initialize value function
V = np.zeros(NUM_STATES)

# Discount factor
gamma = 0.9


def state_to_index(state):
    """Convert a (luffy_x, cannon_x, cannon_y) state tuple into its index in the state list."""
    lx, cx, cy = state
    return lx * GRID_SIZE * GRID_SIZE + cx * GRID_SIZE + cy


def transition(state, action):
    """
    Deterministic transition function for the environment.

    Parameters
    ----------
    state : tuple (luffy_x, cannon_x, cannon_y)
    action : int (0=left, 1=stay, 2=right)

    Returns
    -------
    next_state : tuple or None
        Next state after taking the action.
    reward : float
        Reward received after the transition.
    done : bool
        Whether the episode terminates (Luffy hit by cannonball).
    """
    lx, cx, cy = state

    # Move Luffy
    if action == 0:
        lx = max(0, lx - 1)
    elif action == 2:
        lx = min(GRID_SIZE - 1, lx + 1)

    # Move cannonball down
    cy -= 1

    # Check terminal condition
    if cy < 0:
        if lx == cx:
            # Hit
            return None, -10.0, True
        else:
            # Miss → reset cannonball to top
            return (lx, cx, GRID_SIZE - 1), +1.0, False

    # Otherwise just one step closer to bottom
    return (lx, cx, cy), +1.0, False


def policy_evaluation(policy, V, theta=1e-4):
    """
    TODO: Students must implement this function.

    This function should perform **policy evaluation** — i.e.,
    iteratively compute the state-value function V(s) for the
    current policy π until convergence.

    What to do:
    -----------
    1. Repeat until the value function changes very little (Δ < θ):
       - For each state s:
           * Get the action `a = policy[s]`
           * Use `transition(state, a)` to get (next_state, reward, done)
           * If done: V[s] = reward
           * Else:   V[s] = reward + γ * V[next_state_index]

    2. Stop when the maximum change in any V[s] is below `theta`.

    Parameters
    ----------
    policy : np.array
        Current policy mapping each state index to an action (0,1,2).
    V : np.array
        Current value estimates for each state.
    theta : float
        Convergence threshold for stopping condition.

    Returns
    -------
    V : np.array
        Updated state-value function for the given policy.
    """
    # YOU should fill in the implementation here
    pass


def policy_improvement(V, policy):
    """Greedy policy improvement based on the current value function."""
    policy_stable = True
    for s, state in enumerate(states):
        old_action = policy[s]
        action_values = []

        # Try all actions and pick the best one
        for a in ACTIONS:
            next_state, reward, done = transition(state, a)
            if done:
                action_values.append(reward)
            else:
                action_values.append(reward + gamma * V[state_to_index(next_state)])

        best_action = np.argmax(action_values)
        policy[s] = best_action

        # Check if policy changed
        if old_action != best_action:
            policy_stable = False
    return policy, policy_stable


def policy_iteration():
    """Run the full policy iteration loop."""
    global V, policy
    iteration = 0
    while True:
        iteration += 1
        V = policy_evaluation(policy, V)
        policy, stable = policy_improvement(V, policy)
        print(f"Iteration {iteration} completed.")
        if stable:
            print("✅ Policy converged!")
            break
    return policy, V


if __name__ == "__main__":
    optimal_policy, optimal_V = policy_iteration()

    print("\nOptimal policy (sample of 10 states):")
    for i in range(10):
        print(f"State {states[i]} → Action {optimal_policy[i]}")
