In [4]:
"""
Mars Rover - MDP Exam Template
==============================
You are designing a control system for an autonomous Mars Rover exploring a 6x6 grid region.

Each grid cell has a specific terrain type that affects movement and rewards.

Your goal:
Implement the missing Reinforcement Learning functions to help the rover find the best exploration strategy.

Rules:
- Do not use external RL libraries.
- You may use numpy only.
- Complete ALL TODO functions as described below.
"""

import numpy as np

# -----------------------------
# Environment Setup
# -----------------------------

def make_board():
    """
    Create the 6x6 Mars terrain grid.

    Returns:
        np.ndarray: 6x6 array representing terrain types.
                    Legend:
                        S - Start (landing site)
                        B - Base (target location)
                        D - Dust storm (slippery)
                        C - Crater (absorbing)
                        T - Tunnel (double-step)
                        R - Rock wall (impassable)
                        . - Flat ground
    """
    return np.array([
        list("S....B"),
        list(".R..D."),
        list(".C...."),
        list("..T.R."),
        list("..D..."),
        list("......")
    ])


# Define constants
ACTIONS = ['U', 'R', 'D', 'L']
ACTION_DELTA = {
    'U': (-1, 0),
    'R': (0, 1),
    'D': (1, 0),
    'L': (0, -1)
}


# -----------------------------
# Helper Functions
# -----------------------------

def in_bounds(state, shape):
    """Check whether a state lies inside the grid boundaries."""
    r, c = state
    nrows, ncols = shape
    return 0 <= r < nrows and 0 <= c < ncols


def move(state, action, board):
    """Compute next position from a given state and action."""
    dr, dc = ACTION_DELTA[action]
    r, c = state
    nr, nc = r + dr, c + dc
    if not in_bounds((nr, nc), board.shape) or board[nr, nc] == 'R':
        return (r, c)  # blocked by rocks
    return (nr, nc)


# -----------------------------
# Transition Function
# -----------------------------

def get_transition_probs(state, action, board):
    """
    Get transition probabilities for a given state-action pair.
    """
    tile = board[state]
    probs = []

    if tile == 'C':
        # Crater: absorbing, but 2% chance of random ejection
        probs.append((0.98, state))
        for a in ACTIONS:
            ns = move(state, a, board)
            probs.append((0.02 / len(ACTIONS), ns))
        return probs

    if tile == 'D':
        # Dust storm: slips sideways 15% each, intended direction 70%
        slip_actions = {
            'U': ['L', 'R'],
            'D': ['R', 'L'],
            'L': ['D', 'U'],
            'R': ['U', 'D']
        }
        probs.append((0.7, move(state, action, board)))
        probs.append((0.15, move(state, slip_actions[action][0], board)))
        probs.append((0.15, move(state, slip_actions[action][1], board)))
        return probs

    if tile == 'T':
        # Tunnel: 50% chance to move one step, 50% to double-step
        ns1 = move(state, action, board)
        ns2 = move(ns1, action, board)
        probs.append((0.5, ns1))
        probs.append((0.5, ns2))
        return probs

    # Default terrain
    probs.append((1.0, move(state, action, board)))
    return probs


# -----------------------------
# Reward Function
# -----------------------------

def get_reward(state, next_state, board):
    """Compute reward for a transition."""
    if board[next_state] == 'B':
        return 1.0
    if board[next_state] == 'C':
        return -0.2
    return -0.04  # energy cost


# -----------------------------
# Policy Evaluation (TODO)
# -----------------------------

def policy_evaluation(policy, V, board, gamma=0.95, theta=1e-4):
    """
    Evaluate a given policy using iterative policy evaluation.

    Args:
        policy (dict): Mapping from state -> action.
        V (dict): Initial value estimates.
        board (np.ndarray): Grid environment.
        gamma (float): Discount factor.
        theta (float): Convergence threshold.

    Returns:
        dict: Updated value function after convergence.

    TODO:
        Implement this function by:
        1. Iteratively updating V(s) using:
               V(s) ← Σ [ P(s'|s,π(s)) * (R(s,π(s),s') + γ * V(s')) ]
        2. Repeat until max change in V(s) < θ.
    """
    while True:
      delta = 0
      for r in range(board.shape[0]):
          for c in range(board.shape[1]):
              s = (r, c)
              if board[s] == 'B':
                  continue

              old_action = policy.get(s, None)
              v = V[s]
              for p, ns in get_transition_probs(s,old_action, board):
                    V[s] += p * (get_reward(s, ns, board) + gamma * V[ns])
              # policy[s] = v
              delta = max(delta,abs(v-V[s]))

      if delta < theta:
        break
    return policy, V


# -----------------------------
# Policy Improvement
# -----------------------------

def policy_improvement(V, board, gamma=0.95):
    """Improve policy greedily based on current value estimates."""
    policy = {}
    stable = True
    for r in range(board.shape[0]):
        for c in range(board.shape[1]):
            s = (r, c)
            if board[s] == 'B':
                continue
            old_action = policy.get(s, None)

            q_values = []
            for a in ACTIONS:
                q = 0
                for p, ns in get_transition_probs(s, a, board):

                    q += p * (get_reward(s, ns, board) + gamma * V[ns])
                q_values.append(q)
            best_a = ACTIONS[np.argmax(q_values)]
            policy[s] = best_a
            if old_action != best_a:
                stable = False
    return policy, stable


# -----------------------------
# Policy Iteration
# -----------------------------

def policy_iteration(board, gamma=0.95):
    """Perform policy iteration to find optimal policy and values."""
    policy = {(r, c): np.random.choice(ACTIONS) for r in range(board.shape[0]) for c in range(board.shape[1])}
    V = {(r, c): 0 for r in range(board.shape[0]) for c in range(board.shape[1])}

    while True:
        V = policy_evaluation(policy, V, board, gamma)
        policy, stable = policy_improvement(V, board, gamma)
        if stable:
            break
    return policy, V


# -----------------------------
# Value Iteration (TODO)
# -----------------------------

def value_iteration(board, gamma=0.95, theta=1e-4):
    """
    Perform value iteration to compute the optimal value function and policy.

    Args:
        board (np.ndarray): Grid environment.
        gamma (float): Discount factor.
        theta (float): Convergence threshold.

    Returns:
        tuple: (optimal_value_function, optimal_policy)

    TODO:
        Implement this function by:
        1. Initialize all V(s) = 0.
        2. Iteratively update each V(s) using:
               V(s) ← max_a Σ [ P(s'|s,a) * (R(s,a,s') + γ * V(s')) ]
        3. Stop when the max change in V(s) < θ.
        4. From the final V(s), derive the optimal policy π*(s).
    """
    V = {(r, c): 0 for r in range(board.shape[0]) for c in range(board.shape[1])}
    while True:
      delta = 0
      for r in range(board.shape[0]):
          for c in range(board.shape[1]):
              s = (r, c)
              if board[s] == 'B':
                  continue

              # old_action = policy.get(s, None)
              q_values = []
              v = V[s]
              for a in ACTIONS:
                q=0
                for p, ns in get_transition_probs(s, a, board):
                      q += p * (get_reward(s, ns, board) + gamma * V[ns])
                q_values.append(q)
              best_a = max(q_values)
              delta = max(delta,abs(v-V[s]))
              best_a = max(q_values)
              V[s]= best_a
          if delta < theta:
               break

      # Optimal Value
      policy = {}
      for r in range(board.shape[0]):
            for c in range(board.shape[1]):
                s = (r, c)
                if board[s] == 'B':
                  continue
                old_action = policy.get(s, None)
                q_values = []
                for a in ACTIONS:
                  q = 0
                  for p, ns in get_transition_probs(s, a, board):
                         q += p * (get_reward(s, ns, board) + gamma * V[ns])
                  q_values.append(q)
                  best_a = ACTIONS[np.argmax(q_values)]
                  policy[s] = best_a

    return policy, V

# -----------------------------
# Experiment Runner
# -----------------------------

def run_experiment():
    """Utility to run and test algorithms after implementation."""
    board = make_board()
    print("Mars Terrain:")
    print(board)

    # Once implemented, you can test either:
    policy, V = policy_iteration(board)
    # or:
    # V, policy = value_iteration(board)
    # and visualize or print results
    print("\nImplement the TODO functions and test them here.")


if __name__ == "__main__":
    run_experiment()


Mars Terrain:
[['S' '.' '.' '.' '.' 'B']
 ['.' 'R' '.' '.' 'D' '.']
 ['.' 'C' '.' '.' '.' '.']
 ['.' '.' 'T' '.' 'R' '.']
 ['.' '.' 'D' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.']]


TypeError: tuple indices must be integers or slices, not tuple