# Problem 1

In [3]:
import numpy as np
from gridworld_lec4 import GridWorld
from gw_lec4 import GridWorld5x5

# Initialize the 5x5 GridWorld
env = GridWorld5x5(p=0.5)

# optimal policy frmo previous problem
optimal_policy = {
    (0, 0): 'D', (0, 1): 'D', (0, 2): 'D', (0, 3): 'L', (0, 4): 'L',
    (1, 0): 'R', (1, 1): 'D', (1, 2): 'L', (1, 3): 'L', (1, 4): 'L',
    (2, 1): 'D',
    (3, 0): 'D', (3, 1): 'D', (3, 3): 'D', (3, 4): 'D',
    (4, 0): 'R', (4, 1): 'R', (4, 2): 'R', (4, 3): 'R'
}

# MC parameters
episodes = 10000  # epochs
gamma = 0.9  # Discount factor
value_function = {s: 0 for s in env.all_states()}  # value function
returns = {s: [] for s in env.all_states()}  # state returns

# MC w/ optimal policy
for episode in range(episodes):
    # Start from a random state
    state = (np.random.randint(env.rows), np.random.randint(env.columns))
    env.set_state(state)

    # Generate episode following optimal policy
    episode_sequence = []
    while not env.game_over():
        if state in optimal_policy:
            action = optimal_policy[state]
        else:
            break  

        next_state, reward = env.move(action)
        episode_sequence.append((state, reward))
        state = next_state

    # Update value function
    G = 0  
    visited_states = set()
    for state, reward in reversed(episode_sequence):
        G = reward + gamma * G
        if state not in visited_states:  # First-visit MC
            visited_states.add(state)
            returns[state].append(G)
            value_function[state] = np.mean(returns[state])

# State outputs
visited_states_count = len([s for s in returns if returns[s]])
print("Monte Carlo Value Function Using Optimal Policy:")
for state, value in value_function.items():
    print(f"State {state}: {value:.2f}")
print("\nNumber of visited states:", visited_states_count)


Monte Carlo Value Function Using Optimal Policy:
State (0, 1): -13.93
State (4, 0): 37.77
State (1, 2): -13.43
State (3, 4): 85.24
State (0, 4): -12.44
State (4, 3): 100.00
State (3, 1): 37.85
State (2, 1): 28.18
State (0, 2): -13.08
State (1, 0): -60.34
State (1, 3): -12.30
State (4, 1): 73.94
State (4, 4): 0.00
State (0, 0): -53.90
State (1, 1): -14.64
State (0, 3): -12.50
State (2, 0): 0.00
State (4, 2): 89.00
State (3, 0): 35.83
State (1, 4): -11.92
State (3, 3): 71.55

Number of visited states: 19


# Problem 2

In [4]:
import numpy as np
from gridworld_lec4 import GridWorld
from gw_lec4 import GridWorld5x5

# Initialize the 5x5 GridWorld
env = GridWorld5x5(p=0.5)

# MC parameters
episodes = 10000  # Epochs
gamma = 0.9  # Discount factor
epsilon = 0.1  # epsilon-greedy

# Initialize Q-value function and returns
Q = {s: {a: 0 for a in env.actions(s)} for s in env.all_states()}
returns = {s: {a: [] for a in env.actions(s)} for s in env.all_states()}

# random policy
policy = {s: np.random.choice(env.actions(s)) for s in env.all_states() if env.actions(s)}

# keep track of visited states
visited_states = set()

# MC Exploring Starts Control
for episode in range(episodes):
    # Exploring start
    state = (np.random.randint(env.rows), np.random.randint(env.columns))
    env.set_state(state)
    if not env.actions(state):
        continue
    action = np.random.choice(env.actions(state))

    # Add the state to visited states
    visited_states.add(state)

    # Generate an episode following the current policy
    episode_sequence = []
    while not env.game_over():
        next_state, reward = env.move(action)
        episode_sequence.append((state, action, reward))
        state = next_state
        if env.actions(state):
            # Add the next state to visited states
            visited_states.add(state)

            # action base off epsilon greedy
            if np.random.rand() < epsilon:
                action = np.random.choice(env.actions(state))
            else:
                action = max(Q[state], key=Q[state].get)

    # Calculate returns and update Q-values
    G = 0  
    visited_state_action_pairs = set()
    for state, action, reward in reversed(episode_sequence):
        G = reward + gamma * G
        if (state, action) not in visited_state_action_pairs:  # First-visit MC
            visited_state_action_pairs.add((state, action))
            returns[state][action].append(G)
            Q[state][action] = np.mean(returns[state][action])

            # Update the policy to be greedy with respect to Q
            policy[state] = max(Q[state], key=Q[state].get)

# Output the learned optimal policy
print("Learned Optimal Policy:")
for r in range(env.rows):
    for c in range(env.columns):
        if (r, c) in policy:
            print(f"{(r, c)}: {policy[(r, c)]}", end=", ")
    print()

# Output the number of visited states
print("\nNumber of Visited States:", len(visited_states))


Learned Optimal Policy:
(0, 0): R, (0, 1): D, (0, 2): R, (0, 3): R, (0, 4): D, 
(1, 0): R, (1, 1): R, (1, 2): U, (1, 3): L, (1, 4): L, 
(2, 0): U, (2, 1): U, 
(3, 0): D, (3, 1): L, (3, 3): R, (3, 4): D, 
(4, 0): R, (4, 1): L, (4, 2): R, (4, 3): R, (4, 4): L, 

Number of Visited States: 21


# problem 3

In [5]:
import numpy as np
from gridworld_lec4 import GridWorld
from gw_lec4 import GridWorld5x5

# Initialize the 5x5 GridWorld 
env = GridWorld5x5(p=0.5)

# Parameters
episodes = 10000  # Number of episodes
gamma = 0.9  # Discount factor
epsilon = 0.1  # Epsilon for epsilon-soft policy

# Initialize Q-value function and returns
Q = {s: {a: 0 for a in env.actions(s)} for s in env.all_states()}
returns = {s: {a: [] for a in env.actions(s)} for s in env.all_states()}

# Initialize an epsilon-soft policy
def epsilon_soft_policy(state):
    actions = env.actions(state)
    if not actions:
        return None
    best_action = max(Q[state], key=Q[state].get)
    action_probs = {a: epsilon / len(actions) for a in actions}
    action_probs[best_action] += 1 - epsilon
    return np.random.choice(list(action_probs.keys()), p=list(action_probs.values()))

# Set to keep track of visited states
visited_states = set()

# On-policy First-visit MC Control
for episode in range(episodes):
    # Generate an episode following the epsilon-soft policy
    state = (np.random.randint(env.rows), np.random.randint(env.columns))
    env.set_state(state)
    episode_sequence = []

    while not env.game_over():
        action = epsilon_soft_policy(state)
        if action is None:
            break
        next_state, reward = env.move(action)
        episode_sequence.append((state, action, reward))
        visited_states.add(state)  # Track visited states
        state = next_state

    # Calculate returns and update Q-values
    G = 0  # Return
    visited_state_action_pairs = set()
    for state, action, reward in reversed(episode_sequence):
        G = reward + gamma * G
        if (state, action) not in visited_state_action_pairs:  # First-visit MC
            visited_state_action_pairs.add((state, action))
            returns[state][action].append(G)
            Q[state][action] = np.mean(returns[state][action])

            # Update the policy to be epsilon-soft
            actions = env.actions(state)
            if actions:
                best_action = max(Q[state], key=Q[state].get)
                for a in actions:
                    if a == best_action:
                        policy_prob = 1 - epsilon + (epsilon / len(actions))
                    else:
                        policy_prob = epsilon / len(actions)
                    # No need to explicitly store policy probabilities, just use epsilon-soft policy

# Output the learned Q-values
print("Learned Q-values:")
for state in Q:
    for action in Q[state]:
        print(f"Q({state}, {action}): {Q[state][action]:.2f}")

# Output the number of visited states
print("\nNumber of Unique States Visited:", len(visited_states))


Learned Q-values:
Q((0, 1), R): 21.47
Q((0, 1), L): -6.01
Q((0, 1), D): -4.28
Q((4, 0), R): 54.26
Q((4, 0), U): 49.59
Q((1, 2), R): -8.18
Q((1, 2), L): 12.29
Q((1, 2), U): -8.07
Q((3, 4), L): 78.47
Q((3, 4), D): 100.00
Q((0, 4), L): -8.65
Q((0, 4), D): -9.97
Q((4, 3), R): 100.00
Q((4, 3), L): 70.66
Q((4, 3), U): 90.20
Q((3, 1), L): 37.83
Q((3, 1), D): 55.75
Q((3, 1), U): 24.14
Q((2, 1), L): -100.00
Q((2, 1), D): 43.72
Q((2, 1), U): -8.56
Q((0, 2), R): -7.20
Q((0, 2), L): 14.46
Q((0, 2), D): -7.05
Q((1, 0), R): -7.41
Q((1, 0), D): -100.00
Q((1, 0), U): -9.00
Q((1, 3), R): -9.83
Q((1, 3), L): -10.95
Q((1, 3), U): -8.27
Q((4, 1), R): 78.28
Q((4, 1), L): 53.72
Q((4, 1), U): 48.03
Q((4, 4), L): 0.00
Q((4, 4), U): 0.00
Q((0, 0), R): -5.41
Q((0, 0), D): -11.30
Q((1, 1), R): -6.49
Q((1, 1), L): 13.31
Q((1, 1), D): 15.93
Q((1, 1), U): 16.35
Q((0, 3), R): -9.67
Q((0, 3), L): -7.98
Q((0, 3), D): -9.50
Q((2, 0), R): 0.00
Q((2, 0), D): 0.00
Q((2, 0), U): 0.00
Q((4, 2), R): 88.37
Q((4, 2), L): 55.99

# problem 4

In [6]:
import numpy as np
from gridworld_lec4 import GridWorld
from gw_lec4 import GridWorld5x5

# Initialize the 5x5 GridWorld
env = GridWorld5x5(p=0.5)

# Parameters
episodes = 10000  # epochs
gamma = 0.9  # Discount factor

# Initialize Q-value function and cumulative weights for all state-action pairs
Q = {s: {a: 0 for a in env.actions(s)} for s in env.all_states()}
C = {s: {a: 0 for a in env.actions(s)} for s in env.all_states()}

# Initialize target policy
target_policy = {s: np.random.choice(env.actions(s)) for s in env.all_states() if env.actions(s)}

# Define a behavior policy
def behavior_policy(state):
    actions = env.actions(state)
    if not actions:
        return None
    return np.random.choice(actions)

# Off-policy First-visit MC Control
for episode in range(episodes):
    state = (np.random.randint(env.rows), np.random.randint(env.columns))
    env.set_state(state)
    episode_sequence = []

    while not env.game_over():
        action = behavior_policy(state)
        if action is None:
            break
        next_state, reward = env.move(action)
        episode_sequence.append((state, action, reward))
        state = next_state

    # Calculate returns and update Q-values using importance sampling
    G = 0  
    W = 1  
    for state, action, reward in reversed(episode_sequence):
        G = reward + gamma * G
        C[state][action] += W
        Q[state][action] += (W / C[state][action]) * (G - Q[state][action])

        # Update the target policy to be greedy
        best_action = max(Q[state], key=Q[state].get)
        target_policy[state] = best_action

        if action != best_action:
            break
        W *= 1 / (1 / len(env.actions(state)))  # Adjust the weight

# Output the learned Q-values and target policy
print("Learned Q-values:")
for state in Q:
    for action in Q[state]:
        print(f"Q({state}, {action}): {Q[state][action]:.2f}")

print("\nLearned Target Policy:")
for state in target_policy:
    print(f"State {state}: {target_policy[state]}")


Learned Q-values:
Q((0, 1), R): 0.00
Q((0, 1), L): 0.00
Q((0, 1), D): 0.00
Q((4, 0), R): 24.57
Q((4, 0), U): 32.95
Q((1, 2), R): 0.00
Q((1, 2), L): 0.00
Q((1, 2), U): 0.00
Q((3, 4), L): 93.64
Q((3, 4), D): 93.79
Q((0, 4), L): 0.00
Q((0, 4), D): 0.00
Q((4, 3), R): 89.88
Q((4, 3), L): 94.10
Q((4, 3), U): 76.50
Q((3, 1), L): 0.00
Q((3, 1), D): 46.77
Q((3, 1), U): 42.61
Q((2, 1), L): -100.00
Q((2, 1), D): 49.18
Q((2, 1), U): -100.00
Q((0, 2), R): 0.00
Q((0, 2), L): 0.00
Q((0, 2), D): 0.00
Q((1, 0), R): 0.00
Q((1, 0), D): -100.00
Q((1, 0), U): -100.00
Q((1, 3), R): 0.00
Q((1, 3), L): 0.00
Q((1, 3), U): 0.00
Q((4, 1), R): 40.08
Q((4, 1), L): 51.20
Q((4, 1), U): 71.29
Q((4, 4), L): 0.00
Q((4, 4), U): 0.00
Q((0, 0), R): 0.00
Q((0, 0), D): 0.00
Q((1, 1), R): 0.00
Q((1, 1), L): 0.00
Q((1, 1), D): 0.00
Q((1, 1), U): 0.00
Q((0, 3), R): 0.00
Q((0, 3), L): 0.00
Q((0, 3), D): 0.00
Q((2, 0), R): 0.00
Q((2, 0), D): 0.00
Q((2, 0), U): 0.00
Q((4, 2), R): 84.03
Q((4, 2), L): 77.68
Q((3, 0), R): 28.35
Q((3

# problem 6

In [7]:
import gymnasium as gym
import numpy as np

# Create the CartPole environment
env = gym.make("CartPole-v1")

# Discretize the state space (e.g., using 10 bins for each of the 4 state variables)
num_bins = 10
state_bins = [
    np.linspace(-4.8, 4.8, num_bins),  # Cart position
    np.linspace(-4, 4, num_bins),      # Cart velocity
    np.linspace(-0.418, 0.418, num_bins),  # Pole angle
    np.linspace(-4, 4, num_bins)       # Pole angular velocity
]

# Initialize Q-values and returns
Q = {}
returns = {}

# Initialize policy (randomly choose actions)
policy = {}

# Function to discretize continuous states
def discretize_state(state):
    binned_state = []
    for i in range(len(state)):
        binned_state.append(np.digitize(state[i], state_bins[i]))
    return tuple(binned_state)



In [8]:
# Parameters
gamma = 0.9  # Discount factor
epsilon = 0.1  # Epsilon for epsilon-greedy policy
episodes = 100  # Number of episodes

# Function to choose an action using epsilon-greedy policy
def epsilon_greedy_action(state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Random action
    else:
        return np.argmax(Q.get(state, [0, 0]))  # Greedy action

# Monte Carlo control loop
for episode in range(episodes):
    state = env.reset()[0]  # Reset environment and get the initial state
    state = discretize_state(state)
    episode_sequence = []

    # Generate an episode
    done = False
    while not done:
        action = epsilon_greedy_action(state)
        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state)
        episode_sequence.append((state, action, reward))
        state = next_state

    # Calculate returns and update Q-values
    G = 0  # Return
    visited_state_action_pairs = set()
    for state, action, reward in reversed(episode_sequence):
        G = reward + gamma * G
        if (state, action) not in visited_state_action_pairs:  # First-visit MC
            visited_state_action_pairs.add((state, action))
            if state not in returns:
                returns[state] = {action: []}
            if action not in returns[state]:
                returns[state][action] = []
            returns[state][action].append(G)

            # Update Q-value
            Q[state] = Q.get(state, [0, 0])
            Q[state][action] = np.mean(returns[state][action])

            # Update policy
            policy[state] = np.argmax(Q[state])


In [9]:
# Test the agent
state = env.reset()[0]
state = discretize_state(state)
done = False
total_reward = 0

while not done:
    env.render()  # Render the environment
    action = policy.get(state, env.action_space.sample())  # Use learned policy
    next_state, reward, done, _, _ = env.step(action)
    next_state = discretize_state(next_state)
    state = next_state
    total_reward += reward

print("Total Reward:", total_reward)
env.close()


Total Reward: 108.0


  gym.logger.warn(


# problem 7

In [10]:
import gymnasium as gym
import pygame
import numpy as np

# Initialize the CarRacing-v0 environment
env = gym.make("CarRacing-v3", render_mode="human")

# Initialize pygame and set up the display
pygame.init()
window = pygame.display.set_mode((400, 300))
pygame.display.set_caption("CarRacing Keyboard Control")

# Action: [steering, gas, brake]
action = np.array([0.0, 0.0, 0.0])

# Function to handle keyboard events
def handle_keyboard_events():
    global action
    keys = pygame.key.get_pressed()
    action = np.array([0.0, 0.0, 0.0])  # Reset action

    if keys[pygame.K_LEFT]:
        action[0] = -1.0  # Full left steering
    if keys[pygame.K_RIGHT]:
        action[0] = 1.0  # Full right steering
    if keys[pygame.K_UP]:
        action[1] = 1.0  # Full gas
    if keys[pygame.K_DOWN]:
        action[2] = 1.0  # Full brake

# Main loop
done = False
state = env.reset()

while not done:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            done = True

    handle_keyboard_events()  # Update action based on keyboard input

    # Take a step in the environment
    _, _, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

    # Refresh the pygame display
    pygame.display.flip()

env.close()
pygame.quit()
