In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import ipywidgets as widgets

In [4]:
# Initialize the environment
env = gym.make("CartPole-v1")


In [5]:
# Hyperparameters
alpha = 0.1     # Learning rate
gamma = 0.99    # Discount factor
epsilon = 1.0   # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01
num_episodes = 500  # Reduce number of episodes for faster execution
num_bins = 24         # More bins for better discretization


In [6]:
# Discretize the state space into bins
def discretize_state(state):
    state_bins = [
        np.linspace(-4.8, 4.8, num_bins),
        np.linspace(-5, 5, num_bins),
        np.linspace(-0.418, 0.418, num_bins),
        np.linspace(-5, 5, num_bins)
    ]
    state_index = [np.digitize(state[i], state_bins[i]) - 1 for i in range(len(state))]
    return tuple(state_index)

In [7]:
# Initialize the Q-table
q_table = np.zeros((num_bins, num_bins, num_bins, num_bins, env.action_space.n))

In [8]:
# Q-Learning algorithm
for episode in range(num_episodes):
    state = discretize_state(env.reset())
    done = False
    while not done:
        # Epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        # Take action, observe the reward and next state
        next_state, reward, done, _ = env.step(action)
        next_state = discretize_state(next_state)

        # Update Q-value using the Q-learning update rule
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + gamma * q_table[next_state][best_next_action] * (1 - done)
        td_error = td_target - q_table[state][action]
        q_table[state][action] += alpha * td_error

        state = next_state

    # Decay epsilon to reduce exploration over time
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

env.close()

In [9]:
# Test the agent and display the CartPole in Google Colab (as an animation)
state = discretize_state(env.reset())
done = False

print("Testing the trained agent...")

Testing the trained agent...


In [10]:
# Capture frames for display
frames = []  # List to store frames
while not done:
    frame = env.render(mode='rgb_array')  # Render the frame as an RGB array
    frames.append(frame)  # Add the frame to the list
    action = np.argmax(q_table[state])  # Always exploit the best action
    next_state, reward, done, _ = env.step(action)
    state = discretize_state(next_state)

env.close()

In [11]:
# Create a slider widget for frame navigation
slider = widgets.IntSlider(value=0, min=0, max=len(frames)-1, step=1, description='Frame:')
output = widgets.Output()

# Function to update the displayed frame based on slider value
def update_frame(change):
    with output:
        clear_output(wait=True)  # Clear previous output to avoid cluttering
        plt.imshow(frames[change['new']])  # Display the current frame based on slider value
        plt.axis('off')  # Turn off axis
        plt.show()

slider.observe(update_frame, names='value')

In [12]:
# Display slider and output area for frames
display(slider, output)

IntSlider(value=0, description='Frame:', max=15)

Output()

In [None]:
# Display the first frame initially
update_frame({'new': 0})

In [None]:
# Display slider and output area for frames
display(slider, output)

IntSlider(value=16, description='Frame:', max=21)

Output()

In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import ipywidgets as widgets

# Initialize the environment
env = gym.make("CartPole-v1")

# Hyperparameters
alpha = 0.1     # Learning rate
gamma = 0.99    # Discount factor
epsilon = 1.0   # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01
num_episodes = 500  # Reduce number of episodes for faster execution
num_bins = 24         # More bins for better discretization

# Discretize the state space into bins
def discretize_state(state):
    state_bins = [
        np.linspace(-4.8, 4.8, num_bins),
        np.linspace(-5, 5, num_bins),
        np.linspace(-0.418, 0.418, num_bins),
        np.linspace(-5, 5, num_bins)
    ]
    state_index = [np.digitize(state[i], state_bins[i]) - 1 for i in range(len(state))]
    return tuple(state_index)

# Initialize the Q-table
q_table = np.zeros((num_bins, num_bins, num_bins, num_bins, env.action_space.n))

# Q-Learning algorithm
for episode in range(num_episodes):
    state = discretize_state(env.reset())
    done = False
    while not done:
        # Epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        # Take action, observe the reward and next state
        next_state, reward, done, _ = env.step(action)
        next_state = discretize_state(next_state)

        # Update Q-value using the Q-learning update rule
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + gamma * q_table[next_state][best_next_action] * (1 - done)
        td_error = td_target - q_table[state][action]
        q_table[state][action] += alpha * td_error

        state = next_state

    # Decay epsilon to reduce exploration over time
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

env.close()

# Test the agent and capture frames for display
state = discretize_state(env.reset())
done = False

print("Testing the trained agent...")

# Capture frames for display
frames = []  # List to store frames
while not done:
    frame = env.render(mode='rgb_array')  # Render the frame as an RGB array
    frames.append(frame)  # Add the frame to the list
    action = np.argmax(q_table[state])  # Always exploit the best action
    next_state, reward, done, _ = env.step(action)
    state = discretize_state(next_state)

env.close()

# Create a slider widget for frame navigation with episode information
slider = widgets.IntSlider(value=0, min=0, max=len(frames)-1, step=1, description='Frame:')
output = widgets.Output()

# Function to update the displayed frame based on slider value and show episode info
def update_frame(change):
    with output:
        clear_output(wait=True)  # Clear previous output to avoid cluttering

        idx_current_frame = change['new']

        plt.imshow(frames[idx_current_frame])  # Display current frame based on slider value
        plt.axis('off')  # Turn off axis

        # Calculate episode number based on current frame index (assuming each episode has a similar length)
        total_frames_per_episode = len(frames) // num_episodes if num_episodes > 0 else len(frames)

        # Ensure we don't divide by zero and handle edge cases gracefully.
        if total_frames_per_episode > 0:
            episode_number = idx_current_frame // total_frames_per_episode + 1
            plt.title(f'Episode: {episode_number}')  # Display episode number

        plt.show()

slider.observe(update_frame, names='value')

# Display slider and output area for frames
display(slider, output)

# Display the first frame initially
update_frame({'new': 0})

Testing the trained agent...


IntSlider(value=0, description='Frame:', max=71)

Output()