In [None]:
! pip install gymnasium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gymnasium
  Downloading gymnasium-0.28.1-py3-none-any.whl (925 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting jax-jumpy>=1.0.0
  Downloading jax_jumpy-1.0.0-py3-none-any.whl (20 kB)
Installing collected packages: farama-notifications, jax-jumpy, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.28.1 jax-jumpy-1.0.0


In [None]:
import gymnasium as gym
import numpy as np
import time # to get the time

env = gym.make('MountainCar-v0',render_mode="rgb_array_list")

In [None]:
# Set hyperparameters
num_episodes = 500
alpha = 0.1
gamma = 0.95
epsilon = 0.1

# Set Q-table
nA = env.action_space.n
nS = env.observation_space.shape[0]
np.random.seed(3)
Observation = [20,20]
Q = np.random.uniform(low=0, high=1, size=(Observation + [nA]))
Q.shape

(20, 20, 3)

In [None]:
def get_discrete_state(state, n_bins=(20, 20)):
    """
    Convert the continuous state values to discrete values.

    Parameters:
        state (np.ndarray): The current state of the environment.
        n_bins (tuple): The number of bins to use for each state variable.

    Returns:
        tuple: The discrete state representation.
    """
    # Define the bounds for each state variable
    bounds = [
        [-1.2, 0.5],  # cart position
        [-0.07, 0.07],  # cart velocity
    ]

    # Calculate the bin width for each state variable
    bin_widths = [(bounds[i][1] - bounds[i][0]) / n_bins[i] for i in range(len(bounds))]
    # Convert each state variable to a discrete value
    discrete_state = tuple(int((state[i] - bounds[i][0]) / bin_widths[i]) for i in range(len(bounds)))

    # Make sure the discrete state is within the bounds of the Q-table
    for i in range(len(bounds)):
        if discrete_state[i] < 0:
            discrete_state = list(discrete_state)
            discrete_state[i] = 0
            discrete_state = tuple(discrete_state)
        elif discrete_state[i] >= n_bins[i]:
            discrete_state = list(discrete_state)
            discrete_state[i] = n_bins[i] - 1
            discrete_state = tuple(discrete_state)

    return discrete_state

In [None]:
# Define epsilon-greedy policy
def epsilon_greedy(Q, state, nA, epsilon):
    if np.random.random() > epsilon:
        return np.argmax(Q[state])
    else:
        return np.random.choice(nA)

In [None]:
# MC algorithm
total_reward =0
total_time = 0
episode_rewards = []
episode_states = []
episode_actions = []
for i_episode in range(1,num_episodes+1):
    total=0
    episode_reward = []
    episode_state = []
    episode_action = []

    t0 = time.time() #set the initial time
    state = env.reset(seed=32)
    d_state = get_discrete_state(state[0])
    action = epsilon_greedy(Q, d_state, nA, epsilon)
    terminated, truncated = False,False

    while not (terminated or truncated):
        next_state, reward, terminated, truncated, info = env.step(action)
        if next_state[0] >= 0.5:
            reward = 50
        next_state = get_discrete_state(next_state)
        episode_state.append(d_state)
        episode_action.append(action)
        episode_reward.append(reward)
        next_action = epsilon_greedy(Q, next_state, nA, epsilon)
        # Q[d_state][action] += alpha*(reward - Q[d_state][action])
        d_state = next_state
        action = next_action

    episode_rewards.append(episode_reward)
    episode_states.append(episode_state)
    episode_actions.append(episode_action)
    # Update Q table based on episode history
    G = 0
    # print(episode_rewards[i_episode])
    for t in reversed(range(len(episode_rewards[i_episode-1]))):
        G = gamma*G + episode_rewards[i_episode-1][t]
        state = episode_states[i_episode-1][t]
        action = episode_actions[i_episode-1][t]
        Q[state[0], state[1], action] += (G - Q[state[0], state[1], action]) / (i_episode)

    t1 = time.time() #episode has finished
    episode_time = t1 - t0 #episode total time
    total_time += episode_time
    total_reward += sum(episode_reward) #episode total reward
    # if i_episode % 10 == 0:
    print(f"\rEpisode {i_episode}/{num_episodes}", end="")


Episode 500/500

In [None]:
mean_reward = total_reward / num_episodes
mean_time = total_time / num_episodes

print("Mean Reward: " + str(mean_reward))
print("Time Average: " + str(mean_time))

Mean Reward: -200.0
Time Average: 0.8528874793052673


In [None]:
from gymnasium.utils.save_video import save_video
# Evaluate learned policy
state = env.reset(seed=32)
state = state[0]
state = get_discrete_state(state)
terminated, truncated = False,False
video = []
while not (terminated or truncated):
    action = np.argmax(Q[state])
    next_state, reward, terminated, truncated, info = env.step(action)
    next_state = get_discrete_state(next_state)
    state = next_state

save_video(
  env.render(),
  "videos",
  fps=30,
  episode_index=0
)

# env.close()


Moviepy - Building video /content/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /content/videos/rl-video-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/rl-video-episode-0.mp4
