# Learn Reinforcement Learning in Python: Step-by-step Tutorial

### Setup

```
$ pip install "gymnasium[atari]"
$ pip install autorom[accept-rom-license]
$ AutoROM --accept-license
```

### Exploring the Gymnasium environments

In [77]:
import gymnasium as gym

env = gym.make("ALE/Breakout-v5")

In [78]:
gym.__version__

'0.29.1'

In [79]:
env.action_space

Discrete(4)

In [80]:
env.action_space.sample()

2

In [81]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [82]:
state = env.observation_space.sample()

state.shape

(210, 160, 3)

In [83]:
state, info = env.reset()

state[:2, :2, :2]

array([[[0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0]]], dtype=uint8)

In [84]:
info

{'lives': 5, 'episode_frame_number': 0, 'frame_number': 0}

### State, action, reward workflow

In [85]:
observation, reward, terminated, truncated, info = env.step(1)

In [86]:
info

{'lives': 5, 'episode_frame_number': 4, 'frame_number': 4}

In [87]:
env = gym.make("ALE/Breakout-v5", render_mode="human")
observation, info = env.reset()

for _ in range(100):
    action = (
        env.action_space.sample()
    )  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

### Animating the agent's learning process

In [None]:
env = gym.make("ALE/Breakout-v5", render_mode="rgb_array")
observation, info = env.reset()

while not done:
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    # Put each rendered frame into dict for animation
    frames.append(
        {
            "frame": env.render(),
            "state": observation,
            "action": action,
            "reward": reward,
        }
    )

    epochs += 1
    if epochs == 1000:
        break

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

In [88]:
epochs = 0

frames = []  # for animation

env = gym.make("ALE/Breakout-v5", render_mode="rgb_array")
observation, info = env.reset()

while not done:
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    # Put each rendered frame into dict for animation
    frames.append(
        {
            "frame": env.render(),
            "state": observation,
            "action": action,
            "reward": reward,
        }
    )

    epochs += 1
    if epochs == 1000:
        break

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

  logger.warn(



Timesteps taken: 1000
Penalties incurred: 0


In [None]:
pip install moviepy

In [14]:
from moviepy.editor import ImageSequenceClip


def create_gif(frames: dict, filename, fps=100):
    """
    Creates a GIF animation from a list of RGBA NumPy arrays.

    Args:
        frames: A list of RGBA NumPy arrays representing the animation frames.
        filename: The output filename for the GIF animation.
        fps: The frames per second of the animation (default: 10).
    """
    rgba_frames = [frame["frame"] for frame in frames]

    clip = ImageSequenceClip(rgba_frames, fps=fps)
    clip.write_gif(filename, fps=fps)


# Example usage
create_gif(frames, "animation.gif")

MoviePy - Building file animation.gif with imageio.


                                                                        

### Creating a Q-table

In [16]:
env.observation_space.shape

(210, 160, 3)

In [22]:
env.action_space.n

4

In [23]:
import numpy as np

n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

q_table = np.zeros([n_states, n_actions])

### Running a single episode of Q-learning

In [27]:
def run_qlearning_episode(env, qtable, epsilon=0.1, alpha=0.1, gamma=0.1):
    """
    Runs a single episode of Q-learning on the provided environment.

    Parameters
    ----------
    env :
        The environment to interact with.
    qtable : np.ndarray
        The Q-table to learn and update, with shape (n_states, n_actions).
    epsilon : float, optional
        The exploration rate (default: 0.1).
    alpha : float, optional
        The learning rate (default: 0.1).
    gamma : float, optional
        The discount factor (default: 0.1).

    Returns
    -------
    """
    # Define general variables
    epochs, reward = 0, 0
    done = False

    # Reset the environment
    observation, info = env.reset()

    # Start training
    while not done:
        # Should the agent explore or exploit?
        if np.random.uniform() <= epsilon:
            # Explore the action space if randomness below epsilon
            action = env.action_space.sample()
        else:
            # Perform the action with the highest q-value
            action = np.argmax(q_table[observation])

        # Take action
        next_observation, reward, terminated, truncated, info = env.step(action)

In [76]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 962})

In [110]:
env = gym.make("FrozenLake", is_slippery=True)
num_episodes = 1000
alpha = 0.1
gamma = 1
num_states, num_actions = env.observation_space.n, env.action_space.n
Q = np.zeros((num_states, num_actions))
reward_per_random_episode = []

In [111]:
def update_q_table(state, action, reward, new_state):
    old_value = Q[state, action]
    next_max = max(Q[new_state])
    Q[state, action] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)

In [113]:
for episode in range(num_episodes):
    state, info = env.reset()
    terminated = False
    episode_reward = 0
    while not terminated:
        # Random action selection
        action = env.action_space.sample()
        # Take action and observe new state and reward
        new_state, reward, terminated, truncated, info = env.step(action)
        # Update Q-table
        update_q_table(state, action, reward, new_state)
        episode_reward += reward
        state = new_state
        reward_per_random_episode.append(episode_reward)

### Article plan

1. Define what reward, state and actions are for the current problem
2. Show how to install gymnasium with cmake and scipy
3. Show how to render the env in both human and rgb_array mode
4. Explain env.reset, step and render methods
5. Pseudo-code for solving the environment without RL:
   - Initialize epochs, penalties, reward and an empty list to store frames
   - Define the `done` variable
   - While not done, get a random action and execute with step
   - Increase or decrease the penalty based on the reward
   - Append the current frame to frames using rgb_array mode of render
   - Increase the number of epochs
6. Pseudo-code to display the frames as a GIF
   - Using imageio, collect all rgb-arrays in frames and put them together as a gif
7. Pseudo-code to solve the environment with Q-learning
   - Define the hyperparameters - alpha, gamma, epsilon
   - Define the q_table with the same dims as the number of states and the number of actions
   - For a large number of epochs:
     - Reset the environment
     - Initialize epochs, penalties and reward with 0 values
     - While not done:
       - Generate a random value to compare with epsilon - exp vs. exploit trade-off
       - if random value smaller than epsilon, choose a new random action, else, find the argmax of q_table for the state - i.e. choose the action that gives the biggest reward
       - Take the action
       - Find the old value for the current state and action
       - Find the max value for the next state
       - Create a new value using the Q-learning formula
       - Update the q_table for the current state and action with new_value

