# MDP and gym

## Evaluation considerations
- We take into account the correctness of the solutions but also their generality and quality of the code
- Comment and discuss on the results of all your exercises (in a cell immediately after the results). You may also state the difficulties encountered, lessons learned and your understanding of the problem and solution
- Clean-up your code before submission, do not leave unnecessary code attempts, or if you deem it important, leave them in a way that it is easily understood and with comments/discussion
- We also value the originality of the solutions, don't hesitate in performing unrequested additional tasks in relation to the exercises


**NOTE**

Do not try to reproduce exactly the results in this notebook. RL training is very noisy and performances of learned policies can vary a lot, try running your trains several times.



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install gymnasium
!pip install gymnasium[accept-rom-license,toy_text]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gymnasium
  Downloading gymnasium-0.27.1-py3-none-any.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting jax-jumpy>=0.2.0
  Downloading jax_jumpy-0.2.0-py3-none-any.whl (11 kB)
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Installing collected packages: gymnasium-notices, jax-jumpy, gymnasium
Successfully installed gymnasium-0.27.1 gymnasium-notices-0.0.1 jax-jumpy-0.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.3.dev8
  Downloading pygame-2.1.3.dev8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting au

In [3]:
import gymnasium as gym
from IPython.display import clear_output, HTML, display
import matplotlib.pyplot as plt
%matplotlib notebook

In [4]:
#@title Wrapper for recording an environment into a video

from __future__ import annotations

from copy import deepcopy
from typing import Any, SupportsFloat

from gymnasium.core import ActType, ObsType, RenderFrame, WrapperActType, WrapperObsType
from gymnasium.error import DependencyNotInstalled

class RecordVideo(gym.Wrapper):
    """Adapted from https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/experimental/wrappers/rendering.py#L87
    """

    def __init__(self, env):
        """Initialize a :class:`HumanRendering` instance.
        Args:
            env: The environment that is being wrapped
        """
        super().__init__(env)
        assert env.render_mode in [
            "rgb_array",
            "rgb_array_list",
        ], f"Expected env.render_mode to be one of 'rgb_array' or 'rgb_array_list' but got '{env.render_mode}'"

        if "render_fps" not in env.metadata:
            env.metadata["render_fps"] = 24

        assert (
            "render_fps" in env.metadata
        ), "The base environment must specify 'render_fps' to be used with the HumanRendering wrapper"

        if "human" not in self.metadata["render_modes"]:
            self.metadata = deepcopy(self.env.metadata)
            self.metadata["render_modes"].append("human")

        self.artists = []
        self.figure = None
        self.env = env
    @property
    def render_mode(self):
        """Always returns ``'human'``."""
        return "human"

    def step(
        self, action: WrapperActType
    ) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict]:
        """Perform a step in the base environment and render a frame to the screen."""
        result = super().step(action)
        self._render_frame()
        return result

    def reset(
        self, *, seed: int | None = None, options: dict[str, Any] | None = None
    ) -> tuple[WrapperObsType, dict[str, Any]]:
        """Reset the base environment and render a frame to the screen."""
        result = super().reset(seed=seed, options=options)
        self._render_frame()
        return result

    def video(self):
        """This method renders all frames collected up to now."""
        if self.figure is not None:
            from IPython.display import HTML
            import matplotlib.animation

            animation = matplotlib.animation.ArtistAnimation(self.figure, self.artists, 
                                                             interval=1000//self.metadata["render_fps"],
                                                             blit=True,
                                                             repeat=True,
                                                             repeat_delay=2000)
            return HTML(animation.to_html5_video())

        return None

    def _render_frame(self):
        """Fetch the last frame from the base environment and render it to the screen."""
        try:
            import matplotlib.animation
            import numpy as np
        except ImportError:
            raise DependencyNotInstalled(
                "matplotlib is not installed, run `pip install matplotlib`"
            )
        if self.env.render_mode == "rgb_array_list":
            rgb_arrays = self.env.render()
        elif self.env.render_mode == "rgb_array":
            rgb_arrays = [self.env.render()]
        else:
            raise Exception(
                f"Wrapped environment must have mode 'rgb_array' or 'rgb_array_list', actual render mode: {self.env.render_mode}"
            )

        assert isinstance(rgb_arrays, list)

        for rgb_array in rgb_arrays:
            assert isinstance(rgb_array, np.ndarray)

        if self.figure is None:
            self.figure = plt.figure()
            plt.axis('off')
        
        self.artists.append([plt.imshow(rgb_array) for rgb_array in rgb_arrays])

    def close(self):
        """Close the rendering window."""
        result = self.video()
        super().close()

        return result

In [5]:
import numpy as np
np.set_printoptions(linewidth=100)

Let's render soe steps to show how to use the `RecordVideo` class.

In [6]:
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array"))
env.reset()                    

for _ in range(100):
    observation, reward, terminated, truncated, info = env.step(env.action_space.sample())

    done = terminated or truncated

    if done:
        env.reset()

display(env.video())

<IPython.core.display.Javascript object>

This is only needed when we want to render. In many situations of this lab we will perform many steps without the need to render, we will avoid recording a video, since doing so requires a lot of RAM (to save all the RGB frames) and we may run out of memory.

To avoid this do not wrap the environment on a `RecordVideo` class and set keyword argument `render_mode=None` when making the environment.

In [7]:
env = gym.make("FrozenLake-v1", render_mode=None)
env.reset()                    

for _ in range(100):
    observation, reward, terminated, truncated, info = env.step(env.action_space.sample())

    done = terminated or truncated

    if done:
        env.reset()

In this lab we will be estimating value functions. Value functions of an MDP are conditioned on the policy used.

A policy is the probability distribution over actions. We will restrict ourselves to discrete action spaces.

We will start by using a uniform policy that chooses with equal chances between all actions.

## Exercise : A uniform policy

Create a `policy_uniform` function that returns the probability of each possible action given an environment and state.

We will also create a `sample_multinomial` function that facilitates sampling an action from the policy given the probability of each possible action.

**Hint** you may access the number of discrete actions with `env.action_space.n`.

In [9]:
class UniformPolicy(object):
    '''
    A policy which choose an action given an action space with equal probability to every action
    '''
    def __init__(self, action_space):
        ##for the bipedal walker turned off assertion
        # assert isinstance(action_space, gym.spaces.discrete.Discrete), "Can only create uniform policies for Discrete action spaces"
        
        try:
            self.n_actions = action_space.n
        except:
            self.n_actions = None #for the some environments used this exception

        self.training = True

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

    def probability(self, state, action):
        ### BEGIN SOLUTION
        action_prob = 1/self.n_actions #uniform probability for all
        ### END SOLUTION

        return action_prob

    def sample(self, state):
        ### BEGIN SOLUTION
        choices = [i for i in range(self.n_actions)] #have same number of choise as the action space
        action = np.random.choice(choices) #randomly choose an action        
        ### END SOLUTION        
        return action

# Let's instantiate a uniform policy
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array"))
uniform_policy = UniformPolicy(env.action_space)
del env
# And sample 20 actions from state==0
actions = [uniform_policy.sample(0) for i in range(20)]
print(actions)

[2, 2, 1, 1, 0, 3, 2, 1, 2, 3, 1, 3, 0, 3, 3, 3, 0, 2, 0, 2]


Gym follows a formalism of MDPs that differs slightly (is more general) than the one we have seen in class. The differences are:
- In gym, transitions from a particular state and action can lead to the same destination state with different rewards (in the course all transitions reaching the same state obtain the same reward, except for transitions departing from a terminal state, whose rewards can be ignored)
- In gym, transitions to the same state can be both terminal and non-terminal (in the course it's the states that are terminal)

However we have chosen the environment `FrozenLake` which behaves as the MDPs described in the course, thus:
- All transitions to a given state have the same reward (unless the departure and destination state is the same terminal state)
- All transitions to a terminal state are terminal

In bym `env.env.P` contains the information about the process dynamics, rewards and terminal states. The information is encoded in the following way:

`env.env.P[state][action]` returns a list of tuples containing all the possible outcomes and their probability in the form `(prob, next_state, reward, terminal)` where `state` is the current state, `action` is the action taken, `probability` is the probability of the transition, `next_state` is the destination state of the transition, `reward` is the reward obtained for the transition and `terminal` is a boolean indicating if the transition ends the episode.

## Exercise : MDP from gym environment

Using this information compute the matrices $\mathcal{P}_{ss'}$, $\mathcal{R}_s$ and `terminal_states` of your MDP. Terminal is a boolean vector containing `True` for terminal states and `False` otherwise.

We will use the uniform policy that we created before, that assigns equal probability to all actions.

**Hint** all the transitions from a terminal state must be ignored.

**Optionally** while computing the matrices verify with `assert` that the conditions that we described above are fulfilled.

In [10]:
def compute_P_and_R(env, policy):
    '''
    This function iterate over the system dynamics to compute the P and R matrix.
    returns P, R and the terminal states
    '''
    ### BEGIN SOLUTION
    # Initialize the matrices P and R

    N = env.observation_space.n #number of states
    P = np.zeros((N,N)) #init P matrix
    num_of_action = env.action_space.n #number of possible action
    terminal_states = np.empty(N) 
    terminal_states[:] = np.nan

    # Iterate over env.env.P.items()
    # env.env.P is a dictionary that maps from 
    # departure state to a dictionary of possible actions and subsequent transitions

    for i in range(N): #iterate through every state
        state = env.env.P[i] #take ith state
        prob_to_take_a_action = 1/num_of_action # Obtain the probability of taking that action

        # Iterate over all possible actions with equal probability
        for action in state:
            state_action_dynamics = state[action] #take every action in the state
            
            # Iterate over all transitions for this action
            for result in state_action_dynamics:
                
                next_state = result[1]
                prob = result[0]
                # If the next state is terminal flag it as such in terminal_states
                is_terminal = result[3]
                terminal_states[next_state] = is_terminal
                    
                # Add to P the probability of the transition as:
                # (probability of the action) * (probability of the transition)
                
                P[i,next_state] += prob*prob_to_take_a_action
        
    # Convert the terminal states vector to boolean to use in indexing
    terminal_states = terminal_states.astype(bool)    
    # Set the reward in R
    R = np.zeros((N,1))
    R[N-1,0] = 1 #only the last reward is 1

    return P, R, terminal_states

env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array"))
P, R, terminal_states = compute_P_and_R(env, uniform_policy)

print(P)
print(R)
print(terminal_states)

[[0.5  0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.25 0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.25 0.5  0.   0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.25 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.25 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.  ]
 [0.

## Exercise : Sample an episode

Sample an episode with the uniform policy and return the lists of observations (states), actions, rewards and the done flags.

Notice that the list of states should have one more element (the initial state).

Test the function by rendering an episode. We will use `RecordVideo` to do that.

In [11]:
def sample_episode(env, policy, reset=True):
    '''
    This functions takes an environment and policy and sample an episode of the environment according to the given policy
    returns list of states, actions, rewards, dones in the sampled episode
    '''
    states = []
    actions = []
    rewards = []
    dones = []

    ### BEGIN SOLUTION
    # If reset, we reset the environment and get an initial state
    # else we set the initial state to it's current state env.env.s
    if reset:
        initial_state,_ = env.reset()
    else:
        initial_state = env.env.s
    done = False
    # Collect the initial state
    states.append(initial_state)   
    # While the episode has not finished
    while not done:        
        # Select an action
        action = policy.sample(states[-1])
        actions.append(action)
        # Step the environment
        obs, reward, terminated, truncated, info = env.step(action)
        # The episode is done if it has been terminated or truncated
        done = terminated or truncated 
        # Collect the state, reward and action taken
        states.append(obs)
        rewards.append(reward)
        dones.append(done)
    ### END SOLUTION

    return states, actions, rewards, dones

env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array"))
uniform_policy = UniformPolicy(env.action_space)
states, actions, rewards, dones = sample_episode(env, uniform_policy)
print(rewards)
print(len(states), len(actions), len(rewards), len(dones))
display(env.video())

<IPython.core.display.Javascript object>

[0.0, 0.0, 0.0, 0.0, 0.0]
6 5 5 5


## Exercise : Returns of episode

For a sampled episode compute the return $G_t$ for all steps $t$.

**Hint** the return is easily computed backwards from the last step in the episode to the first.

In [13]:
def compute_returns(rewards, gamma):
    '''
    computes return of an episode with discount factor gamma.
    returns list of returns.
    '''
    returns = np.zeros(len(rewards))

    ### BEGIN SOLUTION    
    
    episode_len = len(rewards)
    returns[episode_len-1] = rewards[episode_len-1] #last return is always same as no future step
    
    # Iterate over the rewards backward computing each return    
    for i,reward in enumerate(reversed(rewards)):
        if i==0:
            continue #as the last return is already computed
        returns[episode_len-1-i] = rewards[episode_len-1-i] + gamma*returns[episode_len-i]# using the previous return computed        

    ### END SOLUTION
    return returns

env = gym.make("FrozenLake-v1", render_mode="rgb_array")
uniform_policy = UniformPolicy(env.action_space)

# Now we will run until one of them has a positive reward
while True:
    states, actions, rewards, dones = sample_episode(env, uniform_policy)
    if np.sum(rewards) > 0:
        # Print the rewards
        print(rewards)
        # Compute and print the returns
        gamma = 0.9
        returns = compute_returns(rewards, gamma)
        print(returns)
        # Exit the loop
        break

[0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.59049 0.6561  0.729   0.81    0.9     1.     ]


# Policy evaluation

In this section we will implement various value function estimation methods:
- Direct method using the previous P and R
- Naive (cheating) method that assumes possibility of setting an arbitrary initial state
- Monte Carlo methods
- Time-difference method

## Exercise : Direct method

Solve the Bellman equation, using the direct solution (matrix inversion).

**Note** that this method leads to miss-leading values for terminal states, since there is no way to indicate terminality using the Bellman equation without sampling.

In [50]:
def value_direct(env, gamma, policy):
    '''
    computes the value function using the bellman equation
    returns the value
    '''
    P, R, terminal_states = compute_P_and_R(env, policy)
    ### BEGIN SOLUTION
    #compute the value using bellman equation
    I = np.eye(env.observation_space.n)
    v = np.matmul(np.linalg.inv(I-gamma*P),R)   
    ### END SOLUTION
    # Unsqueeze the extra dimension
    v = v[:,0]
    return v
    
env = gym.make("FrozenLake-v1", render_mode="rgb_array")
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
v_direct = value_direct(env, gamma, uniform_policy)
print(v_direct)

[ 4.02953462e-02  3.80021094e-02  9.06008086e-02  3.70639671e-02  6.04976257e-02 -1.08110653e-15
  2.37003375e-01  0.00000000e+00  1.68085365e-01  5.18463074e-01  9.62747525e-01  0.00000000e+00
  4.55649547e-16  1.17344744e+00  3.52341144e+00  1.00000000e+01]


## (Optional) Exercise : Verify the solution

Check if the computed value function $V$ fulfills the Bellman equation. You may use `np.allclose(a, b)` to check if all elements in `a` and `b` are close up to a numerical error.

In [51]:
### BEGIN SOLUTION
P, R, terminal_states = compute_P_and_R(env, uniform_policy)
gamma = 0.9
v_direct = value_direct(env, gamma, uniform_policy)
v_original = R.squeeze() + gamma*np.matmul(P,v_direct)
np.allclose(v_direct,v_original)

### END SOLUTION

True

# **We can see that the V fulfulls the Bellman equation...**

For the following we must set the value function of terminal states to 0 in order to keep our value function comparable to those computed by methods using episode sampling.

In [52]:
# To compare to the following methods we must set the value of terminal states to 0
v_direct[terminal_states] = 0
print(v_direct)

[0.04029535 0.03800211 0.09060081 0.03706397 0.06049763 0.         0.23700338 0.         0.16808536
 0.51846307 0.96274753 0.         0.         1.17344744 3.52341144 0.        ]


## Exercise : Naive (cheating) method

Compute the value function for each state by cheating (changing the starting state and computing the average return from the start).

In [54]:
def value_naive(env, gamma, policy, n_episodes):
    '''
    This function returns the value function using naive method. Setting the initial state manually.
    '''
    ### BEGIN SOLUTION

    # Initialize values and counts tables (one cell per state)
    values = np.zeros((env.observation_space.n,1))
    counts = np.zeros((env.observation_space.n,1))


    # Compute the number of episodes per state
    episodes_per_state = int(n_episodes/env.observation_space.n)


    # For each initial state
    for i in range(env.observation_space.n):

        # For each episode
        for episode in range(episodes_per_state):
            # Reset the environment
            env.reset()

            # Set the initial state
            env.env.s = i

            # Sample an episode 
            # (without reseting the environment to avoid changing the initial state)
            states, actions, rewards, dones = sample_episode(env, policy, reset = False)

            # Compute the returns
            returns = compute_returns(rewards, gamma)
            
            # Accumulate the return of the initial state
            values[i] += returns[0]

        # Divide the accumulated value by the number of episodes
        values[i] = values[i]/episodes_per_state

    ### END SOLUTION
    
    return values

def compute_value_error(v_est, v_ref):
    '''
    computes and returns the mean and standard deviation of the difference of two value function
    '''
    diff = (v_est - v_ref)
    return np.mean(diff), np.std(diff)


env = gym.make("FrozenLake-v1", render_mode=None)
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
n_episodes = 10000
v_naive = value_naive(env, gamma, uniform_policy, n_episodes)
#print the naive value
print(v_naive)
#compute the mean and std of the error between the naive and direct method
print(compute_value_error(v_naive, v_direct))

[[0.00146668]
 [0.00414908]
 [0.00540407]
 [0.00300276]
 [0.00305437]
 [0.00608835]
 [0.00733657]
 [0.00508038]
 [0.00233693]
 [0.00585959]
 [0.00314992]
 [0.00452715]
 [0.00773692]
 [0.00262373]
 [0.00399102]
 [0.00234866]]
(-0.4238413681433789, 0.8716202464251206)


***Here we can see that the mean error and std error is not very small compared to the values itself. Maybe the results will get closer if we sample for more episodes. As the process ar random the more number of episodes we will take the more accurate the results will become***

## Exercise : Monte Carlo first visit method
Compute the value function for each state using the Monte carlo method "first visit"

In [55]:
def value_montecarlo_first(env, gamma, policy, n_episodes):
    '''
    returns the value function calculated using monte carlo first visit method
    '''
    ### BEGIN SOLUTION

    # Initialize values and counts tables (one cell per state)
    values = np.zeros((env.observation_space.n,1))
    counts = np.zeros((env.observation_space.n,1))

    # For each episode
    for episode in range(n_episodes):
        # Sample an episode and compute returns
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        # print(states)
        # Keep track of visited states
        visited_states = set(states) #stores the states which has not been visited yet
        returns = compute_returns(rewards, gamma)

        # For each state and associated return
        for state, ret in zip(states, returns):
            # If first visit
            if state in visited_states:
                
                # Increment counts
                counts[state]+=1

                # Accumulate returns
                values[state]+=ret 

                # Update the set of visited states
                visited_states.remove(state) #as state is visited remove it from the list

    # Average the accumulated returns
    for i in range(env.observation_space.n):
        if counts[i]!=0: #avoiding devide by 0
            values[i] = values[i]/counts[i]

    ### END SOLUTION
    
    return values

env = gym.make("FrozenLake-v1", render_mode=None)
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
n_episodes = 10000
v_mc_firstvisit = value_montecarlo_first(env, gamma, uniform_policy, n_episodes)
print(v_mc_firstvisit)
print(compute_value_error(v_mc_firstvisit, v_direct))

[[0.00422553]
 [0.00447437]
 [0.01087732]
 [0.00373806]
 [0.0059759 ]
 [0.        ]
 [0.02640502]
 [0.        ]
 [0.01670493]
 [0.05137023]
 [0.09514683]
 [0.        ]
 [0.        ]
 [0.13853304]
 [0.35475127]
 [0.        ]]
(-0.38358847389779127, 0.8761317605280556)


***Here we can see that the mean error and std error is not very small compared to the values itself. Maybe the results will get closer if we sample for more episodes. As the process ar random the more number of episodes we will take the more accurate the results will become
Also we notice that the terminal states didnot accumulate any value***

## Exercise : Monte Carlo every visit method
Compute the value function for each state using the Monte carlo method "every visit"

In [56]:
def value_montecarlo_every(env, gamma, policy, n_episodes):
    ### BEGIN SOLUTION
    '''
    returns the value function calculated using monte carlo every visit method
    '''
    # Initialize values and counts tables (one cell per state)
    values = np.zeros((env.observation_space.n,1))
    counts = np.zeros((env.observation_space.n,1))

    # For each episode
    for episode in range(n_episodes):
        # Sample an episode and compute returns
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        returns = compute_returns(rewards, gamma)

        # For each state and associated return
        for state, ret in zip(states, returns):
            # Increment counts
            counts[state]+=1

            # Accumulate returns
            values[state]+=ret 

    # Average the accumulated returns
    for i in range(env.observation_space.n):
        if counts[i]!=0:
            values[i] = values[i]/counts[i]


    ### END SOLUTION
    
    return values

env = gym.make("FrozenLake-v1", render_mode=None)
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
n_episodes = 10000
v_mc_everyvisit = value_montecarlo_every(env, gamma, uniform_policy, n_episodes)
print(v_mc_everyvisit)
print(compute_value_error(v_mc_everyvisit, v_direct))

[[0.00462093]
 [0.00418719]
 [0.00961317]
 [0.00605081]
 [0.00660819]
 [0.        ]
 [0.02521458]
 [0.        ]
 [0.01893774]
 [0.05477256]
 [0.09642368]
 [0.        ]
 [0.        ]
 [0.11963533]
 [0.37657292]
 [0.        ]]
(-0.3829363108430256, 0.876515955615458)


***Here we can see that the mean error and std error is not very small compared to the values itself. Maybe the results will get closer if we sample for more episodes. As the process ar random the more number of episodes we will take the more accurate the results will become
Also we notice that the terminal states didnot accumulate any value***

## Exercise : Monte Carlo incremental

Implement the incremental Monte Carlo method.

In [63]:
def value_montecarlo_incremental(env, gamma, policy, n_episodes):
    '''
    computes the value function using monte carlo incremental method
    '''
    ### BEGIN SOLUTION
    # Initialize values and counts tables (one cell per state)

    values = np.zeros((env.observation_space.n,1))
    counts = np.zeros((env.observation_space.n,1))



    # For each episode
    for episode in range(n_episodes):
    
        # Sample an episode and compute returns
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        returns = compute_returns(rewards, gamma)

        # For each state and associated return
        for state, ret in zip(states, returns):
        
            # Increment counts
            counts[state]+=1

            # Update value with return
            values[state] = values[state]+ (ret-values[state])/counts[state]

    ### END SOLUTION
    return values

env = gym.make("FrozenLake-v1", render_mode=None)
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
n_episodes = 10000
v_mc_incremental = value_montecarlo_incremental(env, gamma, uniform_policy, n_episodes)
print(v_mc_incremental)
print(compute_value_error(v_mc_incremental.squeeze(), v_direct))

[[0.00394977]
 [0.00360034]
 [0.00794055]
 [0.00222208]
 [0.00518941]
 [0.        ]
 [0.02335416]
 [0.        ]
 [0.01429527]
 [0.05059055]
 [0.09441255]
 [0.        ]
 [0.        ]
 [0.13315949]
 [0.35187081]
 [0.        ]]
(-0.3849395688406434, 0.7836545522498753)


## Exercise : Time-differences (TD) method

In [68]:
import gymnasium as gym
def value_td(env, gamma, policy, n_episodes, alpha=0.4):
    '''
    computes the value function using td method
    '''
    ### BEGIN SOLUTION
    # Initialize value table (one cell per state)
    values = np.zeros((env.observation_space.n,1))


    # For each episode
    for episode in range(n_episodes):
        # Sample an episode
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        returns = compute_returns(rewards, gamma)
        # For each step in the episode
        for i,state in enumerate(states):
            # Update the value of the depart state with the current value, the value of the next state and the reward
            if i<len(rewards): #taking only departing state. not taking the last state
                values[state] = values[state] + alpha*(rewards[i] + gamma*values[states[i+1]] - values[state])
        
        ### END SOLUTION
    return values


env = gym.make("FrozenLake-v1", render_mode=None)
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
n_episodes = 10000
alpha = 0.4
v_td = value_td(env, gamma, uniform_policy, n_episodes, alpha=alpha)
print(v_td)
print(compute_value_error(v_td, v_direct))

[[0.01218403]
 [0.00996344]
 [0.02871153]
 [0.01418424]
 [0.0087173 ]
 [0.        ]
 [0.04449451]
 [0.        ]
 [0.02854693]
 [0.01347043]
 [0.23962544]
 [0.        ]
 [0.        ]
 [0.28957349]
 [0.8622286 ]
 [0.        ]]
(-0.3311198848788154, 0.897707110047378)


## (Optional) Exercise : Comparison TD to Monte Carlo incremental

How do these to methods compare? Explain the relative advantages and disadvantages.

***TD method had less error compared to MC incremental because the value of the 15th state is large in TD as it should be. The it is the last state before the goal.So it will have more value than other states***

## Exercise : Action-value
Compute the action value function using one or more of the following methods:
- Naive (cheating)
- MC "first visit"
- MC "every visit"
- MC incremental
- TD

In [69]:
def actionvalue_montecarlo_every(env, gamma, policy, n_episodes):
    '''
    computes action value table using monte carlo every method
    '''
    ### BEGIN SOLUTION
    q = np.zeros((env.observation_space.n,  env.action_space.n))
    counts = np.zeros((env.observation_space.n,  env.action_space.n))

    for episode in range(n_episodes):
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        returns = compute_returns(rewards, gamma)

        for state, action, ret in zip(states, actions, returns):
            # Increment counts
            counts[state,action]+=1

            # Accumulate returns
            q[state, action]+=ret 

    # Average the accumulated returns
    for i in range(env.observation_space.n): #for every state
        for j in range(env.action_space.n): #for every action in state
            if counts[i,j]!=0: #avoiding devide by 0
                q[i,j] = q[i,j]/counts[i,j]
                

    ### END SOLUTION
    return q

env = gym.make("FrozenLake-v1", render_mode="rgb_array")
uniform_policy = UniformPolicy(env.action_space)
gamma = 0.9
n_episodes = 10000
q_mc_everyvisit = actionvalue_montecarlo_every(env, gamma, uniform_policy, n_episodes)
print(q_mc_everyvisit)

[[0.00418346 0.00312511 0.004824   0.00295932]
 [0.00240703 0.0030469  0.00294127 0.00452413]
 [0.00667725 0.0056831  0.01190309 0.00468777]
 [0.00449759 0.00766531 0.00371186 0.00675204]
 [0.00816518 0.00530914 0.00640574 0.00307836]
 [0.         0.         0.         0.        ]
 [0.01767613 0.03161242 0.01672411 0.00180974]
 [0.         0.         0.         0.        ]
 [0.0036916  0.01948197 0.01774678 0.02202558]
 [0.02769733 0.08812255 0.05884241 0.02600236]
 [0.11660009 0.12495692 0.07006672 0.03983735]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.04404301 0.18041695 0.14050575 0.09523646]
 [0.19523982 0.41856355 0.44833555 0.29745037]
 [0.         0.         0.         0.        ]]


# Policy improvement

In this section we are going to improve upon the uniform policy, which selects actions at random, independently on the state.

To assess whether our learned policies work, we will start by implmenting a scoring function to evaluate the policies.

## Exercise : Evaluate performance of a policy

In this environment we define a goal as obtaining a final reward greater than 0, thus reaching the target state, since it is the only one with a non-zero reward.

For a given policy, compute:
- the average number of episodes where the goal was reached
- the average number of steps to reach the goal


In [73]:
def score_policy(env, gamma, policy, n_episodes):
    '''
    returns percentage of times the agent reached the goal and avg number of steps took to reach goal
    '''
    episodes_to_goal = 0
    steps_to_goal = []

    ### BEGIN SOLUTION
    for episode in range(n_episodes):
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        #if the goal reached increase the number of times reached and store the number of steps required
        if rewards[-1]!=0: 
            episodes_to_goal+=1
            steps_to_goal.append(len(rewards))
    ### END SOLUTION

    return episodes_to_goal/n_episodes, np.mean(steps_to_goal)

score_policy(env, 0.9, uniform_policy, 10000)

(0.0136, 13.632352941176471)

## Exercise : Greedy policy (policy improvement)

Create a greedy policy from an action value function and evaluate it's performance.

The greedy policy selects the action leading to the largest value in the current state.

**Note** that when several actions have the same maximal value, it is best to randomly pick one of them.

Pick the first highest value action or (optionally) pick randomly amongst actions with the highest value.

In [74]:
class GreedyPolicy(UniformPolicy):
    '''
    A class to take greedy(max value) action everytime
    '''
    def __init__(self, action_space, q):
        super().__init__(action_space) #initialize the paarent class it is inherting from
        # q is the action-value table
        self.q = q

    def _max_value_action(self, state):
        ### BEGIN SOLUTION
        action = np.argmax(self.q[state]) #index of max value in the state
        ### END SOLUTION
        return action

    def probability(self, state, action):
        # Select the highest value action
        action_max = self._max_value_action(state)
        # Return a probability of 1 for the selected action 0 otherwise
        action_prob = float(action == action_max)

        return action_prob

    def sample(self, state):
        # Select the highest value action
        action_max = self._max_value_action(state)

        return action_max

env = gym.make("FrozenLake-v1", render_mode=None)
greedy_policy = GreedyPolicy(env.action_space, q_mc_everyvisit) #intializing with the q value computed using mc every visit

## Exercise : Compare the performance of policies

Report the performance of the uniform policy and the greedy policy

In [75]:
# env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery = True))
env = gym.make("FrozenLake-v1", render_mode=None)

### BEGIN SOLUTION

un_per,un_mean =  score_policy(env, 0.9, uniform_policy, 10000)
print(f"uniform policy: {un_per:.2%} / {un_mean}")

gr_per,gr_mean = score_policy(env, 0.9, greedy_policy, 10000)
print(f"greedy policy: {gr_per:.2%} / {gr_mean}")


### END SOLUTION

uniform policy: 1.36% / 12.375
greedy policy: 16.24% / 18.523399014778324


***The result is as expected as the uniform policy is just taking random action whatever the state is. But the greedy policy is taking the action that will give maximum return in every state from knowledge of the q table***

In [76]:
# env.render_mode = "rgb_array"
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery = True))
for i in range(10):
    states, actions, rewards, dones = sample_episode(env, greedy_policy)
display(env.video())

<IPython.core.display.Javascript object>

# Policy learning

In this section we will start learning policies.

We will then implement the following policy learning methods:
- policy iteration
- SARSA
- Q-Learning

## Exercise : Policy iteration
By alternating between policy evaluation and policy improvement, find an optimal policy.

Print the scores of each intermediate policy and comment on how the metrics evolve.

In [80]:
def policy_learn_iteration(env, initial_policy, policy_evaluation_function, 
                           n_episodes_value, n_episodes_score, n_iterations):
    policy = initial_policy #initializing policy with the given one

    ### BEGIN SOLUTION
    # Score and print the initial policy
    per,mean = score_policy(env, 0.9, policy, n_episodes_score)
    print(f"Initial Policy: {per:.2%} / {mean}")
    action_value_prev = None #at the start nothing is learned
    # Policy evaluation
    for it in range(n_iterations):
        if it==0:
            per,mean = score_policy(env, 0.9, policy, n_episodes_score)
            print(f"Policy (iter: {it}): {per:.2%} / {mean:.4}, Value change: nan")
            

        # Keep track of the action-value function change 
        # (sum of absolute difference between the previous 
        #  and next action-value funcitons)
        # Note that at the first step we will report a change of nan
        # since we still don't have a previous action-value function
        action_value = policy_evaluation_function(env, gamma, policy, n_episodes_value) #using initial policy getting the q table
        if it!=0:
            diff = np.abs(np.diff(action_value_prev - action_value)) #CHANGE OCCURED IN THE Q TABLE
            
        action_value_prev = action_value #storing the q table
        
        # Policy improvement
        policy = GreedyPolicy(env.action_space, action_value) #initializing greedy policy to learn

        # Policy scoring
        if it!=0: #score the greedy policy to see how much it is learning
            
            per,mean = score_policy(env, 0.9, policy, n_episodes_score)
            print(f"Policy (iter: {it}): {per:.2%} / {mean:.4}, Value change: {np.sum(diff):.3}")
            
        



    ### END SOLUTION

env = gym.make("FrozenLake-v1", render_mode=None)
uniform_policy = UniformPolicy(env.action_space)

policy_learn_iteration(env, uniform_policy, actionvalue_montecarlo_every, 
                       n_episodes_value=1000, n_episodes_score=10000,
                       n_iterations=30)

Initial Policy: 1.47% / 13.387755102040817
Policy (iter: 0): 1.30% / 12.79, Value change: nan
Policy (iter: 1): 10.92% / 15.16, Value change: 2.13
Policy (iter: 2): 10.80% / 15.21, Value change: 0.335
Policy (iter: 3): 10.86% / 15.53, Value change: 0.191
Policy (iter: 4): 10.50% / 15.54, Value change: 0.128
Policy (iter: 5): 10.67% / 15.24, Value change: 0.25
Policy (iter: 6): 11.13% / 15.07, Value change: 0.465
Policy (iter: 7): 10.27% / 15.26, Value change: 0.227
Policy (iter: 8): 10.56% / 15.5, Value change: 0.259
Policy (iter: 9): 10.90% / 15.27, Value change: 0.399
Policy (iter: 10): 10.78% / 15.17, Value change: 0.273
Policy (iter: 11): 10.73% / 15.52, Value change: 0.281
Policy (iter: 12): 10.19% / 15.3, Value change: 0.185
Policy (iter: 13): 10.71% / 15.35, Value change: 0.289
Policy (iter: 14): 10.55% / 15.22, Value change: 0.196
Policy (iter: 15): 10.39% / 15.32, Value change: 0.261
Policy (iter: 16): 10.56% / 15.75, Value change: 0.231
Policy (iter: 17): 10.53% / 15.52, Valu

## Exercise : Epsilon-greedy policy

One of the issues with the greedy policy is that some possible trajectories may never be visited depending on the initial estimation of the value function.

To avoid this create an epsilon-greedy policy. These policies act differently when running in training mode and evaluation mode. In evaluation mode they act as a greedy policy. In training mode:
- with probability epsilon, uniformly selects an action
- else selects the action with maximum value (greedy policy)

Implement the `EpsilonGreedyPolicy` class.


In [83]:
class EpsilonGreedyPolicy(GreedyPolicy):
    def __init__(self, action_space, q, epsilon, 
                 epsilon_decay=1, epsilon_min=0):
        super().__init__(action_space, q)
        
        self.epsilon_start = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.epsilon = self.epsilon_start

    def sample(self, state):
        ### BEGIN SOLUTION
        #if in training mode explore epsilon percentage of time else exploit the knowlegde it has
        if np.random.random()<= self.epsilon and self.training:
            choices = [i for i in range(self.n_actions)]
            action = np.random.choice(choices)
        else:
            action = np.argmax(self.q[state])
        ### END SOLUTION
        return action

    def begin_episode(self, episode_index):
        # Start of an episode
        self.epsilon = self.epsilon_start * (self.epsilon_decay ** episode_index)
        self.epsilon = max(self.epsilon, self.epsilon_min)

# Instantiate a policy
dummy = EpsilonGreedyPolicy(env.action_space, q_mc_everyvisit, epsilon=0.5)

# Sample 20 actions from state 0 in train mode
dummy.train()
actions = [dummy.sample(2) for i in range(20)]
print(actions)

# Sample 20 actions from state 0 in eval mode
dummy.eval()
actions = [dummy.sample(2) for i in range(20)]
print(actions)

[2, 2, 1, 2, 2, 0, 1, 3, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


## Exercise : SARSA

State–action–reward–state–action (SARSA) is a method to learn a policy leveraging the TD estimation of action-value functions and an epsilon-greedy policy.

Since it is based on TD, it does not require full episodes to train, the policy can be improved at each step. We will therefore **not use the `sample_episode`** and reimplement here a similar loop.

Implement the SARSA algorithm, in this case it will be passed a policy of type `EpsilonGreedyPolicy` which stores the action-value (or Q-table). We may access and modify if using `policy.q`.

Score the trained policy.

In [85]:
def policy_learn_sarsa(env, policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500):
    # No need to create the tables of action-values since they are stored directly in the policy
    policy.train()
    # While we haven't reached the desired number of steps
    for episode in range(n_episodes):
        # Call the policy begin_episode so it can handle epsilon decay
        policy.begin_episode(episode)
        
        # Print every couple episodes
        if not episode % print_every:
            print(f'ep: {episode}, epsilon: {policy.epsilon:.3f}')
            policy.eval() #go into evaluate mode
            avg_episodes_to_goal,avg_steps_to_goal = score_policy(env, gamma, policy, 10000)
            print(f'Policy SARSA: {avg_episodes_to_goal:.2%} / {avg_steps_to_goal:.2f}')
            policy.train() #go back to training
            print()
        # Get initial state
        state, info = env.reset()

        # Take the first action acording to the policy
        action = policy.sample(state)

        # While we haven't reached the maximum number of steps for the episode
        for step in range(max_n_steps):     
            ### BEGIN SOLUTION
            # Perform a step of the environment
            state_next,reward,terminated,truncated,info =  env.step(action)

            # It is done if terminated or truncated
            done = terminated or truncated

            # If episode has finished
            if done:
                # Update the action-value table and leave the loop
                action_next = policy.sample(state_next)
                policy.q[state, action] += alpha * (reward + gamma * policy.q[state_next, action_next] - policy.q[state, action])                  
                break

            # Sample the next action
            action_next = policy.sample(state_next)

            # Update the action-value table
            policy.q[state, action] += alpha * (reward + gamma * policy.q[state_next, action_next] - policy.q[state, action])
            ### END SOLUTION

            # Set the current state and action
            state = state_next
            action = action_next
            
    return

env = gym.make("FrozenLake-v1", render_mode=None)

q_initial = np.zeros((env.observation_space.n, env.action_space.n))
sarsa_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
                                   epsilon=1, epsilon_decay=0.999, epsilon_min=0.001)

gamma = 0.9
n_episodes = 5000
alpha = 0.2
n_episodes_score = 1000

### BEGIN SOLUTION
# Train the policy
sarsa_policy.train()
policy_learn_sarsa(env, sarsa_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500)

### END SOLUTION

### BEGIN SOLUTION
# Evaluate the policy
sarsa_policy.eval()
avg_episodes_to_goal,avg_steps_to_goal = score_policy(env, gamma, sarsa_policy, n_episodes_score)

### END SOLUTION

print(f'Policy SARSA: {avg_episodes_to_goal:.2%} / {avg_steps_to_goal:.2f}')

ep: 0, epsilon: 1.000
Policy SARSA: 0.00% / nan

ep: 500, epsilon: 0.606
Policy SARSA: 7.72% / 12.39

ep: 1000, epsilon: 0.368
Policy SARSA: 0.00% / nan

ep: 1500, epsilon: 0.223
Policy SARSA: 9.36% / 15.46

ep: 2000, epsilon: 0.135
Policy SARSA: 37.84% / 30.46

ep: 2500, epsilon: 0.082
Policy SARSA: 14.45% / 18.33

ep: 3000, epsilon: 0.050
Policy SARSA: 58.09% / 39.04

ep: 3500, epsilon: 0.030
Policy SARSA: 50.40% / 31.45

ep: 4000, epsilon: 0.018
Policy SARSA: 29.83% / 23.78

ep: 4500, epsilon: 0.011
Policy SARSA: 30.16% / 27.62

Policy SARSA: 74.00% / 38.54


***We can see Sarsa gives much better result in terms of percentage of time the agent reaches the goal compared to the normal policy learning. Also because here we are using the eps greedy policy. The agent will try to reach every state. But for greedy policy the agent might never reach some state as it doesnot explore***

## Exercise : Q-learning

Q-learning is very similar to SARSA. **SARSA is an on-policy learning method** in which the action-values are updated following the same policy. In SARSA the action-values are updated using the value of the next state and next action taken.

Q-learning is off-policy, it does not assume the same policy when updating the action-values, instead it assumes an optimal policy by using the maximum value of the next state.

Implement a modified version of `policy_learn_sarsa` that performs Q-learning.

In [93]:
def policy_learn_qlearn(env, policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500):
    # No need to create the tables of action-values since they are stored directly in the policy
    policy.train()
    # While we haven't reached the desired number of steps
    for episode in range(n_episodes):
        # Call the policy begin_episode so it can handle epsilon decay
        policy.begin_episode(episode)
        
        # Print every couple episodes
        if not episode % print_every:
            print(f'ep: {episode}, epsilon: {policy.epsilon:.3f}')
            policy.eval()
            avg_episodes_to_goal,mean = score_policy(env, gamma, policy, 100)
            print(f'Policy Q-learn: {avg_episodes_to_goal}, mean:{mean}')
            print()
            policy.train()
        # Get initial state
        state, info = env.reset()

        # Take the first action acording to the policy
        action = policy.sample(state)

        # While we haven't reached the maximum number of steps for the episode
        for step in range(max_n_steps):     
            ### BEGIN SOLUTION
            # Perform a step of the environment
            state_next,reward,terminated,truncated,info =  env.step(action)

            # It is done if terminated or truncated
            done = terminated or truncated

            # If episode has finished
            if done:
                # Update the action-value table and leave the loop
                policy.q[state, action] += alpha*(reward + gamma*np.max(policy.q[state_next]) - policy.q[state,action]) 
                break

            # Sample the next action
            action_next = policy.sample(state_next)

            # Update the action-value table
            policy.q[state, action] += alpha*(reward + gamma*np.max(policy.q[state_next]) - policy.q[state,action])
            ### END SOLUTION

            # Set the current state and action
            state = state_next
            action = action_next
            
    return

env = gym.make("FrozenLake-v1", render_mode=None)

q_initial = np.zeros((env.observation_space.n, env.action_space.n))
qlearn_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
                                    epsilon=1, epsilon_decay=0.999, epsilon_min=0.01)

gamma = 0.9
n_episodes = 5000
alpha = 0.2
n_episodes_score = 1000

### BEGIN SOLUTION
# Train the policy
qlearn_policy.train()
policy_learn_qlearn(env, qlearn_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500)

# Evaluate the policy
qlearn_policy.eval()
avg_episodes_to_goal,avg_steps_to_goal = score_policy(env, gamma, qlearn_policy, n_episodes_score)


### END SOLUTION

print(f'Policy Q-learn: {avg_episodes_to_goal:.2%} / {avg_steps_to_goal:.2f}')


ep: 0, epsilon: 1.000
Policy Q-learn: 0.0, mean"nan



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ep: 500, epsilon: 0.606
Policy Q-learn: 0.12, mean"15.333333333333334

ep: 1000, epsilon: 0.368
Policy Q-learn: 0.5, mean"33.3

ep: 1500, epsilon: 0.223
Policy Q-learn: 0.46, mean"26.195652173913043

ep: 2000, epsilon: 0.135
Policy Q-learn: 0.56, mean"35.410714285714285

ep: 2500, epsilon: 0.082
Policy Q-learn: 0.41, mean"29.853658536585368

ep: 3000, epsilon: 0.050
Policy Q-learn: 0.23, mean"26.17391304347826

ep: 3500, epsilon: 0.030
Policy Q-learn: 0.46, mean"34.08695652173913

ep: 4000, epsilon: 0.018
Policy Q-learn: 0.34, mean"21.705882352941178

ep: 4500, epsilon: 0.011
Policy Q-learn: 0.58, mean"37.241379310344826

Policy Q-learn: 73.30% / 37.76


## (Optional) Exercise : Train for Taxi

Train an epsilon-greedy policy using Q-learning on the `Taxi-v3` environment.

Score the performance of this policy and compare it to a uniform policy.

In [95]:
### BEGIN SOLUTION
env = gym.make("Taxi-v3", render_mode=None)

q_initial = np.zeros((env.observation_space.n, env.action_space.n))
qlearn_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
                                    epsilon=1, epsilon_decay=0.999, epsilon_min=0.01)

gamma = 0.9
n_episodes = 5000
alpha = 0.2
n_episodes_score = 1000

# Train the policy
qlearn_policy.train()
policy_learn_qlearn(env, qlearn_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500)


# Evaluate the policy
qlearn_policy.eval()
avg_episodes_to_goal,avg_steps_to_goal = score_policy(env, gamma, qlearn_policy, n_episodes_score)

print(f'Policy Q-learn: {avg_episodes_to_goal:.2%} / {avg_steps_to_goal:.2f}')
### END SOLUTION

ep: 0, epsilon: 1.000
Policy Q-learn: 1.0, mean"200.0

ep: 500, epsilon: 0.606
Policy Q-learn: 1.0, mean"139.29

ep: 1000, epsilon: 0.368
Policy Q-learn: 1.0, mean"48.3

ep: 1500, epsilon: 0.223
Policy Q-learn: 1.0, mean"14.73

ep: 2000, epsilon: 0.135
Policy Q-learn: 1.0, mean"20.59

ep: 2500, epsilon: 0.082
Policy Q-learn: 1.0, mean"15.31

ep: 3000, epsilon: 0.050
Policy Q-learn: 1.0, mean"13.47

ep: 3500, epsilon: 0.030
Policy Q-learn: 1.0, mean"13.36

ep: 4000, epsilon: 0.018
Policy Q-learn: 1.0, mean"13.03

ep: 4500, epsilon: 0.011
Policy Q-learn: 1.0, mean"12.79

Policy Q-learn: 100.00% / 12.97


## (Optional) Exercise : Render Taxi

Run 3 episodes of a Q-learn trained policy on `Taxi-v3` this time rendering the result.

In [97]:
### BEGIN SOLUTION
env = gym.make("Taxi-v3", render_mode=None)
env = RecordVideo(gym.make("Taxi-v3", render_mode="rgb_array"))
qlearn_policy.eval()
env.reset()
for i in range(3):
    env.render()
    states, actions, rewards, dones = sample_episode(env, qlearn_policy)

env.close()

### END SOLUTION

<IPython.core.display.Javascript object>

## (Bonus) Exercise : A observation space environment

Try to perform Q-learning on an environment (e.g. `CartPole-v1`) with continuous action and observation spaces.

You will need to discretize the observation space.

In [98]:
# Example of rendering and showing a CartPole-v1 environment
env = RecordVideo(gym.make("CartPole-v1", render_mode="rgb_array"))
observation, info = env.reset()
i=0
while True:
    env.render()
    
    action = env.action_space.sample() 
    state_next, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    
    if done: 
      break
            
env.close()

<IPython.core.display.Javascript object>

In [100]:
import numpy as np

class ObservationWrapper(gym.ObservationWrapper):
    '''
    n_states(int): number of state of the env.
                It has to be the some power of N = env.observation_space.shape[0]
    bounds(list of tuples): bounds of the states of the env.(taken from the gymnasium documentation).
                each tuple gives the bounds of the states
    '''
    def __init__(self, env, n_states=4096, bounds=[(-4.8,4.8),(-50,50),( -0.418, 0.418),(-20,20)]):
        super().__init__(env)

        self.n_states = n_states
        self.bounds = bounds
        assert isinstance(env.observation_space, gym.spaces.box.Box)

        # Get the dimensions of the observation space
        self.dimension = env.observation_space.shape[0] 
        self.states_per_dimention = int(n_states**(1/self.dimension)) #we give equal number of state per dimension
        
        # calculating steps for each state and storing in a list
        self.step_list = []
        for state_bound in bounds:
            step = (state_bound[1] - state_bound[0])/self.states_per_dimention
            self.step_list.append(step)

        
    def get_quant_observation(self, new_obs):
        ##calculate the new quantized state index from a list of states.  (like converting number systems)
        #observations are the digits and states per dimention is the base of the number system.
        quantized_obs = 0
        dimension = self.dimension 
        for i, obs in enumerate(new_obs):
            quantized_obs+= obs*self.states_per_dimention**(dimension-i-1)
        
        # print(quantized_obs)
        return int(quantized_obs)

    def observation(self, obs):
        ### BEGIN SOLUTION
        # Quantize the observations (states)
        new_obs = []
        # get the quanta of each state
        for i,observation_state in enumerate(obs):
            new_obs.append(int((observation_state-self.bounds[i][0])/self.step_list[i]))
            
        # Once quantized each dimension compute a single observation index
        return self.get_quant_observation(new_obs)

        


In [102]:
def score_policy(env, gamma, policy, n_episodes):
    #score policy for cart-pole environment as the scoring is different
    #Here as the score we are taking how many steps the agent survived.
    #the longer it survived the better the policy
    episodes_to_goal = 0
    steps_to_goal = []

    for episode in range(n_episodes):
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        episodes_to_goal+= len(states)

    return episodes_to_goal/n_episodes,steps_to_goal #returning 2 variables just to make it compatibale with the existing code 

In [101]:
# Initialize environment, wrap to render
env = gym.make("CartPole-v1", render_mode="rgb_array")

# Wrap to discretize the observation space
env = ObservationWrapper(env)

### BEGIN SOLUTION
# Initialize policy

observation, info = env.reset()
#action is 2 and states are 4096 after quantization
q_initial = np.zeros((4096, 2))
qlearn_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
                                    epsilon=1, epsilon_decay=0.999, epsilon_min=0.01)

gamma = 0.9
n_episodes = 5000
alpha = 0.1
n_episodes_score = 100

### BEGIN SOLUTION
# Train the policy
qlearn_policy.train()
policy_learn_qlearn(env, qlearn_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500)

### END SOLUTION

### BEGIN SOLUTION
# Evaluate the policy
qlearn_policy.eval()
avg_episodes_to_goal = score_policy(env, gamma, qlearn_policy, n_episodes_score)


### END SOLUTION

print(f'Policy Q-learn: {avg_episodes_to_goal}')



ep: 0, epsilon: 1.000
Policy Q-learn: 10.25, mean"[]

ep: 500, epsilon: 0.606
Policy Q-learn: 14.61, mean"[]

ep: 1000, epsilon: 0.368
Policy Q-learn: 119.95, mean"[]

ep: 1500, epsilon: 0.223
Policy Q-learn: 65.59, mean"[]

ep: 2000, epsilon: 0.135
Policy Q-learn: 315.48, mean"[]

ep: 2500, epsilon: 0.082
Policy Q-learn: 96.07, mean"[]

ep: 3000, epsilon: 0.050
Policy Q-learn: 66.22, mean"[]

ep: 3500, epsilon: 0.030
Policy Q-learn: 95.23, mean"[]

ep: 4000, epsilon: 0.018
Policy Q-learn: 106.94, mean"[]

ep: 4500, epsilon: 0.011
Policy Q-learn: 188.88, mean"[]

Policy Q-learn: (466.24, [])


In [103]:
qlearn_policy.eval()
avg_episodes_to_goal = score_policy(env, gamma, qlearn_policy, n_episodes_score)
print(f'Policy Q-learn: {avg_episodes_to_goal}')


Policy Q-learn: (466.22, [])


Here we see the model performs really well. It can continue for average 466 steps.

In [104]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
# Wrap to discretize the observation space and then wrap with the record video to render vdo
env = RecordVideo(ObservationWrapper(env))
env.reset()
qlearn_policy.eval()
for i in range(1):
    env.render()
    state_next, reward, terminated, truncated = sample_episode(env, qlearn_policy)
    
env.close()

<IPython.core.display.Javascript object>

In [106]:

def get_bounds(low_bounds, high_bounds):
    '''
    this function takes lower and higher bounds for observation space
    and return them as required by the observation_space wrapper
    '''
    bounds =[]
    for i,_ in enumerate(low_bounds):
        bounds.append((low_bounds[i], high_bounds[i]))

    return bounds


In [133]:
#mountain car
def score_policy(env, gamma, policy, n_episodes):
    '''
    returns percentage of times the agent reached the goal and avg number of steps took to reach goal
    '''
    episodes_to_goal = 0
    steps_to_goal = []

    ### BEGIN SOLUTION
    for episode in range(n_episodes):
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        if len(actions)<200: #the env allows maximum 200 steps. So if the len of action is less than 200 then the agent reached it's goal
            episodes_to_goal+=1
            steps_to_goal.append(len(rewards))
    ### END SOLUTION

    return episodes_to_goal/n_episodes, np.mean(steps_to_goal)


env = gym.make("MountainCar-v0")
#get high and low bounds for env
low_bounds = env.observation_space.low
high_bounds = env.observation_space.high
#get bounds for observation_wrapper
bounds = get_bounds(low_bounds, high_bounds)
n_states = env.observation_space.shape[0]**6
env = ObservationWrapper(env, n_states=n_states, bounds=bounds)
n_action = env.action_space.n

observation, info = env.reset()
#action is 2 and states are 4096 after quantization
q_initial = np.zeros((n_states, n_action))
qlearn_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
                                    epsilon=1, epsilon_decay=0.999, epsilon_min=0.01)

gamma = 0.9
n_episodes = 25000
alpha = 0.4
n_episodes_score = 100

### BEGIN SOLUTION
# Train the policy
qlearn_policy.train()
policy_learn_qlearn(env, qlearn_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=1500)

### END SOLUTION

### BEGIN SOLUTION
# Evaluate the policy
qlearn_policy.eval()
avg_episodes_to_goal = score_policy(env, gamma, qlearn_policy, n_episodes_score)

### END SOLUTION

print(f'Policy Q-learn: {avg_episodes_to_goal}')



ep: 0, epsilon: 1.000
Policy Q-learn: 0.0, mean"nan

ep: 500, epsilon: 0.995
Policy Q-learn: 0.0, mean"nan

ep: 1000, epsilon: 0.990
Policy Q-learn: 0.0, mean"nan

ep: 1500, epsilon: 0.985
Policy Q-learn: 0.0, mean"nan

ep: 2000, epsilon: 0.980
Policy Q-learn: 0.0, mean"nan

ep: 2500, epsilon: 0.975
Policy Q-learn: 0.0, mean"nan

ep: 3000, epsilon: 0.970
Policy Q-learn: 0.0, mean"nan

ep: 3500, epsilon: 0.966
Policy Q-learn: 0.0, mean"nan

ep: 4000, epsilon: 0.961
Policy Q-learn: 0.0, mean"nan

ep: 4500, epsilon: 0.956
Policy Q-learn: 0.0, mean"nan

ep: 5000, epsilon: 0.951
Policy Q-learn: 0.0, mean"nan

ep: 5500, epsilon: 0.946
Policy Q-learn: 0.0, mean"nan

ep: 6000, epsilon: 0.942
Policy Q-learn: 0.0, mean"nan

ep: 6500, epsilon: 0.937
Policy Q-learn: 0.0, mean"nan

ep: 7000, epsilon: 0.932
Policy Q-learn: 0.0, mean"nan

ep: 7500, epsilon: 0.928
Policy Q-learn: 0.0, mean"nan

ep: 8000, epsilon: 0.923
Policy Q-learn: 0.0, mean"nan

ep: 8500, epsilon: 0.919
Policy Q-learn: 0.0, mean"n

KeyboardInterrupt: ignored

In [134]:
env = gym.make("MountainCar-v0", render_mode="rgb_array")
# Wrap to discretize the observation space and then wrap with the record video to render vdo
env = RecordVideo(ObservationWrapper(env, n_states=n_states, bounds=bounds))
env.reset()
qlearn_policy.eval()
for i in range(3):
    env.render()
    state_next, reward, terminated, truncated = sample_episode(env, qlearn_policy)
    
env.close()

<IPython.core.display.Javascript object>

Here the lower the value of the score the better the agent is

In [135]:
env = RecordVideo(gym.make("Acrobot-v1", render_mode="rgb_array"))
observation, info = env.reset()
i=0
while True:
    env.render()
    
    action = env.action_space.sample() 
    state_next, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    
    if done: 
      break
            
env.close()

<IPython.core.display.Javascript object>

In [136]:
def score_policy(env, gamma, policy, n_episodes): #rewriting the score as the frozen lake
    episodes_to_goal = 0
    steps_to_goal = []

    for episode in range(n_episodes):
        states, actions, rewards, dones = sample_episode(env, policy, reset = True)
        if rewards[-1]==0:
            episodes_to_goal+=1
            steps_to_goal.append(len(rewards))

    return episodes_to_goal/n_episodes, np.mean(steps_to_goal)




In [137]:

env = gym.make("Acrobot-v1", render_mode="rgb_array")

# Wrap to discretize the observation space
n_states = env.observation_space.shape[0]**5
action_space = env.action_space.n
bounds = [(-1,1),(-1,1),(-1,1),(-1,1),(-12.567,12.567),(-28.274, 28.274 )]
env = ObservationWrapper(env, n_states = n_states, bounds = bounds)

### BEGIN SOLUTION
# Initialize policy

observation, info = env.reset()

q_initial = np.zeros((n_states, action_space))
qlearn_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
                                    epsilon=1, epsilon_decay=0.999, epsilon_min=0.001)

gamma = 0.9
n_episodes = 5000
alpha = 0.2
n_episodes_score = 100

### BEGIN SOLUTION
# Train the policy
qlearn_policy.train()
policy_learn_qlearn(env, qlearn_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500)

### END SOLUTION

### BEGIN SOLUTION
# Evaluate the policy
qlearn_policy.eval()
avg_episodes_to_goal, mean_steps = score_policy(env, gamma, qlearn_policy, n_episodes_score)


### END SOLUTION

print(f'Policy Q-learn: {avg_episodes_to_goal:.3%}, avg steps: {mean_steps}')



ep: 0, epsilon: 1.000


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Policy Q-learn: 0.0, mean"nan

ep: 500, epsilon: 0.606
Policy Q-learn: 0.89, mean"317.6179775280899

ep: 1000, epsilon: 0.368
Policy Q-learn: 0.94, mean"291.06382978723406

ep: 1500, epsilon: 0.223
Policy Q-learn: 0.83, mean"360.48192771084337

ep: 2000, epsilon: 0.135
Policy Q-learn: 0.98, mean"270.3775510204082

ep: 2500, epsilon: 0.082
Policy Q-learn: 0.9, mean"318.96666666666664

ep: 3000, epsilon: 0.050
Policy Q-learn: 0.98, mean"253.8673469387755

ep: 3500, epsilon: 0.030
Policy Q-learn: 1.0, mean"250.53

ep: 4000, epsilon: 0.018
Policy Q-learn: 0.91, mean"250.94505494505495

ep: 4500, epsilon: 0.011
Policy Q-learn: 1.0, mean"240.04

Policy Q-learn: 100.000%, avg steps: 229.04


***in this environment the agent tries to increase the number of times it reaches goal also decreases the number of steps it takes as more steps cause more penalty. -1 reward for each steps***

In [138]:
#render with the learned policy
env = gym.make("Acrobot-v1", render_mode="rgb_array")

# Wrap to discretize the observation space
n_states = env.observation_space.shape[0]**5
action_space = env.action_space.n
bounds = [(-1,1),(-1,1),(-1,1),(-1,1),(-12.567,12.567),(-28.274, 28.274 )]
env = ObservationWrapper(env, n_states = n_states, bounds = bounds)
env = RecordVideo(env)

qlearn_policy.eval()
for i in range(2):
    state_next, reward, terminated, truncated = sample_episode(env, qlearn_policy)
    
env.close()

<IPython.core.display.Javascript object>

In [None]:
pip install gymnasium[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: box2d-py
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for box2d-py (setup.py) ... [?25lerror
[

In [None]:
# env = RecordVideo(gym.make(
#     "LunarLander-v2",
#     continuous = False,
#     gravity = -10.0,
#     enable_wind = False,
#     #wind_power = 15.0,
#     #turbulence_power = 1.5,
#     render_mode="rgb_array"
# ))
# observation, info = env.reset()
# i=0
# while True:
#     env.render()
    
#     action = env.action_space.sample() 
#     state_next, reward, terminated, truncated, info = env.step(action)
#     done = terminated or truncated
#     # print(reward)
#     if done: 
#       break
            
# env.close()

In [None]:
# def score_policy(env, gamma, policy, n_episodes):
#     episodes_to_goal = []
#     steps_to_goal = 0

#     ### BEGIN SOLUTION
#     for episode in range(n_episodes):
#         states, actions, rewards, dones = sample_episode(env, policy, reset = True)
#         if rewards[-1]==100:
#             steps_to_goal+=1
            
#         episodes_to_goal.append(rewards[-1])
        

#     ### END SOLUTION

#     return np.mean(episodes_to_goal), steps_to_goal/n_episodes

# env = gym.make(
#     "LunarLander-v2",
#     continuous = False,
#     gravity = -10.0,
#     enable_wind = False,
#     #wind_power = 15.0,
#     #turbulence_power = 1.5,
#     render_mode="rgb_array"
# )

# # Wrap to discretize the observation space
# n_states = env.observation_space.shape[0]**6
# action_space = env.action_space.n

# bounds = [(-1.5,1.5),(-1.5,1.5), (-5,5),(-5,5),(-3.14,3.14),(-5, 5 ),(-0,1),(-0,1)]
# env = ObservationWrapper(env, n_states = n_states, bounds = bounds)

# ### BEGIN SOLUTION
# # Initialize policy



# observation, info = env.reset()

# q_initial = np.zeros((n_states, action_space))
# qlearn_policy = EpsilonGreedyPolicy(env.action_space, q_initial,
#                                     epsilon=1, epsilon_decay=0.9999999, epsilon_min=0.01)


# ##monte CARLO
# qlearn_policy.train()
# gamma = 0.9
# n_episodes = 262144*4
# q_mc_everyvisit = actionvalue_montecarlo_every(env, gamma, qlearn_policy, n_episodes)
# ###



# # Evaluate the policy
# n_episodes_score = 1000
# qlearn_policy.eval()
# avg_episodes_to_goal, mean_steps = score_policy(env, gamma, qlearn_policy, n_episodes_score)


# ### END SOLUTION

# print(f'Policy Q-learn: {avg_episodes_to_goal}, avg steps: {mean_steps}')



# env = gym.make(
#     "LunarLander-v2",
#     continuous = False,
#     gravity = -10.0,
#     enable_wind = False,
#     #wind_power = 15.0,
#     #turbulence_power = 1.5,
#     render_mode="rgb_array"
# )

# # Wrap to discretize the observation space
# n_states = env.observation_space.shape[0]**6
# action_space = env.action_space.n

# bounds = [(-1.5,1.5),(-1.5,1.5), (-5,5),(-5,5),(-3.14,3.14),(-5, 5 ),(-0,1),(-0,1)]
# env = ObservationWrapper(env, n_states = n_states, bounds = bounds)
# env = RecordVideo(env)

# qlearn_policy.eval()
# for i in range(2):
#     state_next, reward, terminated, truncated = sample_episode(env, qlearn_policy)
    
# env.close()

In [None]:
# gamma = 0.9
# n_episodes = 100000
# alpha = 0.2
# n_episodes_score = 100

# ### BEGIN SOLUTION
# # Train the policy
# qlearn_policy.train()
# policy_learn_qlearn(env, qlearn_policy, gamma, n_episodes, alpha, max_n_steps=1000, print_every=500)

# ### 


Bipedal Walker

In [None]:

env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="rgb_array")
env = RecordVideo(env)
observation, info = env.reset()
i=0
while True:
    env.render()
    
    action = env.action_space.sample() 
    
    state_next, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    print(reward)
    if done: 
      break
            
env.close()

<IPython.core.display.Javascript object>

-0.13131106100169201
-0.10504514113565168
-0.053355680505435855
-0.12401905450224876
-0.15047660104433813
-0.1507453515926973
-0.0967524282808117
-0.07762173374493798
-0.08851536403099815
-0.04148763575156411
-0.005419124841690066
-0.018788550277553384
-0.13217756775021552
-0.05546471520264825
-0.05351612587769708
-0.01847831314305583
-0.07457788129647455
-0.1267793570558242
-0.2258298990925141
-0.15440637425581732
-0.179263650874298
-0.12283068424463271
-0.1774203105767556
-0.0739073179960251
-0.16071921757857124
-0.15155246150493623
-0.07296020946651696
-0.21035502793391428
-0.27647853773832326
-0.26201781948407377
-0.16663666323820867
-0.21175999410947044
-0.21744010615348816
-0.17486588488022606
-0.18982471652825553
-0.20882499287525932
-0.19390969805916033
-0.23806343770027164
-0.24129290365179618
-0.24177127659320832
-0.2556744307279587
-0.19819611120224
-0.14816723992427072
-0.2716240739027671
-0.3259423822561876
-0.27624910162886224
-0.2992626442511865
-0.2786542269537858
-0.24

In [None]:
2/5

0.4

In [None]:
#action space wrapper

class ActionWrapper(gym.ActionWrapper):
    '''
    n_state_per_action(int): number of state per action of the env.
                
    bounds(list of tuples): bounds of the actions of the env.(taken from the gymnasium documentation).
                each tuple gives the bounds of the actions
    '''
    def __init__(self, env, n_state_per_action=25, bounds=[(-1,1),(-1,1),(-1,1),(-1,1)]):
        super().__init__(env)

        self.n_state_per_action = n_state_per_action
        self.bounds = bounds
        assert isinstance(env.observation_space, gym.spaces.box.Box)

        self.dimension = env.action_space.shape[0] 
        self.step_list = []
        for action_bound in bounds:
            step = (action_bound[1] - action_bound[0])/self.n_state_per_action
            self.step_list.append(step)

        
    def get_quant_action(self, new_act):
        
        quantized_act = 0
        dimension = self.dimension
        for i, obs in enumerate(reversed(new_act)):
            quantized_act+= obs*self.n_state_per_action**(dimension-i-1)
        
        return int(quantized_act)

    def action(self, act):
        
        new_act = []
        
        for i,action_state in enumerate(act):
            new_act.append(int((action_state-self.bounds[i][0])/self.step_list[i]))
            
        
        # return new_act
        # print(new_act)
        return self.get_quant_action(new_act)
    
    def decimal_to_action(self, act):
        env_action = []
        while (act > 0):
            env_action.append(act % self.n_state_per_action)
            act = int(act / self.n_state_per_action)
        
        while len(env_action)<len(self.bounds):
            env_action.append(0)

        return env_action

    def get_env_action(self, act):
        env_action = self.decimal_to_action(act)
        # env_action = act
        converted_action_space = []
        for i, actn in enumerate(env_action):
            converted_action_space.append(actn*self.step_list[i] + self.bounds[i][0])

        return converted_action_space


In [None]:
action_wrapper = ActionWrapper(env)
a = action_wrapper.action([-1, 0.25126212, -0.124, 0.9912917 ])
print(a)
action_wrapper.get_env_action(a)

381625


[-1.0, 0.19999999999999996, -0.19999999999999996, 0.9199999999999999]

In [None]:
def get_bounds(low_bounds, high_bounds):
    bounds =[]
    for i,_ in enumerate(low_bounds):
        bounds.append((low_bounds[i], high_bounds[i]))

    return bounds

In [None]:
def sample_episode_bipedal(env,action_wrapper, policy, reset=True):
    states = []
    actions = []
    rewards = []
    dones = []

    
    # If reset, we reset the environment and get an initial state
    # else we set the initial state to it's current state env.env.s
    if reset:
        initial_state,_ = env.reset()
        
    
    done = False
    # Collect the initial state
    states.append(initial_state)
    
    # While the episode has not finished
    while not done:
        
        # Select an action
        act = policy.sample(states[-1])
        actions.append(act)
        action = action_wrapper.get_env_action(act)
        
        # Step the environment
        obs, reward, terminated, truncated, info = env.step(action)
        
        # The episode is done if it has been terminated or truncated
        done = terminated or truncated 

        
        # Collect the state, reward and action taken
        states.append(obs)
        rewards.append(reward)
        dones.append(done)



    ### END SOLUTION

    return states, actions, rewards, dones


In [None]:
def score_policy_bipedal_walker(env,action_wrapper, gamma, policy, n_episodes):
    returns = []
    
    for episode in range(n_episodes):
        states, actions, rewards, dones = sample_episode_bipedal(env,action_wrapper, policy, reset = True)
        returns.append(compute_returns(rewards, gamma)[0])

    return np.mean(returns), rewards[-1]



In [None]:
class EpsilonGreedyPolicy_Bipedal(GreedyPolicy):
    def __init__(self, action_space, q, action_wrapper, epsilon, 
                 epsilon_decay=1, epsilon_min=0):
        super().__init__(action_space, q)
        self.epsilon_start = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.epsilon = self.epsilon_start
        self.n_actions = q.shape[1]

    def sample(self, state):
        
        if np.random.random()<= self.epsilon and self.training:
            choices = [i for i in range(self.n_actions)]
            action = np.random.choice(choices)
        else:
            action = np.argmax(self.q[state])

        return action

    def begin_episode(self, episode_index):
        
        self.epsilon = self.epsilon_start * (self.epsilon_decay ** episode_index)
        self.epsilon = max(self.epsilon, self.epsilon_min)


In [None]:
import pickle
def policy_learn_qlearn_bipedal(env, policy, gamma, n_episodes, alpha, action_wrapper, max_n_steps=3000, print_every=500,episode_start = 0):
    # No need to create the tables of action-values since they are stored directly in the policy
    policy.train()
    # While we haven't reached the desired number of steps
    for episode in range(n_episodes):
        # Call the policy begin_episode so it can handle epsilon decay
        if episode<episode_start:
            policy.begin_episode(episode)
            continue
        policy.begin_episode(episode)
        
        # Print every couple episodes
        if (not episode % print_every) and episode!=0:
            print(f'ep: {episode}, epsilon: {policy.epsilon:.3f}')
            policy.eval()
            mean, last_reward = score_policy_bipedal_walker(env, action_wrapper, gamma, policy, 1)
            print(f'Policy Q-learn: {mean}, last_reward:{last_reward}')
            print()
            policy.train()

            with open(f'drive/MyDrive/qlearn_policy_bipedal_q_value.pkl', 'wb') as file:
                pickle.dump({episode:qlearn_policy.q}, file)

        # Get initial state
        state, info = env.reset()

        # Take the first action acording to the policy
        action = policy.sample(state)

        # While we haven't reached the maximum number of steps for the episode
        for step in range(max_n_steps):     
            ### BEGIN SOLUTION
            # Perform a step of the environment
            
            state_next,reward,terminated,truncated,info =  env.step(action_wrapper.get_env_action(action))

            # It is done if terminated or truncated
            done = terminated or truncated


            # If episode has finished
            if done:
                # Update the action-value table and leave the loop
                policy.q[state, action] += alpha*(reward + gamma*np.max(policy.q[state_next]) - policy.q[state,action])
                # print(f"episode: {episode}, steps:{step}")
                break

            # Sample the next action
            action_next = policy.sample(state_next)

            # Update the action-value table
            policy.q[state, action] += alpha*(reward + gamma*np.max(policy.q[state_next]) - policy.q[state,action])
            ### END SOLUTION

            # Set the current state and action
            state = state_next
            action = action_next
            
    return


In [None]:
with open(f'drive/MyDrive/qlearn_policy_bipedal_q_value.pkl', 'rb') as file:
    q_dict = pickle.load(file)

q_dict.keys()

dict_keys([8300])

In [None]:
list(q_dict.keys())[0]

8300

In [None]:
env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="rgb_array")
action_wrapper = ActionWrapper(env)

action_space = env.action_space.shape[0] * action_wrapper.n_state_per_action

n_states = env.observation_space.shape[0]**4
high_bounds = env.observation_space.high
low_bounds = env.observation_space.low

bounds = get_bounds(low_bounds, high_bounds)

del(high_bounds)
del(low_bounds)

env = ObservationWrapper(env, n_states = n_states, bounds = bounds)

observation, info = env.reset()

# q_initial = np.zeros((n_states, action_space))
# q_initial = qlearn_policy.q
with open(f'drive/MyDrive/qlearn_policy_bipedal_q_value.pkl', 'rb') as file:
    q_dict = pickle.load(file)

episode_start = list(q_dict.keys())[0]
q_initial = q_dict[episode_start]

qlearn_policy = EpsilonGreedyPolicy_Bipedal(env.action_space, q_initial, action_wrapper,
                                    epsilon=1, epsilon_decay=0.999999, epsilon_min=0.01)
del(q_initial)
gamma = 0.9
n_episodes = 500000
alpha = 0.2
n_episodes_score = 100

qlearn_policy.train()
policy_learn_qlearn_bipedal(env, qlearn_policy, gamma, n_episodes, alpha, action_wrapper, max_n_steps=2000, print_every=100, episode_start = episode_start)


ep: 19900, epsilon: 0.980
Policy Q-learn: -1.8652467061996947, last_reward:-0.09183999999999999

ep: 20000, epsilon: 0.980
Policy Q-learn: -2.4489584925382677, last_reward:-100

ep: 20100, epsilon: 0.980
Policy Q-learn: -1.990435805072057, last_reward:-0.10751999999999999

ep: 20200, epsilon: 0.980
Policy Q-learn: -2.4072530720983183, last_reward:-100

ep: 20300, epsilon: 0.980
Policy Q-learn: -2.608881240077515, last_reward:-0.08512

ep: 20400, epsilon: 0.980
Policy Q-learn: -3.0035051913608473, last_reward:-100

ep: 20500, epsilon: 0.980
Policy Q-learn: -2.3456786931647837, last_reward:-100

ep: 20600, epsilon: 0.980
Policy Q-learn: -2.3404400761614395, last_reward:-0.08736

ep: 20700, epsilon: 0.980
Policy Q-learn: -2.0204270605864565, last_reward:-0.09856

ep: 20800, epsilon: 0.979
Policy Q-learn: -1.9552241587942167, last_reward:-0.10751999999999999



KeyboardInterrupt: ignored

In [None]:
env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="rgb_array")
env = ObservationWrapper(env, n_states = n_states, bounds = bounds)
env = RecordVideo(env)
observation, info = env.reset()

qlearn_policy.eval()
for i in range(1):
    env.render()
    states, actions, rewards, dones = sample_episode_bipedal(env, action_wrapper, qlearn_policy)
# print(rewards)
env.close()


<IPython.core.display.Javascript object>