In [1]:
import gym
import numpy as np
import time
import random 


from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

env_name = 'FrozenLake-v1'


In [2]:
def render_single(env, policy, max_steps=100):
    """
    Renders policy for an environment.

    Parameters
    ----------
    env:    gym.core.Environment, open gym environment object
    policy: np.array of shape [env.nS], the action to take at a given state
    """
    episode_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        env.render()
        time.sleep(0.25)
        a = policy[ob]
        ob, reward, done, _ = env.step(a)
        episode_reward += reward
        if done:
            break
    env.render();
    if not done:
        print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
    else:
        print("Episode reward: %f" % episode_reward)
        

def set_random_seeds(env):
    np.random.seed(1)
    random.seed(1)
    env.seed(1)

In [3]:
class QLearning:
    '''Implements Off-policy control with Q Learning.'''
    def __init__(self, env, num_states, num_actions, alpha, epsilon, gamma):
        '''Parameters
        ----------
        env:         gym.core.Environment, open gym environment object
        num_states:  integer, number of states in the environment
        num_actions: integer, number of possible actions
        alpha:       float, step size, (0, 1]
        epsilon:     float, the epsilon parameter used for exploration
        gamma:       float, discount factor, small > 0
        '''
        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        
        self.Q = np.zeros((self.num_states, self.num_actions))
        
        self.env.reset()
        
        
    def run_q_learning(self, num_episodes, verbose=True):
        '''Runs Q learning
        
        Parameters
        ----------
        num_episodes: integer, number of episodes to run to train RL agent
        
        Returns
        ----------
        self.policy:         list of integers of length self.num_states, final policy
        '''
        terminated = False
        
        for i in range(num_episodes):
            self.env.reset()
            state = np.random.choice(self.num_states, 1)[0]
            terminated = False
            while not terminated:
                action, next_state, reward, terminated = self.generate_next_step(state)
                # print(f'state {state} -> action {action} -> next state {next_state}, reward {reward}')
                self.evaluate_policy(state, action, reward, next_state)

                state = next_state

        
        # Once training is finished, calculate and return policy using argmax approach
        final_policy = np.argmax(self.Q, axis=1)
        
        return final_policy
    
    
    def generate_next_step(self, state):
        '''Generates episode given policy. Calculates r_t and s_t+1
        
        Parameters
        ----------
        policy: list of integers of length self.num_states, the action to take at a given state
        
        Returns
        ----------
        state_action_reward: list of tuple (state, action, reward)
        '''
        random_action = self.env.action_space.sample()
        action = self.get_epsilon_greedy_action(state, random_action)
        
        observation, reward, terminated, _ = self.env.step(action)

        return (action, observation, reward, terminated)
    
    
    def evaluate_policy(self, state, action, reward, next_state):
        '''Updates action value function self.Q.

        Parameters
        ----------
        state:      int
        action:     int
        reward:     float
        next_state: int
        '''

        
        est_reward = reward + self.gamma * np.max(self.Q[next_state])
        self.Q[state][action] = self.Q[state][action] + self.alpha * (est_reward - self.Q[state][action])
        
                

    def argmax(self, state: int) -> int:
        """
        Finds and returns greedy action.

        Parameters
        ----------
        state: int, state for which greedy action should be selected
        
        Returns
        ----------
        action: int, corresponds to the index of the greedy action

        """
        return int(np.argmax(self.Q[state]))

    
    def get_epsilon_greedy_action(self, state, random_action):
        '''Returns next action using epsilon greedy approach.
        
        Parameters
        ----------
        greedy_action: integer, greedy action (action with a maximum Q value)
        
        Returns
        ----------
        next_action: integer, either greedy or random action
        '''   
        prob = np.random.random()

        if prob < (1 - self.epsilon):
            return self.argmax(state)
            
        return random_action


# Tests

In [10]:
import ipytest
import pytest
ipytest.autoconfig()

import random

random.seed(10)
np.random.seed(0)

In [19]:
%%ipytest -qq

np.random.seed(1)
 
@pytest.fixture
def this_env():
    env = gym.make(env_name, is_slippery=False)
    env.seed(0)
    
    yield env

@pytest.fixture
def q_learning_instance(this_env):
    alpha = 1
    epsilon = 0.8
    gamma = 1
    num_states = this_env.observation_space.n
    num_actions = this_env.action_space.n
    yield QLearning(this_env, num_states, num_actions, alpha, epsilon, gamma)
    

def test_argmax(q_learning_instance):
    res = q_learning_instance.argmax(0)
    assert isinstance(res, int)
    assert res == 0
    
    
def test_get_epsilon_greedy_action(q_learning_instance):
    action = q_learning_instance.get_epsilon_greedy_action(0, 3)
    assert isinstance(action, int)
    action == 0
  
    
def test_argmax(this_env, q_learning_instance):
    action = q_learning_instance.argmax(1)
    assert action < this_env.observation_space.n
    assert action >= 0

    
def test_evaluate_policy():
    pass
    

[32m.[0m[32m.[0m[32m                                                                                           [100%][0m


# Frozen lake

In [28]:
np.random.seed(1)

alpha = 0.1
epsilon = 0.9
gamma = 0.9
n_episodes = 1000

env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)
env.seed(11)
np.random.seed(1)
num_states = env.observation_space.n
num_actions = env.action_space.n
print(f"Starting environment with {num_states} states and {num_actions} actions")

set_random_seeds(env)
model = QLearning(env, num_states, num_actions, alpha, epsilon, gamma)

policy = model.run_q_learning(n_episodes)

Starting environment with 16 states and 4 actions


In [29]:
policy

array([2, 2, 1, 0, 1, 1, 1, 3, 2, 2, 1, 3, 2, 2, 2, 0])

In [30]:
model.Q

array([[0.27379148, 0.29974549, 0.3069626 , 0.27317711],
       [0.27326965, 0.14937472, 0.34615447, 0.30359087],
       [0.29977181, 0.40280248, 0.29211742, 0.33363007],
       [0.31636824, 0.13831437, 0.27908861, 0.28419455],
       [0.29078198, 0.33715916, 0.15411133, 0.27281344],
       [0.14429971, 0.16237924, 0.12885372, 0.15948293],
       [0.13468874, 0.48267985, 0.16574233, 0.31258232],
       [0.08201737, 0.1604162 , 0.10615144, 0.16551707],
       [0.31389383, 0.20147118, 0.384174  , 0.28455738],
       [0.30264767, 0.39628041, 0.43730757, 0.0971202 ],
       [0.2932352 , 0.57929146, 0.18547948, 0.32931879],
       [0.08759228, 0.10288266, 0.15527208, 0.1977573 ],
       [0.12651585, 0.06626141, 0.21892148, 0.06002865],
       [0.1526231 , 0.33106709, 0.57039357, 0.24717837],
       [0.35280627, 0.50586413, 0.81224827, 0.37521828],
       [0.14559758, 0.09573911, 0.0680775 , 0.13749406]])

In [31]:
render_single(env, policy, 200)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode reward: 1.000000


# Cliff Walking

In [4]:


np.random.seed(1)

alpha = 0.1
epsilon = 0.9
gamma = 0.9
n_episodes = 1000

env = gym.make('CliffWalking-v0')
env.seed(11)
np.random.seed(1)
num_states = env.observation_space.n
num_actions = env.action_space.n
print(f"Starting environment with {num_states} states and {num_actions} actions")

set_random_seeds(env)
model = QLearning(env, num_states, num_actions, alpha, epsilon, gamma)

policy = model.run_q_learning(n_episodes)

Starting environment with 48 states and 4 actions


In [5]:
render_single(env, policy, 200)

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  x  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  x  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  x  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  x  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  x  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o

# Taxi Problem

In [6]:
np.random.seed(1)

alpha = 0.1
epsilon = 0.9
gamma = 0.9
n_episodes = 1000

env = gym.make('Taxi-v3')
env.seed(11)
np.random.seed(1)
num_states = env.observation_space.n
num_actions = env.action_space.n
print(f"Starting environment with {num_states} states and {num_actions} actions")

set_random_seeds(env)
model = QLearning(env, num_states, num_actions, alpha, epsilon, gamma)

policy = model.run_q_learning(n_episodes)

Starting environment with 500 states and 6 actions


In [7]:
render_single(env, policy, 200)

+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|R: | : :[34;1mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | :[42m_[0m:G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y