In [6]:
import gym
import numpy as np
import time
import random 


from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

env_name = 'FrozenLake-v1'


In [7]:
def render_single(env, policy, max_steps=100):
    """
    Renders policy for an environment.

    Parameters
    ----------
    env:    gym.core.Environment, open gym environment object
    policy: np.array of shape [env.nS], the action to take at a given state
    """
    episode_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        env.render()
        time.sleep(0.25)
        a = policy[ob]
        ob, reward, done, _ = env.step(a)
        episode_reward += reward
        if done:
            break
    env.render();
    if not done:
        print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
    else:
        print("Episode reward: %f" % episode_reward)
        

def set_random_seeds(env):
    np.random.seed(1)
    random.seed(1)
    env.seed(1)

In [4]:
class QLearning:
    '''Implements Off-policy control with Q Learning.'''
    def __init__(self, env, num_states, num_actions, alpha, epsilon, gamma):
        '''Parameters
        ----------
        env:         gym.core.Environment, open gym environment object
        num_states:  integer, number of states in the environment
        num_actions: integer, number of possible actions
        alpha:       float, step size, (0, 1]
        epsilon:     float, the epsilon parameter used for exploration
        gamma:       float, discount factor, small > 0
        '''
        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        
        self.Q = np.zeros((self.num_states, self.num_actions))
        
        self.env.reset()
        
        
    def run_q_learning(self, num_episodes, verbose=True):
        '''Runs Q learning
        
        Parameters
        ----------
        num_episodes: integer, number of episodes to run to train RL agent
        
        Returns
        ----------
        self.policy:         list of integers of length self.num_states, final policy
        '''
        terminated = False
        
        for i in range(num_episodes):
            self.env.reset()
            state = np.random.choice(self.num_states, 1)[0]
            terminated = False
            while not terminated:
                action, next_state, reward, terminated = self.generate_next_step(state)
                # print(f'state {state} -> action {action} -> next state {next_state}, reward {reward}')
                self.evaluate_policy(state, action, reward, next_state)
                if reward > 0:
                    print(f'episode number: {i}')
                    
                    print(self.Q)
                    print(f'state {state} -> action {action} -> next state {next_state}, reward {reward}')
                    

                state = next_state

        
        # Once training is finished, calculate and return policy using argmax approach
        final_policy = np.argmax(self.Q, axis=1)
        
        return final_policy
    
    
    def generate_next_step(self, state):
        '''Generates episode given policy. Calculates r_t and s_t+1
        
        Parameters
        ----------
        policy: list of integers of length self.num_states, the action to take at a given state
        
        Returns
        ----------
        state_action_reward: list of tuple (state, action, reward)
        '''
        random_action = self.env.action_space.sample()
        action = self.get_epsilon_greedy_action(state, random_action)
        
        observation, reward, terminated, _ = self.env.step(action)

        return (action, observation, reward, terminated)
    
    
    def evaluate_policy(self, state, action, reward, next_state):
        '''Updates action value function self.Q.

        Parameters
        ----------
        state:      int
        action:     int
        reward:     float
        next_state: int
        '''

        
        est_reward = reward + self.gamma * np.max(self.Q[next_state])
        self.Q[state][action] = self.Q[state][action] + self.alpha * (est_reward - self.Q[state][action])
        
                

    def argmax(self, state: int) -> int:
        """
        Finds and returns greedy action.

        Parameters
        ----------
        state: int, state for which greedy action should be selected
        
        Returns
        ----------
        action: int, corresponds to the index of the greedy action

        """
        return int(np.argmax(self.Q[state]))

    
    def get_epsilon_greedy_action(self, state, random_action):
        '''Returns next action using epsilon greedy approach.
        
        Parameters
        ----------
        greedy_action: integer, greedy action (action with a maximum Q value)
        
        Returns
        ----------
        next_action: integer, either greedy or random action
        '''   
        prob = np.random.random()

        if prob < (1 - self.epsilon):
            return self.argmax(state)
            
        return random_action


# Tests

In [4]:
import ipytest
import pytest
ipytest.autoconfig()

import random

random.seed(10)
np.random.seed(0)

In [45]:
%%ipytest -qq

np.random.seed(1)
 
@pytest.fixture
def this_env():
    env = gym.make(env_name, is_slippery=False)
    env.seed(0)
    
    yield env

@pytest.fixture
def q_learning_instance(this_env):
    alpha = 1
    epsilon = 0.8
    gamma = 1
    num_states = this_env.observation_space.n
    num_actions = this_env.action_space.n
    yield QLearning(this_env, num_states, num_actions, alpha, epsilon, gamma)
    

def test_argmax(q_learning_instance):
    res = q_learning_instance.argmax(0)
    assert isinstance(res, int)
    assert res == 0
    
    
def test_get_epsilon_greedy_action(q_learning_instance):
    action = q_learning_instance.get_epsilon_greedy_action(3)
    assert isinstance(action, int)
    action == 0
    
# def test_get_epsilon_greedy_action(mock_env, q_learning_instance):
#     q_learning_instance.init_agent()
#     action = q_learning_instance.get_epsilon_greedy_action(3)
#     assert action < mock_env.num_actions
#     assert action >= 0

    
# def test_argmax(mock_env, q_learning_instance):
#     q_learning_instance.init_agent()
#     action = q_learning_instance.argmax(1)
#     assert action < mock_env.num_actions
#     assert action >= 0
    

[32m.[0m[32m.[0m[32m                                                                                           [100%][0m


In [21]:
np.random.seed(1)

alpha = 0.1
epsilon = 0.9
gamma = 0.9
n_episodes = 1000

env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)
env.seed(11)
np.random.seed(1)
num_states = env.observation_space.n
num_actions = env.action_space.n
print(f"Starting environment with {num_states} states and {num_actions} actions")

set_random_seeds(env)
model = QLearning(env, num_states, num_actions, alpha, epsilon, gamma)

policy = model.run_q_learning(n_episodes)

Starting environment with 16 states and 4 actions
episode number: 95
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.  0. ]]
state 14 -> action 2 -> next state 15, reward 1.0
episode number: 123
[[2.63541592e-07 4.17659482e-06 2.09901191e-07 9.06319535e-07]
 [6.92850845e-07 0.00000000e+00 1.86535791e-07 1.24835491e-08]
 [4.29309253e-08 3.93797781e-06 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.71242100e-05 0.00000000e+00 6.13080966e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 5.85766080e-05 0.00000000e+00 1.86535791e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.71242100e-05 0.00000000e+00 1.90269000e-04 2.635

In [22]:
policy

array([2, 2, 1, 0, 1, 0, 1, 3, 2, 2, 1, 0, 3, 2, 2, 3])

In [23]:
model.Q

array([[0.17715467, 0.18718024, 0.22355253, 0.18663861],
       [0.18124409, 0.09587048, 0.2559526 , 0.20626927],
       [0.19567832, 0.30496429, 0.16930608, 0.2349059 ],
       [0.22396132, 0.07991399, 0.15984125, 0.16191753],
       [0.1839361 , 0.21359967, 0.08931205, 0.17617787],
       [0.11950351, 0.07319217, 0.08135434, 0.0835514 ],
       [0.09373659, 0.39598394, 0.09093848, 0.17630057],
       [0.0494229 , 0.06267889, 0.08080869, 0.09367579],
       [0.19911509, 0.06664114, 0.24449192, 0.17503692],
       [0.14436153, 0.25752158, 0.33154844, 0.07796355],
       [0.19521238, 0.52848793, 0.09329674, 0.18887904],
       [0.10163008, 0.10025029, 0.06659368, 0.0718607 ],
       [0.06749825, 0.03809295, 0.06201159, 0.07359099],
       [0.06535923, 0.15510909, 0.44345113, 0.14144531],
       [0.15426472, 0.45982963, 0.77983941, 0.24598367],
       [0.07566969, 0.05927948, 0.06569119, 0.09218157]])

In [24]:
render_single(env, policy, 200)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode reward: 1.000000
