In [1]:
import gym
import numpy as np
import random
import itertools
import time

from IPython.display import clear_output

from lake_envs import *

In [5]:
def render_single(env, policy, max_steps=100):
    """
    This function does not need to be modified
    Renders policy once on environment. Watch your agent play!

    Parameters
    ----------
    env: gym.core.Environment
      Environment to play on. Must have nS, nA, and P as
      attributes.
    Policy: np.array of shape [env.nS]
      The action to take at a given state
    """
    episode_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        env.render()
        time.sleep(0.25)
        a = policy[ob]
        ob, rew, done, _ = env.step(a)
        episode_reward += rew
        if done:
            break
    env.render();
    if not done:
        print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
    else:
        print("Episode reward: %f" % episode_reward)

In [27]:
#game = gym.make('FrozenLake-v0')


def epsilon_greedy(greedy_action, num_actions, epsilon):
    prob = np.random.random()
    
    if prob < 1 - epsilon:
        return greedy_action
    
    return np.random.randint(0, num_actions)


def argmax(q_values):
    """
    Finds and returns greedy policy.

    Parameters
    ----------
    q_values: nested dictionary {state: {action: q_val}}
    
    Returns
    ----------
    policy: The action to take at a given state, list of length num_state
      
    """
    # YOUR CODE HERE
    for s in q_values.keys():
        best_a = None
        best_G = float('-inf')
        for a, G in q_values[s].items():
            if G > best_G:
                best_G = G
                best_a = a
        policy[s] = best_a
        
    return policy

    
def play_game(env, policy, epsilon, gamma, num_actions):
    G = 0
    first = True
    state_action_return = []

    s = env.reset()
    a = epsilon_greedy(policy[s], num_actions, epsilon)
    
    #reward belong to one state and action before
    state_action_reward = [(s,a,0)]
    while True:
        s, r, terminated,_ = env.step(a)
        if terminated:
            state_action_reward.append((s, None, r))
            break
        else:
            a = epsilon_greedy(policy[s], num_actions, epsilon)
            state_action_reward.append((s, a, r))


    for s, a, r in reversed(state_action_reward):
        if first:
            first = False
        else:
            state_action_return.append((s, a, G))
            
        G = gamma * G + r
        state_action_return.reverse()

    return state_action_return
    
    
def monte_carlo(env, epsilon, gamma, num_episodes, num_states, num_actions):
    policy = np.random.choice(num_actions, num_states)
    
    # init q value and visit count
    Q = {}
    visit_count = {}
    
    for s in range(num_states):
        Q[s] = {}
        visit_count[s] = {}
        for a in range(num_actions):
            Q[s][a] = 0
            visit_count[s][a] = 0
        
    for i in range(num_episodes):
        state_action_return = play_game(env, policy, epsilon, gamma, num_actions)
        
        seen_state_action = set()
        
        for s, a, G in state_action_return:
            if (s, a) not in seen_state_action:
                visit_count[s][a] += 1
                Q[s][a] = Q[s][a] + ( G - Q[s][a] ) / visit_count[s][a]
                seen_state_action.add((s,a))

        policy = argmax(Q)
            
    return policy

    
epsilon = 0.4
gamma = 0.9
n_episodes = 4000
env = gym.make("Deterministic-4x4-FrozenLake-v0")


num_states = env.nS
num_actions = env.nA

policy = monte_carlo(env, epsilon, gamma, n_episodes, num_states, num_actions)

In [28]:
render_single(env, policy, 100)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode reward: 1.000000
