# 3. Monte Carlo Methods

Monte Carlo Methods have the benefit of not assuming a given model. Instead they estimate the value function by sample returns (just starting from a state a few times, running the policy and calculating how much reward the agent got on average). Depending on the size of the state space, these methods require quite some time to get valid estimates for each state.

### Monte Carlo Policy Evaluation

In [1]:
import gym
import numpy as np
from collections import defaultdict

env_name = 'FrozenLake-v0'
env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

policy = np.random.randint(4, size=state_space)
discount_factor=1.0
num_episodes = 500
v = np.zeros(state_space)
q = np.zeros((state_space, action_space))
epsilon=1.0

returns_sum_v = defaultdict(float)
returns_count_v = defaultdict(float)
returns_sum_q = defaultdict(float)
returns_count_q = defaultdict(float)

def mc_policy_evaluation(policy, env, num_episodes, v, q, discount_factor, epsilon):
    for i_episode in range(1, num_episodes + 1):
        episode = np.empty([0,3])
        state = env.reset()
        # run an episode and save (state, action, reward) in episode
        for t in range(100):
            # epsilon greedy
            if(np.random.uniform() < epsilon):
                action = np.random.choice(4) # exploration
            else:
                action = policy[state] # exploitation
            next_state, reward, done, _ = env.step(action)

            episode = np.append(episode, np.array([state, action, reward]).reshape(1,3), axis=0)
            if done:
                break
            state = next_state

        states_in_episode = episode[:,0]
        first_visit_index = np.unique(states_in_episode, return_index=True)[1]
        rewards = episode[:,2]
        actions = episode[:,1]
        
        # calculates the reward for each state from the point it was first visited
        for x in first_visit_index:
            state = int(states_in_episode[x])
            action = int(actions[x])
            G = sum([r*(discount_factor**i) for i,r in enumerate(rewards[x:])])
            returns_sum_v[state] += G
            returns_count_v[state] += 1.0
            v[state] = returns_sum_v[state] / returns_count_v[state]
            
            returns_sum_q[state, action] += G
            returns_count_q[state, action] += 1.0
            q[state, action] = returns_sum_q[state, action] / returns_count_q[state, action]

    return v, q

v, q = mc_policy_evaluation(policy, env, num_episodes, v, q, discount_factor, epsilon)

### Monte Carlo Control
Like policy iteration the agent rotates between improving the estimate of the value function and improving the policy by acting greedy on the improved value function. Using the state value function has one problem, it requires a model of the environment and Monte Carlo tries to be model free. $\pi'(s) = argmax( R_s^a + P_{s  s'}^a v(s') )$. <br>
The alternative is to use Q, which does not require an environment model. $\pi'(s) = argmax( Q(s,a) )$ 

In [2]:
discount_factor=1.0
num_episodes = 100
q = np.zeros((state_space, action_space))
v = np.zeros(state_space)
policy = np.random.randint(4, size=state_space)
epsilon = 1.0

policy = np.random.randint(4, size=state_space)
discount_factor=1.0
num_episodes = 500

for i in range(1000):
    v, q = mc_policy_evaluation(policy, env, num_episodes, v, q, discount_factor, epsilon)
    policy = np.argmax(q, axis=1)
    epsilon *= 0.995

### Visualization of the state value function

In [9]:
def render_value_fct_array(value_fct_array):
    len_x = int(np.sqrt(value_fct_array.shape[0]))
    array_2d = value_fct_array.reshape([ len_x, len_x])
    print(array_2d)

np.set_printoptions(linewidth = 150)
render_value_fct_array(v)
print(env.render())

[[0.4011143  0.21144849 0.23839718 0.09157032]
 [0.42370662 0.         0.26209566 0.        ]
 [0.48507557 0.5635991  0.54627573 0.        ]
 [0.         0.68083442 0.8302095  0.        ]]
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
None


### Testing the policy in 1000 games

In [10]:
def test_policy(env, policy):
    state = env.reset()
    reward_sum = 0
    for i in range(100):
        action = policy[state]
        state, reward, done, _ = env.step(action)
        reward_sum += reward
        if(done):
            break
    return reward_sum

reward_all = 0
for i in range(1000):
    reward = test_policy(env, policy)
    reward_all += reward

print(reward_all/1000.0)

0.722
