Purpose of this notebook is to demonstrate the Monte Carlo method for policy approximation.

Aim is for the student to understand the following features of Monte Carlo methods

- high variance
- inefficient in terms of using experience
- lack of any bootstrapping
- lack of environment model 

In [1]:
import numpy as np

In [52]:
#  we make use of a class for the Maze Markov Decision Process
#  I have incorporated the logic from the dynamic programming notebook into this class

class Maze_Env(object):
    def __init__(self, verbose=0):
        
        #  creating the functions needed to define a Markov Decision Process
        self.state_space = np.array(['s{}'.format(state) for state in np.arange(1,6)])
        self.action_space = np.array(['left', 'right', 'up', 'down'])
        self.state_transitions = {state:np.genfromtxt('{}.csv'.format(state), delimiter=',') for state in self.state_space}
        self.reward_functions = {state:np.full((len(self.state_space)),-1) for state in self.state_space}
        
        #  our two changes to the reward function
        self.reward_functions['s4'][4] = 10
        self.reward_functions['s5'] = np.full((len(self.state_space)),0)
        self.verbose = verbose
        
        self.state = self.reset()
        
    def reset(self):   
        """
        Resets the environment to the initial state
        """
        print('resetting environment')
        self.state = 's1'
        return self.state
        
    def step(self, action):
        """
        Environment response to a given action
        
        action -> reward + next_state
        """

        state_idx = np.argwhere(self.state==self.state_space).flatten()
        action_idx = np.argwhere(action==self.action_space).flatten()

        state_transition = self.state_transitions[self.state][action_idx].flatten()
        next_state = np.random.choice(self.state_space, p=state_transition)
        next_state_idx = np.argwhere(next_state==self.state_space).flatten()
    
        reward_function = self.reward_functions[self.state].flatten()     
        reward = reward_function[next_state_idx]
        
        if self.verbose == 1:
            print('state is {}'.format(self.state))
            print('action is {}'.format(action))
            print('reward is {}'.format(reward))
            print('next_state is {}'.format(next_state))
        self.state = next_state   
        return reward, next_state

In [55]:
#  we also use our old friend random_policy

def random_policy(state, action_space):
    action = np.random.choice(action_space)
    p_distribution = np.full(len(action_space),  1/len(action_space))
    return action, p_distribution

In [92]:
env = Maze_Env(verbose=0)
HORIZION = 10
EPISODES = 5

rewards = np.zeros(shape=(EPISODES, HORIZION))
for episode in range(0, EPISODES):
    state = env.reset()
    for step in range(0, HORIZION):
        print('episode {} step {}'.format(episode, step))
        action, _ = random_policy(state, env.action_space)
        reward, next_state = env.step(action)
        rewards[episode][step] = reward
        

resetting environment
resetting environment
episode 0 step 0
episode 0 step 1
episode 0 step 2
episode 0 step 3
episode 0 step 4
episode 0 step 5
episode 0 step 6
episode 0 step 7
episode 0 step 8
episode 0 step 9
resetting environment
episode 1 step 0
episode 1 step 1
episode 1 step 2
episode 1 step 3
episode 1 step 4
episode 1 step 5
episode 1 step 6
episode 1 step 7
episode 1 step 8
episode 1 step 9
resetting environment
episode 2 step 0
episode 2 step 1
episode 2 step 2
episode 2 step 3
episode 2 step 4
episode 2 step 5
episode 2 step 6
episode 2 step 7
episode 2 step 8
episode 2 step 9
resetting environment
episode 3 step 0
episode 3 step 1
episode 3 step 2
episode 3 step 3
episode 3 step 4
episode 3 step 5
episode 3 step 6
episode 3 step 7
episode 3 step 8
episode 3 step 9
resetting environment
episode 4 step 0
episode 4 step 1
episode 4 step 2
episode 4 step 3
episode 4 step 4
episode 4 step 5
episode 4 step 6
episode 4 step 7
episode 4 step 8
episode 4 step 9


In [91]:
rewards

array([[ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  10.],
       [ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
       [ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
       [ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  10.,   0.],
       [ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.]])