In [1]:
import numpy as np

## environment class

In [2]:
class Grid:
    def __init__(self, rows, cols, start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]

        
    def set(self, rewards, actions):
        # rewards should be a dict of: (i, j): r (row, col): reward
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        self.rewards = rewards
        self.actions = actions

        
    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

        
    def current_state(self):
        return (self.i, self.j)

    
    def is_terminal(self, s):
        return s not in self.actions

    
    def move(self, action):
        # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards.get((self.i, self.j), 0)

    
    def undo_move(self, action):
        # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    
    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    
    def all_states(self):
        # possibly buggy but simple way to get all states
        # either a position that has possible next actions
        # or a position that yields a reward
        return set(self.actions.keys()) | set(self.rewards.keys())


def standard_grid():
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # x means you can't go there
    # s means start position
    # number means reward at that state
    # .  .  .  1
    # .  x  . -1
    # s  .  .  .
    g = Grid(3, 4, (2, 0))
    rewards = {(0, 3): 1, (1, 3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    g.set(rewards, actions)
    return g


def negative_grid(step_cost=-0.1):
    # in this game we want to try to minimize the number of moves
    # so we will penalize every move
    g = standard_grid()
    g.rewards.update({
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
    })
    return g

In [3]:
def print_value(V, g):
    for i in range(g.rows):
        print('----------------------------')
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v>=0:
                print('| %.2f '%v, end='')
            else:
                print('|%.2f '%v, end='') # negative sign takes up an extra space
        print('|')
    print('----------------------------')
    

def print_policy(P, g):
    for i in range(g.rows):
        print('-----------------')
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print('| %s '%a, end='')
        print('|')
    print('-----------------')

## Monte Carlo (MC) method with Approximation
here we generate a feature vector assuming all the states are categorical data<br>
so for each categorical data we genearate a feature vector like in neural networks where each feature will belong to Real number<br>
each feature is an array- [state[0], state[1], state[0]\*state[1], 1] <br>
here we will replace V(s) with linear model as following- <br>
V(s) = V(s) + alpha\*(G(s) - V(s)) where G(s) = r + gamma\*G(s')<br>
theta = theta + alpha\*(G(s) - V(s, theta))\*feature

## windy environment or non deterministic action
notice that we are not optimizing policy here just general a new set of values which will be a little more negative compared to previous approach

In [42]:
def random_action(a):
    p = np.random.random()
    if p<0.5:
        return a
    else:
        return np.random.choice([action for action in all_possible_actions if action!=a])

def play_game(grid, policy):
    # returns a list of states and corresponding returns
    # reset game to start at a random position
    # we need to do this, because given our current deterministic policy
    # we would never end up at certain states, but we still want to measure their value
    
    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    s = grid.current_state()
    states_rewards = [(s, 0)]  # list of tuples (state, reward)
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state() 
        states_rewards.append((s, r))
    # calculate the returns by working backwards from the terminal state
    G=0
    states_returns = []
    first=True
    for s, r in reversed(states_rewards):
        # the value of the terminal state is 0 by definition
        # we should ignore the first state we encounter
        # and ignore the last G, which is meaningless since it doesn't correspond to any move
        if first:
            first = False
        else:
            states_returns.append((s, G))
        G = r + gamma*G
    states_returns.reverse() # we want it to be in order of state visited
    return states_returns

In [49]:
gamma = 0.9 # discount factor

# learning rate
lr = 0.001

# set different actions possible for a particular state
all_possible_actions = ['U', 'D', 'R', 'L']

# use the standard grid again (0 for every step) so that we can compare
# to iterative policy evaluation
grid = standard_grid()

# print rewards
print("rewards:")
print_value(grid.rewards, grid)

# state -> action
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'U',
    (2, 1): 'L',
    (2, 2): 'U',
    (2, 3): 'L'
}


# initialize theta
# our model is V_hat = theta.dot(x)
# where x = [row, col, row*col, 1] - 1 for bias term
theta = np.random.randn(4)/2   # theta with a array of 4 values
def s2x(s):
    return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1], 1])

# play game for n episodes
i = 1.0
for t in range(20000):
    if t % 100 == 0:
        i += 0.01
    alpha = lr/i
    
    # generate an episode using pi
    states_returns = play_game(grid, policy)
    
    seen_states = set()
    for s, G in states_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        # in this if a state repeats again in a particular episode then we consider the value of only first visit
        if s not in seen_states:
            x = s2x(s)
            V_hat = theta.dot(x)
            # theta = theta + alpha*(G(s) - V(s, theta))*x
            theta += alpha*(G - V_hat)*x    # theta- a vector/array of 4 values
            seen_states.add(s)
            
# obtain predicted values
states = grid.all_states()
V = {}
for s in states:
    if s in grid.actions:
        # this is like in the last layer of neural network where weights and bias are multiplied to a new
        # feature to generate a final value for a regression task
        V[s] = theta.dot(s2x(s))
    else:
        # terminal state or state we can't otherwise get to
        V[s] = 0
        
print("values:")
print_value(V, grid)
    
print('policy:')
print_policy(policy, grid)
    
# state -> action
# found by policy_iteration_random on standard_grid
# MC method won't get exactly this, but should be close
# values:
# ---------------------------
#  0.43|  0.56|  0.72|  0.00|
# ---------------------------
#  0.33|  0.00|  0.21|  0.00|
# ---------------------------
#  0.25|  0.18|  0.11| -0.17|
# policy:
# ---------------------------
#   R  |   R  |   R  |      |
# ---------------------------
#   U  |      |   U  |      |
# ---------------------------
#   U  |   L  |   U  |   L  |

rewards:
----------------------------
| 0.00 | 0.00 | 0.00 | 1.00 |
----------------------------
| 0.00 | 0.00 | 0.00 |-1.00 |
----------------------------
| 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
values:
----------------------------
| 0.40 | 0.54 | 0.68 | 0.00 |
----------------------------
| 0.34 | 0.00 | 0.32 | 0.00 |
----------------------------
| 0.27 | 0.12 |-0.04 |-0.19 |
----------------------------
policy:
-----------------
| R | R | R |   |
-----------------
| U |   | U |   |
-----------------
| U | L | U | L |
-----------------
