# Session 1: Discrete States and Discrete Actions

## Imports

In [140]:
import numpy as np
from collections import defaultdict

## Gridworld

### Environment

In [171]:
class Gridworld:
    def __init__(self, sz = (3,3), start = (0,0), goal = (0,2), traps = [(0,1)],
                 goal_reward = 5, trap_reward = -3, move_reward = -1, wind_p = 0.3):
        self.sz = sz
        self.action_space = ['U','L','D','R']
        #create grids
        self.grid_keys = [(i,j) for i in range(sz[0]) for j in range(sz[1])]
        self.start =start
        self.goal = goal
        self.traps = traps
        self.move_reward = move_reward
        self.trap_reward = trap_reward
        self.goal_reward = goal_reward
        self.wind_p = wind_p
        self.reset()
    def reset(self):
        self.traversed = [self.start]
        self.i = self.start[0]
        self.j = self.start[1]
        self.done = False
        #physical grid
        self.physical_grid = dict.fromkeys(self.grid_keys,['F','x'])
        self.physical_grid[self.start] = ['F','o']
        self.physical_grid[self.goal] = ['G','x']
        for t in self.traps: self.physical_grid[t] = ['T','x']
        #reward grid
        self.reward_grid = dict.fromkeys(self.grid_keys,0)
        self.reward_grid[self.goal] = self.goal_reward
        for t in self.traps: self.reward_grid[t] = self.trap_reward
        return((self.i,self.j))
    def print_reward(self,visible_only=False):
        for i in range(self.sz[0]):
            print('\n----------')
            for j in range(self.sz[1]):
                if visible_only:
                    out = self.reward_grid[(i,j)] if (i,j) in self.traversed else 'NA'
                else:
                    out = self.reward_grid[(i,j)]
                print(f'{out} |',end='')
    def print_physical(self,visible_only=False):
        for i in range(self.sz[0]):
            print('\n------------------------------------')
            for j in range(self.sz[1]):
                if visible_only:
                    out = self.physical_grid[(i,j)] if (i,j) in self.traversed else ['NA','NA']
                else:
                    out = self.physical_grid[(i,j)]
                print(f'{out} |',end='')
    def update_physical(self):
        for key in self.grid_keys:
            self.physical_grid[key][1] = 'x'
        tile = self.physical_grid[(self.i,self.j)][0] 
        self.physical_grid[(self.i,self.j)] = [tile,'o']
    def wind(self):
        if np.random.uniform() < self.wind_p:
            pos = self.i - 1
            self.i = pos if pos >= 0 else 0
    def step(self,action_idx):
        reward = self.move_reward
        i,j = self.i,self.j
        action = self.action_space[action_idx]
        if action == 'U':
            i -= 1
        elif action == 'L':
            j -= 1
        elif action == 'D':
            i += 1
        elif action == 'R':
            j += 1
        #check legality
        if (i,j) in self.grid_keys:
            #update position
            self.i,self.j = i,j
            #wind blows
            self.wind()
            #save traversed
            self.traversed.append((self.i,self.j))
            #update physical
            self.update_physical()
            #update reward
            reward += self.reward_grid[(self.i,self.j)]
        else:
            pass
        if (self.i,self.j) == self.goal: self.done = True
        #return s',r, done or not
        return((self.i,self.j),reward,self.done)

In [172]:
g = Gridworld()
g.print_physical(visible_only=False)
g.print_reward(visible_only=False)


------------------------------------
['F', 'o'] |['T', 'x'] |['G', 'x'] |
------------------------------------
['F', 'x'] |['F', 'x'] |['F', 'x'] |
------------------------------------
['F', 'x'] |['F', 'x'] |['F', 'x'] |
----------
0 |-3 |5 |
----------
0 |0 |0 |
----------
0 |0 |0 |

In [173]:
print(g.step(3))
print(g.step(3))
g.print_physical(visible_only=True)

((0, 1), -4, False)
((0, 2), 4, True)

------------------------------------
['F', 'x'] |['T', 'x'] |['G', 'o'] |
------------------------------------
['NA', 'NA'] |['NA', 'NA'] |['NA', 'NA'] |
------------------------------------
['NA', 'NA'] |['NA', 'NA'] |['NA', 'NA'] |

### Agent

In [247]:
class Agent:
    def __init__(self, env, policy, gamma = 1, 
                 start_epsilon = 0.9, end_epsilon = 0.1, epsilon_decay = 0.9):
        self.env = env
        self.policy = policy
        self.gamma = gamma
        self.v = dict.fromkeys(self.env.grid_keys,0)
        self.q = defaultdict(lambda: np.zeros(len(self.env.action_space)))
        self.start_epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.epsilon_decay = epsilon_decay
    def get_epsilon(self,n_episode):
        epsilon = max(self.start_epsilon * (self.epsilon_decay**n_episode),self.end_epsilon)
        return(epsilon)
    def select_action(self,state,epsilon):
        if np.random.uniform() < epsilon:
            action = np.random.choice(range(len(self.env.action_space)))
        else:
            action = self.policy[state]
        return(action)
    def print_policy(self):
        for i in range(self.env.sz[0]):
            print('\n----------')
            for j in range(self.env.sz[1]):
                p=self.policy[(i,j)]
                out = self.env.action_space[p]
                print(f'{out} |',end='')
    def run_episode(self, n_episode):
        state = self.env.reset()
        while True:
            epsilon = self.get_epsilon(n_episode)
            old_state = state
            action = self.select_action(state,epsilon)
            state,reward,done = self.env.step(action)
            print(old_state,state,reward,done)
            if done: break

In [248]:
env = Gridworld()
policy = {(0, 0): 2,
          (0, 1): 3,
          (0, 2): 0,
          (1, 0): 3,
          (1, 1): 3,
          (1, 2): 0,
          (2, 0): 3,
          (2, 1): 0,
          (2, 2): 0}
a = Agent(env,policy,gamma=1)
a.print_policy()


----------
D |R |U |
----------
R |R |U |
----------
R |U |U |

In [255]:
a.run_episode(20)

(0, 0) (0, 0) -1 False
(0, 0) (1, 0) -1 False
(1, 0) (0, 1) -4 False
(0, 1) (0, 2) 4 True


## Prediction Problem
* Evaluate deterministic policies in a deterministic environment
* Evaluate deterministic policies in a stochastic environment
* Evaluate stochastic policies in a stochastic environment 

## Control Problem
* Monte Carlo
* First-visit
* Greedy in the Limit with Infinite Exploration
* GLIE with constant learning rate