# Session 1: Discrete States and Actions

## Imports

In [18]:
import numpy as np
from collections import defaultdict
#complete these scripts during session / as homework
# from answers.environments import Gridworld
# from answers.agents import MCAgent
#cross check when you finish
from solutions.environments import Gridworld
from solutions.agents import MCAgent

## Markov Decision Process

## Environment - Gridworld

### What The Gridworld Looks Like

In [10]:
env = Gridworld()
env.print_physical(visible_only=False)


------------------------------------
['F', 'o'] |['T', 'x'] |['G', 'x'] |
------------------------------------
['F', 'x'] |['F', 'x'] |['F', 'x'] |
------------------------------------
['F', 'x'] |['F', 'x'] |['F', 'x'] |

### States

In [11]:
env.state_space

[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]

### Actions

In [12]:
env.action_space, env.action_text

(array([0, 1, 2, 3]), array(['U', 'L', 'D', 'R'], dtype='<U1'))

### Rewards

In [13]:
env.print_reward()


----------
0 |-3 |5 |
----------
0 |0 |0 |
----------
0 |0 |0 |

In [19]:
env.move_reward

-1

### What The Agent Usually Sees

In [14]:
env.print_physical(visible_only=True)


------------------------------------
['F', 'o'] |['NA', 'NA'] |['NA', 'NA'] |
------------------------------------
['NA', 'NA'] |['NA', 'NA'] |['NA', 'NA'] |
------------------------------------
['NA', 'NA'] |['NA', 'NA'] |['NA', 'NA'] |

In [15]:
#go down
action = np.argwhere(env.action_text=='D')
print(env.step(action))
#then right
action = np.argwhere(env.action_text=='R')
print(env.step(action))
env.print_physical(visible_only=True)

((1, 0), -1, False)
((1, 1), -1, False)

------------------------------------
['F', 'x'] |['NA', 'NA'] |['NA', 'NA'] |
------------------------------------
['F', 'x'] |['F', 'o'] |['NA', 'NA'] |
------------------------------------
['NA', 'NA'] |['NA', 'NA'] |['NA', 'NA'] |

### Deterministic Environment vs Stochastic Environment

In [17]:
env = Gridworld(wind_p=0.5)
#go down
action = np.argwhere(env.action_text=='D')
print(env.step(action))
#then right
action = np.argwhere(env.action_text=='R')
print(env.step(action))
env.print_physical(visible_only=True)

((1, 0), -1, False)
((0, 1), -4, False)

------------------------------------
['F', 'x'] |['T', 'o'] |['NA', 'NA'] |
------------------------------------
['F', 'x'] |['NA', 'NA'] |['NA', 'NA'] |
------------------------------------
['NA', 'NA'] |['NA', 'NA'] |['NA', 'NA'] |

## Agent

### Deterministic Policy

### Stochastic Policy - Epsilon Greedy

### State Value

### (State-)Action Value

### Optimal State and Action Values

## Solving Reinforcement Learning Problems - Monte Carlo Method

### Prediction Problem

### Control Problem

## Challenges

* What are some other ways of solving reinforcement learning problems? How are they better or worse than Monte Carlo methods e.g. performance, data requirements, etc.?
* Play around with Gridworld. Tweak these variables and see what happens:
    * Wing probability
    * Move rewards
    * Discount factor
    * Epsilon and how to decay it (or not)
* Solve at least one of the following OpenAI gym environments with discrete states and actions:
    * FrozenLake-v0
    * Taxi-v2
    * Blackjack-v0

In [8]:
env = Gridworld(wind_p=0.1)
policy = dict.fromkeys(env.grid_keys)
for key in env.grid_keys: policy[key] = np.random.choice(np.arange(len(env.action_space)))
a = Agent(env,policy)
a.print_policy()

AttributeError: 'Gridworld' object has no attribute 'grid_keys'

In [None]:
a.mc_control_glie(lr=0.01)
a.print_policy()

## Prediction Problem
* Evaluate deterministic policies and environments
* Evaluate stochastic policies and environments

### Evaluate deterministic policies and environments

In [None]:
#deterministic env
env = Gridworld(wind_p=0.)
#deterministic policy
policy_a = {(0, 0): 3,
          (0, 1): 3,
          (0, 2): 0,
          (1, 0): 3,
          (1, 1): 3,
          (1, 2): 0,
          (2, 0): 3,
          (2, 1): 0,
          (2, 2): 0}
policy_b = {(0, 0): 2,
          (0, 1): 3,
          (0, 2): 0,
          (1, 0): 2,
          (1, 1): 2,
          (1, 2): 0,
          (2, 0): 3,
          (2, 1): 3,
          (2, 2): 0}

#peek
env.print_reward()
env.print_physical()
print('\n')
a = Agent(env,policy_a,gamma=0.9)
print('Policy a')
a.print_policy()
print('\n')
print('Policy b')
a.policy = policy_b
a.print_policy()

In [None]:
for state in env.grid_keys:
    a.v[state] = a.get_v(state)
a.print_v()
a.print_policy()

In [None]:
for state in env.grid_keys:
    for action in range(len(env.action_space)):
        a.q[state][action] = a.get_q(state,action)
a.q

### Evaluate stochastic policies and environments

## Control Problem
* Monte Carlo
* First-visit

Bonus
* Greedy in the Limit with Infinite Exploration
* GLIE with constant learning rate

Homework