In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

Starting virtual X frame buffer: Xvfb../xvfb: line 8: start-stop-daemon: command not found
.
env: DISPLAY=:1


## 1. Q-learning in the wild (3 pts)

Here we use the qlearning agent on taxi env from openai gym.
You will need to insert a few agent functions here.

In [None]:
import gym
env = gym.make("Taxi-v2")

n_actions = env.action_space.n

In [None]:
from cs188.qlearningAgents import QLearningAgent

agent = QLearningAgent(alpha=0.5,epsilon=0.25,gamma=0.99,
                       actionFn = lambda s: range(n_actions))

In [None]:
def play_and_train(env,agent,t_max=10**4):
    """This function should 
    - run a full game, actions given by agent.getAction(s)
    - train agent using agent.update(...) whenever possible
    - return total reward"""
    total_reward = 0.0
    s = env.reset()
    
    for t in range(t_max):
        a = #<get agent to pick action given state s>
        
        next_s,r,done,_ = env.step(a)
        
        #<train (update) agent for state s>
        
        s = next_s
        total_reward +=r
        if done:break
        
    return total_reward
    
        
    

In [None]:
rewards = []
for i in range(1000):
    rewards.append(play_and_train(env,agent))    
    if i %100 ==0:
        clear_output(True)
        print "mean reward",np.mean(rewards[-100:])
        plt.plot(rewards)
        plt.show()

### 1.1 reducing epsilon

Try decreasing agent epsilon over time to make him reach positive score.

The straightforward way to do so is to reduce epsilon every N games:
* either multiply agent.epsilon by a number less than 1 (e.g. 0.99)
* or substract a small value until it reaches 0

You can, of-course, devise other strategies.

__The goal is to reach positive reward!__

## 2. Experience replay (3 pts)
Here we implement Experience replay - basically a buffer, which stores some sessions to increase sample efficency

In [None]:
import random
class ReplayBuffer(object):
    def __init__(self, size):
        """Create Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        """
        self._storage = []
        self._maxsize = size
        <any other vars>
        

    def __len__(self):
        return len(self._storage)

    def add(self, obs_t, action, reward, obs_tp1, done):
        '''Make sure, _storage will not exceed _maxsize. 
           Make sure, FIFO rule is being followed: the oldest examples has to be removed earlier'''
        data = (obs_t, action, reward, obs_tp1, done)
        <add data to storage.>
        

    def _get_sessions(self, idxes):
        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
        <get specified indexes from data>
        return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)

    def sample(self, batch_size):
        """Sample a batch of experiences.
        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        """
        idxes = <sample indexes>
        return self._get_sessions(idxes)


In [None]:
'Tests'
replay = ReplayBuffer(2)
obj1 = tuple(range(5))
obj2 = tuple(range(5, 10))
replay.add(*obj1)
assert replay.sample(1)==obj1
replay.add(*obj2)
assert len(replay._storage)==2
replay.add(*obj2)
assert len(replay._storage)==2
assert tuple(np.unique(a) for a in replay.sample(100))==obj2
replay.add(*obj1)
assert max(len(np.unique(a)) for a in replay.sample(100))==2
replay.add(*obj1)
assert tuple(np.unique(a) for a in replay.sample(100))==obj1
print "Success!"

In [None]:
def play_and_train(env,agent,t_max=10**4):
    """This function should 
    - run a full game, actions given by agent.getAction(s)
    - train agent using agent.update(...) whenever possible
    - return total reward"""
    total_reward = 0.0
    s = env.reset()
    <How you need to modify pipeline in order to use ER?>
    for t in range(t_max):
        a = #<get agent to pick action given state s>
        
        next_s,r,done,_ = env.step(a)
    
        
        s = next_s
        total_reward +=r
        if done:break
    
    return total_reward

## 3. Continuous state space (1 pt)

Use agent to train on CartPole-v0

This environment has a continuous number of states, so you will have to group them into bins somehow.

The simplest way is to use `round(x,n_digits)` (or numpy round) to round real number to a given amount of digits.

The tricky part is to get the n_digits right for each state to train effectively.

Note that you don't need to convert state to integers, but to __tuples__ of any kind of values.

In [None]:
env = gym.make("CartPole-v0")
n_actions = env.action_space.n

print("first state:%s"%(env.reset()))
plt.imshow(env.render('rgb_array'))

### Play a few games

We need to estimate observation distributions. To do so, we'll play a few games and record all states.

In [None]:
all_states = []
for _ in range(1000):
    all_states.append(env.reset())
    done = False
    while not done:
        s,r,done,_ = env.step(env.action_space.sample())
        all_states.append(s)
        if done:break
            
all_states = np.array(all_states)

for obs_i in range(env.observation_space.shape[0]):
    
    plt.hist(all_states[:,obs_i],bins=20)
    plt.show()

In [None]:
rewards = []
for i in range(1000):
    rewards.append(play_and_train(env,agent))    
    if i %100 ==0:
        clear_output(True)
        print "mean reward",np.mean(rewards[-100:])
        plt.plot(rewards)
        plt.show()

## Binarize environment

In [None]:
from gym.core import ObservationWrapper
class Binarizer(ObservationWrapper):
    
    def _observation(self,state):    
        
        #state = <round state to some amount digits.>
        #hint: you can do that with round(x,n_digits)
        #you will need to pick a different n_digits for each dimension

        return tuple(state)

In [None]:
env = Binarizer(gym.make("CartPole-v0"))

In [None]:
all_states = []
for _ in range(1000):
    all_states.append(env.reset())
    done = False
    while not done:
        s,r,done,_ = env.step(env.action_space.sample())
        all_states.append(s)
        if done:break
            
all_states = np.array(all_states)

for obs_i in range(env.observation_space.shape[0]):
    
    plt.hist(all_states[:,obs_i],bins=20)
    plt.show()

## Learn

In [None]:
agent = QLearningAgent(alpha=0.5,epsilon=0.25,gamma=0.99,
                       actionFn = lambda s: range(n_actions))

In [None]:
rewards = []
for i in range(1000):
    rewards.append(play_and_train(env,agent))    
    if i %100 ==0:
        clear_output(True)
        print "mean reward",np.mean(rewards[-100:])
        plt.plot(rewards)
        plt.show()
        

## 4. Expected value SARSA

```<go to expected_value_sarsa.py and implement missing lines in getValue(state)```

In [None]:
import gym
env = gym.make("Taxi-v2")

n_actions = env.action_space.n

In [None]:
from cs188.expected_value_sarsa import EVSarsaAgent
agent = EVSarsaAgent(alpha=0.5,epsilon=0.25,gamma=0.99,
                       actionFn = lambda s: range(n_actions))

### Train EV-SARSA

Note that it uses __the same update parameters as__ qlearning so you can use the ```play_and_train``` function from q-learning.

Please try both constant epsilon = 0.25 and decreasing epsilon.

In [None]:
<your code here>

## 4.2 EV-sarsa on CartPole

Now train the `EVSarsaAgent` on CartPole-v0 env with binarizer you used above for Q-learning.

In [None]:
env = <make env and wrap it with binarizer>

agent = <your code>

In [None]:
<train me>

# 5. Massive experiments (3 pts)

This is the final part of the homework. You can pick any of the 3 tasks listed below. Or take more that one and get score for each of them independently.

_If you feel to cool for this kind of school, see bonus section below - it awwards just as much points_

###  Algorithm comparison 


For this experiment, you will need to write the code to answer to compare algorithm performance and produce plots/tables with experimental results that can be used to compare them.

Take CartPole or Taxi and compare learning performance of those algoritms
* Q-learning
* Q-learning with Experience Replay
* EV-sarsa 
* EV-sarsa with Experience Replay

Under those conditions:

* Constant epsilon 0.25, 0.1 and 0.001
* Decreasing epsilong starting from 0.25 (decrease any way you want)
* It's probably a good idea to plot learning curves (reward / games played)
* At the end of your assignment, please describe in which conditions does each algorithm work better (if at all).

* It's also useful to double-check if experiment results are robust to re-running and if they aren't - average over several runs.
* If you use CartPole-v0, use same binarization techniques.

It is __highly recommended__ that your code automatically builds the plot or prints the table.

A creative approach to visualization or trying out more ideas will be awwarded with bonus points.


### Bonus I: Advanced algorithms (4+ points)

Implement any of the three algorithms:
* n-step expected value SARSA or Q-learning
* EV-SARSA or Q-learning( using eligibility traces aka TD(lambda)
* q-learning with experience replay

_(you will likely need to create a new file for that, just like qlearning.py)_

* Show that this algorithm works no worse than those we already implemented for simple problems. 
* Try to find a way to learn faster than with default q-learning.

You will also get +2 points for each algorithm implemented after the first one and any other awesomeness you're up to.


### Bonus II: Binarization techniques (4+ points total)

Measure how learning performance depends on binarization and try some advanced binarizations.

On CartPole-v0,
* Measure learning speed and final performance against changing the amount of bins (uniformly across all dimensions) __(1 point)__
* Try pre-processing observation with PCA, SparseCoding or any dimensionality reduction method you want, see what happens __(1 point)__

* Apply binarization to solve MountainCar-v0 or LunarLander-v2 __(+2 points each)__

_Warning, Mountaincar-v0 and LunarLander-v2 may train for ~hour. The only sanity check is that the frequency of successes more or less increases._


### Bonus II+

Try applying categorical deep autoencoder as a binarization technique.

Use gumbel-softmax, 
* Explaination and [tutorial](http://blog.evjang.com/2016/11/tutorial-categorical-variational.html), 
* [Example in lasagne](https://gist.github.com/justheuristic/fd08c15dee26dbe95d3e3a17855f3f7a/)

* If you make it work on Cartpole, it's +5. 
* If on LunarLander or MountainCar, it's +5 more.
* If it somehow ends up good on Atari (see week1 homework) or BipedalWalker-v2 or anything serious, it's a full project ( more pts :) )
* If you have any questions or need any help, feel free to ask us!
