# Homework 2: Function approximation

**Your task**

- Implement experience replay for CartPole with linear function approximation.  (4p)

- **Bonus**: implement DQN with experience replay. Use a neural network instead of a linear approximator (2p):

- *Bonus points*: 1 bug found in my code (hopefully none, but it happens) = 1 extra point.
    - Bug 1: td_target calculation should use `np.max(estimator.predict(new_state))` instead of `np.amax()`
    - Bug 2: set `state = new_state` at the end of `while not done:` loop

You can borrow inspiration from tutorial 5 (`05 fa_cartpole_sklearn.py`).

In [3]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
from sklearn.linear_model import SGDRegressor

In [61]:
class FunctionEstimator:
    def __init__(self,env, alpha=0.001):
        self.alpha = alpha
        self.env = env
        self.n_actions = env.action_space.n
        self.possible_actions = range(env.action_space.n)
        self.initial_state = self.featurize(env.reset())
        self.state_dim = len(self.initial_state)
        self.w = np.ones(shape=(self.state_dim, self.n_actions), dtype=float) #parameters of the function approximating Q table
    
    def featurize(self,state):
        """
        Turn state into feature vector so it can be inputted into the function approximating Q table.
        """
        return np.tanh(state)
    
    def gradient(self, state, action):
        """
        f(state,action) = state*w_action
        df(state, action)/dw_action = state
        """
        return state
    
    def update(self, state, action, td_target):
        """
        Update the parameters by gradient descend.
        Move the result of f(state, action) closer to target = minimize MSE.
        """
        state = self.featurize(state) 
        
        target = td_target
        prediction = self.predict(state)[action]
        error = target - prediction
        gradient = self.gradient(state, action)
        
        self.w[:,action] -= self.alpha * error * (-1 * gradient)
        
    def batch_update(self, history, batch_size=5):
        history = np.array(history)
        length = history.shape[0]
        
        if length <= batch_size:
            batch = history
        else:
            idxs = np.random.choice(length, batch_size, replace=False) #select indexes of points from history
            batch = history[idxs] # select points from history to learn from
        
        for i in range(batch.shape[0]):
            [state, action, reward, new_state, td_target] = batch[i]
            self.update(state, action, td_target)
        
    def predict(self,state):
        """
        Predict Q-values of every action in a given state.
        Policy will then select which action to take based on these Q-values.
        """
        state = self.featurize(state)
        return np.dot(state, self.w).flatten() # linear function

        
def make_policy(estimator, epsilon):
    """
    Epsilon greedy policy choosing which action to take.
    """
    def policy_fn(state):
        preds = estimator.predict(state)
        if np.random.rand()>epsilon:
            action = np.argmax(preds)
        else:
            action = np.random.choice(estimator.possible_actions)
        return action
    
    return policy_fn


def run_episodes(n_episodes = 1000, gamma = 1):
    env = gym.make('CartPole-v0')

    estimator = FunctionEstimator(env)
    history = []
    episode_rewards = []

    for ep in range(n_episodes):
        state = env.reset()
        done = False
        policy = make_policy(estimator, 0.99**ep) # reduce epsilon
        ep_reward = 0

        while not done:
            action = policy(state)
            new_state, reward, done, _ = env.step(action)
            ep_reward += reward

            # Update the Q-function
            if done:
                td_target = ep_reward
            else:
                td_target = reward + gamma*np.max(estimator.predict(new_state))
            
            # Keep track of the states    
            history.append([state, action, reward, new_state, td_target])
            
            #estimator.update(state,action,td_target) # without experience replay
            estimator.batch_update(history[-100:])         # with experience replay
            
            state = new_state

        episode_rewards.append(ep_reward)
        # Show stats
        if (ep) % 100 == 0:
            #print('*'*100)
            print("INFO: Reward at episode {} is {} | avg in last 100: {} | w={}"
                  .format(ep,ep_reward, np.average(episode_rewards[-100:]), estimator.w))
            
    env.close()

In [62]:
run_episodes(n_episodes=1000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO: Reward at episode 0 is 42.0 | avg in last 100: 42.0 | w=[[ 1.00097404  1.00005722]
 [ 0.99373492  0.98483588]
 [ 0.99720467  0.99820808]
 [ 0.99660423  1.01232684]]
INFO: Reward at episode 100 is 35.0 | avg in last 100: 57.0 | w=[[ 2.08690129  1.78788476]
 [ 3.74974809  3.35901264]
 [ 0.15283881  1.53076725]
 [-2.68210934  2.40561922]]
INFO: Reward at episode 200 is 157.0 | avg in last 100: 176.51 | w=[[ -2.25959391e+00   1.38102728e+00]
 [  1.11174700e+00   7.37940994e+00]
 [  4.64043679e-03   4.21131614e+00]
 [ -5.89146013e+00   1.09774520e+01]]
INFO: Reward at episode 300 is 200.0 | avg in last 100: 194.55 | w=[[ -4.06894849  -7.67611794]
 [ -2.21333537   3.28140213]
 [ -0.64360084   5.30080772]
 [-11.53663604  14.1085186 ]]
INFO: Reward at episode 400 is 200.0 | avg in last 100: 194.29 | w=[[ -7.44752497  -9.99959068]
 [ -4.12410456   3.49934027]
 [ -0.78679853   5.2076

KeyboardInterrupt: 

In [29]:
x = np.array(range(16)).reshape(4,4)
length = x.shape[0]
batch_size = 3
idxs = np.random.choice(length, batch_size, replace=False)
print("x:",x)
print("idxs:",idxs)
print("x[idxs]:",x[idxs])

x: [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
idxs: [2 3 1]
x[idxs]: [[ 8  9 10 11]
 [12 13 14 15]
 [ 4  5  6  7]]


In [32]:
[a,b,c,d] = x[1]
print(a,b,c,d)

4 5 6 7
