# Homework 2: Function approximation

**Your task**

- Implement experience replay for CartPole with linear function approximation.  (4p)

- **Bonus**: implement DQN with experience replay. Use a neural network instead of a linear approximator (2p):

- *Bonus points*: 1 bug found in my code (hopefully none, but it happens) = 1 extra point.
    - Bug 1: td_target calculation should use `np.max(estimator.predict(new_state))` instead of `np.amax()`
    - Bug 2: set `state = new_state` at the end of `while not done:` loop

You can borrow inspiration from tutorial 5 (`05 fa_cartpole_sklearn.py`).

In [3]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
from sklearn.linear_model import SGDRegressor

In [12]:
class FunctionEstimator:
    def __init__(self,env, alpha=0.001):
        self.alpha = alpha
        self.env = env
        self.n_actions = env.action_space.n
        self.possible_actions = range(env.action_space.n)
        self.initial_state = self.featurize(env.reset())
        self.state_dim = len(env.reset())
        self.w = np.ones(shape=(self.state_dim, self.n_actions), dtype=float) #parameters of the function approximating Q table
    
    def featurize(self,state):
        """
        Turn state into feature vector so it can be inputted into the function approximating Q table.
        """
        return state
    
    def gradient(self, state, action):
        """
        f(state,action) = state*w_action
        df(state, action)/dw_action = state
        """
        return state
    
    def update(self, state, action, td_target):
        """
        Update the parameters by gradient descend.
        Move the result of f(state, action) closer to target = minimize MSE.
        """
        state = self.featurize(state) 
        
        target = td_target
        prediction = self.predict(state)[action]
        error = target - prediction
        gradient = self.gradient(state, action)
        
        self.w[:,action] -= self.alpha * error * (-1 * gradient)
        
    def predict(self,state):
        """
        Predict Q-values of every action in a given state.
        Policy will then select which action to take based on these Q-values.
        """
        state = self.featurize(state)
        return np.dot(state, self.w).flatten() # linear function


        
def make_policy(estimator, epsilon):
    """
    Epsilon greedy policy choosing which action to take.
    """
    def policy_fn(state):
        preds = estimator.predict(state)
        if np.random.rand()>epsilon:
            action = np.argmax(preds)
        else:
            action = np.random.choice(estimator.possible_actions)
        return action
    
    return policy_fn


def run_episodes(n_episodes = 1000, gamma = 1):
    env = gym.make('CartPole-v0')

    estimator = FunctionEstimator(env)
    history = []
    last_episode_rewards = deque(maxlen=100)

    for ep in range(n_episodes):
        state = env.reset()
        done = False
        policy = make_policy(estimator, 0.99**ep) # reduce epsilon
        ep_reward = 0

        while not done:
            action = policy(state)
            new_state, reward, done, _ = env.step(action)
            ep_reward += reward

            # Keep track of the states    
            history.append([state, action, reward, new_state])

            # Update the Q-function
            td_target = reward
            if not done:
                td_target = reward + gamma*np.max(estimator.predict(new_state))
                
            estimator.update(state,action,td_target)
            
            state = new_state

        last_episode_rewards.append(ep_reward)
        # Show stats
        if (ep) % 100 == 0:
            #print('*'*100)
            print("INFO: Reward at episode {} is {}, best in last 100: {}".format(ep,ep_reward, np.max(last_episode_rewards)))
            
    env.close()

In [14]:
run_episodes(n_episodes=2000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO: Reward at episode 0 is 21.0, best in last 100: 21.0
INFO: Reward at episode 100 is 94.0, best in last 100: 167.0
INFO: Reward at episode 200 is 86.0, best in last 100: 166.0
INFO: Reward at episode 300 is 62.0, best in last 100: 200.0
INFO: Reward at episode 400 is 106.0, best in last 100: 184.0
INFO: Reward at episode 500 is 97.0, best in last 100: 182.0
INFO: Reward at episode 600 is 98.0, best in last 100: 200.0
INFO: Reward at episode 700 is 99.0, best in last 100: 200.0
INFO: Reward at episode 800 is 56.0, best in last 100: 174.0
INFO: Reward at episode 900 is 104.0, best in last 100: 169.0
INFO: Reward at episode 1000 is 64.0, best in last 100: 200.0
INFO: Reward at episode 1100 is 73.0, best in last 100: 162.0
INFO: Reward at episode 1200 is 54.0, best in last 100: 161.0
INFO: Reward at episode 1300 is 61.0, best in last 100: 200.0
INFO: Reward at episode 1400 is 61.