# Homework 2: Function approximation

**Your task**

- Implement experience replay for CartPole with linear function approximation.  (4p)

- **Bonus**: implement DQN with experience replay. Use a neural network instead of a linear approximator (2p):

- *Bonus points*: 1 bug found in my code (hopefully none, but it happens) = 1 extra point.

You can borrow inspiration from tutorial 5 (`05 fa_cartpole_sklearn.py`).

In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt

In [None]:
class FunctionEstimator:
    def __init__(self,env):
        self.env = env
        self.n_actions = env.action_space.n
        self.possible_actions = range(env.action_space.n)
        self.initial_state = self.featurize(env.reset())
        self.theta = np.ones(shape=(self.n_actions,), dtype=float) #parameters of the function approximating Q table
    
    def featurize(self,state):
        """
        Turn state into feature vector so it can be inputted into the function approximating Q table.
        """
        return state
    
    def update(self, state, action, td_target):
        """
        Update the parameters theta by gradient descend.
        """
        state = self.featurize(state) 
        return None
        
    def predict(self,state):
        """
        Predict Q-values of every action in a given state.
        Policy will then select which action to take based on these Q-values.
        """
        state = self.featurize(state)
        return None


        
def make_policy(estimator, epsilon):
    """
    Epsilon greedy policy choosing which action to take.
    """
    def policy_fn(state):
        preds = estimator.predict(state)
        if np.random.rand()>epsilon:
            action = np.argmax(preds)
        else:
            action = np.random.choice(estimator.possible_actions)
        return action
    
    return policy_fn


def run_episodes(n_episodes = 1000, gamma = 1):
    env = gym.make('CartPole-v0')

    estimator = FunctionEstimator(env)
    states = []
    last_states = deque(maxlen=100)

    for ep in range(n_episodes):
        state = env.reset()
        done = False
        policy = make_policy(estimator, 0.99**ep) # reduce epsilon
        ep_reward = 0

        while not done:
            action = policy(state)
            new_state, reward, done, _ = env.step(action)
            ep_reward += reward

            # Keep track of the states    
            states.append(new_state[0])
            last_states.append(new_state[0])

            # Update the Q-function
            td_target = reward
            if not done:
                td_target = reward + gamma*np.amax(estimator.predict(new_state))
            estimator.update(state,action,td_target)

        # Show stats
        if (ep) % 50 == 0:
            print('*'*100)
            print("INFO: Reward at episode {0} is {1}".format(ep+1,ep_reward))
            print("Best state reached (overall):", np.max(states))
            print("Best state reached (last 100):", np.max(last_states))
            
    env.close()