# Deep Reinforcement Learning
---
Learn more about the math __[Lecture 14 | Deep Reinforcement Learning (Stanford)](http://goo.gl/hTj627)__


In [90]:
from IPython.display import IFrame
IFrame(src="https://www.youtube.com/embed/OYhFoMySoVs", width=560, height=315)

### Import Dependencies

In [91]:
import os
import gym
import numpy as np
import random
from collections import deque  # list accessible from both ends
from keras.models import Sequential
from keras.layers import Dense  # only using dense layers
from keras. optimizers import Adam


### Set Parameters

In [92]:
# define environment for deep rl
env = gym.make('CartPole-v0')

In [93]:
# define the state size
state_size = env.observation_space.shape[0]

In [94]:
# define the action space size
action_size = env.action_space.n

In [95]:
# define the batch size for the gradient descent
batch_size = 32  # can be varied by power of 2


In [96]:
# number of episodes is the number games to play 
# provides more data for training
# each episode data is collected to train the deep rl
n_episodes = 1001


In [97]:
# define an output directory for model
output_dir = 'model_output/cartpole'
# create directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Formally Define the Agent

In [98]:
class DQNAgent:
    
    def __init__(self, state_size, action_size):
        
        #inital parameters
        self.state_size = state_size
        self.action_size = action_size
        
        # define the memory- how many state/action pairs to remember
        # limits the amount of information, gives us a sample from all around data set
        # get a better diversity from the data, using a deque to keep only some 
        # of the samples, once the limit is reached, older ones are discarded.
        
        self.memory = deque(maxlen = 2000)
        
        # set hyper parameters
        # gamma is how much discount to apply
        
        self.gamma = 0.95
        
        # epsilon is the exploration rate for the agent
        # two modes the action can take: best possible based on what is learned (exploitation), 
        # or random new actions (exploration).  Exploration is helpful because domain is so large
        # it would be too time consuming to try everything but exploitation might direct the actions 
        # away from a more optimal solution over time.
        
        self.epsilon = 1.0 # skewed 100% towards exploration (at the begining because it doesn't know anything)
        
        # decay epislon over time to shift from exploring at random to exploiting 
        # the information that is now known.  The floor is how low the epsilon can go- even when
        # we have decayed to zero, so a little exploration will always occur. 
        
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        # learning rate stochastic gradient learning rate
        
        self.learning_rate = 0.001
        
        #private method for build model
        
        self.model = self._build_model()
        
    def _build_model(self):
        """ Define the dense neural network for approximating the Q* """
        
        #  set up a keras model
        
        model = Sequential()
        
        # create a shallow neural network
        # use a dense network with 24 input neurons, input dimensions are num of states, use relu for activation. 
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))        
        
        # output has as many neurons as possible actions, linear activation to directly model actions 
        # don't want a probablity estimate
        
        model.add(Dense(self.action_size, activation='linear'))
        
        # for output use mean-squared-error for cost estimate (try cross-entropy, but it is not as good)
        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
        
    def remember(self, state, action, reward, next_state, done):
        """ 
            Appends the information into the memory deck
            
            Parameters:
                state at current time step
                action at current time step
                reward at current time step
                next_state
                done    
        """
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        """
            Decide what action to take within a state: explore/exploit
            Parameters:
                state
        """
        
        # sample a random value between 0 and 1, if less than or equal to epsilon, act randomly (exploration)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        # to exploit, use theta weights and predict method on the model so there is a guess on the 
        # next best action
        
        act_values = self.model.predict(state)
        
        # return the best choice
        
        return np.argmax(act_values[0])
        
    def replay(self, batch_size):
    
        # create a mini batch that is a random sample from the deck of memories
        # the batch size is the number of memories

        minibatch = random.sample(self.memory, batch_size)
        
        # if the max time steps allowed is reached, or the game ended then the done
        # target is = reward (we know how the game ends)
        
        for (state, action, reward, next_state, done) in minibatch:
            target = reward
        
        # if not done, then future reward must be estimated (current reward plus the discounted future reward.)
        
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
                
        # map the maximized future reward to the current reward using theta
        
            target_f = self.model.predict(state)
            target_f[0][action] = target
                          
        # fit a model to train using current state, predicted future reward, for a single epoch
                          
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        # decrease epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
         
    def load(self, name):
        """ loading weights """
        self.model.load_weights(name)
        
    def save(self, name):
        """ savin weights for later use """
        self.model.save_weights(name)
            
                

In [99]:
agent = DQNAgent(state_size, action_size)

In [102]:
done = False
for e in range(n_episodes):
    
    # start each episode at the beginning 
    
    state = env.reset()
    
    # reshape the states (transpose) so they fit with the network
    
    state = np.reshape(state, [1, state_size])
    
    # iterate over timesets of the game, set max number of timesteps it can run for
    
    for time in range(5000):
        
        env.render()
        
        # set the action state to the current state
        
        action = agent.act(state) 
        
        # get the information for the next step
        
        next_state, reward , done, _ = env.step(action)
        
        # calculate the reward (if done, -10)
        
        reward = reward if not done else -10
        
        # reshape next state
        
        next_state = np.reshape(next_state, [1, state_size])
        
        # remember the previous time steps
        
        agent.remember(state, action, reward, next_state, done)
        
        # subsequent iteration state is the next state
        state = next_state
        
        if done:
            print("Episode: {}/{}, score: {}, e: {:.2}".format(e, n_episodes, time, agent.epsilon))
            break
            
    # train theta, give the agent a chance to update theta weights to improve
    # gradient descent is about minimizing cost, we are maximizing reward- so technically ascent
    
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
        
    # every 50 save the outputs 
    if e % 50 == 0:
        agent.save(output_dir + "weights_" + '{:04d}'.format(e) +".hdf5")

Episode: 0/1001, score: 134, e: 0.043
Episode: 1/1001, score: 137, e: 0.042
Episode: 2/1001, score: 165, e: 0.042
Episode: 3/1001, score: 112, e: 0.042
Episode: 4/1001, score: 122, e: 0.042
Episode: 5/1001, score: 128, e: 0.041
Episode: 6/1001, score: 110, e: 0.041
Episode: 7/1001, score: 199, e: 0.041
Episode: 8/1001, score: 40, e: 0.041
Episode: 9/1001, score: 104, e: 0.041
Episode: 10/1001, score: 17, e: 0.04
Episode: 11/1001, score: 37, e: 0.04
Episode: 12/1001, score: 30, e: 0.04
Episode: 13/1001, score: 28, e: 0.04
Episode: 14/1001, score: 26, e: 0.04
Episode: 15/1001, score: 10, e: 0.039
Episode: 16/1001, score: 13, e: 0.039
Episode: 17/1001, score: 15, e: 0.039
Episode: 18/1001, score: 16, e: 0.039
Episode: 19/1001, score: 16, e: 0.039
Episode: 20/1001, score: 13, e: 0.038
Episode: 21/1001, score: 15, e: 0.038
Episode: 22/1001, score: 19, e: 0.038
Episode: 23/1001, score: 17, e: 0.038
Episode: 24/1001, score: 21, e: 0.038
Episode: 25/1001, score: 20, e: 0.038
Episode: 26/1001, 

KeyboardInterrupt: 