In [1]:
#import random. includes the module into the namespace under the name 'random'. from random import random. includes the function'random' from the namespace 'random' into the global namespace
import random
import gym # Gym is a toolkit for developing and comparing reinforcement learning algorithms. It makes no assumptions about the structure of your agent, and is compatible with any numerical computation library, such as TensorFlow or Theano
import numpy as np
from collections import deque #A double-ended queue, or deque, supports adding and removing elements from either end. The more commonly used stacks and queues are degenerate forms of deques, where the inputs and outputs are restricted to a single end.
from keras.models import Sequential  # Sequential model In Keras, you assemble layers to build models. A model is (usually) a graph of layers. The most common type of model is a stack of layers: the tf.keras.Sequential model.To build a simple, fully-connected network (i.e. multi-layer perceptron
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf # End-to-end open source machine learning platform Build and train models by using the high-level Keras API
# Keras is a high-level neural networks API, written in Python and capable of running on top of TensorFlow, CNTK, or Theano. It was developed with a focus on enabling fast experimentation. Being able to go from idea to result with the least possible delay is key to doing good research
EPISODES = 2000 

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # Batch size: size of each batch self.memory = deque(maxlen = buffer_size) ... We do this so that the states that we feed to the model are not temporally correlated.
        self.gamma = 0.95    # A discounted rate in Gamma distribution.Inherits From: Distribution The Gamma distribution is defined over positive real numbers using parameters concentration (aka "alpha") and rate (aka "beta").
        self.epsilon = 1.0  # A new exploration rate Because this policy has an exploratory component, we must override the turnOffExploration() and turnOnExploration() functions, so that when evaluating the policy’s performance the exploratory component may be automatically disabled so as not to influence results:
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()


    def _huber_loss(self, y_true, y_pred, clip_delta=1.0): #In statistics, the Huber loss is a loss function used in robust regression, that is less sensitive to outliers in data than the squared error loss. A variant for classification is also sometimes used. Contents. 1 Definition; 2 Motivation; 3 Pseudo-Huber loss function
        error = y_true - y_pred
        cond  = K.abs(error) <= clip_delta

        squared_loss = 0.5 * K.square(error) # Custom Metrics in Keras 
        quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)

        return K.mean(tf.where(cond, squared_loss, quadratic_loss)) # more or less then that result is passed to K.square, which as expected, returns the square of its parameter, and then that result is given to K.mean, which computes the mean.

    def _build_model(self):
        # Defining the model in the Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=self._huber_loss,
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def update_target_model(self):
        # Direct copying of weights from the other model into target_model
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # Returns the indices of the maximum values along an axis. A return of the value action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                # a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * np.amax(t)
                # target[0][action] = reward + self.gamma * t[np.argmax(a)]
            self.model.fit(state, target, epochs=1, verbose=0) # For verbose > 0, fit method logs:loss: value of loss function for your training data acc: accuracy value for your training data. For example, using verbose while training the model helps to detect overfitting which occurs if your acc keeps improving while your val_acc gets worse.
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-ddqn.h5")
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                agent.update_target_model()
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        # if e % 10 == 0:
        #     agent.save("./save/cartpole-ddqn.h5")



Using TensorFlow backend.






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where








episode: 0/2000, score: 44, e: 0.89
episode: 1/2000, score: 39, e: 0.6
episode: 2/2000, score: 12, e: 0.53
episode: 3/2000, score: 10, e: 0.48
episode: 4/2000, score: 13, e: 0.42
episode: 5/2000, score: 12, e: 0.37
episode: 6/2000, score: 10, e: 0.34
episode: 7/2000, score: 8, e: 0.31
episode: 8/2000, score: 10, e: 0.28
episode: 9/2000, score: 9, e: 0.26
episode: 10/2000, score: 13, e: 0.23
episode: 11/2000, score: 8, e: 0.21
episode: 12/2000, score: 9, e: 0.19
episode: 13/2000, score: 11, e: 0.17
episode: 14/2000, score: 23, e: 0.14
episode: 15/2000, score: 31, e: 0.099
episode: 16/2000, score: 147, e: 0.023
episode: 17/2000, score: 425, e: 0.0099
episode: 18/2000, score: 122, e: 0.0099
episode: 19/2000, score: 129, e: 0.0099
episode: 20/2000, score: 170, e: 0.0099
episode: 21/2000, score: 297, e: 0.0099
episode: 22/2000, score: 132, e: 0.0099
episode: 23/2000, score: 117, e: 0.00