In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import gym
import random

In [2]:
keras.backend.set_floatx('float64')

In [3]:
class Agent(keras.Model):
    
    def __init__(self, h_s, n_action):
        super(Agent, self).__init__()
        self.fc1 = keras.layers.Dense(h_s, activation='relu', name='layer_1')
        self.fc2 = keras.layers.Dense(n_action, activation=None, name='dqn_output')
        
    def call(self, inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        return x

In [4]:
# env = gym.make('CartPole-v0')
# env.reset()
# for _ in range(1000):
#     env.render()
#     _, rew, done, _ = env.step(env.action_space.sample()) # take a random action
#     if done:
#         env.reset()
# env.close()

In [5]:
class DQN:
    
    def __init__(self, n_features, n_action, memory_limit, epsilon, params_change_pointer, gamma, learning_rate):
        
        self.n_features = n_features
        self.n_action = n_action
        self.experience_limit = memory_limit
        self.experiene_counter = 0
        self.epsilon = epsilon
        self.params_change_pointer = params_change_pointer
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.batch_size = 100
        self.learning_counter = 0
        self.memory = np.zeros([self.experience_limit, self.n_features*2+2])
        
        self.build_models()
    
    def build_models(self):
        
        self.primary_network = Agent(10, self.n_action)       
        self.target_network = Agent(10, self.n_action)
        self.optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate)
        
    def fit(self):
        
        if self.experiene_counter < self.experience_limit:
            indices = np.random.choice(self.experiene_counter, size=self.batch_size)
        else:
            indices = np.random.choice(self.experience_limit, size=self.batch_size)
        batch = self.memory[indices, :]
        q_p = self.primary_network(batch[:, :self.n_features])
        q_t = self.target_network(batch[:, -self.n_features:])
        
        q_eval = q_p.numpy().copy()
        batch_idx = np.arange(self.batch_size, dtype=np.int32)
        actions = self.memory[indices, self.n_features].astype(int)
        rewards = self.memory[indices, self.n_features+1]
        q_eval[batch_idx, actions] = rewards + self.gamma * np.max(q_t, axis=1)
        
        with tf.GradientTape() as tape:
            prediction = self.primary_network(batch[:, :self.n_features])
            loss = keras.losses.MSE(y_true=q_eval, y_pred=prediction)
        gradients = tape.gradient(loss, self.primary_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.primary_network.trainable_variables))
        
        #self.primary_network.fit(batch[:, :self.n_features], q_eval, epochs=1, verbose=0)
        
        if self.epsilon > 0.1:
            self.epsilon -= 0.0002
        if self.learning_counter % self.params_change_pointer == 0:
            self.target_params_change()
        self.learning_counter += 1
    
    def epsilon_greedy(self, obs):
        if np.random.uniform(low=0, high=1) > self.epsilon:
            return np.argmax(self.primary_network(obs[np.newaxis, :]))
        else:
            return np.random.choice(self.n_action)
    
    def store_experience(self, obs, a, r, obs_):
        index = self.experiene_counter % self.experience_limit
        self.memory[index] = np.hstack((obs, [a, r], obs_))
        self.experiene_counter += 1
    
    def target_params_change(self):
        for i, layer in enumerate(self.primary_network.layers):
            weights = layer.get_weights()
            self.target_network.layers[i].set_weights(weights)
        print('Target network parameters changed...')

In [6]:
env = gym.make('CartPole-v0')
dqn = DQN(env.observation_space.shape[0], env.action_space.n, 2000, 0.4, 100, 0.99, 0.01)

In [7]:
EPISODES = 150
def train():
    total_steps = 0
    for episode in range(EPISODES):
        obs = env.reset()
        steps = 0
        total_reward = 0
        while True:
            env.render()
            a = dqn.epsilon_greedy(obs)
            obs_, r, d, _ = env.step(a)
            x, vel, angle, angle_vel = obs_
            r1 = (env.x_threshold - abs(x))/env.x_threshold-0.8
            r2 = (env.theta_threshold_radians - abs(angle))/env.theta_threshold_radians - 0.5
            r = r1 + r2
            total_reward += r
            dqn.store_experience(obs, a, r, obs_)
            if total_steps > 1000:
                dqn.fit()
            if d:
                break
            obs = obs_
            steps += 1
            total_steps += 1
        print("Episode {} is completed at epsilon {} with reward {} in steps {}".format(episode+1, dqn.epsilon,
                                                                                        total_reward, steps))
    env.close()

In [8]:
train()

Episode 1 is completed at epsilon 0.4 with reward 2.703025730927006 in steps 12
Episode 2 is completed at epsilon 0.4 with reward 2.3109763157241137 in steps 8
Episode 3 is completed at epsilon 0.4 with reward 4.351363436519171 in steps 13
Episode 4 is completed at epsilon 0.4 with reward 3.058969795487201 in steps 10
Episode 5 is completed at epsilon 0.4 with reward 2.570171420438006 in steps 14
Episode 6 is completed at epsilon 0.4 with reward 1.5258085135832538 in steps 7
Episode 7 is completed at epsilon 0.4 with reward 2.117470652380274 in steps 10
Episode 8 is completed at epsilon 0.4 with reward 1.913066454681036 in steps 9
Episode 9 is completed at epsilon 0.4 with reward 2.4587688859524257 in steps 10
Episode 10 is completed at epsilon 0.4 with reward 1.5031734405181285 in steps 11
Episode 11 is completed at epsilon 0.4 with reward 1.5377572625075513 in steps 7
Episode 12 is completed at epsilon 0.4 with reward 2.9201138611425295 in steps 10
Episode 13 is completed at epsilon 

Episode 102 is completed at epsilon 0.38620000000000154 with reward 3.074088769257816 in steps 13
Episode 103 is completed at epsilon 0.38360000000000183 with reward 2.831164439590162 in steps 12
Episode 104 is completed at epsilon 0.38080000000000214 with reward 4.399405031715874 in steps 13
Target network parameters changed...
Episode 105 is completed at epsilon 0.37880000000000236 with reward 2.857107035432395 in steps 9
Episode 106 is completed at epsilon 0.37700000000000256 with reward 1.0086793836750083 in steps 8
Episode 107 is completed at epsilon 0.3746000000000028 with reward 1.3625694650415354 in steps 11
Episode 108 is completed at epsilon 0.37240000000000306 with reward 2.9329829219942574 in steps 10
Episode 109 is completed at epsilon 0.3702000000000033 with reward 2.3285375723242363 in steps 10
Episode 110 is completed at epsilon 0.3684000000000035 with reward 1.491958071534467 in steps 8
Episode 111 is completed at epsilon 0.3664000000000037 with reward 1.52244654850610