In [1]:
import warnings
warnings.simplefilter('ignore')
import tensorflow as tf
print("Is GPU available?", tf.test.is_gpu_available())
print("TF version:", tf.__version__)
print("Keras version:", tf.keras.__version__)

Is GPU available? True
TF version: 1.15.0
Keras version: 2.2.4-tf


In [2]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [3]:
class Agent():
    def __init__(self, input_shape, n_actions, mem_size, eps, eps_min, eps_dec, gamma, q_eval_name, q_next_name, 
                 replace_freq, lr=0.0001):
        self.Q_eval = self._build_NN(input_shape, n_actions, lr)
        self.Q_next = self._build_NN(input_shape, n_actions, lr)
        #self.Q_eval = load_model('Q_eval.h5')
        #self.Q_next = load_model('Q_next.h5')
        self.memory = deque(maxlen=mem_size)
        self.eps = eps
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.gamma = gamma
        self.replace = replace_freq
        self.action_space = [i for i in range(n_actions)]
        self.steps = 0
        self.input_shape = input_shape
        self.q_eval_name = q_eval_name
        self.q_next_name = q_next_name
    
    def _build_NN(self, input_shape, n_actions, lr):
        
        model = Sequential()
        
        model.add(Dense(32, input_shape=(*input_shape,), activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(n_actions, activation='linear'))
        
        model.compile(optimizer=Adam(lr=lr), loss='mse')
        
        model.summary()
        
        return model
    
    def store(self, state, action, reward, n_state, done):
        pack = [np.expand_dims(state, axis=0), action, reward, np.expand_dims(n_state, axis=0), done]
        self.memory.append(pack)
    
    def take_data(self, batch_size):
        pack = random.sample(self.memory, batch_size)
        states = []
        actions = []
        rewards = []
        n_states = []
        dones = []
        for i in range(batch_size):
            states.append(pack[i][0])
            actions.append(pack[i][1])
            rewards.append(pack[i][2])
            n_states.append(pack[i][3])
            dones.append(pack[i][4])
        return states, actions, rewards, n_states, dones
    
    def choose_action(self, state):
        if np.random.random() > self.eps:
            return np.argmax(self.Q_next.predict(state))
        else:
            return np.random.choice(self.action_space)
    
    def decay_eps(self): 
        self.eps = self.eps - self.eps_dec if self.eps > self.eps_min else self.eps_min
    
    def replace_weights(self):
        if not (self.steps % self.replace):
            self.Q_next.set_weights(self.Q_eval.get_weights())
            print('Atualizado')
    
    def upgrade(self, batch_size=64):
        if len(self.memory) >= 4*batch_size:
            states, actions, rewards, n_states, dones = self.take_data(batch_size)
            
            ### DDQN
            act = [np.argmax(self.Q_eval.predict(n_states[i])) for i in range(batch_size)]
            
            q_next = [self.Q_next.predict(n_states[i])[0][act[i]] for i in range(batch_size)]
            
            y = [self.Q_eval.predict(states[i])[0] for i in range(batch_size)]
            
            for i in range(batch_size):
                y[i][act[i]] = rewards[i] + self.gamma*q_next[i]*dones[i]
            ###
            
            states = np.reshape(states, (batch_size, *self.input_shape))
            
            _ = self.Q_eval.fit(np.array(states), np.array(y), epochs=1, verbose=0, batch_size=batch_size)
            
            self.decay_eps()
            self.replace_weights()

    def save(self):
        self.Q_eval.save(self.q_eval_name)
        self.Q_next.save(self.q_next_name)

In [None]:
env = gym.make('CartPole-v1')
n_games = 1000
agent = Agent(input_shape=env.observation_space.shape, n_actions=env.action_space.n, mem_size=10000, eps=1.0, 
              eps_min=0.001, eps_dec=0.001, gamma=1.0, q_eval_name='Q_eval.h5', q_next_name='Q_next.h5', 
              replace_freq=250)
best_score = -200
scores = []
means = []
eps = []
for i in range(n_games):
    done = False
    state = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(np.expand_dims(state, axis=0))
        n_state, reward, done, _ = env.step(action)
        score += reward
        agent.steps += 1
        agent.store(state, action, reward, n_state, (int(done)))
        agent.upgrade()
        state = n_state
    scores.append(score)
    mean = np.mean(scores[-20:])
    means.append(mean)
    eps.append(agent.eps)
    if mean > best_score:
        agent.save()
        best_score = mean
    print('episode: ', i+1, '   score: ', score, '   eps:  %.3f' %agent.eps)
    if np.mean(scores[-100:]) > 195:
        print("Environment Solved!")
        agent.save()
        break
env.close()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 2,402
Trainable params: 2,402
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                160       
_________________________________________________________________
dense_4 (Dense)  

episode:  144    score:  8.0    eps:  0.001
episode:  145    score:  9.0    eps:  0.001
episode:  146    score:  9.0    eps:  0.001
episode:  147    score:  9.0    eps:  0.001
episode:  148    score:  9.0    eps:  0.001
episode:  149    score:  10.0    eps:  0.001
episode:  150    score:  10.0    eps:  0.001
episode:  151    score:  10.0    eps:  0.001
episode:  152    score:  10.0    eps:  0.001
episode:  153    score:  10.0    eps:  0.001
episode:  154    score:  10.0    eps:  0.001
episode:  155    score:  10.0    eps:  0.001
episode:  156    score:  8.0    eps:  0.001
episode:  157    score:  9.0    eps:  0.001
episode:  158    score:  8.0    eps:  0.001
episode:  159    score:  9.0    eps:  0.001
episode:  160    score:  10.0    eps:  0.001
episode:  161    score:  10.0    eps:  0.001
episode:  162    score:  9.0    eps:  0.001
episode:  163    score:  11.0    eps:  0.001
episode:  164    score:  9.0    eps:  0.001
episode:  165    score:  10.0    eps:  0.001
episode:  166    scor

episode:  327    score:  10.0    eps:  0.001
episode:  328    score:  8.0    eps:  0.001
Atualizado
episode:  329    score:  10.0    eps:  0.001
episode:  330    score:  9.0    eps:  0.001
episode:  331    score:  9.0    eps:  0.001
episode:  332    score:  10.0    eps:  0.001
episode:  333    score:  8.0    eps:  0.001
episode:  334    score:  9.0    eps:  0.001
episode:  335    score:  10.0    eps:  0.001
episode:  336    score:  9.0    eps:  0.001
episode:  337    score:  10.0    eps:  0.001
episode:  338    score:  10.0    eps:  0.001
episode:  339    score:  9.0    eps:  0.001
episode:  340    score:  8.0    eps:  0.001
episode:  341    score:  10.0    eps:  0.001
episode:  342    score:  10.0    eps:  0.001
episode:  343    score:  10.0    eps:  0.001
episode:  344    score:  9.0    eps:  0.001
episode:  345    score:  8.0    eps:  0.001
episode:  346    score:  10.0    eps:  0.001
episode:  347    score:  8.0    eps:  0.001
episode:  348    score:  9.0    eps:  0.001
episode:  3

In [None]:
plt.plot(scores)
plt.grid()
plt.show()

In [None]:
plt.plot(means)
plt.grid()
plt.show()

In [None]:
plt.plot(eps)
plt.grid()
plt.show()

In [None]:
Q = load_model('Q_next.h5')
env = gym.make('CartPole-v1')
for i in range(20):
    done = False
    state = env.reset()
    while not done:
        env.render()
        action = np.argmax(Q.predict(np.expand_dims(state, axis=0))[0])
        n_state, reward, done, _ = env.step(action)
        state = n_state
env.close()