In [6]:
!pip install gym
!pip install gym[classic_control]



In [7]:
from keras.models import Model, load_model
from keras.layers import Dense, Input
from keras.optimizers import Adam, RMSprop
import numpy as np
import gym
from collections import deque
import random

In [8]:
def OurModel(input_shape, action_space):
    X_input= Input(input_shape)
    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 512 nodes
    X= Dense(512, input_shape=(input_shape,), activation="relu", kernel_initializer='he_uniform')(X_input)
    print(X)
    # Hidden layer with 256 nodes
    X= Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
    # Hidden layer with 64 nodes
    X= Dense(64, activation='relu', kernel_initializer='he_uniform')(X)
    # Output Layer with # of actions: 2 nodes (left, right)
    X= Dense(action_space, activation='linear', kernel_initializer='he_uniform')(X)

    model= Model(inputs= X_input, outputs=X, name='CartPoleDQNmodel')
    model.compile(loss='mse', optimizer=RMSprop(epsilon=0.01, rho=0.95, learning_rate=0.00025), metrics=["accuracy"])

    model.summary()

    return model

In [9]:
class DQNAgent:
    def __init__(self, r_mode=None):
        self.env= gym.make('CartPole-v1', render_mode= r_mode)
        self.state_size= self.env.observation_space.shape[0]
        self.action_size= self.env.action_space.n
        self.EPISODES= 1000 #número de partidas que queremos que o agente jogue
        self.memory= deque(maxlen=2000)

        self.gamma= 0.95 #taxa de decaimento ou desconto, para calcular a futura recompensa descontada
        self.epsilon= 1.0 #taxa de exploração é a taxa na qual um agente decide aleatoriamente sua ação em vez de uma previsão
        self.epsilon_min= 0.001 #queremos que o agente explore pelo menos esta quantidade
        self.epsilon_decay= 0.999 #queremos diminuir o número de explorações à medida que melhora os jogos
        self.batch_size= 64 # Determina quanta memória o DQN usará para treinar
        self.train_start= 1000 
        
        self.model= OurModel(input_shape= self.state_size, action_space=self.action_size)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if ( len(self.memory)> self.train_start ):
            if ( self.epsilon > self.epsilon_min ):
                self.epsilon *= self.epsilon_decay
    
    def replay(self):
        if ( len(self.memory) < self.train_start ):
            return
        # Randomly sample minibatch from the memory
        minibatch= random.sample(self.memory, min(len(self.memory), self.batch_size))

        state= np.zeros((self.batch_size, self.state_size))
        next_state= np.zeros((self.batch_size, self.state_size))
        action, reward, done= [], [], []

        # do this before prediction
        # for speedup, this could be done on the tensor level
        # but easier to understand using a loop

        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])
        
        # do batch prediction to save speed

        target= self.model.predict(state)
        target_next= self.model.predict(next_state)

        for i in range(self.batch_size):
            # correction on the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # Q_max = max_a' Q_target(s', a')

                target[i][action[i]] = reward[i] + self.gamma * (np.max(target_next[i]))
        
        self.model.fit(state, target, batch_size= self.batch_size, verbose=0)
    
    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state))
    
    def load(self, name):
        self.model = load_model(name)
    
    def save(self, name):
        self.model.save(name)
    
    def run(self):
        for e in range(self.EPISODES):
            state= self.env.reset()[0]
            print(state)
            state = np.reshape(state, [1, self.state_size])
            done= False
            i=0
            while not done:
                #self.env.render()
                action= self.act(state)
                next_state, reward, done, truncated, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                if not done or i== self.env._max_episode_steps-1:
                    reward = reward
                else:
                    reward = -100
                
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i+=1
                
                if done:
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon))
                    if i==500:
                        print("Saving trained model as cartpole-dqn.h5")
                        self.save("cartpole-dqn.h5")
                        return
                
                self.replay()
    def test(self):
        self.load("cartpole-dqn.h5")
        for e in range(self.EPISODES):
            state= self.env.reset()
            state= np.reshape(state, [1, self.state_size])
            done= False
            i=0
            while not done:
                self.env.render()
                action= np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                i+=1
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
                    break

In [10]:
if __name__ == '__main__':
    agent= DQNAgent('human')
    agent.run()
    #agent.test()

KerasTensor(type_spec=TensorSpec(shape=(None, 512), dtype=tf.float32, name=None), name='dense_4/Relu:0', description="created by layer 'dense_4'")
Model: "CartPoleDQNmodel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense_4 (Dense)             (None, 512)               2560      
                                                                 
 dense_5 (Dense)             (None, 256)               131328    
                                                                 
 dense_6 (Dense)             (None, 64)                16448     
                                                                 
 dense_7 (Dense)             (None, 2)                 130       
                                                                 
Total params: 150,466
Trainable par

episode: 47/1000, score: 57, e: 0.95
[-0.01050392  0.02285334  0.04067612 -0.03200128]
episode: 48/1000, score: 15, e: 0.94
[-0.00187767  0.01204844  0.02593346  0.04260801]
episode: 49/1000, score: 39, e: 0.9


[-0.03812521  0.00923642 -0.00873916 -0.01568652]
episode: 50/1000, score: 37, e: 0.87
[-0.00487543  0.04132544  0.0413962   0.00867231]


episode: 51/1000, score: 46, e: 0.83
[ 0.03061185  0.03897313 -0.02119974  0.03185323]
episode: 52/1000, score: 34, e: 0.8
[-0.04957194  0.03686651  0.01931255  0.0444658 ]


episode: 53/1000, score: 40, e: 0.77
[ 0.00080486 -0.0201457  -0.00641    -0.04831463]


episode: 54/1000, score: 77, e: 0.71
[ 0.02996972 -0.00838752  0.02272328  0.03413446]


episode: 55/1000, score: 75, e: 0.66
[-0.00444284 -0.02484535 -0.03160096 -0.04339598]




episode: 56/1000, score: 138, e: 0.58
[ 0.0015985  -0.03046422  0.01993542 -0.00627556]


episode: 57/1000, score: 98, e: 0.52
[-0.01218593 -0.03426437  0.00322622  0.03400931]












episode: 58/1000, score: 329, e: 0.38
[-0.04662694  0.02909283  0.01002891  0.04285282]






episode: 59/1000, score: 191, e: 0.31
[ 0.03109652 -0.00580391  0.00120827  0.02522199]










episode: 60/1000, score: 274, e: 0.24
[-0.02193335  0.04037424 -0.04077152  0.01431774]














episode: 61/1000, score: 425, e: 0.15
[ 0.01465021  0.03069281  0.00480235 -0.00497016]








episode: 62/1000, score: 197, e: 0.13
[-0.04634293  0.00420111  0.0329336  -0.03724229]








episode: 63/1000, score: 256, e: 0.098
[-0.01446746 -0.03971748  0.02434145 -0.03725595]














episode: 64/1000, score: 348, e: 0.069
[-0.00250139  0.04305112  0.04869934 -0.01946292]
episode: 65/1000, score: 12, e: 0.068
[ 0.01561588 -0.03668439  0.01258934 -0.0399526 ]










episode: 66/1000, score: 252, e: 0.053
[ 0.03072977 -0.03141075 -0.01561759 -0.00562837]










episode: 67/1000, score: 279, e: 0.04
[ 0.02557238 -0.01029165 -0.02046508 -0.01445812]










episode: 68/1000, score: 276, e: 0.03
[-0.02033386  0.00598604  0.02413445  0.00353641]








episode: 69/1000, score: 223, e: 0.024
[-0.01764004  0.01399918  0.03907744  0.02633155]








episode: 70/1000, score: 209, e: 0.02
[0.04459529 0.02553432 0.02775605 0.00724607]








episode: 71/1000, score: 174, e: 0.017
[ 0.04253048 -0.00674032 -0.04552201 -0.01460777]






episode: 72/1000, score: 208, e: 0.013
[-0.01535547 -0.01254059 -0.01323056 -0.04675671]










episode: 73/1000, score: 226, e: 0.011
[-0.03557271 -0.00975943 -0.04388202 -0.04761278]








episode: 74/1000, score: 240, e: 0.0085
[-0.03898065  0.00671076  0.00250315  0.03740644]








episode: 75/1000, score: 204, e: 0.0069
[ 0.01336882 -0.0446527   0.03494911  0.04400455]






episode: 76/1000, score: 170, e: 0.0058
[-0.04507579 -0.01640452 -0.00199074  0.04266545]










episode: 77/1000, score: 227, e: 0.0046
[-0.02211085  0.03335911  0.03824744 -0.01494474]






episode: 78/1000, score: 198, e: 0.0038
[ 0.00501037 -0.03718429  0.04505588 -0.02634325]








episode: 79/1000, score: 204, e: 0.0031
[-0.00436366  0.01725498  0.04112575 -0.02730641]








episode: 80/1000, score: 212, e: 0.0025
[ 0.03402697  0.03701936  0.01448859 -0.01161189]








episode: 81/1000, score: 191, e: 0.0021
[-0.03614484  0.02052335  0.0441617   0.02513903]








episode: 82/1000, score: 210, e: 0.0017
[-0.04551498 -0.04306786  0.00214844  0.0211266 ]








episode: 83/1000, score: 249, e: 0.0013


[ 0.04952122  0.00971727  0.03764711 -0.02507203]






episode: 84/1000, score: 187, e: 0.0011
[-0.04545487  0.04357622 -0.01637964  0.01242099]










episode: 85/1000, score: 241, e: 0.001
[-0.04925543 -0.0447738  -0.01732186  0.01076947]










episode: 86/1000, score: 279, e: 0.001
[ 0.02214205 -0.0370262   0.03536401 -0.03007418]








episode: 87/1000, score: 221, e: 0.001
[ 0.04576099  0.00219458  0.02781304 -0.01883204]








episode: 88/1000, score: 203, e: 0.001
[0.02566009 0.04891906 0.02756285 0.03977514]








episode: 89/1000, score: 201, e: 0.001
[-0.03340961 -0.00311196  0.0397622   0.03943063]








episode: 90/1000, score: 230, e: 0.001
[-0.03612615 -0.04731482 -0.02085652 -0.0089409 ]














episode: 91/1000, score: 355, e: 0.001
[-0.03394734 -0.00732114  0.02102923  0.0171703 ]










episode: 92/1000, score: 258, e: 0.001
[ 0.01648515  0.00406859 -0.01726238 -0.00528554]










KeyboardInterrupt: 