In [4]:
import gym

import numpy as np
import random

from keras import models, layers, optimizers

from collections import deque

In [2]:
env = gym.make('CartPole-v0')

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print(state_size, action_size)

batch_size = 32

episodes = 1000

4 2


In [5]:
class dqAgent:
    
    def __init__(self, state_size, action_size):
        
        self.state_size = state_size
        self.action_size = action_size
        
        self.memory = deque(maxlen=1000)
        
        self.gamma = 0.95
        
        self.alpha = 0.001
        
        self.epsilon = 1.0
        self.epsilon_decay = 0.997
        self.epsilon_min = 0.01
        
        self.model = self._build_model()
        
    def _build_model(self):
        
        model = models.Sequential()
        model.add(layers.Dense(12, input_dim= self.state_size,
                               activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        
        model.add(layers.Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=
                     optimizers.Adam(lr=self.alpha))
        
        return model
    
    def remember(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
    
    def action(self, state):
        
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        prediction = self.model.predict(state)
        
        return np.argmax(prediction[0])
    
    def train(self, batch_size):
        
        batch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in batch:
            
            target = reward
            
            if not done:
                
                target = (reward + self.gamma * 
                          np.amax(self.model.predict(next_state)[0]))
                
            target_y = self.model.predict(state)
            
            target_y[0][action] = target
            
            self.model.fit(state, target_y, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)

In [6]:
agent = dqAgent(state_size, action_size)
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 12)                60        
_________________________________________________________________
dense_2 (Dense)              (None, 24)                312       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 422
Trainable params: 422
Non-trainable params: 0
_________________________________________________________________


In [9]:
try:
    for e in range(episodes):
    
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        total_reward = 0
        done = False

        while not done:

            action = agent.action(state)

            next_state, reward, done, _ = env.step(action)

            reward = reward if not done else -10

            total_reward += reward

            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)

            state = next_state

        if e % 100 == 0:
            print(f"Episode:{e}, score: {total_reward}, eps:{agent.epsilon:.2}")
            agent.save(f"{e:04d}.hdf5")

        if len(agent.memory) > batch_size:
            agent.train(batch_size)
        
finally:
    env.close()

Episode:0, score: 26.0, eps:1.0
Episode:100, score: 3.0, eps:0.74
Episode:200, score: 42.0, eps:0.55
Episode:300, score: 90.0, eps:0.41
Episode:400, score: 127.0, eps:0.3
Episode:500, score: 189.0, eps:0.22
Episode:600, score: 189.0, eps:0.16
Episode:700, score: 189.0, eps:0.12
Episode:800, score: 189.0, eps:0.09
Episode:900, score: 189.0, eps:0.067


In [16]:
try:
    for e in range(5):
    
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        total_reward = 0
        done = False

        while not done:

            env.render()
            action = agent.action(state)

            next_state, reward, done, _ = env.step(action)

            total_reward += reward

            next_state = np.reshape(next_state, [1, state_size])

            state = next_state

        print(f"Episode:{e}, score: {total_reward}")
        
finally:
    env.close()

Episode:0, score: 200.0
Episode:1, score: 200.0
Episode:2, score: 200.0
Episode:3, score: 200.0
Episode:4, score: 200.0
