# Cartpole DQN

In [1]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import os

In [2]:
# ! pip install jdc
import jdc

In [3]:
env = gym.make('CartPole-v0')

In [4]:
state_size = env.observation_space.shape[0]
state_size

4

In [5]:
action_size = env.action_space.n
action_size

2

In [6]:
batch_size = 32

In [7]:
n_episodes = 1001

In [8]:
output_dir = 'model_output/cartpole'

In [9]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Define Agent

In [10]:
class DQNAgent:
    pass

In [11]:
%%add_to DQNAgent
def __init__(self, state_size, action_size, model = None):
    self.state_size = state_size
    self.action_size = action_size

    self.memory = deque(maxlen = 2000)

    self.gamma = 0.95
    self.epsilon = 1.0
    self.epsilon_decay = 0.995
    self.epsilon_floor = 0.01

    self.learning_rate = 0.001
    self.model = self._build_model() if model is None else model

In [12]:
%%add_to DQNAgent
def _build_model(self):
    model = Sequential()

    model.add(Dense(24, input_dim = self.state_size, activation = 'relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(self.action_size, activation='linear'))

    model.compile(loss = 'mse', optimizer = Adam(learning_rate=self.learning_rate))

    return model

In [13]:
%%add_to DQNAgent
def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

In [20]:
%%add_to DQNAgent
def act(self, state, training = True):
    if np.random.rand() <= self.epsilon and training is True:
        return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])

In [15]:
%%add_to DQNAgent
def replay(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)

    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
        target_f = self.model.predict(state)
        target_f[0][action] = target

        self.model.fit(state, target_f, epochs = 1, verbose = 0)

    if self.epsilon > self.epsilon_floor:
        self.epsilon *= self.epsilon_decay

In [16]:
%%add_to DQNAgent
def load(self, name):
    self.model.load_weights(name)
def save(self, name):
    self.model.save(name)

In [17]:
agent = DQNAgent(state_size=state_size, action_size=action_size)

## Interact with environment

In [18]:
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    for time in range(5000):
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)

        reward = reward if not done else -10

        next_state = np.reshape(next_state, [1, state_size])

        agent.remember(state, action, reward, next_state, done)

        state = next_state

        if done:
            print("episode: {}/{}, score: {}, e:{:.2}".format(e, n_episodes, time, agent.epsilon))
            break
        
    if len(agent.memory) > batch_size:
        agent.replay(batch_size=batch_size)
    if e % 50 == 0:
        agent.save(output_dir + "weights_" + "{:04d}".format(e) + '.hdf5')
        

episode: 0/1001, score: 13, e:1.0
episode: 1/1001, score: 21, e:1.0
episode: 2/1001, score: 15, e:0.99
episode: 3/1001, score: 9, e:0.99
episode: 4/1001, score: 19, e:0.99
episode: 5/1001, score: 15, e:0.98
episode: 6/1001, score: 56, e:0.98
episode: 7/1001, score: 10, e:0.97
episode: 8/1001, score: 13, e:0.97
episode: 9/1001, score: 14, e:0.96
episode: 10/1001, score: 25, e:0.96
episode: 11/1001, score: 12, e:0.95
episode: 12/1001, score: 29, e:0.95
episode: 13/1001, score: 17, e:0.94
episode: 14/1001, score: 14, e:0.94
episode: 15/1001, score: 29, e:0.93
episode: 16/1001, score: 28, e:0.93
episode: 17/1001, score: 17, e:0.92
episode: 18/1001, score: 10, e:0.92
episode: 19/1001, score: 15, e:0.91
episode: 20/1001, score: 8, e:0.91
episode: 21/1001, score: 22, e:0.9
episode: 22/1001, score: 25, e:0.9
episode: 23/1001, score: 11, e:0.9
episode: 24/1001, score: 11, e:0.89
episode: 25/1001, score: 15, e:0.89
episode: 26/1001, score: 15, e:0.88
episode: 27/1001, score: 11, e:0.88
episode: 

KeyboardInterrupt: 

In [18]:
agent.load(os.path.join('model_output', 'cartpoleweights_0400.hdf5'))

In [22]:
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    for time in range(5000):
        env.render()
        action = agent.act(state, training=False)
        next_state, reward, done, _ = env.step(action)

        reward = reward if not done else -10

        next_state = np.reshape(next_state, [1, state_size])

        agent.remember(state, action, reward, next_state, done)

        state = next_state

        if done:
            print("episode: {}/{}, score: {}, e:{:.2}".format(e, n_episodes, time, agent.epsilon))
            break

episode: 0/1001, score: 199, e:1.0
episode: 1/1001, score: 199, e:1.0
episode: 2/1001, score: 199, e:1.0
episode: 3/1001, score: 199, e:1.0
episode: 4/1001, score: 199, e:1.0
episode: 5/1001, score: 199, e:1.0
episode: 6/1001, score: 199, e:1.0
episode: 7/1001, score: 199, e:1.0
episode: 8/1001, score: 199, e:1.0
episode: 9/1001, score: 199, e:1.0


KeyboardInterrupt: 