In [None]:
# !pip install gym
# !apt install freeglut3-dev -y
# !pip3 install tensorflow
# !apt-get install -y ffmpeg
# !apt-get install -y python3-opengl

In [1]:
import warnings
warnings.filterwarnings('ignore')

import random
import gym
import numpy as np
from collections import deque

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
#from scores.score_logger import ScoreLogger

In [2]:
ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 5# 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


def cartpole():
    env = gym.make(ENV_NAME)
    #score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    max_score = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                if step>=max_score:
                    print("Run: " + str(run) + ", exploration: " + 
                          str(dqn_solver.exploration_rate) + ", score: " + str(step))
                    # score_logger.add_score(step, run)
                    max_score = step
                break
            dqn_solver.experience_replay()

In [None]:
env = gym.make(ENV_NAME)
#score_logger = ScoreLogger(ENV_NAME)
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
dqn_solver = DQNSolver(observation_space, action_space)
run = 0

run += 1
state = env.reset()
state = np.reshape(state, [1, observation_space])
step = 0

In [None]:
step += 1
env.render()
# action = dqn_solver.act(state)
if np.random.rand() < dqn_solver.exploration_rate:
    action = random.randrange(dqn_solver.action_space)
else:
    q_values = dqn_solver.model.predict(state)
    print('q_values =', q_values[0])
    action = np.argmax(q_values[0])
print('action =', action)

state_next, reward, terminal, info = env.step(action)
print('cart_position, cart_velocity, pole_angle, pole_velocity')
print('state_n = ', state_next)
print('reward  = ', reward)
print('terminal= ', terminal)
print('info    = ', info)

reward = reward if not terminal else -reward
state_next = np.reshape(state_next, [1, observation_space])
dqn_solver.remember(state, action, reward, state_next, terminal)
state = state_next
print('state = ', state)
if terminal:
    print("Run: " + str(run) + ", exploration: " + 
          str(dqn_solver.exploration_rate) + ", score: " + str(step))
    # score_logger.add_score(step, run)
    run += 1
    state = env.reset()
    state = np.reshape(state, [1, observation_space])
    step = 0
    
# dqn_solver.experience_replay()
if len(dqn_solver.memory) < BATCH_SIZE:
    print('collect more samples!')
else:
    batch = random.sample(dqn_solver.memory, BATCH_SIZE)
    for state, action, reward, state_next, terminal in batch:
        q_update = reward
        if not terminal:
            q_update = (reward + GAMMA * np.amax(dqn_solver.model.predict(state_next)[0]))
        q_values = dqn_solver.model.predict(state)
        q_values[0][action] = q_update
        print('q_values =', q_values[0])
        dqn_solver.model.fit(state, q_values, verbose=0)
    dqn_solver.exploration_rate *= EXPLORATION_DECAY
    dqn_solver.exploration_rate = max(EXPLORATION_MIN, dqn_solver.exploration_rate)

In [None]:
if __name__ == "__main__":
    cartpole()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Run: 1, exploration: 0.9322301194154049, score: 19
Run: 3, exploration: 0.7219385759785162, score: 41
Run: 29, exploration: 0.060687903789832374, score: 188
Run: 109, exploration: 0.01, score: 271
Run: 119, exploration: 0.01, score: 272
Run: 137, exploration: 0.01, score: 289
Run: 148, exploration: 0.01, score: 322
Run: 160, exploration: 0.01, score: 324
Run: 185, exploration: 0.01, score: 326
Run: 201, exploration: 0.01, score: 367
Run: 206, exploration: 0.01, score: 367
Run: 213, exploration: 0.01, score: 397
Run: 231, exploration: 0.01, score: 495
Run: 255, exploration: 0.01, score: 500
Run: 260, exploration: 0.01, score: 500
Run: 275, exploration: 0.01, score: 500
Run: 277, exploration: 0.01, score: 500
Run: 288, exploration: 0.01, score: 500
Run: 305, exploration: 0.01, score: 500
Run: 314, exploration: 0.01, score: 500
Run: 329, exploration: 0.01, score: 500
Run: 

In [None]:
env.close()