## DDDQN
Dueling double deep Q-network

In [15]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import matplotlib as mpl
import joblib
import os
import keras

import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')
from tensorflow.keras import layers

In [2]:
import gym

In [None]:
env = gym.make("CartPole-v1")
env.seed(77)
input_shape = [4]
n_outputs = 2

In [7]:
# 입실론 그리디 정책
# 최초 입실론 0
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [8]:
# 경험 샘플링 함수 정의
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [9]:
# 한 step 이동하는 함수 정의
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [19]:
# 상태 초기화
keras.backend.clear_session()
tf.random.set_seed(77)
K=keras.backend

In [20]:
# DQN 모델 구현
input_states = keras.layers.Input(shape=[4])
hidden1 = keras.layers.Dense(32, activation="elu")(input_states)
hidden2 = keras.layers.Dense(32, activation="elu")(hidden1)
state_values = keras.layers.Dense(1)(hidden2)
raw_advantages = keras.layers.Dense(n_outputs)(hidden2)
advantages = raw_advantages - K.max(raw_advantages, axis=1, keepdims=True)
Q_values = state_values + advantages
model = keras.models.Model(inputs=[input_states], outputs=[Q_values])
# 타겟 모델 복사
target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [21]:
# 학습 스텝 정의
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(learning_rate=7.5e-3)
loss_fn = keras.losses.Huber()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis=1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)
    target_Q_values = (rewards +
                       (1 - dones) * discount_rate * next_best_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [23]:
# 재생 메모리
from collections import deque
replay_memory = deque(maxlen=2000)

In [None]:
# DDDQN 학습 시작 400회
# epoch수와 step수 따로 정의
# 학습+리워드와 그래프 함수까지 한꺼번에 정의
episode = 0
rewards = 0
best_weights = []
def model_fit(epoch = None, steps = None, graph_show = None):
  rewards=[]
  best_score = 0
  for episode in range(epoch):
      obs = env.reset()
      for step in range(steps):
          epsilon = max(1 - episode / 500, 0.01)
          obs, reward, done, info = play_one_step(env, obs, epsilon)
          if done:
              break
      rewards.append(step)
      if step >= best_score:
          best_weights = model.get_weights()
          best_score = step
      print("\rEpisode: {}, Steps: {}, eps: {:.3f}".format(episode, step + 1, epsilon), end="")
      if episode >= 50:
          training_step(batch_size)
          if episode % 50 == 0:
              target.set_weights(model.get_weights())
  # 리워드 그래프
  def graph(rewards):
      plt.figure(figsize=(12, 8))
      plt.plot(rewards)
      plt.xlabel("Episode", fontsize=14)
      plt.ylabel("Sum of rewards", fontsize=14)
      plt.show()
  if graph_show == "yes":
      graph(rewards)
  return model.set_weights(best_weights)
model_fit(600, 200, "yes")

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


In [44]:
frames=[]
obs = env.reset()
def update_scene(num, frames, patch):
  patch.set_data(frames[num])
  return patch,

def plot_animation(frames, repeat = False, interval=40):
  fig = plt.figure()
  patch = plt.imshow(frames[0])
  plt.axis('off')
  anim = animation.FuncAnimation(
      fig, update_scene, fargs=(frames, patch),
      frames = len(frames), repeat=repeat, interval = interval
  )
  plt.close()
  return anim

ValueError: ignored

In [39]:
# 행동 시 300 스텝
state = env.reset()

frames = []

for step in range(300):
    action = epsilon_greedy_policy(state)
    state, reward, done, info = env.step(action)
    if done:
        break
    img = env.render(mode="rgb_array")
    frames.append(img)

NameError: ignored

<Figure size 1200x800 with 0 Axes>

#### 애니메이션 반환

In [None]:
# DQN 애니메이션 반출
plot_animation(frames)