# DQN Training
Training a DQN agent using custom gym environment and keras-rl2.

# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

from gym import Env
from gym.spaces import Discrete, Box

from rl.agents import DQNAgent
from rl.policy import MaxBoltzmannQPolicy
from rl.memory import SequentialMemory

## Functions

In [2]:
def moving_average(x, w):
    return np.convolve(x, np.ones(w), "valid") / w

# Environment

## Loading Environmet of Tetris

In [None]:
#

## Create env

In [None]:
# env = PacManEnv()

## Test

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action = np.random.randint(0, 5)
        n_state, reward, done, info = env.step(action)
        score+=reward
    print("Episode: {}, Score: {}".format(episode, score))

Episode: 1, Score: 6
Episode: 2, Score: 12
Episode: 3, Score: 10
Episode: 4, Score: 6
Episode: 5, Score: 15
Episode: 6, Score: 3
Episode: 7, Score: 3
Episode: 8, Score: 29
Episode: 9, Score: 2
Episode: 10, Score: 11


# Nural Network and Agent

## Build model

In [None]:
tf.config.list_physical_devices("GPU")

[]

In [13]:
def build_model():
    return tf.keras.Sequential([
    # 1 state, 20 rows, 10 cols, 3 matricies: locked, falling and next figures
    layers.Input(shape=(1, 20, 10, 3)),
    layers.Reshape(target_shape=(20, 10, 3)),
    
    layers.Conv2D(filters=32, kernel_size=(3, 3), padding="same"),
    layers.Activation("relu"),
    layers.MaxPool2D(),
    
    layers.Conv2D(filters=64, kernel_size=(3, 3), padding="same"),
    layers.Activation("relu"),
    layers.MaxPool2D(),
    
    layers.Flatten(),
    
    layers.Dense(32),
    layers.Activation("relu"),
    
    layers.Dense(16),
    layers.Activation("relu"),
    
    layers.Dense(5)])

In [14]:
model = build_model()
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape_5 (Reshape)         (None, 20, 10, 3)         0         
                                                                 
 conv2d_10 (Conv2D)          (None, 20, 10, 32)        896       
                                                                 
 activation_20 (Activation)  (None, 20, 10, 32)        0         
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 10, 5, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 10, 5, 128)        36992     
                                                                 
 activation_21 (Activation)  (None, 10, 5, 128)        0         
                                                      

## Agent

In [None]:
env.observation_space.shape, env.action_space.n

((16, 16, 4), 4)

In [None]:
def build_agent(model):
    policy = MaxBoltzmannQPolicy(eps=0.8)
    memory = SequentialMemory(limit=2048, window_length=1)
    dqn = DQNAgent(model=model, 
        memory=memory,
        policy=policy,
        nb_actions=4,
        gamma=0.99,
        nb_steps_warmup=256,
        batch_size=64,
        target_model_update=0.1,
        enable_double_dqn=True,
        enable_dueling_network=True)
    return dqn

In [None]:
dqn = build_agent(model)
dqn.compile(tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0), metrics=["mean_squared_error"])

In [None]:
#dqn.load_weights("")

#from keras import backend as K
#print(K.eval(dqn.model.optimizer.learning_rate))
#K.set_value(dqn.model.optimizer.learning_rate, 0.001)
#print(K.eval(dqn.model.optimizer.learning_rate))

0.01


## Training

In [None]:
history = dqn.fit(env, nb_steps=40000, visualize=False, verbose=1, log_interval=1000)

Training for 40000 steps ...
Interval 1 (0 steps performed)
43 episodes - episode_reward: 9.535 [2.000, 33.000] - loss: 2.405 - mean_squared_error: 98.972 - mean_q: 11.039

Interval 2 (1000 steps performed)
42 episodes - episode_reward: 10.048 [2.000, 43.000] - loss: 2.369 - mean_squared_error: 103.279 - mean_q: 11.151

Interval 3 (2000 steps performed)
43 episodes - episode_reward: 9.163 [1.000, 30.000] - loss: 2.282 - mean_squared_error: 114.543 - mean_q: 11.416

Interval 4 (3000 steps performed)
41 episodes - episode_reward: 10.585 [2.000, 30.000] - loss: 2.417 - mean_squared_error: 125.440 - mean_q: 11.907

Interval 5 (4000 steps performed)
44 episodes - episode_reward: 10.705 [2.000, 37.000] - loss: 2.677 - mean_squared_error: 124.935 - mean_q: 11.996

Interval 6 (5000 steps performed)
39 episodes - episode_reward: 10.641 [2.000, 37.000] - loss: 2.800 - mean_squared_error: 140.816 - mean_q: 13.180

Interval 7 (6000 steps performed)
50 episodes - episode_reward: 9.100 [1.000, 26.00

## Plot

In [None]:
full_hist = np.load("E:/Projects/Python/piis_rl_pacman/player_rl/saved/16conv/history.npy")
full_hist = np.concatenate((full_hist, np.array(history.history["episode_reward"])))
np.save("E:/Projects/Python/piis_rl_pacman/player_rl/saved/16conv/history.npy", np.array(full_hist))

plt.rcParams["figure.figsize"] = (16, 8)
plt.plot(moving_average(full_hist, 100))
plt.show()