# Deep Q learning

In [1]:
# maybe useful:
# https://towardsdatascience.com/reinforcement-learning-with-tensorflow-agents-tutorial-4ac7fa858728

In [2]:
import random
from time import sleep 
from engine import TetrisEngine

In [3]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, BatchNormalization, Activation, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [4]:
## use pip install --upgrade --force-reinstall  git+https://github.com/Bosmansc/tetris_openai.git
## not pip install  pip install keras-rl2, this is not compatible with the custom tetris environment

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.policy import GreedyQPolicy
from rl.memory import SequentialMemory

In [5]:
env = TetrisEngine()

In [6]:
def build_model_conv(actions):
    
    ## edit state based on code Seyedomid
    #resize(state)
    
    # Network defined by the Deepmind paper
    model = tf.keras.models.Sequential()
    
    model.add(Conv2D(32, (3, 3), padding='same', kernel_initializer='he_uniform', 
                     kernel_constraint=max_norm(4), input_shape=(1, 16, 6)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    
    model.add(Conv2D(64, (3, 3), padding='same', kernel_initializer='he_uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    
    model.add(Conv2D(64, (3, 3), padding='same', kernel_initializer='he_uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    
    #model.add(MaxPooling2D(pool_size=(2,2)))
    
    
    # end of convolutional layers, start of 'hidden' dense layers 
    model.add(Flatten())
    model.add(Dense(128, kernel_initializer='he_uniform', kernel_constraint=max_norm(3)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    # Final dense layer
    model.add(Dense(actions, activation='linear'))
    
    return model

In [7]:
actions = 6
model = build_model_conv(actions)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 1, 16, 32)         1760      
_________________________________________________________________
batch_normalization (BatchNo (None, 1, 16, 32)         128       
_________________________________________________________________
activation (Activation)      (None, 1, 16, 32)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 16, 64)         18496     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 16, 64)         256       
_________________________________________________________________
activation_1 (Activation)    (None, 1, 16, 64)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 16, 64)         3

In [8]:
def build_agent(model, actions):
    # policy = GreedyQPolicy() ## hyperparm, GreedyQPolicy is used in paper: https://www.elen.ucl.ac.be/Proceedings/esann/esannpdf/es2008-118.pdf
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [8]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=500, visualize=True)

Training for 500 steps ...
Interval 1 (0 steps performed)




  500/10000 [>.............................] - ETA: 3:59 - reward: 0.7712done, took 12.934 seconds


<tensorflow.python.keras.callbacks.History at 0x7f465457a1f0>

In [9]:
env.results()

KeyError: 'new_episode'

In [10]:
scores = dqn.test(env, nb_episodes=10, visualize=True
               #   ,nb_max_episode_steps=50
                 )
print(np.mean(scores.history['episode_reward']))

NameError: name 'dqn' is not defined

In [12]:
env.results()

Unnamed: 0,new_episode_cum,heigt_diff_sum,new_block_sum,nr_lines_sum,score_sum,score_avg
0,0.0,-14.0,12.0,0.0,32.4,0.348387
1,1.0,-14.0,8.0,0.0,3.8,0.11875
2,2.0,-14.0,7.0,0.0,-3.0,-0.2
3,3.0,-14.0,10.0,0.0,23.0,0.605263
4,4.0,-14.0,9.0,0.0,14.4,0.48
5,5.0,-14.0,9.0,0.0,14.0,0.4375
6,6.0,-14.0,7.0,0.0,-3.0,-0.2
7,7.0,-14.0,7.0,0.0,-3.0,-0.2
8,8.0,-14.0,7.0,0.0,-4.8,-0.2
9,9.0,-14.0,10.0,38.0,103.4,3.041176


# save and load agent

In [9]:
dqn.save_weights('models/dqn_model.model', overwrite=False)

NameError: name 'dqn' is not defined

In [10]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.load_weights('models/dqn_model.model')

In [11]:
scores = dqn.test(env, nb_episodes=10, visualize=True
               #   ,nb_max_episode_steps=50
                 )
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...




KeyboardInterrupt: 