In [1]:
##############################################################
## 1. Libraries

import os
import cv2
import gym
from gym import spaces
import numpy as np
import random
from collections import deque
import time
import re
import sys

from keras.models import Model
from keras.layers import Dense, Embedding, Reshape, Flatten, Input, Conv2D, LeakyReLU
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical

from matplotlib import pyplot as plt
from PIL import Image
import imageio

from keras import backend as K
import tensorflow as tf

##############################################################

Using TensorFlow backend.


In [2]:
##############################################################
## 2. Global Variables

DRIVE = True
ROOT = './'

if DRIVE:
    ROOT = 'drive/My Drive/PROYECTOS/RL/BreakOut_DQN/'
    from google.colab import drive
    drive.mount('/content/drive')

EPISODES = 100_000        # Episodes
LEARNING_RATE = 0.00025  # Agent learning rate 
ALPHA = 0.4             # Agent smoothing learning
GAMMA = 0.99           # Discount rate, the higher gamma the higher the focus on long term rewards
EPSILON = 1.0           # Exploration rate
KIND = 'small'          # Small/big DQN
MEMORY_SIZE = 53_000
WINDOW_FRAMES = 4

FRAME_WIDTH = 84
FRAME_HEIGHT = 84
STATE_LENGTH = 4

GIF_DURATION = 0.07
FRAMES_WINDOW = 14

path_input =  f'{ROOT}Training/'
path_output =  f'{ROOT}Training/'

##############################################################

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
##############################################################
## 3. Functions

def sizeoFmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def rgbToGray(img):
    # [0.2126, 0.7152, 0.0722] set the image in gray scale
    return np.dot(img, [0.2126, 0.7152, 0.0722])

def resizeImage(image, resized_shape = (84, 84), method = 'crop', crop_offset = 8):
    height, width = image.shape[0:2]
    r_height, r_width = resized_shape
    
    if method == 'crop':
        h = int(np.round(height * r_width / width, 0))
        resized = cv2.resize(image,
                            (r_width, h),
                            interpolation = cv2.INTER_LINEAR)
        crop_y_cutoff = h - crop_offset - r_height
        cropped = resized[crop_y_cutoff:crop_y_cutoff + r_height, :]
        return np.asarray(cropped, dtype = np.uint8)
    elif method == 'scale':
        return np.asarray(cv2.resize(image, (r_width, r_height), interpolation = cv2.INTER_LINEAR), dtype = np.uint8)
    else:
        raise ValueError('Method not implemented.')
        
        
def preprocessImage(img, resized_shape = (84, 84), crop_offset = 8, method = 'crop'):
    return rgbToGray(resizeImage(img, resized_shape = (84, 84), method = 'crop', crop_offset = 8))/255

def clipReward(reward):
    return np.sign(reward)

def saveEpisode(path_output, episode, list_images):
    
    try:
        os.mkdir(path_output)
    except:
        pass
    for i, img in enumerate(list_images):
        im = Image.fromarray(img)
        im.save(f'{path_output}epoch_{i}.png')
        
def sorted_aphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)    

def create_gif(path_input, path_output, episode, gif_name, duration):
    images = []
    
    try:
      files = [img for img in sorted_aphanumeric(os.listdir(path_input)) if img.endswith(".png")]
      
      for img in files:
          images.append(imageio.imread(path_input + img))

      imageio.mimsave(path_output + gif_name, images, duration = duration)
    except Except as e:
      pass
    
def createVideo(path_input, path_output, episode , video_name, frames_window = 14):
    images = [img for img in sorted_aphanumeric(os.listdir(path_input)) if img.endswith(".png")]
    frame = cv2.imread(os.path.join(path_input, images[0]))
    height, width, layers = frame.shape

    video = cv2.VideoWriter(path_output + video_name, 0, frames_window, (width, height))

    for image in images:
        video.write(cv2.imread(os.path.join(path_input, image)))

    cv2.destroyAllWindows()
    video.release()

def areSameImages(imgs1, imgs2):

    if imgs1.shape == imgs2.shape:
        if np.max(preprocessImage2(imgs1) - preprocessImage2(imgs2)) == 0:
            return True
        else:
            return False
    else:
        return False
    
##############################################################

In [0]:
##################################################
# 4. Classes

class DQNAgent:
    
    def __init__(self, action_size, lr, alpha, gamma, epsilon, kind, mem_size):
        self.action_size = action_size
        self.replay_memory = deque(maxlen = mem_size)
        self.window_frames = deque(maxlen = WINDOW_FRAMES)
        self.alpha = alpha
        self.learning_rate = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.output_dim = 24
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99985
        
        self.model = self.buildModel(kind)
        self.target_model = self.model
        
        
        self.update_target_model()
        
    def huberLoss(self, y_true, y_pred, clip_delta=1.0):
        error = y_true - y_pred
        cond  = K.abs(error) <= clip_delta

        squared_loss = 0.5 * K.square(error)
        quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)

        return K.mean(tf.where(cond, squared_loss, quadratic_loss))
        
    def buildModel(self, kind = 'small'):
        input_frame = Input(shape = (FRAME_WIDTH, FRAME_HEIGHT, STATE_LENGTH))
        
        conv1 = Conv2D(32, 8, 4, subsample=(4, 4), activation='relu')(input_frame)
        conv2 = Conv2D(64, 4, 2, subsample=(2, 2), activation='relu')(conv1)
        conv3 = Conv2D(64, 3, 1, subsample=(1, 1), activation='relu')(conv2)
        
        x = Flatten()(conv3)
        x = Dense(512)(x)
        x = LeakyReLU()(x)
        
        q_value_prediction = Dense(self.action_size, activation = 'linear')(x)
        
        model = Model(inputs = input_frame, outputs = q_value_prediction)
        model.compile(loss = 'mse',
                      # loss = self.huberLoss, 
                      optimizer = RMSprop(lr=self.learning_rate))
        
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def act(self, state, ret_predictions = False):
        if (np.random.rand() <= self.epsilon):
            return random.randrange(self.action_size)
        else:
            predictions = self.model.predict(np.expand_dims(state.reshape(FRAME_WIDTH, FRAME_HEIGHT, STATE_LENGTH) , axis = 0))
        if ret_predictions == True:
          return predictions
        else:
          return np.argmax(predictions)
    
    def remember(self, state, action, reward, next_state, done):
        # self.addWindowFrame(state)
        if len(self.replay_memory) >= MEMORY_SIZE:
            self.replay_memory.popleft()
        self.replay_memory.append((state, action, reward, next_state, done))

    
    def replayMemory2(self, batch_size):
        x_states = []
        x_actions = []
        x_next_states = []
        y_rewards = []
        dones = []

        minibatch = random.sample(self.replay_memory, min(len(self.replay_memory), batch_size))
        
        for state, action, reward, next_state, done in minibatch:
            x_states.append(state)
            x_actions.append(action)
            x_next_states.append(next_state)
            y_rewards.append(reward)
            dones.append(done)
        
        x_states = np.array(x_states).reshape(batch_size, FRAME_WIDTH, FRAME_HEIGHT, STATE_LENGTH)
        x_next_states = np.array(x_next_states).reshape(batch_size, FRAME_WIDTH, FRAME_HEIGHT, STATE_LENGTH)
        
        q_s_action = self.model.predict(x_states)
        q_next_state_action = self.model.predict(x_next_states)
        q_target_next_state_action = self.target_model.predict(x_next_states)
        
        # q_s_values = np.zeros((batch_size, self.action_size))
        
        for i, elem in enumerate(q_s_action):
            #is_terminal
            if dones[i]: 
                q_s_action[i][x_actions[i]] = y_rewards[i]
                #q_s_values[i][x_actions[i]] = y_rewards[i]
            else:
                #next_best_action = np.argmax(q_next_state_action[i])
                q_s_action[i][x_actions[i]] = y_rewards[i] + (GAMMA * np.max(q_target_next_state_action[i]))
                #q_s_values[i][x_actions[i]] = y_rewards[i] + (GAMMA * q_target_next_state_action[i, next_best_action])

        hist = self.model.fit(x_states, q_s_action, batch_size = batch_size, verbose=0)
        loss = hist.history['loss'][0]

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        return loss

###########################

class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env=None):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        super(EpisodicLifeEnv, self).__init__(env)
        self.lives = 0
        self.was_real_done = True
        self.was_real_reset = False

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert somtimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset()
            self.was_real_reset = True
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
            self.was_real_reset = False
        self.lives = self.env.unwrapped.ale.lives()
        return obs
    
    
##################################################

In [0]:
##################################################
## 5. Workflow

# Done = All 5 lives are finished
# Reward = 1 if a brick is breaken 0 otherwise
# Actions: {0: No-op, 1:Fire(start game), 2: Right, 3: Left}

if __name__ == '__main__':
    
    # env = gym.make('Breakout-v0')   
    # env_wrp = EpisodicLifeEnv(env)
    env = EpisodicLifeEnv(gym.make('BreakoutDeterministic-v4'))
    action_size = 4
    
    agent = DQNAgent(action_size, LEARNING_RATE, ALPHA, GAMMA, EPSILON, KIND, MEMORY_SIZE)
    
    batch_size = 32
    loss = 1
    
    steps = 0
    max_epochs = 5_000
    
    last_20_ep_reward = deque(maxlen = 20)
    last_20_ep_loss = deque(maxlen = 20)
    
    for episode in range(EPISODES):

        done = False
        window_states = deque(maxlen = STATE_LENGTH)
        episode_suffix = f'Episode_{episode}/'
        tot_reward = 0
        frames = []
        first = True
        
        if first:
            state = env.reset()
            state, reward, done, _ = env.step(1)
            for i in range(STATE_LENGTH):
                window_states.append(preprocessImage(state))
            first = False
            empty_frame = state.copy()
        
        for epoch in range(max_epochs):
            steps += 1
            action = agent.act(np.asarray(window_states))
            next_state, reward, done, _ = env.step(action)
            reward = clipReward(reward)

            actual_window_states = np.asarray(window_states)
            window_states.append(preprocessImage(next_state))
            next_window_states = np.asarray(window_states)
            
            state = next_state

            tot_reward += reward

            if done:
                reward = -1
            
            agent.remember(actual_window_states, action, reward, next_window_states, done)

            if(episode % 500 == 0):
                frames.append(state)
            
            if steps % 10_000 == 0:
                print('Update target model')
                agent.update_target_model()
                
            if len(agent.replay_memory) > batch_size:
                loss = agent.replayMemory2(batch_size)

            if done and (episode % 500 == 0):
                print(f'Exporting episode: {episode} ...')
                saveEpisode(path_input + episode_suffix, episode, frames)
                create_gif(path_input + episode_suffix, path_output, episode, f'gif_episode_{episode}.gif', duration = GIF_DURATION)
                # createVideo(path_input + episode_suffix, path_output, episode, f'video_episode_{episode}.avi', frames_window = FRAMES_WINDOW)
                break
                
            if done:
                break

        last_20_ep_reward.append([tot_reward]) 
        last_20_ep_loss.append([loss])
        
        print(f'Episode: {episode}, Episode_Steps: {epoch}, Cum_Reward: {tot_reward},  Epsilon: {np.round(agent.epsilon, 4)}, Memory: {len(agent.replay_memory)}, Loss_last_20_ep: {np.round(np.mean(np.asarray(last_20_ep_loss)), 4)}, Reward_last_20_ep: {np.round(np.mean(np.asarray(last_20_ep_reward)), 2)}')

##################################################















Exporting episode: 0 ...
Episode: 0, Episode_Steps: 23, Cum_Reward: 0.0,  Epsilon: 1.0, Memory: 24, Loss_last_20_ep: 1.0, Reward_last_20_ep: 0.0


Episode: 1, Episode_Steps: 23, Cum_Reward: 0.0,  Epsilon: 0.9976, Memory: 48, Loss_last_20_ep: 7.1348, Reward_last_20_ep: 0.0
Episode: 2, Episode_Steps: 49, Cum_Reward: 1.0,  Epsilon: 0.9901, Memory: 98, Loss_last_20_ep: 5.8322, Reward_last_20_ep: 0.33
Episode: 3, Episode_Steps: 23, Cum_Reward: 0.0,  Epsilon: 0.9866, Memory: 122, Loss_last_20_ep: 4.8328, Reward_last_20_ep: 0.25
Episode: 4, Episode_Steps: 23, Cum_Reward: 0.0,  Epsilon: 0.983, Memory: 146, Loss_last_20_ep: 4.7528, Reward_last_20_ep: 0.2
Episode: 5, Episode_Steps: 23, Cum_Reward: 0.0,  Epsilon: 0.9795, Memory: 170, Loss_last_20_ep: 4.5117, Reward_last_20_ep: 0.17
Episode: 6, Episode_Steps: 21, Cum_Reward: 0.0,  Epsilon: 0.9763, Memory: 192, Loss_last_20_ep: 3.9897, Reward_last_20_ep: 0.14
Episode: 7, Episode_Steps: 23, Cum_Reward: 0.0,  Epsilon: 0.9728, Memory: 216, Loss_las

In [0]:
first = True

if first:
    #state = env.reset()
    #state, reward, done, _ = env.step(1)
    for i in range(STATE_LENGTH):
        window_states.append(preprocessImage(state))
    first = False
    plt.imshow(state)

In [0]:
fig=plt.figure(figsize=(16, 16))
columns = 4
rows = 1
for i, img in enumerate(window_states):
    fig.add_subplot(rows, columns, i + 1)
    plt.imshow(img, cmap = 'gray')
plt.show()

In [0]:
# Actions: {0: No-op, 1:Fire(start game), 2: Right, 3: Left}
agent.act(np.asarray(window_states), ret_predictions = True)
# agent.act(np.asarray(window_states))

In [0]:
# Actions: {0: No-op, 1:Fire(start game), 2: Right, 3: Left}

window_states.append(preprocessImage(state))

fig=plt.figure(figsize=(16, 16))
columns = 4
rows = 1
for i, img in enumerate(window_states):
    fig.add_subplot(rows, columns, i + 1)
    plt.imshow(img, cmap = 'gray')
plt.show()

print(agent.model.predict(np.expand_dims(np.asarray(window_states).reshape(FRAME_WIDTH, FRAME_HEIGHT, STATE_LENGTH) , axis = 0)))
next_state, reward, done, _ = env.step(1)
print(reward, done)
plt.imshow(preprocessImage(next_state), cmap = 'gray')
state = next_state

In [0]:
# saveEpisode(path_input + episode_suffix, episode, frames)
# # create_gif(path_input + episode_suffix, path_output, episode, f'gif_episode_{episode}.gif', duration = GIF_DURATION)
# createVideo(path_input + episode_suffix, path_output, episode, f'video_episode_{episode}.avi', frames_window = FRAMES_WINDOW)