In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gym
import numpy as np
import keras

In [None]:
ENV_ID = "Breakout-v4"
env = gym.make(ENV_ID)
frame_shape = (105,80)
frame_shape_channel = (105,80,4)
n_actions = env.action_space.n

In [None]:
#network parameters
learning_rate = 0.0001
gradient_momentum = 0.95
min_sq_gradient = 0.01

In [None]:
def to_grayscale(img):return np.mean(img, axis=2).astype(np.uint8)
def downsample(img):return img[::2, ::2]
def preprocess(img):return to_grayscale(downsample(img))

In [None]:
def custom_loss(episode_reward):
    def loss(y_true,y_pred):
        # feed in y_true as actual action taken
        # loss = reward*(-actual*np.log(y_pred)-(1-actual)*np.log(1-y_pred)))
        
        tmp_pred = keras.layers.Lambda(lambda x: keras.backend.clip(x,0.05,0.95))(y_pred) # we could also do gradient clipping
        tmp_loss = keras.layers.Lambda(lambda x:-y_true*keras.backend.log(x)-(1-y_true)*(keras.backend.log(1-x)))(tmp_pred)
        # put reward in effect
        policy_loss=keras.layers.Multiply()([tmp_loss,episode_reward])
        
        return policy_loss
    return loss

In [None]:
def create_model():
    input_layer = keras.layers.Input(frame_shape_channel)
    
    h_layer_1 = keras.layers.Conv2D(16, (8, 8), activation="relu", strides=(4, 4))(input_layer)
    h_layer_2 = keras.layers.Conv2D(32, (8, 8), activation="relu", strides=(4, 4))(h_layer_1)
    
    flattened_layer = keras.layers.core.Flatten()(h_layer_2)
    
    softmax_output = keras.layers.Dense(n_actions,activation='softmax',use_bias=False)(flattened_layer)
    ddpg = keras.models.Model(inputs=input_layer,outputs=softmax_output)
    
    ddpg.summary()
    

    optimizer = keras.optimizers.RMSprop(lr=learning_rate)
    ddpg.compile(optimizer, loss='mse')
    
    return ddpg

In [None]:
ddpg = create_model()

In [None]:
def process_rewards(r_list):
    reward_decay=0.99
    tmp_r=0
    rew=np.zeros_like(r_list,dtype=np.float32)
    for i in range(len(r_list)-1,-1,-1):
        if r_list[i]==0:
            tmp_r=tmp_r*reward_decay
            rew[i]=tmp_r
        else: 
            tmp_r = r_list[i]
            rew[i]=tmp_r
    rew -= np.mean(rew) # subtract by average
    rew /= np.std(rew) # divide by std
    return rew

In [None]:
def clip_reward(r):return np.sign(r)