In [0]:
import gym
import random
import numpy as np
from collections import deque
from keras.layers import Dense, Conv2D,Activation,Flatten
from keras.optimizers import Adam, SGD,RMSprop
from keras.models import Sequential
from skimage.util import crop
import skimage
import tensorflow as tf
from keras import utils 
import keras.backend as K
import matplotlib.pyplot as plt
import pickle
from keras.models import load_model
from keras.initializers import VarianceScaling


Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


I used a customized loss function

In [0]:
def loss_mse(Y_true,Y_pred):
      global glob_act
      target= tf.keras.backend.max(Y_true,axis=1)
      new_Y_pred=K.sum((Y_pred*glob_act),axis=1)
      loss= tf.compat.v1.losses.huber_loss(labels=target,predictions=new_Y_pred,delta=1.0)
      loss_mean = tf.reduce_mean(loss)
      return (loss_mean)

In [0]:
actions=[0]
actions=actions*32
glob_act=utils.to_categorical(actions, num_classes=4)

In [0]:
class DDQNAgent:
        def __init__(self,env,path=None):
            #defining some parameters
            self.no_op_steps=30
            self.env=env
            self.path=path
            self.action_size=env.action_space.n
            self.agent_history_length=4
            self.input_img_shape=(84,84,self.agent_history_length)
            self.n_episodes=1000000
            self.max_n_steps=18000
            self.discount_factor=0.99
            self.learning_rate=0.00001
            self.epsilon=1  
            self.epsilon_decay=9.000000000000001e-07  
            self.epsilon_min=0.1
            self.batch_size=32
            self.start_train=50000
            self.n_seen_frame=1
            self.memory=deque(maxlen=1000000)
            self.reward_per_episode=[]
            self.episodic_reward=0
            self.actions=[]
            self.MAX_FRAMES = 30000000
            
            self.model=self.build_model()
            self.frozen_target_model=self.build_model()
            
            self.update_target_model()

            if (self.path!=None):
                self.model=load_model(self.path,custom_objects={'loss_mse': loss_mse})
                self.frozen_target_model=load_model(self.path, custom_objects={'loss_mse': loss_mse})

        
 
        def build_model(self):
            #building the NN model
            model=Sequential()
            model.add(Conv2D(16, (8, 8), strides=(4, 4), input_shape=self.input_img_shape,kernel_initializer=VarianceScaling(scale=2))) 
            model.add(Activation('relu')) 
            model.add(Conv2D(32, (4, 4), strides=(2, 2),kernel_initializer=VarianceScaling(scale=2)))
            model.add(Activation('relu'))  
            model.add(Flatten()) 
            model.add(Dense(256,activation="relu",kernel_initializer=VarianceScaling(scale=2))) 
            model.add(Dense(self.action_size,activation="linear",kernel_initializer=VarianceScaling(scale=2)))
            model.compile(loss=loss_mse,optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01))
            model.summary()
            return model
        def update_target_model(self):
            #function to update the target model every few steps
            self.frozen_target_model.set_weights(self.model.get_weights())
        
        def reset(self):
          #function that prepares the initial state in every episode
          frame = self.env.reset() 
          for _ in range(random.randint(1, self.no_op_steps)):
              frame, _, _, _ = self.env.step(1) # Action 'Fire'
          return (frame)


        def eps_greedy_policy(self,state):
            # this is the epsilon-greedy policy, the algorithm tries to find the optimal policy 
            # by following this policy
            state_stacked=np.expand_dims(state,axis=0)
            if (self.n_seen_frame<50000):
              self.epsilon=1
            elif (self.epsilon<self.epsilon_min):
                self.epsilon=self.epsilon_min
            else:
                self.epsilon=self.epsilon-self.epsilon_decay
            if (np.random.rand()<=self.epsilon):
                return (random.randint(0,self.action_size-1))
            else :
                state_stacked= state_stacked/255.0      
                state_stacked=state_stacked.astype('float32')
                q_value=self.model.predict(state_stacked)
                return (np.argmax(q_value[0]))
        def greedy_policy(self,state):
            # this is the greedy-policy, the policy which the algorithm tries to optimize
            state_stacked=np.expand_dims(state,axis=0)
            state_stacked= state_stacked/255.0      
            state_stacked=state_stacked.astype('float32')
            q_value=self.model.predict(state_stacked)
            return (np.argmax(q_value[0]))
        
        def memorize_sample(self,state,action,reward,done):
                # function that will append a sample to the memory.
                
                reward=self.clip_reward(reward)
                self.memory.append((state,action,reward,done))
        def clip_reward(self,reward):
            #function to clip the reward
          if (reward>0):
            r=1
          elif(reward==0):
            r=0
          else:
            r=-1
          return(r)
        def preprocess_input(self,img):
            # Function to preprocess the input
            # convert the input from rgb to grey 
            image = skimage.color.rgb2gray(img)
            # resize image to 80x80 from 288x404 
            image = skimage.transform.resize(image,(110,84), mode='constant') 
            # return image after stretching/shrinking its intensity levels 
            #image = skimage.exposure.rescale_intensity(image,out_range=(0,255)) 
            image=image[17:101,0:84]
            image=(image*256).astype('uint8')
            # scale down pixels values to (0,1) 
            #image = image / 255.0 
            #image=image.astype('float32')
            return image
        def get_mini_batch(self):
                #function that sample a mini-batch from the memory to use it for training.
                first_state=np.empty((self.batch_size,self.input_img_shape[0],self.input_img_shape[1],self.agent_history_length))
                next_state=np.empty((self.batch_size,self.input_img_shape[0],self.input_img_shape[1],self.agent_history_length))
                actions,reward,done=[],[],[]
                for i in range(self.batch_size):
                  while True:
                    index = random.randint(self.agent_history_length-1, len(self.memory)-2)
                    if ((self.memory[index][3]!=True) & (self.memory[index-1][3]!=True) & (self.memory[index-2][3]!=True) & (self.memory[index-3][3]!=True)):
                      break
                  stacked_first_state=np.stack([self.memory[index-3][0],self.memory[index-2][0],self.memory[index-1][0],self.memory[index][0]],axis=2)
                  stacked_next_state=np.stack([self.memory[index-2][0],self.memory[index-1][0],self.memory[index][0],self.memory[index+1][0]],axis=2)
                  first_state[i]=stacked_first_state
                  next_state[i]=stacked_next_state
                  actions.append(self.memory[index][1])
                  reward.append(self.memory[index][2])
                  done.append(self.memory[index][3])
                return(first_state,actions,reward,next_state,done)



        
        def train_model(self):
            # This function uses all of the above to train the model.
            # It is also reponsible for saving the models and the needed paramters to resume the training.
            for episode in range(self.n_episodes):
              if(self.n_seen_frame>self.MAX_FRAMES):
                break
              resting_lives=5
              if (episode%300==0):
                model_save_name = '11_Huber_target_model_'+str(episode)
                path = F"/content/gdrive/My Drive/DDQN_Breakout_V4/{model_save_name}.h5" 
                agent.frozen_target_model.save(path)
                mylist=[]
                mylist.append(self.reward_per_episode)
                mylist.append(episode)
                mylist.append(self.epsilon)
                list_name="11_Huber_reward_episode_epsilon_"+str(episode)
                with open(F"/content/gdrive/My Drive/DDQN_Breakout_V4/{list_name}.pkl", "wb") as fp:
                  pickle.dump(mylist, fp)
                memory_name="11_Huber_memory_"+str(episode)
                with open(F"/content/gdrive/My Drive/DDQN_Breakout_V4/{memory_name}.pkl", "wb") as fp:
                  pickle.dump(self.memory, fp)
                n_seen_frames="11_Huber_n_seen_frames"+str(episode)
                with open(F"/content/gdrive/My Drive/DDQN_Breakout_V4/{n_seen_frames}.pkl", "wb") as fp:
                  pickle.dump(self.n_seen_frame, fp)
              global glob_act
              print("episode {}/{}".format(episode,self.n_episodes))
              print("number of seen frames = {}".format(self.n_seen_frame))
              print("epsilon = {}".format(self.epsilon))
              
              self.reward_per_episode.append(self.episodic_reward)
              print("average reward in episode {} = {}".format(episode,self.reward_per_episode[-1]))
              print("\n")
              self.episodic_reward=0

              state=self.reset()
              processed_state=self.preprocess_input(state)
              stacked_processed_state=np.stack([processed_state for i in range(self.agent_history_length)],axis=2)
              done=False
              for step in range(self.max_n_steps):
                if (done==True):
                  print("done at step {}".format(step))
                  break
                
                
                
                
                
                action=self.eps_greedy_policy(stacked_processed_state)
                next_state,reward,done,info=self.env.step(action)

                self.episodic_reward+=reward
                self.memorize_sample(processed_state,action,reward,done)
                state=next_state
                processed_state=self.preprocess_input(state)
                stacked_processed_state[:,:,0:self.agent_history_length-1]=stacked_processed_state[:,:,1:self.agent_history_length]
                stacked_processed_state[:,:,self.agent_history_length-1]=processed_state
                self.n_seen_frame+=1
                


                
                if len(self.memory)< self.start_train:
                  continue
                
                if (self.n_seen_frame%4==0):
                  first_states,self.actions,rewards,next_states,dones=self.get_mini_batch()
                  next_states=next_states/255.0
                  next_states=next_states.astype('float32')
                  q_target=self.frozen_target_model.predict(next_states)
                  for i in range(len(dones)):
                    if (dones[i]==True):
                      ((q_target == (q_target.max(axis=1)[:,None]))*rewards[i])[i,:]
                    else:
                      q_target[i,:]=self.discount_factor*q_target[i,:]+rewards[i]
                  #print(q_target)
                  #print(q_target.shape)
                  
                  glob_act=utils.to_categorical(self.actions, num_classes=self.action_size)
                  
                  first_states=next_states/255.0
                  first_states=next_states.astype('float32')
                  self.model.fit(x=first_states,y=q_target,batch_size=self.batch_size,epochs=1,verbose=0)
                  if (self.n_seen_frame%10000==0):
                    self.update_target_model()
                    


            

            
             

In [0]:
#defining the environment and the agent
env=gym.make("BreakoutDeterministic-v4")
agent=DDQNAgent(env=env)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 20, 20, 16)        4112      
_________________________________________________________________
activation_1 (Activation)    (None, 20, 20, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 32)          8224      
_________________________________________________________________
activation_2 (Activation)    (None, 9, 9, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2592)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               663808    
______________________

In [0]:
# First time we started the training.
agent.train_model()

episode 0/1000000
number of seen frames = 1
epsilon = 1
average reward in episode 0 = 0
done at step 169
episode 1/1000000
number of seen frames = 170
epsilon = 1
average reward in episode 1 = 2.0
done at step 157
episode 2/1000000
number of seen frames = 327
epsilon = 1
average reward in episode 2 = 1.0
done at step 189
episode 3/1000000
number of seen frames = 516
epsilon = 1
average reward in episode 3 = 2.0
done at step 156
episode 4/1000000
number of seen frames = 672
epsilon = 1
average reward in episode 4 = 1.0
done at step 161
episode 5/1000000
number of seen frames = 833
epsilon = 1
average reward in episode 5 = 1.0
done at step 239
episode 6/1000000
number of seen frames = 1072
epsilon = 1
average reward in episode 6 = 3.0
done at step 115
episode 7/1000000
number of seen frames = 1187
epsilon = 1
average reward in episode 7 = 0.0
done at step 147
episode 8/1000000
number of seen frames = 1334
epsilon = 1
average reward in episode 8 = 1.0
done at step 128
episode 9/1000000
nu

In [0]:
#presume training
 
env=gym.make("BreakoutDeterministic-v4")
agent=DDQNAgent(env=env,path="/content/gdrive/My Drive/DDQN_Breakout_V4/10_Huber_target_model_5100.h5")
with open("/content/gdrive/My Drive/DDQN_Breakout_V4/10_Huber_reward_episode_epsilon_5100.pkl", "rb") as fp:
                  my_list=pickle.load(fp)

with open("/content/gdrive/My Drive/DDQN_Breakout_V4/10_Huber_memory_5100.pkl", "rb") as fp2:
                  my_memory=pickle.load(fp2)

with open(F"/content/gdrive/My Drive/DDQN_Breakout_V4/10_Huber_n_seen_frames5100.pkl", "rb") as fp:
                  agent.n_seen_frame=pickle.load(fp)

agent.reward_per_episode=my_list[0]
agent.epsilon=my_list[2]
agent.memory=deque(list(my_memory)[:850000],maxlen=850000)
del(my_memory)

#agent.n_seen_frame=len(agent.memory)
print(agent.epsilon)
print(len(agent.memory))
agent.train_model()






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 20, 20, 16)        4112      
_________________________________________________________________
activation_1 (Activation)    (None, 20, 20, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 32)          8224      
_________________________________________________________________
activation_2 (Activation)    (None, 9, 9, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2592)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               663808    
______________________