In [1]:
!pip install gym[atari]
!pip install stable-baselines3[extra]
!pip install keras-rl2

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.5.0-py3-none-any.whl (177 kB)
[K     |████████████████████████████████| 177 kB 5.2 MB/s 
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 52.4 MB/s 
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting ale-py~=0.7.4
  Downloading ale_py-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 42.8 MB/s 
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gym, AutoROM.accept-rom-license
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.21.0-p

In [None]:
!pip install gupload

# IMPORTS

In [2]:
from pydrive.auth import GoogleAuth
from google.colab import auth
import tensorflow as tf
from PIL import Image
import numpy as np
import gym
import tensorflow as tf

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint


import matplotlib.pyplot as plt

# Authenticate and create the PyDrive client.
auth.authenticate_user()




# preprocess

In [3]:
inputShape = (84, 84)
windowLength = 4

class AtariProcessor(Processor):
    def process_observation(self, obs):
        assert obs.ndim == 3 
        imgs = Image.fromarray(obs)
        imgs = imgs.resize(inputShape).convert('L')  
        obs = np.array(imgs)
        assert obs.shape == inputShape
        return obs.astype('uint8')  

    def process_state_batch(self, batch):
        batch = batch.astype('float32') / 255.
        return batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

# instantiate first game

In [4]:
env1 = gym.make('BreakoutDeterministic-v4')
np.random.seed(123)
env1.seed(123)
nb_actions = env1.action_space.n
print('number of actions',nb_actions)
height ,width, channels = env1.observation_space.shape
print('types of actions:',env1.unwrapped.get_action_meanings())
print('height:{} width:{} channels:{}'.format(height,width, channels))

number of actions 4
types of actions: ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
height:210 width:160 channels:3


# build cnn model

In [5]:
input_shape = (windowLength,) + inputShape
print(input_shape)

model1 = tf.keras.models.Sequential()
model1.add(tf.keras.layers.Permute((2, 3, 1), input_shape=input_shape))
model1.add(tf.keras.layers.Convolution2D(32, (8,8), strides=(4,4), activation='relu', input_shape=input_shape))
model1.add(tf.keras.layers.Convolution2D(64, (4,4), strides=(2,2), activation='relu'))
model1.add(tf.keras.layers.Convolution2D(64, (3,3), strides=(1,1), activation='relu'))
model1.add(tf.keras.layers.Flatten())
model1.add(tf.keras.layers.Dense(512, activation='relu'))
model1.add(tf.keras.layers.Dense(env1.action_space.n, activation="linear"))
 
    
model1.summary()

(4, 84, 84)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 permute (Permute)           (None, 84, 84, 4)         0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 3136)              0         
                                                                 
 dense (Dense)               (None, 512)               1606144   
                                                                 
 dense_1 (Dense)             (None, 4)      

# set parameters

In [6]:
memorylimit = 10000
innerpolicy = EpsGreedyQPolicy()
maxEps = 1.0
minEps = 0.1
testEps = 0.05
annealSteps = 200000
processor = AtariProcessor()
warmup = 50000
discount = 0.99
target_model_update = 10000
train_interval = 4
delta_clip = 1.0
lr = 0.00025
trainingSteps = 1000000
trainingLogInterval = 10000

# configure and compile the agent

In [7]:
memory = SequentialMemory(limit=memorylimit, window_length=windowLength)
policy = LinearAnnealedPolicy(innerpolicy, attr='eps', value_max=maxEps, value_min=minEps, value_test=testEps,
                              nb_steps=annealSteps)
#policy = BoltzmannQPolicy(tau=1.)
dqn = DQNAgent(model=model1, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor,enable_double_dqn=False,enable_dueling_network=True, dueling_type='avg',  nb_steps_warmup=warmup, gamma=discount, target_model_update=target_model_update,
               train_interval=train_interval, delta_clip=delta_clip)
#adamOptimizer = adam_v2.Adam(learning_rate=0.00025)
dqn.compile(tf.keras.optimizers.Adam(learning_rate=lr), metrics=[tf.keras.metrics.RootMeanSquaredError()])

# train the first agent

In [8]:
env_name = 'TLBreakoutDeterministic-v4'
weights_filename = 'dqn_{}_weights.h5f'.format(env_name)
checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f'
log_filename = 'dqn_{}_log.json'.format(env_name)
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=200000)]
callbacks += [FileLogger(log_filename, interval=100)]
trainLog = dqn.fit(env1, callbacks=callbacks, nb_steps=trainingSteps, log_interval=trainingLogInterval)

done, took 9.035 seconds


# instantiate the second env

In [9]:
env2 = gym.make('SpaceInvadersDeterministic-v4')
np.random.seed(123)
env2.seed(123)
nb_actions2 = env2.action_space.n
print('number of actions',nb_actions2)
height2 ,width2, channels2 = env2.observation_space.shape
print('types of actions:',env2.unwrapped.get_action_meanings())
print('height:{} width:{} channels:{}'.format(height2,width2, channels2))

number of actions 6
types of actions: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
height:210 width:160 channels:3


# build Transfer Learning cnn model 

In [10]:


base_model = model1

base_model.trainable = False
model2= tf.keras.Model(inputs=base_model.input, outputs=base_model.layers[3].output)

x=model2.output
x=tf.keras.layers.Flatten()(x)
x=tf.keras.layers.Dense(512,activation='relu')(x)
out=tf.keras.layers.Dense(env2.action_space.n, activation="linear")(x)
TLmodel=tf.keras.Model(inputs=model2.input,outputs=out)
TLmodel.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 permute_input (InputLayer)  [(None, 4, 84, 84)]       0         
                                                                 
 permute (Permute)           (None, 84, 84, 4)         0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 flatten_1 (Flatten)         (None, 3136)              0         
                                                                 
 dense_3 (Dense)             (None, 512)               1606

# set second agent parameters

In [11]:
memorylimit2 = 10000
innerpolicy2 = EpsGreedyQPolicy()
maxEps2 = 1.0
minEps2 = 0.1
testEps2 = 0.05
annealSteps2 = 200000
processor2 = AtariProcessor()
warmup2 = 50000
discount2 = 0.99
target_model_update2 = 10000
train_interval2 = 4
delta_clip2 = 1.0
lr2 = 0.00025
trainingSteps2 = 1000000
trainingLogInterval2 = 10000

# config and compile second agent

In [12]:
memory2 = SequentialMemory(limit=memorylimit2, window_length=windowLength)
policy2 = LinearAnnealedPolicy(innerpolicy2, attr='eps', value_max=maxEps2, value_min=minEps2, value_test=testEps2,
                              nb_steps=annealSteps2)
#policy2 = BoltzmannQPolicy(tau=1.0)
dqn2 = DQNAgent(model=TLmodel, nb_actions=nb_actions2, policy=policy2, memory=memory2,
               processor=processor2,enable_double_dqn=False,enable_dueling_network=True, dueling_type='avg',  nb_steps_warmup=warmup2, gamma=discount2, target_model_update=target_model_update2,
               train_interval=train_interval2, delta_clip=delta_clip2)
dqn2.compile(tf.keras.optimizers.Adam(learning_rate=lr2), metrics=[tf.keras.metrics.RootMeanSquaredError()])

# train the second agent

In [None]:
env_name2 = 'TLSpaceInvadersDeterministic-v4'
weights_filename2 = 'dqn_{}_weights.h5f'.format(env_name2)
checkpoint_weights_filename2 = 'dqn_' + env_name2 + '_weights_{step}.h5f'
log_filename2 = 'dqn_{}_log.json'.format(env_name2)
callbacks2 = [ModelIntervalCheckpoint(checkpoint_weights_filename2, interval=200000)]
callbacks2 += [FileLogger(log_filename2, interval=100)]
trainLog2 = dqn2.fit(env2, callbacks=callbacks2, nb_steps=trainingSteps2, log_interval=trainingLogInterval2)

# upload logs and models to drive

In [None]:
!gupload --to '1Eu7GmiWmvEw_vlUoo2jA-TALR0oO9BqB' *.h5f.*
!gupload --to '1Eu7GmiWmvEw_vlUoo2jA-TALR0oO9BqB' dqn_TLSpaceInvadersDeterministic-v4_log.json
!gupload --to '1Eu7GmiWmvEw_vlUoo2jA-TALR0oO9BqB' dqn_TLBreakoutDeterministic-v4_log.json