In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import keras
from keras import backend as K
from keras import utils as np_utils
from keras.layers import Dense, Activation, Input, BatchNormalization, Conv2D
from keras.models import Model
from keras import optimizers
from keras.preprocessing import image
from PIL import Image
from gym import envs
print(envs.registry.all())

In [None]:
env = gym.make("LunarLander-v2")
print(env.action_space)
print(env.observation_space)

In [None]:
class Agent(object):
    def __init__(self, input_dim, output_dim, hidden_dims = [16, 16]):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.__build_network(input_dim, output_dim, hidden_dims)
        self.__build_train_fn()
        
    def __build_network(self, input_dim, output_dim, hidden_dims):
        #creates base network
        self.X = Input(shape = (8,))
        net = self.X
        net = keras.layers.Conv2D(kernel_size = (10, 10), filters = 8, strides = 3)(net)
        net = keras.layers.Conv2D(kernel_size = (8, 8), filters = 8, s
                                  rides = 3)(net)
        net = keras.layers.Flatten()(net)
        net = Dense(16)(net)
        net = Activation("relu")(net)
        net = BatchNormalization()(net)
        net = Dense(output_dim)(net)
        net = Activation("softmax")(net)
        self.model = Model(inputs = self.X, outputs = net)
        self.model.summary()
    def __build_train_fn(self):
        action_prob_placeholder     = self.model.output #placeholder to hold the probabilities for each action
        action_onehot_placeholder   = K.placeholder(shape = (None, self.output_dim), name = 'action_onehot')
        discount_reward_placeholder = K.placeholder(shape = (None, ), name = "discounted_reward")
        
        action_prob = K.sum(K.log(action_prob_placeholder) * action_onehot_placeholder)
        loss = -action_prob * discount_reward_placeholder
        
        loss = K.mean(loss)
        adam = optimizers.Adam(lr = 0.001)
        
        updates = adam.get_updates(params = self.model.trainable_weights, 
                                  loss = loss)
        
        self.train_fn = K.function(inputs = [self.model.input,
                                            action_onehot_placeholder,
                                            discount_reward_placeholder],
                                  outputs = [],
                                  updates = updates)
    def get_action(self, state):
        shape = state.shape
            
        action_prob = self.model.predict(state)
        return np.random.choice(np.arange(self.output_dim), p = action_prob)
    def fit(self, S, A, R):
        action_onehot = np_utils.to_categorical(A, num_classes = self.output_dim)
        discount_reward = compute_discounted_R(R)
        self.train_fn([S, action_onehot, discount_reward])

In [None]:
def compute_discounted_R(R, discount_rate = .99):
    discounted_r = np.zeros_like(R, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(R))):
        running_add = running_add * discount_rate + R[t]
        discounted_r[t] = running_add
    discounted_r -= (discounted_r.mean()/discounted_r.std())
    return discounted_r

def run_episode(env, agent):
    done = False
    S = []
    A = []
    R = []
    s = env.reset()
    s = image.array_to_img(s)
    s = s.resize((100, 100), Image.ANTIALIAS).convert('L')
    s = image.img_to_array(s).reshape(-1, 100, 100, 1)
    #print(s.shape)
    total_reward = 0
    while not done:
        env.render()
        a = agent.get_action(s)
        s2, r, done, info = env.step(a)
        s2 = image.array_to_img(s2)
        s2 = s2.resize((100, 100), Image.ANTIALIAS).convert('L')
        s2 = image.img_to_array(s2).reshape(-1, 100, 100, 1)
        #print(s2.shape)
        total_reward += r
        
        S.append(s)
        A.append(a)
        R.append(r)
        
        s = s2
        
        if done:
            S = np.array(S)
            A = np.array(A)
            R = np.array(R)
            agent.fit(S, A, R)
    return total_reward

In [None]:
input_shape = env.observation_space.shape
print(input_shape)
output_shape = env.action_space.n
print(output_shape)
agent = Agent(input_shape, output_shape, [16, 16])

for episode in range(2000):
    reward = run_episode(env, agent)
    print(episode, reward)
env.close()

In [None]:
!pip install keras-rl --user

In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory


ENV_NAME = 'LunarLander-v2'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model regardless of the dueling architecture
# if you enable dueling network in DQN , DQN will build a dueling network base on your model automatically
# Also, you can build a dueling network by yourself and turn off the dueling network in DQN.

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())
try:
    print("its working")
    model.load_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME))
    print("it worked")
except Exception as e:
    print(e)

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
# enable the dueling network
# you can specify the dueling_type to one of {'avg','max','naive'}
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=5000000, visualize=True, verbose=1)

# After training is done, we save the final weights.
dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)



# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=10, visualize=True)
env.close()

Using TensorFlow backend.
W0809 15:14:23.353299 140509737899840 deprecation_wrapper.py:119] From /home/idstudent/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0809 15:14:23.371836 140509737899840 deprecation_wrapper.py:119] From /home/idstudent/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0809 15:14:23.388050 140509737899840 deprecation_wrapper.py:119] From /home/idstudent/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

W0809 15:14:23.826774 140509737899840 deprecation_wrapper.py:119] From /home/idstudent/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0809 15:14:23.827909 140509737899840 deprecation_wrapper.py:119] From /home/idstudent/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0809 15:14:24.090398 140509737899840 deprecation_wrapper.py:119] From /home/idstudent/.local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



Training for 5000000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 1:51:04 - reward: -0.9378



   21/10000 [..............................] - ETA: 25:05 - reward: -0.9578



44 episodes - episode_reward: -146.948 [-1007.840, 52.927] - loss: 10.950 - mean_absolute_error: 22.159 - mean_q: 1.275

Interval 2 (10000 steps performed)
25 episodes - episode_reward: -75.722 [-195.495, 48.410] - loss: 7.995 - mean_absolute_error: 34.963 - mean_q: 29.005

Interval 3 (20000 steps performed)
14 episodes - episode_reward: -30.813 [-148.949, 113.459] - loss: 8.359 - mean_absolute_error: 37.478 - mean_q: 43.043

Interval 4 (30000 steps performed)
14 episodes - episode_reward: -20.314 [-310.028, 115.515] - loss: 8.525 - mean_absolute_error: 38.462 - mean_q: 45.922

Interval 5 (40000 steps performed)
15 episodes - episode_reward: -99.566 [-672.687, 131.334] - loss: 8.861 - mean_absolute_error: 36.080 - mean_q: 43.368

Interval 6 (50000 steps performed)
14 episodes - episode_reward: -14.331 [-331.254, 146.318] - loss: 7.967 - mean_absolute_error: 33.562 - mean_q: 42.400

Interval 7 (60000 steps performed)
11 episodes - episode_reward: -42.692 [-235.650, 131.371] - loss: 6.81