In [1]:
import gym
from gym.wrappers.monitoring.video_recorder import VideoRecorder

import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
import random
import copy

In [2]:
def query_environment(name):
    env = gym.make(name)
    spec = gym.spec(name)
    print(f'Action space: {env.action_space}')
    print(f'Observation space: {env.observation_space}')
    print(f'Max Episode steps: {spec.max_episode_steps}')
    print(f'Nondeterministic: {spec.nondeterministic}')
    print(f'Reward range: {env.reward_range}')
    print(f'Reward threshold: {spec.reward_threshold}')

In [3]:
query_environment('LunarLander-v2')

Action space: Discrete(4)
Observation space: Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)
Max Episode steps: 1000
Nondeterministic: False
Reward range: (-inf, inf)
Reward threshold: 200


In [4]:
env = gym.make('LunarLander-v2', render_mode='rgb_array')

In [5]:
env.reset()
vid = VideoRecorder(env, 'videos/NotWise.mp4', enabled=True)
for i in range(1000):
    env.render()
    vid.capture_frame()
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated: break
vid.close()
env.close()

Moviepy - Building video videos/NotWise.mp4.
Moviepy - Writing video videos/NotWise.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready videos/NotWise.mp4




## Hyperparameters

In [6]:
MEMORY_SIZE = 100_000
GAMMA = 0.995
ALPHA = 1e-3
MINI_BATCH_SIZE = 64
NUM_OF_UPDATE_STEP = 4

## Defining Networks

In [7]:
num_of_features = 8
num_of_actions = 4

q_network = Sequential([
    Input(shape=(num_of_features)),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=num_of_actions, activation='linear')
], name='Q_Network')

target_q_network = Sequential([
    Input(shape=(num_of_features)),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=num_of_actions, activation='linear')
], name='target_Q_Network')


optimizer = Adam(learning_rate=ALPHA)

## Defining loss function
Return denoted by $Q(s, a)$ = "Return" if we start at state $s$ and apply action $a$ once and behave optimally after that.
### Bellman equation
$Q(s, a) = R(s) + \gamma max_{a'}Q(s', a')$
<br>
Our goal is to train the model to estimate the value for $Q(s, a)$. As it seems we can start with a random function of $Q$ and get to the good estimate of the function. <br>
Our target is "target_q_network" but this network is implemented with random weights and will predict $Q(s', a')$ (as the target) so it has to be trained. We defined "target_q_network" so we won't oscillate due to moving "target".

So target network is used to predict the $a'$ via its value. Than have $target = R(s) + \gamma max_{a'} Q(s', a')$ and so
```python
max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1) # Finds maximum Q values
y_targets = rewards + (1 - dones) * gamma * max_qsa
```

calculates target values for each of the experciences the agent have had.
<br><br>
However q_network is the network that we try to make reach target_q_network (target_q_network will change slightly on each gradient update). So predict $Q(s, a)$ and find the squared difference to be loss;

In [8]:
def compute_loss(experiences, gamma, q_network, target_q_network):
    states, actions, rewards, next_states, dones = experiences

    # Find maximum Q that optimal behavior let us into after taking first action
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1) # Choose maximum Qs
    y_targets = rewards + (1 - dones) * gamma * max_qsa # Calculate target
    
    q_values = q_network(states) # Find Q values using q_network 
    # Q values that our action led to
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
    
    loss = MSE(q_values, y_targets)
    return loss

## Soft update
Updates target_q_network with a little fraction of q_network weights

In [9]:
def soft_update_target_network(q_network, target_q_network, TAU=1e-3):
    for target_weights, q_net_weights in zip(target_q_network.weights, q_network.weights):
        target_weights.assign(TAU * q_net_weights + (1.0 - TAU) * target_weights)

In [10]:
@tf.function
def agent_learn(experiences, gamma):
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)
        
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))
    soft_update_target_network(q_network, target_q_network)

In [11]:
def get_new_epsilon(epsilon, gamma):
    return max(0.01, gamma * epsilon)

In [12]:
def get_action(q_values, epsilon):
    if random.random() > epsilon:
        return np.argmax(q_values)
    else: return random.choice(range(num_of_actions))

In [13]:
"""
takes memory_buffer, and chooses a random sample (mini batch) and convert them to tensors
"""
def get_expercience(memory_buffer):
    exp = random.sample(memory_buffer, k=MINI_BATCH_SIZE)
    
    states = np.array(exp, dtype='object')[:, 0]
    states = tf.convert_to_tensor([s.flatten() for s in states], dtype=tf.float32)
    
    actions = np.array(exp, dtype='object')[:, 1]
    actions = tf.convert_to_tensor(np.array(actions).astype(np.int32), dtype=tf.float32)
    
    rewards = np.array(exp, dtype='object')[:, 2]
    rewards = tf.convert_to_tensor(np.array(rewards).astype(np.int32), dtype=tf.float32)
    
    observations = np.array(exp, dtype='object')[:, 3]
    observations = tf.convert_to_tensor([s.flatten() for s in observations], dtype=tf.float32)
    
    terminated = np.array(exp, dtype='object')[:, 4]
    terminated = tf.convert_to_tensor(np.array(terminated).astype(np.int32), dtype=tf.float32)
    
    return (states, actions, rewards, observations, terminated)

In [14]:
# Prep
target_q_network.set_weights(q_network.get_weights())

In [15]:
num_episodes = 2000
num_max_steps = 1000
epsilon = 1
memory_buffer = deque(maxlen=MEMORY_SIZE)

total_point_history = []
num_tp_av = 100 # Number of total points for averaging

for eps in range(num_episodes):
    state, _ = env.reset()
    state = np.array(state).reshape(1, -1)
    total_points = 0
    
    for step in range(num_max_steps):
        q_values = q_network(state)
        action = get_action(q_values, epsilon)
        observation, reward, terminated, truncated, info = env.step(action)
        observation = np.array(observation).reshape(1, -1)
        memory_buffer.append((state, action, reward, observation, terminated))
        
        if (step + 1) % NUM_OF_UPDATE_STEP == 0 and len(memory_buffer) >= MINI_BATCH_SIZE:
            exp = get_expercience(memory_buffer)
            agent_learn(exp, GAMMA)
        
        state = observation.copy()
        total_points += reward
        
        if terminated: break
            
    epsilon = get_new_epsilon(epsilon, GAMMA)
    total_point_history.append(total_points)
    points_mean = np.mean(total_point_history[-num_tp_av:])
    
    print(f'\rEpisode {eps + 1}: mean points are {points_mean}', end="")
    
    if points_mean >= 300.0:
        print(f'Solved in {eps + 1} episodes')
        q_network.save('LunarLander.h5')
        break

Episode 2000: mean points are 258.78828004666567

In [17]:
state, _ = env.reset()
vid = VideoRecorder(env, 'videos/Wise.mp4', enabled=True)
for i in range(1000):
    env.render()
    vid.capture_frame()
    action = get_action(q_network(np.array(state).reshape(1, -1)), 0)
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated: break
    state = observation.copy()
vid.close()
env.close()

Moviepy - Building video videos/Wise.mp4.
Moviepy - Writing video videos/Wise.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready videos/Wise.mp4


