In [1]:
import random

import time
from collections import deque, namedtuple

import gym
import numpy as np
import PIL.Image
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.losses import MSE

from pyvirtualdisplay import Display

from itertools import zip_longest
from statsmodels.iolib.table import SimpleTable

2022-10-12 21:18:08.063326: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/azzam/.local/lib/python3.8/site-packages/cv2/../../lib64:
2022-10-12 21:18:08.063344: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
tf.random.set_seed(123)
env = gym.make('LunarLander-v2')

In [3]:
class lunarLanderRL:
    
    def __init__(self, num_episodes, max_iters, q_network, target_q_network,
                 memory_buffer, num_iters_update, batch_size, gamma,
                 optimizer, alpha, num_episodes_monitor, satisfying_mv_avg,
                 initial_epsilon, min_epsilon, epsilon_decay,
                ):
        self.num_episodes = num_episodes
        self.max_iters = max_iters
        self.q_network = q_network
        self.target_q_network = target_q_network
        self.memory_buffer = memory_buffer
        self.num_iters_update = num_iters_update
        self.batch_size = batch_size
        self.gamma = gamma
        self.optimizer = optimizer
        self.alpha = alpha
        self.num_episodes_monitor = num_episodes_monitor
        self.satisfying_mv_avg = satisfying_mv_avg
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        
    def take_action(self, best_action, epsilon):
        if random.random() > epsilon:
            return best_action
        return random.randint(0, 3)
    
    def is_update(self, t):
        if (t+1) % self.num_iters_update == 0 and len(self.memory_buffer) >= self.batch_size:
            return True
        return False
    
    def select_experiences(self):
        experiences = random.sample(self.memory_buffer, k=self.batch_size)
        states = tf.constant([e.state for e in experiences if e is not None], dtype=tf.float32)
        actions = tf.constant([e.action for e in experiences if e is not None], dtype=tf.float32)
        rewards = tf.constant([e.reward for e in experiences if e is not None], dtype=tf.float32)
        next_states = tf.constant([e.next_state for e in experiences if e is not None], dtype=tf.float32)
        done_vals = tf.constant([e.done for e in experiences if e is not None], dtype=tf.float32)
        return (states, actions, rewards, next_states, done_vals)
    
    def compute_loss(self, experiences):
        states, actions, rewards, next_states, done_vals = experiences
        max_qsa = tf.reduce_max(self.target_q_network(next_states), axis=-1)
        y = ((1 - done_vals) * rewards + self.gamma * max_qsa) + (done_vals * rewards)
        preds = self.q_network(states)
        filter_ = tf.stack([tf.range(len(preds)), tf.cast(actions, dtype=tf.int32)], axis=1)
        preds = tf.gather_nd(preds, filter_)
        loss = MSE(y, preds)
        return loss
        
    def update_target_netwokr(self):
        for w1, w2 in zip(self.q_network.weights, self.target_q_network.weights):
            w2.assign((self.alpha * w1) + ((1 - self.alpha) * w2))
        
    def learn_agent(self, experiences):
        with tf.GradientTape() as tape:
            loss = self.compute_loss(experiences)
        grads = tape.gradient(loss, self.q_network.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.q_network.trainable_weights))
        self.update_target_netwokr()
        
    def update_eps(self, eps):
        return max(self.min_epsilon, eps * self.epsilon_decay)
        
    def __call__(self):
        t1 = time.time()
        experience = namedtuple('Experience', field_names=['state', 'action', 'reward', 'next_state', 'done'])
        total_rewards_list = []
        epsilon = self.initial_epsilon
        
        for i in range(self.num_episodes):
            state = env.reset()[0]
            total_rewards = 0
            
            for t in range(self.max_iters):
                state_reshaped = np.expand_dims(state, axis=0)
                best_action = np.argmax(self.q_network(state_reshaped))
                action = self.take_action(best_action, epsilon)
                next_state, reward, done, _, _ = env.step(action)
                self.memory_buffer.append(experience(state, action, reward, next_state, int(done)))
                
                update = self.is_update(t)
                
                if update:
                    experiences = self.select_experiences()
                    self.learn_agent(experiences)
                    
                state = next_state.copy()
                total_rewards += reward
                
                if done:
                    break
            
            epsilon = self.update_eps(epsilon)
            total_rewards_list.append(total_rewards)
            mv_avg = np.mean(total_rewards_list[-self.num_episodes_monitor:])
            print(f'\rEpisode {i+1} | Moving Average Rewards: {mv_avg:.2f} | Epsilon: {epsilon:.3f}', end='')
            
            if (i+1) % self.num_episodes_monitor == 0:
                print(f'\rEpisode {i+1} | Moving Average Rewards: {mv_avg:.2f} | Epsilon: {epsilon:.3f}')
                
            if mv_avg >= self.satisfying_mv_avg:
                print(f'\n\nEnvironment Solved by Agent with {i+1} Episodes | Moving Average Rewards: {mv_avg:.2f}')
                self.q_network.save_weights('q_network_weights.h5')
                break
                
        print(f'\n\nTime Required to Solve Environment: {(time.time()-t1)/60:.2f} min')

In [4]:
def rl_network():
    inputs = layers.Input(shape=(8,))
    x = layers.Dense(64, activation='relu')(inputs)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(4)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [5]:
num_episodes = 2000
max_iters = 1000
tf.keras.backend.clear_session()
q_network = rl_network()
tf.keras.backend.clear_session()
target_q_network = rl_network()
memory_buffer = deque(maxlen=100_000)
num_iters_update = 4
batch_size = 64 
gamma = 0.995
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) 
alpha = 1e-3 
num_episodes_monitor = 100 
satisfying_mv_avg = 200
initial_epsilon = 1.0 
min_epsilon = 0.01 
epsilon_decay = 0.995

agent = lunarLanderRL(num_episodes, max_iters, q_network, target_q_network,
                      memory_buffer, num_iters_update, batch_size, gamma,
                      optimizer, alpha, num_episodes_monitor, satisfying_mv_avg,
                      initial_epsilon, min_epsilon, epsilon_decay)

agent()

2022-10-12 21:18:13.371072: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/azzam/.local/lib/python3.8/site-packages/cv2/../../lib64:
2022-10-12 21:18:13.371124: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-12 21:18:13.371157: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (Azzam): /proc/driver/nvidia/version does not exist
2022-10-12 21:18:13.371535: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Episode 100 | Moving Average Rewards: -166.86 | Epsilon: 0.606
Episode 200 | Moving Average Rewards: -111.26 | Epsilon: 0.367
Episode 300 | Moving Average Rewards: -112.93 | Epsilon: 0.222
Episode 400 | Moving Average Rewards: 19.30 | Epsilon: 0.13567
Episode 500 | Moving Average Rewards: 109.88 | Epsilon: 0.082
Episode 583 | Moving Average Rewards: 200.23 | Epsilon: 0.054

Environment Solved by Agent with 583 Episodes | Moving Average Rewards: 200.23


Time Required to Solve Environment: 21.97 min
