# Install and import libraries


In [None]:
%pip install gymnasium[classic-control]
%pip install tensorflow
%pip install tdqm

In [None]:
import numpy as np
import tensorflow as tf
import gymnasium as gym
import os
import datetime
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from gym import wrappers
from keras import regularizers
from keras.optimizers import Adam
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from collections import deque

Use GPU acceleration if available


In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)

  and should_run_async(code)



# RLAgent class and network implementation

In [None]:
class RLAgent:
    def __init__(
        self,
        env: gym.Env,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        gamma: float,
        lr: float,
        dropout_rate: float = 0.3
    ):

        self.state_shape = env.observation_space.shape
        self.actions = env.action_space.n
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.lr = lr
        self.train_net = self.build_model()
        self.loss_values = list()
        self.dropout_rate = dropout_rate

    def save_model(
        self,
        model_name:str,
        episode: int
    ):
        """
        Saveing model
        """
        self.train_net.save(f'./{model_name}/trainNetwork{episode}.h5')

    def load(
        self,
        model_name:str,
        episode: int
    ):
        self.train_net = load_model(f'./{model_name}/trainNetwork{episode}.h5')

    @staticmethod
    def get_optmizer(lr: int):

        # apply learning rate EsponentialDecay
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            lr,
            decay_steps=500,
            decay_rate=0.96,
            staircase=True
        )

        # define an optimizer with a learning rate schedule to decrease it over time
        opt = tf.optimizers.Adam(learning_rate=lr_schedule)
        return opt


    def build_model(self):
        """
        Builds a deep neural net which predicts the Q values for all possible
        actions given a state. The input should have the shape of the state
        (which is 2 in MountainCar), and the output should have the same shape as
        the action space (which is 2 in MountainCar) since we want 1 Q value per
        possible action.

        :return: the Q network
        """


        model = Sequential()
        model.add(Dense(24, input_shape=self.state_shape, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dropout(0.0))
        model.add(Dense(48, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dropout(0.0))

        model.add(Dense(self.actions, activation='linear',
                        kernel_initializer='he_uniform'))

        model.compile(optimizer=self.get_optmizer(self.lr), loss='mse', metrics=["mse"])
        return model


    def update_dropout(self):
        # Enable dropout after the specified episode
        for layer in self.train_net.layers:
            if isinstance(layer, Dropout):
                layer.rate = self.dropout_rate

    def policy(self,state) -> int:
        action_q = self.train_net(np.atleast_2d(state))
        return np.argmax(action_q[0], axis=0)


    def get_action(self, obs) -> int:
        """
        Get an action with an epsilon greedy policy
        """
        greedy = random.random() > self.epsilon

        # exploitation
        if greedy:

            # use the train net to get the action value given a state
            return self.policy(obs)

        # exploration
        else:
             return np.random.choice(self.actions)

    def train(self, batch):
        """
        Train the network with a batch sample using a train net
        """

        state, next_state, action, reward, terminated = batch

        # get the current q value for that state, it will be a value for both actions
        current_q = self.train_net(state)

        # copy that value of the current q-value into a target variable
        target_q = np.copy(current_q)

        # using the train network get the q-value of the next state
        next_q = self.train_net(next_state)

        # among the q-values returned by the target network select the best
        max_next_q = np.amax(next_q, axis=1)

        for i in range(state.shape[0]):

            target_q[i][action[i]] = reward[i] + self.gamma * (1 - terminated[i]) * max_next_q[i]

        # fit the train model
        history = self.train_net.fit(x=state, y=target_q, epochs=1,verbose=0)

        # add to list
        self.loss_values.append(history.history["loss"])

        # return the loss and learning rate
        return round(self.train_net.optimizer.lr.numpy(), 5)

    def decay_epsilon(self):
        """ Decay epsilon value by a constant"""
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

    def plot_loss(self):
        plt.plot(range(len(self.loss_values)), self.loss_values)
        plt.xlabel('Episode')
        plt.ylabel('Loss')
        plt.show()

    def plot_rew(self,aggr_ep_rewards,model_name):

        plt.plot(aggr_ep_rewards.get('ep'), aggr_ep_rewards.get('avg'), label="avg rewards")
        plt.plot(aggr_ep_rewards.get('ep'), aggr_ep_rewards.get('min'), label="min rewards")
        plt.plot(aggr_ep_rewards.get('ep'), aggr_ep_rewards.get('max'), label="max rewards")
        plt.legend(loc=4)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.ylim(-200, None)
        plt.show()


    def plot_policy(self,actions):
        temp_action_x = list(actions.keys())

        action_labels = {0: "left", 1: "stay", 2: "right"}
        action_x = [action_labels[a] for a in temp_action_x]
        action_y = list(actions.values())

        colors = ['blue', 'green', 'orange']

        fig, ax = plt.subplots()
        ax.bar(action_x, action_y, color=colors)
        ax.set_ylabel('Ocurrences')
        ax.set_title('Actions')
        ax.legend(title='Actions policy')

        plt.show()



# Replay buffer implementation

In [None]:
class ReplayBuffer:
  def __init__(self,exp_max_size,batch_size):
    self.exp_max_size = exp_max_size
    self.batch_size = batch_size
    self.experiences = deque(maxlen=exp_max_size)

  def get_exp_size(self):
    """
    Get experiences length
    """
    return len(self.experiences)

  def add_experience(self,exp):
    """
    Add new experience to buffer
    """
    # oldest item are automatically removed when dimensione is over max_exp_size
    self.experiences.append(exp)


  def sample_game_batch(self):
    """
    Sample game batch for training loop
    """
    # take a sample of batch size
    sampled_gameplay_batch = random.sample(self.experiences, self.batch_size)

    # define state, next_state, action ,reward, done
    state_batch, next_state_batch, action_batch, reward_batch, done_batch= [], [], [], [], [],

    # for each experience in the batch get a sample
    for gameplay_experience in sampled_gameplay_batch:
      state_batch.append(gameplay_experience[0])
      next_state_batch.append(gameplay_experience[1])
      reward_batch.append(gameplay_experience[2])
      action_batch.append(gameplay_experience[3])
      done_batch.append(gameplay_experience[4])

    return np.array(state_batch), np.array(next_state_batch), np.array(action_batch), np.array(reward_batch), np.array(done_batch)

# Helper method used to evaluate training model

In [None]:
def training_result(model_name,episode):

  env = gym.make('MountainCar-v0')

  total_reward = 0.0
  win = 0
  episodes = 1000
  actions = {0:0,
             1:0,
             2:0}

  agent = RLAgent(
        env=env,
        lr=0.01,
        initial_epsilon=1.0,
        epsilon_decay=0.99,
        final_epsilon=0.001,
        gamma=0.99,
        )

  agent.load(model_name,episode)


  for i in tqdm(range(episodes)): # Play 10 episode and take the average
    state, _ = env.reset()
    done = False
    truncated = False
    episode_reward = 0.0
    while not (done or truncated):
      action = agent.policy(state)

      next_state, reward, done,truncated, info = env.step(action)

      # increment action
      actions[action] += 1


      # Count number of win
      if next_state[0] >= 0.5:
        win += 1

      episode_reward += reward
      state = next_state
    print(episode_reward)
    if i % 20 == 0:
        print(f"{i}/{episodes}")

    total_reward += episode_reward

  average_reward = total_reward / episodes
  accuracy = win / episodes

  print(f"Average reward: {average_reward}, Accuracy {accuracy:.4f}")
  agent.plot_policy(actions)

model_name = "LR: 0.01251 - GAMMA: 0.99 - EPISODES: 4000 EPSILON: 1.0"
episode = "2500"

training_result(model_name, episode)

# Training loop

In [None]:
EPISODE = 4000                      # Number of episode to play
EPISODE_MAX_LENGTH = 200            # This number depends on the environment
SAVE_MODEL_STEP = EPISODE // 8      # Frequency of saving model
DROPOUT = 750                       # Point of insertion of dropout layers
GAMMA = 0.99                        # Discount factor
EXP_MAX_SIZE = 10_000               # Max batch size of past experience previous 10000
#LR = 0.001251                      # NN learning rate
LR = 0.01251                        # Different learning rate
EPS_MAX = 1.0                       # Initial exploration probability
EPS_MIN = 0.001                     # Final exploration probability before: 0,00001
DECAY = 0.85                        # Decay value
BATCH_SIZE = 32                     # Sample to get from experiences
PLOT = 500                          # Frequency of plotting graphs

win = 0
scores = list()

model_name = f'LR: {LR} - GAMMA: {GAMMA} -' \
             f' EPISODES: {EPISODE}'\
             f' EPSILON: {EPS_MAX}'\
             f" BATCH: {BATCH_SIZE}"


# create model directory for storing models
if not os.path.exists(model_name):
    os.makedirs(model_name)


env = gym.make('MountainCar-v0')
agent = RLAgent(
        env=env,
        lr=LR,
        initial_epsilon=EPS_MAX,
        epsilon_decay=DECAY,
        final_epsilon=EPS_MIN,
        gamma=GAMMA,
        )


buffer = ReplayBuffer(
                      exp_max_size=EXP_MAX_SIZE,
                      batch_size=BATCH_SIZE
                      )

time_scores = deque(maxlen=100)
lr_value = 0

aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []}

for episode_cnt in range(1, EPISODE + 1):
  state, _ = env.reset()
  terminated = False

  # play the game and collect experience
  for step in range(1, EPISODE_MAX_LENGTH + 1):
    action = agent.get_action(state)
    next_state, reward, terminated, truncated, _ = env.step(action)

    # add experience tu the buffer
    buffer.add_experience((state, next_state, reward, action, terminated))

    # agent won't start learning if there isn't enough experience
    if buffer.get_exp_size() > BATCH_SIZE and step % 15 == 0:
        gameplay_experience_batch = buffer.sample_game_batch()
        lr_value = agent.train(gameplay_experience_batch)

    # set state to next state
    state = next_state


    if terminated or truncated:

        # store current time for that episode
        time_scores.append(step * -1)

        # compute avg score of the last 100 episodes
        avg_reward = np.mean(time_scores)
        min_reward = min(time_scores)
        max_reward = max(time_scores)
        aggr_ep_rewards['ep'].append(episode_cnt)
        aggr_ep_rewards['avg'].append(avg_reward)
        aggr_ep_rewards['min'].append(min_reward)
        aggr_ep_rewards['max'].append(max_reward)

        # store avg score
        scores.append(avg_reward)


        print(f"Episode {episode_cnt}/{EPISODE}, e {agent.epsilon:.6f}, avg reward {avg_reward:.2f}, time {step}, lr :{lr_value:.6f}")
        break

  if episode_cnt % SAVE_MODEL_STEP == 0:
    agent.save_model(model_name, episode_cnt)

  agent.decay_epsilon()

  # show avarage reward
  if episode_cnt % PLOT == 0:
    agent.plot_rew(aggr_ep_rewards,model_name)
    agent.plot_loss()

  if episode_cnt == DROPOUT:
    agent.update_dropout()
    print("Starting dropout")


#training_result(agent,model_name)