# Install librarys
Install packages need to train mario agent.

In [None]:
!pip install torch
!pip install numpy
!pip install matplotlib
!pip install gymnasium==0.29.1
!pip install gym-super-mario-bros==7.4.0
!pip install gym==0.25.2
!pip install imageio-ffmpeg
!pip install imageio
!pip install torchvision
!pip install opencv-python-headless

# Import packages

In [None]:
from PIL import Image
from collections import deque
from datetime import datetime
from pathlib import Path
import copy
import cv2
import imageio
import numpy as np
import random, os
import torch
from torch import nn
import torch.nn.functional as F
import torch.multiprocessing as mp
#import multiprocessing as mp
from torchvision import transforms as T

# Gym is an OpenAI toolkit for RL
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

# NES Emulator for OpenAI Gym
from nes_py.wrappers import JoypadSpace

# Super Mario environment for OpenAI Gym
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY

# Create hyperparammeters
Config hyperparammeters, just change it

In [None]:
#class DictWrapper create by Chatgpt
class DictWrapper:
    def __init__(self, dictionary):
        self._dict = dictionary

    def __getattr__(self, item):
        if item in self._dict:
            return self._dict[item]
        else:
            raise AttributeError(f"'DictWrapper' object has no attribute '{item}'")

config = {
    'num_envs': 32,
    'save_model_step': int(1e5),
    'save_figure_step': 756,
    'learn_step': 756,
    'total_step_or_episode': 'step',
    'total_step': int(5e6),
    'total_episode': None,
    'batch_size': 256,
    'save_dir': "",
    'gamma': 0.99,
    'gamma_int': 0.99,
    'learning_rate': 7e-5,
    'state_dim': (4, 84, 84),
    'action_dim': 12,#12 for complex, 7 for simple
    'entropy_coef': 0.05,
    'V_coef': 0.5,
    'max_grad_norm': 0.5,
    'clip_param': 0.2,
    'num_epoch': 10,
    'world': 8,
    'stage': 4,
    'action_type': 'complex',
    'is_normalize_advantage': False,
    'V_loss_type': "mse", #"huber"
    'target_kl': 0.05,
    'gae_lambda': 0.95,
    'int_adv_coef': 1,
    'ext_adv_coef': 2
}

config = DictWrapper(config)

# Define environment
## Create a custom environment, We need:
- SkipFrame: Because the episode is very long, we only need to repeat actions sometimes in this environment. We repeat each action 4 times and skip the first 3 frames (returning the 4th frame). We also sum the rewards from all 4 frames.
- GrayScaleResizeObservation: Convert the state to grayscale (from RGB to a gray image) and resize it to 84x84 pixels.
- NoopResetEnv: When resetting the environment, we perform random actions before starting the environment. This is similar to the Atari strategy. When resetting, we randomly choose num_noops actions between 0 and noop_max and perform num_noops random actions. If the random actions lead to a terminal state, we reset and continue performing random actions. I set noop_max to 30, similar to Atari.
- CustomRewardAndDoneEnv
    - I noticed that many people train Mario using this custom reward system, so I copied it. The system adds 50 reward points if the agent solves the stage and subtracts 50 reward points if the agent dies. The reward is divided by 10. I set done to True if Mario dies, instead of the default setting where Mario loses all lives.
    - Stage 4-2: Subtract 50 reward points if Mario moves on top of the map (y_pos >= 255).
    - Stages 4-4 and 7-4: Set done = True when Mario goes the wrong way and subtract 50 reward points as a penalty. If Mario takes the correct path but the map still loops (a known bug), I set done = True but do not apply a penalty.
    - Stage 8-4: Set done = True when Mario goes the wrong way and subtract 50 reward points as a penalty (similar to stages 4-4 and 7-4). This map has a particularly difficult section where Mario needs to find a hidden brick. I apply a -100 penalty in this part (info["x_pos"] > 2440 and info["x_pos"] <= 2500).
    - Stage 8-4: Add 100 reward points when Mario successfully takes the correct path.
    - Stages 4-4 and 8-4: give a -0.1 reward every step.


## About reward system:
- Set done to True when Mario dies: This is the most important aspect because, in the default reward system, Mario still gains a reward by just moving right. If Mario dies, the agent doesn't lose total rewards and can continue moving right (in the new life) to get more rewards. This is the easiest way for the agent to earn rewards, and it can learn to exploit this trick.
- Penalty of -50 reward when Mario dies: This is necessary to speed up Mario's training. Without this penalty, the agent may struggle to complete more difficult stages.
- Reward of 50 when reaching the flag: This encourages Mario to train faster and overcome difficult sections in harder stages.
- Changing the penalty and flag reward to more or less than 50 doesn't make a significant difference, so I haven't changed it.
- Divide rewards by 10: I believe this reduces the total rewards and helps the agent learn a better strategy, but I'm not entirely sure how necessary this is. I simply followed an existing approach.
- With Stage 4-2: I noticed that the agent can earn more rewards when Mario goes to the warp zone, but Mario can't win this stage using the warp zone because the reward system gives negative rewards when Mario moves left. Therefore, I added a penalty when Mario moves to the top of the map.
- Use FrameStack to stack the latest 4 frames as observation.
- With Stage 4-4 and 7-4:
    - Since this map has a wrong path, Mario can enter a loop where the reward increases indefinitely. To prevent this, I set done = True and assign a negative penalty reward.
    - I also give a negative reward to prevent Mario from taking the wrong path.
    - Another strategy is to give a negative reward without setting done = True (as in 4-2). However, this strategy doesn't work due to a bug in this map.
    - Even when Mario is on the correct path, sometimes he still enters the loop. To handle this, I set done = True every time Mario enters the loop (checked by x_pos and max_x_pos).
- Stage 4-4: Assign a -0.1 reward for every step: This prevents Mario from getting stuck. This map has a section where Mario needs to move left, but moving left incurs a negative reward in the default system. If Mario moves right, he takes the wrong path, causing the episode to end with a negative reward. To keep Mario moving, I added a negative reward for every step.
- Stage 8-4:
    - Assign a -0.1 reward for every step: This prevents Mario from getting stuck. This encourages Mario to move when stuck at a particular section (info["x_pos"] > 2440 and info["x_pos"] <= 2500). Since moving right can lead Mario down the wrong path, the agent often learns to do nothing to avoid losing rewards. This penalty encourages exploration to find the hidden brick.
    - Sams as in 4-4 and 7-4, I set done = True and assign a -50 penalty when the agent moves in the wrong direction. At the hardest part (info["x_pos"] > 2440 and info["x_pos"] <= 2500), I increased the penalty to -100.
    - Add a 50 reward when the agent goes the correct way (avoiding the wrong path).
    - Another strategy is giving a +50 reward when the agent finds the hidden brick. I've tried both methods, and they both work.
    - Since this map has many locations with overlapping x_pos values (especially in the underwater section where x_pos is reset to 1), be careful when modifying the custom reward system.

In [None]:
# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen)
if gym.__version__ < '0.26':
    env = gym_super_mario_bros.make(f"SuperMarioBros-{config.world}-{config.stage}-v0", new_step_api=True)
else:
    env = gym_super_mario_bros.make(f"SuperMarioBros-{config.world}-{config.stage}-v0", render_mode='rgb', apply_api_compatibility=True)

env = JoypadSpace(env, COMPLEX_MOVEMENT)
print(env.action_space)

env.reset()
next_state, reward, done, trunc, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, trunk, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, trunk, info

class GrayScaleResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        self.current_state = observation
        observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
        observation = cv2.resize(observation, self.shape, interpolation=cv2.INTER_AREA)
        observation = observation.astype(np.uint8)#.reshape(-1, observation.shape[0], observation.shape[1])
        return observation

class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        super(NoopResetEnv, self).__init__(env)
        self.noop_max = noop_max

    def reset(self, **kwargs):
        """Do no-op action for a number of steps in [1, noop_max]."""
        obs = self.env.reset(**kwargs)
        noops = np.random.randint(0, self.noop_max, (1, ))[0]
        for _ in range(noops):
            action = self.env.action_space.sample()
            obs, _, done, _, _ = self.env.step(action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        obs, reward, done, trunk, info = self.env.step(ac)
        return obs, reward, done, trunk, info

class CustomRewardAndDoneEnv(gym.Wrapper):
    def __init__(self, env=None, world=1, stage=1):
        super(CustomRewardAndDoneEnv, self).__init__(env)
        self.current_score = 0
        self.current_x = 0
        self.current_x_count = 0
        self.max_x = 0
        self.world = world
        self.stage = stage
        if self.world == 8 and self.stage == 4:
            self.sea_map = False

    def reset(self, **kwargs):
        self.current_score = 0
        self.current_x = 0
        self.current_x_count = 0
        self.max_x = 0
        if self.world == 8 and self.stage == 4:
            self.sea_map = False
        return self.env.reset(**kwargs)

    def step(self, action):
        state, reward, done, trunc, info = self.env.step(action)

        if (info['x_pos'] - self.current_x) == 0:
            self.current_x_count += 1
        else:
            self.current_x_count = 0
        if info["flag_get"]:
            reward += 50
            done = True
        if done and info["flag_get"] == False and info["time"] != 0:
            reward -= 50
            done = True
        self.current_x = info["x_pos"]

        if self.world == 7 and self.stage == 4:
            if (506 <= info["x_pos"] <= 832 and info["y_pos"] > 127) or (
                    832 < info["x_pos"] <= 1064 and info["y_pos"] < 80) or (
                    1113 < info["x_pos"] <= 1464 and info["y_pos"] < 191) or (
                    1579 < info["x_pos"] <= 1943 and info["y_pos"] < 191) or (
                    1946 < info["x_pos"] <= 1964 and info["y_pos"] >= 191) or (
                    1984 < info["x_pos"] <= 2060 and (info["y_pos"] >= 191 or info["y_pos"] < 127)) or (
                    2114 < info["x_pos"] < 2440 and info["y_pos"] < 191):
                reward -= 50
                done = True
            if done == False and info["x_pos"] < self.max_x - 100:
                done = True
        if self.world == 4 and self.stage == 4:
            if (info["x_pos"] <= 1500 and info["y_pos"] < 127) or (
                    1588 <= info["x_pos"] < 2380 and info["y_pos"] >= 127):
                reward = -50
                done = True
            if done == False and info["x_pos"] < self.max_x - 100:
                done = True
            if done == False:
                reward -= 0.1
        if self.world == 4 and self.stage == 2 and done == False and info['y_pos'] >= 255:
            reward -= 50
        if self.world == 8 and self.stage == 4:
            if info["x_pos"] > 2440 and info["x_pos"] <= 2500:
                done = True
                reward -= 100
            if info["x_pos"] >= 3675 and info["x_pos"] <= 3700:
                done = True
                reward -= 50

            if info["x_pos"] < self.max_x - 200:
                if self.max_x >= 1250 and self.max_x <= 1310: #solved bug because x_pos duplicated
                    if info["x_pos"] >= 320:
                        done = True
                        reward = -50
                elif info["x_pos"] >= 312-5 and info["x_pos"] <= 312+5:
                    done = True
                    reward = -50
                elif info["x_pos"] >= 56-5 and info["x_pos"] <= 56-5 and self.max_x > 3650 and self.sea_map == False:
                    reward += 50
                    self.sea_map = True
            if info["x_pos"] > self.max_x + 100:
                reward += 50
            if done == False:
                reward -= 0.1
        self.max_x = max(self.max_x, self.current_x)
        self.current_score = info["score"]

        return state, reward / 10., done, trunc, info

# Create MultipleEnvironments
MultipleEnvironments use multi-processing to parallel running.

Because in the training process, we need to reset the environment when the agent reaches the terminal state. But if we will do it in parallel, then I don't want to check each environment and reset (by loop) or create a new function that parallels check and reset all environments. Then I reset the environment if done = True in step function and set next_state = env.reset(). Then in training, we just set state = next_state (next_state is reset state if done = True)

In [None]:
#modify from https://github.com/uvipen/Super-mario-bros-PPO-pytorch/blob/master/src/env.py
def create_env(world, stage, action_type, test=False):
    if gym.__version__ < '0.26':
        env = gym_super_mario_bros.make(f"SuperMarioBros-{world}-{stage}-v0", new_step_api=True)
    else:
        env = gym_super_mario_bros.make(f"SuperMarioBros-{world}-{stage}-v0", render_mode='rgb', apply_api_compatibility=True)

    if action_type == "right":
        action_type = RIGHT_ONLY
    elif action_type == "simple":
        action_type = SIMPLE_MOVEMENT
    else:
        action_type = COMPLEX_MOVEMENT

    env = JoypadSpace(env, action_type)

    if test == False:
        env = NoopResetEnv(env)
    env = SkipFrame(env, skip=4)
    env = CustomRewardAndDoneEnv(env, world, stage)
    env = GrayScaleResizeObservation(env, shape=84)
    if gym.__version__ < '0.26':
        env = FrameStack(env, num_stack=4, new_step_api=True)
    else:
        env = FrameStack(env, num_stack=4)
    return env

class MultipleEnvironments:
    def __init__(self, world, stage, action_type, num_envs):
        self.agent_conns, self.env_conns = zip(*[mp.Pipe(duplex=True) for _ in range(num_envs)])
        self.envs = [create_env(world, stage, action_type) for _ in range(num_envs)]

        for index in range(num_envs):
            process = mp.Process(target=self.run, args=(index,))
            process.start()
            self.env_conns[index].close()

    def run(self, index):
        self.agent_conns[index].close()
        while True:
            request, action = self.env_conns[index].recv()
            if request == "step":
                next_state, reward, done, trunc, info = self.envs[index].step(action)
                if done:
                    next_state = self.envs[index].reset()
                self.env_conns[index].send((next_state, reward, done, trunc, info))
            elif request == "reset":
                self.env_conns[index].send(self.envs[index].reset())
            else:
                raise NotImplementedError

    def step(self, actions):
        [agent_conn.send(("step", act)) for agent_conn, act in zip(self.agent_conns, actions)]
        next_states, rewards, dones, truncs, infos = zip(*[agent_conn.recv() for agent_conn in self.agent_conns])
        return next_states, rewards, dones, truncs, infos

    def reset(self):
        [agent_conn.send(("reset", None)) for agent_conn in self.agent_conns]
        states = [agent_conn.recv() for agent_conn in self.agent_conns]
        return states

# RunningMeanStd
RunningMeanStd is used to normalize observation for random network distillation. Please view at [random-network-distillation-pytorch](https://github.com/jcwleo/random-network-distillation-pytorch/blob/master/utils.py)

In [None]:
#https://github.com/jcwleo/random-network-distillation-pytorch/blob/master/utils.py
class RunningMeanStd(object):
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
    def __init__(self, epsilon=1e-4, shape=(), device="cpu"):
        self.mean = torch.tensor(np.zeros(shape, 'float32'))#.to(device)
        self.var = torch.tensor(np.ones(shape, 'float32'))#.to(device)
        self.count = epsilon

    def update(self, x):
        x = x.float()
        batch_mean = x.mean(0)
        batch_var = torch.var(x, 0)
        batch_count = x.shape[0]
        self.update_from_moments(batch_mean, batch_var, batch_count)

    def update_from_moments(self, batch_mean, batch_var, batch_count):
        delta = batch_mean - self.mean
        tot_count = self.count + batch_count

        new_mean = self.mean + delta * batch_count / tot_count
        m_a = self.var * (self.count)
        m_b = batch_var * (batch_count)
        M2 = m_a + m_b + torch.square(delta) * self.count * batch_count / (self.count + batch_count)
        new_var = M2 / (self.count + batch_count)

        new_count = batch_count + self.count

        self.mean = new_mean
        self.var = new_var
        self.count = new_count

# Create memory
Memory just save all info we need to train and return all stored info

In [None]:
class Memory():
    def __init__(self, num_envs):
        self.num_envs = num_envs

        self.states = []
        self.actions = []
        self.next_states = []
        self.rewards = []
        self.dones = []
        self.logits = []
        self.values = []
        self.values_int = []

    def save(self, state, action, reward, next_state, done, logit, value, value_int):
        self.states.append(state)
        self.actions.append(action)
        self.next_states.append(next_state)
        self.rewards.append(reward)
        self.dones.append(done)
        self.logits.append(logit)
        self.values.append(value)
        self.values_int.append(value_int)

    def reset(self):
        self.states = []
        self.actions = []
        self.next_states = []
        self.rewards = []
        self.dones = []
        self.logits = []
        self.values = []
        self.values_int = []

    def get_data(self):
        return self.states, self.actions, self.next_states, self.rewards, self.dones, self.logits, self.values, self.values_int

# Create agent
The agent includes 5 main functions:
## train
train function train agent via many episodes:
- Reset the first state.
- loop until the agent wins this stage or reaches the maximum episode/step:
    - predict value, value_int, logit for the current state
    - sample action from logit with category distribution (select_action function)
    - log all info to memory
    - train agent every learn_step (learn function)
    - eval agent every save_figure_step (save_figure function)
    - set state = next_state (I reset environment when agent reach terminal state then next_state is first state if done=True)

## select_action
this function sample action from logit:
- just convert logit to probability: policy = F.softmax(logits, dim=1)
- create distribution from probability: distribution = torch.distributions.Categorical(policy)
- sample action from distribution: actions = distribution.sample()

## save_figure
this function eval agent and saves agent/video if the agent yields better total rewards:
- reset the environment.
- loop until the agent reaches the terminal state.
    - predict logit from model
    - get action = argmax (logit)
    - environment do this action to get next_state, reward, info, done
    - if total_reward > best test total reward or agent complete this stage, I save model and video.
    - if agent completes this state, we stop training.

## compure_reward_int
this function used to calculate intrinsic rewards:
- first, we need normalize observation before calculate reward (by observation running mean std)
- second, we need get features of next_state from target model and predict model
- then, calculate intrinsic rewards = MSE between features of target model and predict model 
- finally, we need normalzie intrinsic rewards (because I use min-max normalization, we need calculate all intrinsic reward for all next_states in memory before normalize, than I normalize it in learn function)

## learn
this function trains agent from the experiment saved in memory
- get all the info from memory
- calculate intrinsic rewards
- normalize intrinsic rewards
- calculate td (lambda) target and gae advantages
- train num_epoch epochs:
    - shuffle data
    - train with each batch data:
        - calculate loss
        - norm gradient
        - update model from loss

In [None]:
class Agent():
    def __init__(self, envs, world, stage, action_type, num_envs, state_dim, action_dim, save_dir, save_model_step,
                 save_figure_step, learn_step, total_step_or_episode, total_step, total_episode, model,
                 target_model, predict_model, gamma, gamma_int, learning_rate, entropy_coef, V_coef, max_grad_norm,
                 clip_param, batch_size, num_epoch, is_normalize_advantage, V_loss_type, target_kl, gae_lambda, int_adv_coef,
                 ext_adv_coef, device):
        self.world = world
        self.stage = stage
        self.action_type = action_type
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.save_dir = save_dir
        self.learn_step = learn_step
        self.total_step_or_episode = total_step_or_episode
        self.total_step = total_step
        self.total_episode = total_episode

        self.current_step = 0
        self.current_episode = 0

        self.save_model_step = save_model_step
        self.save_figure_step = save_figure_step

        self.device = device
        self.save_dir = save_dir

        self.num_envs = num_envs
        self.envs = envs
        self.model = model.to(self.device)
        self.target_model = target_model.to(self.device)
        self.predict_model = predict_model.to(self.device)

        self.learning_rate = learning_rate
        self.gamma = gamma
        self.gamma_int = gamma_int
        self.entropy_coef = entropy_coef
        self.V_coef = V_coef
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.feature_optimizer = torch.optim.Adam(self.predict_model.parameters(), lr=self.learning_rate)

        self.max_grad_norm = max_grad_norm
        self.clip_param = clip_param
        self.batch_size = batch_size
        self.num_epoch = num_epoch

        self.memory = Memory(self.num_envs)
        self.is_completed = False

        self.env = None
        self.max_test_score = -1e9
        self.is_normalize_advantage = is_normalize_advantage
        self.V_loss_type = V_loss_type
        self.target_kl = target_kl
        self.gae_lambda = gae_lambda
        self.int_adv_coef = int_adv_coef
        self.ext_adv_coef = ext_adv_coef

        # I just log 1000 lastest update and print it to log.
        self.V_loss = np.zeros((1000,)).reshape(-1)
        self.V_int_loss = np.zeros((1000, )).reshape(-1)
        self.P_loss = np.zeros((1000,)).reshape(-1)
        self.E_loss = np.zeros((1000,)).reshape(-1)
        self.approx_kl_divs = np.zeros((1000,)).reshape(-1)
        self.total_loss = np.zeros((1000,)).reshape(-1)
        self.loss_index = 0
        self.len_loss = 0

        self.obs_rms = RunningMeanStd(shape=(1, 1, 84, 84), device=self.device)

    def save_figure(self, is_training = False):
        # test current model and save model/figure if model yield best total rewards.
        # create env for testing, reset test env
        if self.env is None:
            self.env = create_env(self.world, self.stage, self.action_type, True)
        state = self.env.reset()
        done = False

        images = []
        total_reward = 0
        total_step = 0
        num_repeat_action = 0
        old_action = -1

        episode_time = datetime.now()

        # play 1 episode, just get loop action with max probability from model until the episode end.
        while not done:
            with torch.no_grad():
                logit, value, value_in = self.model(torch.tensor(np.array(state), dtype = torch.float, device = self.device).unsqueeze(0))
            action = logit.argmax(-1).item()
            next_state, reward, done, trunc, info = self.env.step(action)
            state = next_state
            img = Image.fromarray(self.env.current_state)
            images.append(img)
            total_reward += reward
            total_step += 1

            if action == old_action:
                num_repeat_action += 1
            else:
                num_repeat_action = 0
            old_action = action
            if num_repeat_action == 200:
                break

        #logging, if model yield better result, save figure (test_episode.mp4) and model (best_model.pth)
        if is_training:
            f_out = open(f"logging_test.txt", "a")
            f_out.write(f'episode_reward: {total_reward:.4f} episode_step: {total_step} current_step: {self.current_step} loss_p: {(self.P_loss.sum()/self.len_loss):.4f} loss_v: {(self.V_loss.sum()/self.len_loss):.4f} loss_v_int: {(self.V_int_loss.sum()/self.len_loss):.4f} loss_e: {(self.E_loss.sum()/self.len_loss):.4f} loss: {(self.total_loss.sum()/self.len_loss):.4f} approx_kl_div: {(self.approx_kl_divs.sum()/self.len_loss):.4f} episode_time: {datetime.now() - episode_time}\n')
            f_out.close()

        if total_reward > self.max_test_score or info['flag_get']:
            imageio.mimsave('test_episode.mp4', images)
            self.max_test_score = total_reward
            if is_training:
                torch.save(self.model.state_dict(), f"best_model.pth")

        if info['flag_get']:
            self.is_completed = True

    def save_model(self):
        torch.save(self.model.state_dict(), f"model_{self.current_step}.pth")

    def load_model(self, model_path = None):
        if model_path is None:
            model_path = f"model_{self.current_step}.pth"
        self.model.load_state_dict(torch.load(model_path))

    def update_loss_statis(self, loss_p, loss_v, loss_v_int, loss_e, loss, approx_kl_div):
        # update loss for logging, just save 1000 latest updates.
        self.V_loss[self.loss_index] = loss_v
        self.V_int_loss[self.loss_index] = loss_v_int
        self.P_loss[self.loss_index] = loss_p
        self.E_loss[self.loss_index] = loss_e
        self.total_loss[self.loss_index] = loss
        self.approx_kl_divs[self.loss_index] = approx_kl_div
        self.loss_index = (self.loss_index + 1)%1000
        self.len_loss = min(self.len_loss+1, 1000)

    def select_action(self, states):
        # select action when training, we need use Categorical distribution to make action base on probability from model
        states = torch.tensor(np.array(states), device = self.device)

        with torch.no_grad():
            logits, Values, values_int = self.model(states)
            policy = F.softmax(logits, dim=1)
            distribution = torch.distributions.Categorical(policy)
            actions = distribution.sample().cpu().numpy().tolist()
        return actions, logits, Values, values_int

    def compure_reward_int(self, next_state):
        # compute intrinsic rewards = MSE between features of target model and predict model 
        # we need normalize observation before calculate reward (by observation running mean std)
        next_state = torch.tensor(np.array(next_state), device = self.device)
        next_state = next_state[:, 3, :, :].reshape(-1, 1, next_state.shape[2], next_state.shape[3])
        next_state = ((next_state - self.obs_rms.mean.to(self.device)) / torch.sqrt(self.obs_rms.var.to(self.device))).clip(-5, 5)
        with torch.no_grad():
            target_features = self.target_model(next_state)
            features = self.predict_model(next_state)
        rewards_int = ((features - target_features)**2).mean(-1)
        return rewards_int.reshape(-1)

    def learn(self):
        # get all data
        states, actions, next_states, rewards, dones, old_logits, old_values, old_values_int = self.memory.get_data()

        # predict next_value and next_value_int for calculate advantage and td target
        targets = []
        targets_int = []
        with torch.no_grad():
            _, next_value, next_value_int = self.model(torch.tensor(np.array(next_states[-1]), device = self.device))
        target = next_value
        target_int = next_value_int
        advantage = 0
        advantage_int = 0

        # calculate intrinsic rewards, after get intrinsic rewards for all next_state, we need normalize intrinsic rewards. The best way to normalize intrinsic rewards is (min, max) normalizazion (I find this at https://opendilab.github.io/DI-engine/12_policies/rnd.html).
        rewards_int = []
        with torch.no_grad():
            for next_state in np.transpose(np.array(next_states), axes = [1, 0, 2, 3, 4]):
                reward_int = self.compure_reward_int(next_state)
                reward_int = reward_int.cpu().numpy().tolist()
                rewards_int.append(reward_int)
        rewards_int = np.transpose(np.array(rewards_int), axes = [1, 0])
        rewards_int = (rewards_int - rewards_int.min()) / (rewards_int.max() - rewards_int.min() + 1e-11)
        rewards_int = rewards_int.astype(np.float32)

        # calculate advantage and td target. We need calculate for both reward and intrinsic rewards.
        for state, next_state, reward, reward_int, done, value, value_int in zip(states[::-1], next_states[::-1], rewards[::-1], rewards_int[::-1], dones[::-1], old_values[::-1], old_values_int[::-1]):
            done = torch.tensor(done, device = self.device, dtype = torch.float).reshape(-1, 1)
            reward = torch.tensor(reward, device = self.device).reshape(-1, 1)
            reward_int = torch.tensor(reward_int, device = self.device).reshape(-1, 1)

            target = next_value * self.gamma * (1-done) + reward
            advantage = target + self.gamma * advantage * (1-done) * self.gae_lambda
            targets.append(advantage)
            advantage = advantage - value.detach()
            next_value = value.detach()

            target_int = next_value_int * self.gamma_int * (1-done) + reward_int
            advantage_int = target_int + self.gamma_int * advantage_int * (1-done) * self.gae_lambda
            targets_int.append(advantage_int)
            advantage_int = advantage_int - value_int.detach()
            next_value_int = value_int.detach()

        # convert all data to tensor
        targets = targets[::-1]
        targets_int = targets_int[::-1]

        action_index = torch.flatten(torch.tensor(actions, device = self.device, dtype = torch.int64))
        states = torch.tensor(np.array(states), device = self.device)
        states = states.reshape((-1,  states.shape[2], states.shape[3], states.shape[4]))

        old_values = torch.cat(old_values, 0)
        old_values_int = torch.cat(old_values_int, 0)

        targets = torch.cat(targets, 0).view(-1, 1)
        targets_int = torch.cat(targets_int, 0).view(-1, 1)

        old_logits = torch.cat(old_logits, 0)
        old_probs = torch.softmax(old_logits, -1)
        index = torch.arange(0, len(old_probs), device = self.device)
        old_log_probs = (old_probs[index, action_index] + 1e-9).log()
        advantages = (targets - old_values).reshape(-1)
        advantages_int = (targets_int - old_values_int).reshape(-1)

        early_stopping = False

        # update observation running mean std
        next_states = torch.tensor(np.array(next_states), device = self.device)
        next_states = next_states.reshape(-1, next_states.shape[2], next_states.shape[3], next_states.shape[4])
        self.obs_rms.update(next_states.cpu()[:, 3, :, :].reshape(-1, 1, next_states.shape[2], next_states.shape[3]))

        #train num_epoch time
        for epoch in range(self.num_epoch):
            #shuffle data for each epoch
            shuffle_ids = torch.randperm(len(targets), dtype = torch.int64)
            for i in range(len(old_values)//self.batch_size):
                #train with batch_size data
                self.optimizer.zero_grad()
                self.feature_optimizer.zero_grad()
                start_id = i * self.batch_size
                end_id = min(len(shuffle_ids), (i+1) * self.batch_size)
                batch_ids = shuffle_ids[start_id:end_id]

                #predict logits and values from model
                logits, value, value_int = self.model(states[batch_ids])

                #calculate entropy and value loss (using mse or huber based on config)
                probs =  torch.softmax(logits, -1)
                entropy = (- (probs * (probs + 1e-9).log()).sum(-1)).mean()
                if self.V_loss_type == 'huber':
                    loss_V = F.smooth_l1_loss(value, targets[batch_ids])
                    loss_V_int = F.smooth_l1_loss(value_int, targets_int[batch_ids])
                else:
                    loss_V = F.mse_loss(value, targets[batch_ids])
                    loss_V_int = F.mse_loss(value_int, targets_int[batch_ids])
                index = torch.arange(0, len(probs), device = self.device)
                batch_action_index = action_index[batch_ids]

                log_probs = (probs[index, batch_action_index] + 1e-9).log()

                #approx_kl_div copy from https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/ppo/ppo.html#PPO
                #if approx_kl_div larger than 1.5 * target_kl (if target_kl in config is not None), stop training because policy change so much
                with torch.no_grad():
                    log_ratio = log_probs - old_log_probs[batch_ids]
                    approx_kl_div = torch.mean((torch.exp(log_ratio) - 1) - log_ratio).cpu().numpy()
                if self.target_kl is not None and approx_kl_div > 1.5 * self.target_kl:
                    early_stopping = True

                #calculate policy loss
                ratio = torch.exp(log_probs - old_log_probs[batch_ids])

                batch_advantages = self.ext_adv_coef * advantages[batch_ids].detach() + self.int_adv_coef * advantages_int[batch_ids].detach()
                if self.is_normalize_advantage:
                    batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-9)
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * batch_advantages
                loss_P = -torch.min(surr1, surr2).mean()

                # calculate random distillation network loss
                batch_next_states = next_states[batch_ids]
                batch_next_states = batch_next_states[:, 3, :, :].reshape(-1, 1, batch_next_states.shape[2], batch_next_states.shape[3])
                batch_next_states = ((batch_next_states - self.obs_rms.mean.to(self.device)) / torch.sqrt(self.obs_rms.var.to(self.device))).clip(-5, 5)

                with torch.no_grad():
                    target_features = self.target_model(batch_next_states)
                features = self.predict_model(batch_next_states)

                update_proportion = 0.25
                mask = torch.rand(len(features)).to(self.device)
                mask = (mask < update_proportion).type(torch.FloatTensor).to(self.device)
                loss_RND = ((features - target_features)**2).mean(-1)
                loss_RND = (loss_RND * mask).sum() / torch.max(mask.sum(), torch.Tensor([1]).to(self.device))

                # update model
                loss = loss_V * self.V_coef + loss_V_int * self.V_coef + loss_P - entropy * self.entropy_coef

                self.update_loss_statis(loss_P.item(), loss_V.item(), loss_V_int.item(), entropy.item(), loss.item(), approx_kl_div.item())

                if early_stopping == False:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                    self.optimizer.step()

                    loss_RND.backward()
                    self.feature_optimizer.step()
                else:
                    break
            if early_stopping:
                break

    def train(self):
        episode_reward = [0] * self.num_envs
        episode_step = [0] * self.num_envs
        max_episode_reward = 0
        max_episode_step = 0
        episode_time = [datetime.now() for _ in range(self.num_envs)]
        total_time = datetime.now()

        last_episode_rewards = []

        #reset envs
        states = self.envs.reset()

        while True:
            # finish training if agent reach total_step or total_episode base on what type of total_step_or_episode is step or episode
            self.current_step += 1

            if self.total_step_or_episode == 'step':
                if self.current_step >= self.total_step:
                    break
            else:
                if self.current_episode >= self.total_episode:
                    break

            actions, logit, value, value_int = self.select_action(states)

            next_states, rewards, dones, truncs, infos = self.envs.step(actions)

            # save to memory
            self.memory.save(states, actions, rewards, next_states, dones, logit, value, value_int)

            episode_reward = [x + reward for x, reward in zip(episode_reward, rewards)]
            episode_step = [x+1 for x in episode_step]

            # logging after each step, if 1 episode is ending, I will log this to logging.txt
            for i, done in enumerate(dones):
                if done:
                    self.current_episode += 1
                    max_episode_reward = max(max_episode_reward, episode_reward[i])
                    max_episode_step = max(max_episode_step, episode_step[i])
                    last_episode_rewards.append(episode_reward[i])
                    f_out = open(f"logging.txt", "a")
                    f_out.write(f'episode: {self.current_episode} agent: {i} rewards: {episode_reward[i]:.4f} steps: {episode_step[i]} complete: {infos[i]["flag_get"]==True} mean_rewards: {np.array(last_episode_rewards[-min(len(last_episode_rewards), 100):]).mean():.4f} max_rewards: {max_episode_reward:.4f} max_steps: {max_episode_step} current_step: {self.current_step} loss_p: {(self.P_loss.sum()/self.len_loss):.4f} loss_v: {(self.V_loss.sum()/self.len_loss):.4f} loss_v_int: {(self.V_int_loss.sum()/self.len_loss):.4f} loss_e: {(self.E_loss.sum()/self.len_loss):.4f} loss: {(self.total_loss.sum()/self.len_loss):.4f} approx_kl_div: {(self.approx_kl_divs.sum()/self.len_loss):.4f} episode_time: {datetime.now() - episode_time[i]} total_time: {datetime.now() - total_time}\n')
                    f_out.close()
                    episode_reward[i] = 0
                    episode_step[i] = 0
                    episode_time[i] = datetime.now()

            # training agent every learn_step
            if self.current_step % self.learn_step == 0:
                self.learn()
                self.memory.reset()

            # eval agent every save_figure_step
            if self.current_step % self.save_figure_step == 0:
                self.save_figure(is_training=True)
                if self.is_completed:
                    return

            if self.current_step % self.save_model_step == 0:
                self.save_model()

            states = list(next_states)

        f_out = open(f"logging.txt", "a")
        f_out.write(f' mean_rewards: {np.array(last_episode_rewards[-min(len(last_episode_rewards), 100):]).mean()} max_rewards: {max_episode_reward} max_steps: {max_episode_step} current_step: {self.current_step} total_time: {datetime.now() - total_time}\n')
        f_out.close()

# Create model
## Actor Critic Model
I use the same model architecture as [PPO](https://github.com/CVHvn/Mario_PPO).
Model includes:
- 4 convolution layers to encode input image (observation) to feature vector.
- 1 hidden linear layer.
- two linear layers for policy and value prediction (actor and critic).

## RND model
Both networks have same architecture:
- 4 convolution layers to encode input image (observation) to feature vector.
- 1 or 2 hidden linear layers:
    - 1 hidden linear layer for target network (random network or non-learning network).
    - 2 hidden linear layers for feature network (learning network)

In [None]:
class Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(input_dim[0], 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.linear = nn.Linear(1152, 512)
        self.int_critic = nn.Linear(512, 1)
        self.ext_critic = nn.Linear(512, 1)
        self.actor_linear = nn.Linear(512, output_dim)

    def forward(self, x):
        x = F.relu(self.conv1(x/255.))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        return self.actor_linear(x), self.ext_critic(x), self.int_critic(x)

class Feature_Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Feature_Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.linear1 = nn.Linear(1152, 512)
        self.linear2 = nn.Linear(512, 512)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

class Target_Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Target_Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.linear = nn.Linear(1152, 512)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        return x

# train

In [None]:
model = Model(config.state_dim, config.action_dim)
target_model = Target_Model(config.state_dim, config.action_dim)
predict_model = Feature_Model(config.state_dim, config.action_dim)

In [None]:
envs = MultipleEnvironments(config.world, config.stage, config.action_type, config.num_envs)

In [None]:
agent = Agent(envs = envs, world = config.world, stage = config.stage, action_type = config.action_type, num_envs = config.num_envs, 
              state_dim = config.state_dim, action_dim = config.action_dim, save_dir = config.save_dir,
              save_model_step = config.save_model_step, save_figure_step = config.save_figure_step, learn_step = config.learn_step,
              total_step_or_episode = config.total_step_or_episode, total_step = config.total_step, total_episode = config.total_episode,
              model = model, target_model = target_model, predict_model = predict_model, gamma = config.gamma, gamma_int = config.gamma_int,
              learning_rate = config.learning_rate, entropy_coef = config.entropy_coef, V_coef = config.V_coef,
              max_grad_norm = config.max_grad_norm, clip_param = config.clip_param, batch_size = config.batch_size,
              num_epoch = config.num_epoch, is_normalize_advantage = config.is_normalize_advantage, V_loss_type = config.V_loss_type,
              target_kl = config.target_kl, gae_lambda = config.gae_lambda, ext_adv_coef = config.ext_adv_coef,
              int_adv_coef = config.int_adv_coef, device = "cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
agent.train()

# test

In [None]:
agent.load_model("best_model.pth")
agent.save_figure()