In [30]:
import numpy as np

arr = np.array([1 + 2j, 3+1j])

print(arr.max())

(3+1j)


In [29]:
import pandas as pd
import numpy as np

# Sample dataframe (replace with your actual dataframe)
data = {
    'from_account': [101, 101, 101, 101, 101, 101],
    'to_account': [102, 101, 103, 104, 103, 102],
    'amount': [100, 200, 150, 50, 75, 300]
}
df = pd.DataFrame(data)

# Create a pivot table where the rows and columns are account numbers
transaction_matrix = pd.pivot_table(df, 
                                   index='from_account', 
                                   columns='to_account', 
                                   values='amount', 
                                   aggfunc=['mean', 'std'],  # mean # max min std median
                                   fill_value=0)

print(transaction_matrix.xs('mean', level = 0, axis=0))

# Ensure the matrix has all accounts as rows and columns (4x4 matrix)
# Get all unique accounts in sorted order
unique_accounts = sorted(set(df['from_account']).union(set(df['to_account'])))

# Reindex the matrix to match the sorted account list (ensures 4x4 matrix)
transaction_matrix = transaction_matrix.loc['mean'].reindex(index=unique_accounts, columns=unique_accounts, fill_value=0)

# Convert the transaction matrix to a numpy array
transaction_matrix_array = transaction_matrix.to_numpy()
transaction_matrix_array += transaction_matrix_array.T

# Display the resulting 4x4 matrix as a numpy array
print(transaction_matrix_array)

# Calculate the eigenvalues and eigenvectors of A
eigenvalues = np.linalg.eigvals(transaction_matrix_array)
print("Eigenvalues of A:", eigenvalues.max())




TypeError: Index must be a MultiIndex

In [1]:
import numpy as np

# Define the matrix A
A = np.array([[4, 1], 
              [2, 3]])

# Calculate the determinant of A
det_A = np.linalg.det(A)
print("Determinant of A:", det_A)

# Calculate the eigenvalues and eigenvectors of A
eigenvalues, eigenvectors = np.linalg.eig(A)

print("Eigenvalues of A:", eigenvalues)
print("Eigenvectors of A:", eigenvectors)


Determinant of A: 10.000000000000002
Eigenvalues of A: [5. 2.]
Eigenvectors of A: [[ 0.70710678 -0.4472136 ]
 [ 0.70710678  0.89442719]]


In [None]:
import numpy as np

# Define the matrix A
A = np.array([[0,  2, 1, -4, -5], 
              [2,  0, 0,  0, 0],
              [1,  0, 0,  0, 0],
              [-4, 0, 0,  0, 0],
              [-5, 0, 0,  0, 0]])

# Calculate the determinant of A
det_A = np.linalg.det(A)
print("Determinant of A:", det_A)

# Calculate the eigenvalues and eigenvectors of A
eigenvalues, eigenvectors = np.linalg.eig(A)

print("Eigenvalues of A:", eigenvalues)
print("Eigenvectors of A:")
print(eigenvectors)


Determinant of A: 0.0
Eigenvalues of A: [ 22.04540769 -22.04540769   0.           0.           0.        ]
Eigenvectors of A:
[[ 9.87759340e-02  9.87759340e-02 -1.70445698e-18  3.55574272e-18
  -1.10715782e-19]
 [ 9.94685953e-01 -9.94685953e-01 -4.25177547e-01  8.86981595e-01
   9.24967437e-01]
 [ 4.48056735e-03 -4.48056735e-03  9.05069631e-01  2.78396105e-03
   2.90318686e-03]
 [-1.79222694e-02  1.79222694e-02  5.33800356e-03  4.61586544e-01
  -1.16127474e-02]
 [-2.24028368e-02  2.24028368e-02  6.67250446e-03 -1.39198052e-02
   3.79857810e-01]]


In [32]:
import numpy as np


# Define the matrix A
A = np.array([[1, 5, 2, 1, 4], 
              [5, 2, 0, 0, 9],
              [2, 0, 1, 5, 0],
              [1, 0, 5, 1, 0],
              [4, 9, 0, 0, 1]])


# Calculate the determinant of A
det_A = np.linalg.det(A)
print("Determinant of A:", det_A)

# Calculate the eigenvalues and eigenvectors of A
eigenvalues, eigenvectors = np.linalg.eig(A)

print("Eigenvalues of A:", eigenvalues)
print("Eigenvectors of A:")
print(eigenvectors)


Determinant of A: -6561.000000000005
Eigenvalues of A: [13.84364188  6.30199871 -2.34787471 -7.56204804 -4.23571783]
Eigenvectors of A:
[[ 0.4589336   0.13782702 -0.81529082  0.09039938 -0.3122699 ]
 [ 0.64674377 -0.14805716  0.23633503 -0.70705302  0.06342985]
 [ 0.10062527  0.69126353 -0.10024329 -0.02268766  0.70814252]
 [ 0.07490554  0.67788487  0.39323673  0.00269082 -0.61661893]
 [ 0.59612596 -0.14734187  0.33876656  0.70098645  0.12953543]]


<a href="https://colab.research.google.com/github/Brownwang0426/Reversal-Generative-Reinforcement-Learning/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up (for colab)

In [None]:
!sudo apt-get install python3.10
!pip install torch==2.0.1 
!pip install numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gymnasium==1.0.0 minigrid==3.0.0 tqdm==4.67.1 dill==0.3.8

In [None]:
!git clone https://github.com/Brownwang0426/Reversal-Generative-Reinforcement-Learning.git

In [None]:
import os
os.chdir('/content/Reversal-Generative-Reinforcement-Learning')

# Setting up (for local)
CUDA Toolkit 11.8 \
cuDNN 8.9.x \
pip install torch==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu118  \
pip install numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gymnasium==1.0.0 minigrid==3.0.0 tqdm==4.67.1 dill==0.3.8

# Importing modules

In [1]:
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
import minigrid

import numpy as np
import math
from scipy.special import softmax

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm
from collections import defaultdict

import itertools

import dill

import warnings
warnings.filterwarnings('ignore')

import concurrent.futures
import hashlib

# Checking cuda

In [2]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

Device 0: NVIDIA GeForce RTX 4090
using cuda...


# Control board

Crucial configurations regarding how your agent will learn in the environment. The meanings are as follow:
(the configs starting with ⚠️ are what we suggest you must tune according to your specific need in your task)
(the configs starting with ◀️ are what we suggest you to play with to see the effect)



## Configs meaning
| Configs   | Type   | Description                                                                 |
|------------|--------|-----------------------------------------------------------------------------|
| ⚠️game_name  | STR| The name of the environment.                                |
| ⚠️max_steps_for_each_episode | +INT | The maximun steps that the agent will go through while not done. In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.                    |
| ⚠️seed | +INT/None | The seed for environment. None for random environment each episode.                    |
| load_pretrained_model  | BOLEAN |Whether you want to load previous trained model.                          |
| ◀️ensemble_size  | +INT | The size of the neural ensemble which the agent is comprised of. The bigger, the better, but the longer training time without parallel training. :-D                  |
| ⚠️state_size  | +INT | The size of the state as input data.                    |
| ⚠️action_size   | +INT | The size of action per step as input data.   |
| ⚠️reward_size  | +INT |The size of the reward as output data.                          |
| ⚠️feature_size   | +INT |The size of the hidden layers.       |
| ⚠️history_size  | 0/+INT |How many steps in the history for state and action will the agent take into consideration.                           |
| ⚠️future_size  | +INT |The length of the sequence of actions. Namely, how many steps in the future the agent will predict or use to discern the present best action.                |
| ⚠️neural_type  | STR |  [**`rnn`**, **`gru`**, **`lstm`**, **`td`**] The type of neural network you prefer. For now, we support rnn, gru, lstm, and td (Transformer decoder only). More to come in the future (or you can build one yourself :-D in the models repository).           |
| ⚠️num_layers  | +INT |The number of layers in rnn, gru, lstm, and td (Transformer decoder only).     |
| ⚠️num_heads  | +INT/None |The number of heads in multi-head attention (Should be able to devide feature_size) (Should be None for non-attention neural_type).                         |
| init   | STR | [**`random_normal`**, **`random_uniform`**, **`xavier_normal`**, **`xavier_uniform`**, **`glorot_normal`**, **`glorot_uniform`**] The initialization method you prefer for initiating neural net ensemble of your agent.                          |
| opti   | STR | [**`adam`**, **`sgd`**, **`rmsprop`**]  The optimization method you prefer.             |
| loss  | STR | [**`mean_squared_error`**, **`binary_crossentropy`**] The loss or error function you prefer.                           |
| bias  | BOLEAN |Whether you want add bias.                          |
| drop_rate   | 0/+FLOAT |The drop-rate for drop-out.              |
| ⚠️alpha   | 0/+FLOAT |The learning rate for neural networks weight matrices.                           |
| iteration_for_learning   | +INT |The iteration for learning.              |
| init_   | STR | [**`random_normal`**, **`random_uniform`**, **`xavier_normal`**, **`xavier_uniform`**, **`glorot_normal`**, **`glorot_uniform`**] The initialization method you prefer for initiating actions of your agent.                         |
| greed_epsilon_t  |  +INT |The times applying gaussian noise to the initializated actions of the agent, similar to diffusion model's adding gaussian noise.          |
| ⚠️greed_epsilon_init  |  +FLOAT |The initial greed_epsilon or noise range to initializate the actions of the agent. The higher the value is, the more exploration-oriented the agent will be in the begining.                    |
| ⚠️greed_epsilon_min  |  +FLOAT |A very small number representing the lower bound of the greed_epsilon.        |
| greed_epsilon_decay  |  +FLOAT | The rate of decaying for greed_epsilon for each step and eposide.  |
| ⚠️beta  |  0/+FLOAT |The updating rate for updating actions of the agent.              |
| loss_scale  |  +FLOAT |The scaler for loss in nn for each consecutive time step. The higher, the more weight is given to the loss in later time steps.|
| iteration_for_deducing  |  +INT |The iteration for updating actions of the agent.                           |
| episode_for_training  | +INT |How many epsiodes will your agent run in the training mode where your agent will learn offline.              |
| ⚠️buffer_limit  | +INT |The maximum size for your buffer.              |
| PER_epsilon  | 0/+FLOAT |The epsilon for prioritized experience replay.              |
| PER_exponent  | 0/+FLOAT |The expoenet for prioritized experience replay.                           |
| episode_for_testing  | +INT |How many epsiodes will your agent run in the testing mode where your agent will not learn offline.                        |
| render_for_human  | BOLEAN | Wether you want to render the visual result for each step in the testing mode.              |




## dualism principles

| neural weights | neural actions (and experiences) |
|----------|----------|
| Re-updated   | Not re-updated. New ones are created and stored each time   |
| Stable initialization   | Unstable initialization at first and then stable initialization gradually   |

## frozen lake

In [3]:
game_name =  'FrozenLake-v1'         #⚠️   gym.make(game_name, max_episode_steps=max_steps_for_each_episode, is_slippery=False, map_name="4x4")
max_steps_for_each_episode = 25      #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  16                     #⚠️
action_size = 4                      #⚠️
reward_size = 100                    #⚠️
feature_size = 200                   #⚠️
history_size  = 0                    #⚠️
future_size = 10                     #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## blackjack

In [4]:
game_name = 'Blackjack-v1'           #⚠️
max_steps_for_each_episode = 10      #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  201                    #⚠️
action_size = 2                      #⚠️
reward_size = 100                    #⚠️
feature_size = 250                   #⚠️
history_size  = 0                    #⚠️
future_size = 5                      #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## cartpole

In [5]:
game_name = 'CartPole-v1'            #⚠️
max_steps_for_each_episode = 1000    #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  400                    #⚠️
action_size = 2                      #⚠️
reward_size = 100                    #⚠️
feature_size = 500                   #⚠️
history_size  = 0                    #⚠️
future_size = 25                     #⚠️
neural_type = 'gru'                  #⚠️
num_layers = 2                       #⚠️
num_heads = None                     #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


In [6]:
game_name = 'CartPole-v1'            #⚠️
max_steps_for_each_episode = 1000    #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  400                    #⚠️
action_size = 2                      #⚠️
reward_size = 100                    #⚠️
feature_size = 500                   #⚠️
history_size  = 0                    #⚠️
future_size = 25                     #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## mountain car

In [7]:
game_name =  'MountainCar-v0'        #⚠️
max_steps_for_each_episode = 200     #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  200                    #⚠️
action_size = 3                      #⚠️
reward_size = 100                    #⚠️
feature_size = 300                   #⚠️
history_size  = 0                    #⚠️
future_size = 25                     #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## acrobot

In [8]:
game_name = 'Acrobot-v1'             #⚠️
max_steps_for_each_episode = 200     #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  600                    #⚠️
action_size = 3                      #⚠️
reward_size = 100                    #⚠️
feature_size = 700                   #⚠️
history_size  = 0                    #⚠️
future_size = 25                     #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## lunar lander

In [9]:
game_name = "LunarLander-v3"         #⚠️
max_steps_for_each_episode = 200     #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  800                    #⚠️
action_size = 4                      #⚠️
reward_size = 250                    #⚠️
feature_size = 950                   #⚠️
history_size  = 0                    #⚠️
future_size = 25                     #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## door key

In [10]:
game_name = "MiniGrid-DoorKey-5x5-v0"#⚠️
max_steps_for_each_episode = 50      #⚠️
seed = 1                             #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  157                    #⚠️
action_size = 7                      #⚠️
reward_size = 100                    #⚠️
feature_size = 250                   #⚠️
history_size  = 15                   #⚠️
future_size = 8                      #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 500

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1.5            #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 0.995
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 1000                  #⚠️

PER_epsilon  = 0.000001
PER_exponent = 1.1

sequence_size = history_size + future_size 


## your present config

In [11]:
game_name = 'CartPole-v1'            #⚠️
max_steps_for_each_episode = 1000    #⚠️
seed = None                          #⚠️

load_pretrained_model = True

ensemble_size = 5                    #◀️

state_size =  400                    #⚠️
action_size = 2                      #⚠️
reward_size = 100                    #⚠️
feature_size = 500                   #⚠️
history_size  = 0                    #⚠️
future_size = 25                     #⚠️
neural_type = 'td'                   #⚠️
num_layers = 2                       #⚠️
num_heads = 10                       #⚠️
init = "xavier_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                          #⚠️
iteration_for_learning = 1000

init_ = "random_uniform"
greed_epsilon_t     = 1
greed_epsilon_init  = 1              #⚠️
greed_epsilon_min   = 0.000001       #⚠️
greed_epsilon_decay = 1
beta = 0.1                           #⚠️
loss_scale = 1.1
iteration_for_deducing = 25

episode_for_training = 100000

buffer_limit = 25000                 #⚠️

PER_epsilon  = 0.000001
PER_exponent = 0.0

sequence_size = history_size + future_size 


In [12]:
episode_for_testing = 100
render_for_human = True

suffix                 = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_learn={iteration_for_learning:05d}_deduce={iteration_for_deducing:05d}"
directory              = f'./result/{game_name}/'
performance_directory  = f'./result/{game_name}/performace_{suffix}.csv'
model_directory        = f'./result/{game_name}/model_{suffix}.pth'
buffer_directory       = f'./result/{game_name}/buffer_{suffix}.dill'

if not os.path.exists(directory):
    os.makedirs(directory)

# Importing local modules

In [13]:
game_modules = {
    'FrozenLake-v1': 'envs.env_frozenlake',
    'Blackjack-v1': 'envs.env_blackjack',
    'CartPole-v1': 'envs.env_cartpole',
    'MountainCar-v0': 'envs.env_mountaincar',
    'Acrobot-v1': 'envs.env_acrobot',
    'LunarLander-v3': 'envs.env_lunarlander',
    'MiniGrid-DoorKey-5x5-v0': 'envs.env_doorkey'
}
if game_name in game_modules:
    game_module = __import__(game_modules[game_name], fromlist=['vectorizing_state', 'vectorizing_action', 'vectorizing_reward'])
    vectorizing_state  = game_module.vectorizing_state
    vectorizing_action = game_module.vectorizing_action
    vectorizing_reward = game_module.vectorizing_reward
else:
    raise RuntimeError('Missing env functions')

In [14]:
model_modules = {
    'td': 'models.model_td',
    'rnn': 'models.model_rnn',
    'gru': 'models.model_rnn',
    'lstm': 'models.model_rnn'
}
if neural_type in model_modules:
    model_module = __import__(model_modules[neural_type], fromlist=['build_model'])
    build_model  = model_module.build_model
else:
    raise RuntimeError('Missing model functions')

from utils.util_func  import load_performance_from_csv,\
                             load_buffer_from_pickle,\
                             retrieve_history,\
                             retrieve_present,\
                             initialize_future_action, \
                             initialize_desired_reward,\
                             update_future_action, \
                             sequentialize, \
                             update_long_term_experience_replay_buffer,\
                             update_model_list,\
                             limit_buffer,\
                             save_performance_to_csv,\
                             save_buffer_to_pickle


# Deducing -> Learning
Training mode where your agent will learn offline. You can see here how your agent learn overtime and improve its performance.

## Creating or loading models

In [15]:

# creating empty log for recording performance
performance_log  = []

# setting the last episode number for performance log
last_episode = 0

# creating model list
model_list = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        action_size,
                        reward_size,
                        feature_size,
                        sequence_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        init,
                        opti,
                        loss,
                        bias,
                        drop_rate,
                        alpha)
    model.to(device)
    model_list.append(model)

# creating space for storing tensors as experience replay buffer
history_state_stack        = torch.empty(0).to(device)
history_action_stack       = torch.empty(0).to(device)
present_state_stack        = torch.empty(0).to(device)
future_action_stack        = torch.empty(0).to(device)
future_reward_stack        = torch.empty(0).to(device)
future_state_stack         = torch.empty(0).to(device)
history_state_hash_list    = list()
history_action_hash_list   = list()
present_state_hash_list    = list()
future_action_hash_list    = list()
future_reward_hash_list    = list()
future_state_hash_list     = list()

# load from pre-trained models if needed
if load_pretrained_model == False:
    pass
elif load_pretrained_model == True:
    try:
        model_dict = torch.load(model_directory)
        for i, model in enumerate(model_list):
            model.load_state_dict(model_dict[f'model_{i}'])
        history_state_stack, \
        history_action_stack, \
        present_state_stack, \
        future_action_stack, \
        future_reward_stack, \
        future_state_stack , \
        history_state_hash_list, \
        history_action_hash_list, \
        present_state_hash_list, \
        future_action_hash_list, \
        future_reward_hash_list, \
        future_state_hash_list = load_buffer_from_pickle(buffer_directory)
        history_state_stack    = history_state_stack.to (device) 
        history_action_stack   = history_action_stack.to(device) 
        present_state_stack    = present_state_stack.to (device) 
        future_action_stack    = future_action_stack.to (device) 
        future_reward_stack    = future_reward_stack.to (device) 
        future_state_stack     = future_state_stack .to (device) 
        performance_log        = load_performance_from_csv(performance_directory)
        last_episode           = performance_log[-1][0] + 1 if len(performance_log) > 0 else 0
        greed_epsilon_init     = greed_epsilon_init * (greed_epsilon_decay ** last_episode)
        print('Loaded pre-trained models.')
    except:
        print('Failed loading pre-trained models. Now using new models.')
        pass

Failed loading pre-trained models. Now using new models.


## Putting all the previous works into play

In [None]:
"""
We dont randomize desired reward anymore because:
1 - It is not typical in RL.
2 - There are many more effective methods like epsilon-greedy, intrinsic motivation, and reward shaping that can drive an agent to explore effectively.
3 - Those methods are designed to balance exploration and exploitation in a way that promotes learning while keeping the agent on a meaningful path toward mastering the environment.
"""

"""
planning phase:
    (history_state) (history_action) present_state         future_action        desired_rewar
                                     -observed by agent    -planned by agent    -given by human
learning phase:
                                     present_state         future_action        future_reward             future_state
                                     -observed by agent    -executed by agent   -observed by agent        -observed by agent
                                                                                -criterion set by human
"""

# starting each episode
for training_episode in tqdm(range(episode_for_training)):

    # initializing summed reward
    summed_reward  = 0

    """
    We filled short term experience replay buffer with some dummy data to insure that history exists.
    """
    # initializing short term experience replay buffer
    state_list  = []
    action_list = []
    reward_list = []
    for _ in range(history_size):
        state_list .append(torch.zeros(state_size  ).to(device) - 1)
        action_list.append(torch.zeros(action_size ).to(device)    )
        reward_list.append(torch.zeros(reward_size ).to(device)    ) 

    # initializing environment
    env            = gym.make(game_name, max_episode_steps=max_steps_for_each_episode)
    state, info    = env.reset(seed = seed)
    
    # observing state
    state          = vectorizing_state(state, device)
    state_list.append(state)

    # starting each step
    for count in itertools.count(1):
        print(f'\rStep: {count}\r', end='', flush=True)

        """
        We let agent took some history states and actions into consideration.
        """
        """
        We also gave loss weight proportional to its position in time step for the agent to be more long-term-reward oriented.
        """
        # initializing and updating action                                    
        history_state, \
        history_action  = retrieve_history(state_list, action_list, history_size, device)
        present_state   = retrieve_present(state_list, device)
        future_action   = initialize_future_action(init_, greed_epsilon_t, greed_epsilon_init, (1, future_size, action_size), device)
        desired_reward  = initialize_desired_reward((1, future_size, reward_size), device)
        future_action   = update_future_action(iteration_for_deducing,
                                               model_list,
                                               history_state ,
                                               history_action,
                                               present_state,
                                               future_action,
                                               desired_reward,
                                               beta,
                                               loss_scale)
        action, action_  = vectorizing_action(future_action, device)
        action_list.append(action)

        # executing action
        state, reward, done, truncated, info = env.step(action_)

        # summing reward
        summed_reward += reward

        # observing actual reward
        reward = vectorizing_reward(state, reward, summed_reward, done, reward_size, device)
        reward_list.append(reward)

        # observing state
        state    = vectorizing_state(state, device)
        state_list.append(state)

        """
        We expanded the condition for terminating an episode to include the case where the count is smaller than the sum of the history and time sizes. 
        Though it is contrary to common practice in RL, this is for better handling the sequentialization of the short-term experience replay buffer with fixed window length.
        """
        # terminating episode if done or truncated
        if count >= (future_size):
            if done or truncated:
                print(f'Episode {training_episode + last_episode}: Summed_Reward = {summed_reward}')
                performance_log.append([training_episode + last_episode, summed_reward])
                break
            else:
                pass
        else:
            pass

    # closing env
    env.close()




    # sequentializing short term experience replay buffer
    history_state_list   ,\
    history_action_list   ,\
    present_state_list   ,\
    future_action_list   ,\
    future_reward_list   ,\
    future_state_list    = sequentialize(state_list  ,
                                         action_list ,
                                         reward_list ,
                                         history_size,
                                         future_size)




    # storing sequentialized short term experience to long term experience replay buffer when it is a new experience
    history_state_stack, \
    history_action_stack, \
    present_state_stack, \
    future_action_stack, \
    future_reward_stack, \
    future_state_stack , \
    history_state_hash_list  , \
    history_action_hash_list  , \
    present_state_hash_list  , \
    future_action_hash_list  , \
    future_reward_hash_list  , \
    future_state_hash_list   = update_long_term_experience_replay_buffer(history_state_stack,
                                                                         history_action_stack,
                                                                         present_state_stack,
                                                                         future_action_stack,
                                                                         future_reward_stack,
                                                                         future_state_stack ,
                                                                         history_state_hash_list  ,
                                                                         history_action_hash_list  ,
                                                                         present_state_hash_list  ,
                                                                         future_action_hash_list  ,
                                                                         future_reward_hash_list  ,
                                                                         future_state_hash_list   ,
                                                                         history_state_list   ,
                                                                         history_action_list   ,
                                                                         present_state_list,
                                                                         future_action_list,
                                                                         future_reward_list,
                                                                         future_state_list )
    



    """
    The true iteration is min(iteration_for_learning, len(present_state_stack))
    """
    # training with Prioritized Experience Replay (PER)
    model_list = update_model_list(iteration_for_learning ,
                                   history_state_stack,
                                   history_action_stack,
                                   present_state_stack,
                                   future_action_stack,
                                   future_reward_stack,
                                   future_state_stack ,
                                   model_list
                                   )




    """
    We also limit the buffer size by using priority to preserve some experiences according to buffer limit.
    """
    # limit_buffer
    history_state_stack, \
    history_action_stack, \
    present_state_stack, \
    future_action_stack, \
    future_reward_stack, \
    future_state_stack , \
    history_state_hash_list  , \
    history_action_hash_list  , \
    present_state_hash_list  , \
    future_action_hash_list  , \
    future_reward_hash_list  , \
    future_state_hash_list   = limit_buffer(history_state_stack,
                                            history_action_stack,
                                            present_state_stack,
                                            future_action_stack,
                                            future_reward_stack,
                                            future_state_stack ,
                                            history_state_hash_list  ,
                                            history_action_hash_list  ,
                                            present_state_hash_list  ,
                                            future_action_hash_list  ,
                                            future_reward_hash_list  ,
                                            future_state_hash_list ,
                                            buffer_limit  )




    # decreasing decay rate
    greed_epsilon_init = greed_epsilon_init * greed_epsilon_decay
    greed_epsilon_init = max( greed_epsilon_min, greed_epsilon_init )




    # saving final reward to log
    save_performance_to_csv(performance_log, performance_directory)

    # saving nn models
    model_dict = {}
    for i, model in enumerate(model_list):
        model_dict[f'model_{i}'] = model.state_dict()
    torch.save(model_dict, model_directory)

    # saving long term experience replay buffer
    save_buffer_to_pickle(buffer_directory,
                            history_state_stack,
                            history_action_stack,
                            present_state_stack,
                            future_action_stack,
                            future_reward_stack,
                            future_state_stack,
                            history_state_hash_list,
                            history_action_hash_list,
                            present_state_hash_list,
                            future_action_hash_list,
                            future_reward_hash_list,
                            future_state_hash_list)




    # clear up
    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/100000 [00:00<?, ?it/s]

Episode 0: Summed_Reward = 10.0


  0%|          | 1/100000 [00:58<1630:48:13, 58.71s/it]

Episode 1: Summed_Reward = 39.0


  0%|          | 2/100000 [02:47<2445:33:25, 88.04s/it]

Episode 2: Summed_Reward = 48.0


  0%|          | 3/100000 [04:46<2846:17:09, 102.47s/it]

Episode 3: Summed_Reward = 20.0


  0%|          | 4/100000 [06:09<2630:31:03, 94.70s/it] 

Episode 4: Summed_Reward = 9.0


  0%|          | 5/100000 [07:25<2440:44:20, 87.87s/it]

Episode 5: Summed_Reward = 10.0


  0%|          | 6/100000 [08:46<2374:18:40, 85.48s/it]

Episode 6: Summed_Reward = 18.0


  0%|          | 7/100000 [10:08<2343:41:25, 84.38s/it]

Episode 7: Summed_Reward = 24.0


  0%|          | 8/100000 [11:23<2260:06:50, 81.37s/it]

Episode 8: Summed_Reward = 17.0


  0%|          | 9/100000 [12:35<2177:34:08, 78.40s/it]

Episode 9: Summed_Reward = 18.0


  0%|          | 10/100000 [13:55<2197:30:55, 79.12s/it]

Episode 10: Summed_Reward = 14.0


  0%|          | 11/100000 [15:10<2154:34:01, 77.57s/it]

Episode 11: Summed_Reward = 17.0


  0%|          | 12/100000 [16:24<2130:13:58, 76.70s/it]

Episode 12: Summed_Reward = 9.0


  0%|          | 13/100000 [17:38<2106:00:49, 75.83s/it]

Episode 13: Summed_Reward = 20.0


  0%|          | 14/100000 [18:48<2053:37:22, 73.94s/it]

Episode 14: Summed_Reward = 11.0


  0%|          | 15/100000 [19:58<2022:55:37, 72.84s/it]

Episode 15: Summed_Reward = 48.0


  0%|          | 16/100000 [22:07<2493:19:05, 89.77s/it]

Episode 16: Summed_Reward = 20.0


  0%|          | 17/100000 [23:32<2457:20:05, 88.48s/it]

Episode 17: Summed_Reward = 14.0


  0%|          | 18/100000 [25:03<2477:37:02, 89.21s/it]

Episode 18: Summed_Reward = 27.0


  0%|          | 19/100000 [26:49<2611:18:28, 94.02s/it]

Episode 19: Summed_Reward = 26.0


  0%|          | 20/100000 [28:33<2698:54:36, 97.18s/it]

Episode 20: Summed_Reward = 21.0


  0%|          | 21/100000 [30:20<2781:51:02, 100.17s/it]

Episode 21: Summed_Reward = 51.0


  0%|          | 22/100000 [32:51<3202:12:48, 115.31s/it]

Episode 22: Summed_Reward = 34.0


  0%|          | 23/100000 [35:12<3416:54:36, 123.04s/it]

Episode 23: Summed_Reward = 10.0


  0%|          | 24/100000 [36:55<3252:12:10, 117.11s/it]

Episode 24: Summed_Reward = 28.0


  0%|          | 25/100000 [38:43<3175:47:40, 114.36s/it]

Episode 25: Summed_Reward = 10.0


  0%|          | 26/100000 [40:37<3174:00:38, 114.29s/it]

Episode 26: Summed_Reward = 16.0


  0%|          | 27/100000 [42:42<3260:51:52, 117.42s/it]

Episode 27: Summed_Reward = 19.0


  0%|          | 28/100000 [44:42<3285:28:38, 118.31s/it]

Episode 28: Summed_Reward = 16.0


  0%|          | 29/100000 [46:42<3298:08:05, 118.77s/it]

Episode 29: Summed_Reward = 13.0


  0%|          | 30/100000 [48:48<3356:06:58, 120.86s/it]

Episode 30: Summed_Reward = 20.0


  0%|          | 31/100000 [50:50<3362:36:44, 121.09s/it]

Episode 31: Summed_Reward = 32.0


  0%|          | 32/100000 [53:17<3581:09:06, 128.96s/it]

Episode 32: Summed_Reward = 63.0


  0%|          | 33/100000 [56:52<4299:58:45, 154.85s/it]

Episode 33: Summed_Reward = 18.0


  0%|          | 34/100000 [58:59<4065:49:11, 146.42s/it]

Episode 34: Summed_Reward = 15.0


  0%|          | 35/100000 [1:01:06<3905:34:03, 140.65s/it]

Episode 35: Summed_Reward = 108.0


  0%|          | 36/100000 [1:06:43<5538:59:25, 199.48s/it]

Episode 36: Summed_Reward = 28.0


  0%|          | 37/100000 [1:09:45<5390:55:40, 194.15s/it]

Episode 37: Summed_Reward = 32.0


  0%|          | 38/100000 [1:13:15<5525:17:37, 198.99s/it]

Episode 38: Summed_Reward = 15.0


  0%|          | 39/100000 [1:16:28<5479:02:54, 197.32s/it]

Episode 39: Summed_Reward = 24.0


  0%|          | 40/100000 [1:19:20<5261:56:37, 189.51s/it]

Episode 40: Summed_Reward = 25.0


  0%|          | 41/100000 [1:22:09<5093:59:16, 183.46s/it]

Episode 41: Summed_Reward = 16.0


  0%|          | 42/100000 [1:25:02<5008:45:45, 180.39s/it]

Episode 42: Summed_Reward = 13.0


  0%|          | 43/100000 [1:27:52<4918:44:45, 177.15s/it]

Episode 43: Summed_Reward = 77.0


  0%|          | 44/100000 [1:33:01<6016:19:09, 216.68s/it]

Episode 44: Summed_Reward = 12.0


  0%|          | 45/100000 [1:36:17<5850:14:12, 210.70s/it]

Episode 45: Summed_Reward = 15.0


  0%|          | 46/100000 [1:39:27<5672:47:09, 204.31s/it]

Episode 46: Summed_Reward = 12.0


  0%|          | 47/100000 [1:43:11<5836:15:16, 210.20s/it]

Episode 47: Summed_Reward = 31.0


  0%|          | 48/100000 [1:47:15<6118:53:11, 220.39s/it]

Episode 48: Summed_Reward = 17.0


  0%|          | 49/100000 [1:50:56<6125:49:15, 220.64s/it]

Episode 49: Summed_Reward = 16.0


  0%|          | 50/100000 [1:54:35<6107:51:18, 219.99s/it]

Episode 50: Summed_Reward = 26.0


  0%|          | 51/100000 [1:57:58<5969:53:23, 215.03s/it]

Episode 51: Summed_Reward = 31.0


  0%|          | 52/100000 [2:02:10<6277:58:36, 226.12s/it]

Episode 52: Summed_Reward = 24.0


  0%|          | 53/100000 [2:05:59<6301:26:33, 226.97s/it]

Episode 53: Summed_Reward = 20.0


  0%|          | 54/100000 [2:09:48<6321:30:32, 227.70s/it]

Episode 54: Summed_Reward = 39.0


  0%|          | 55/100000 [2:14:43<6877:54:39, 247.74s/it]

Episode 55: Summed_Reward = 22.0


  0%|          | 56/100000 [2:18:54<6907:25:25, 248.81s/it]

Episode 56: Summed_Reward = 41.0


  0%|          | 57/100000 [2:23:39<7205:19:14, 259.54s/it]

Episode 57: Summed_Reward = 43.0


  0%|          | 58/100000 [2:28:11<7309:18:33, 263.29s/it]

Episode 58: Summed_Reward = 21.0


  0%|          | 59/100000 [2:31:52<6961:56:56, 250.78s/it]

Episode 59: Summed_Reward = 58.0


  0%|          | 60/100000 [2:36:58<7417:22:23, 267.19s/it]

Episode 60: Summed_Reward = 22.0


  0%|          | 61/100000 [2:41:05<7250:03:00, 261.16s/it]

Episode 61: Summed_Reward = 37.0


  0%|          | 62/100000 [2:45:37<7342:31:41, 264.50s/it]

Episode 62: Summed_Reward = 32.0


  0%|          | 63/100000 [2:50:08<7393:11:38, 266.32s/it]

Episode 63: Summed_Reward = 33.0


  0%|          | 64/100000 [2:54:53<7549:39:42, 271.96s/it]

Episode 64: Summed_Reward = 25.0


  0%|          | 65/100000 [2:59:36<7643:35:08, 275.35s/it]

Episode 65: Summed_Reward = 25.0


  0%|          | 66/100000 [3:03:40<7379:00:45, 265.82s/it]

Episode 66: Summed_Reward = 23.0


  0%|          | 67/100000 [3:07:51<7257:19:04, 261.44s/it]

Episode 67: Summed_Reward = 20.0


  0%|          | 68/100000 [3:12:05<7193:58:46, 259.16s/it]

Episode 68: Summed_Reward = 27.0


  0%|          | 69/100000 [3:16:32<7259:43:42, 261.53s/it]

Episode 69: Summed_Reward = 111.0


  0%|          | 70/100000 [3:25:11<9400:13:26, 338.65s/it]

Episode 70: Summed_Reward = 77.0


  0%|          | 71/100000 [3:33:15<10618:23:41, 382.53s/it]

Episode 71: Summed_Reward = 23.0


  0%|          | 72/100000 [3:39:12<10402:35:18, 374.76s/it]

Episode 72: Summed_Reward = 17.0


  0%|          | 73/100000 [3:45:14<10293:27:34, 370.84s/it]

Episode 73: Summed_Reward = 15.0


  0%|          | 74/100000 [3:51:06<10138:08:52, 365.24s/it]

Episode 74: Summed_Reward = 71.0


  0%|          | 75/100000 [3:59:17<11186:54:35, 403.03s/it]

Episode 75: Summed_Reward = 42.0


  0%|          | 76/100000 [4:06:34<11470:58:30, 413.27s/it]

Episode 76: Summed_Reward = 43.0


  0%|          | 77/100000 [4:14:12<11842:05:25, 426.64s/it]

Episode 77: Summed_Reward = 20.0


  0%|          | 78/100000 [4:20:50<11597:50:48, 417.85s/it]

Episode 78: Summed_Reward = 55.0


  0%|          | 79/100000 [4:28:46<12084:22:22, 435.38s/it]

Episode 79: Summed_Reward = 14.0


  0%|          | 80/100000 [4:35:53<12014:19:41, 432.86s/it]

Episode 80: Summed_Reward = 32.0


  0%|          | 81/100000 [4:43:06<12016:34:39, 432.95s/it]

Episode 81: Summed_Reward = 22.0


  0%|          | 82/100000 [4:50:09<11935:51:32, 430.04s/it]

Episode 82: Summed_Reward = 26.0


  0%|          | 83/100000 [4:57:19<11931:04:25, 429.88s/it]

Episode 83: Summed_Reward = 23.0


  0%|          | 84/100000 [5:04:25<11901:29:04, 428.81s/it]

Episode 84: Summed_Reward = 66.0


  0%|          | 85/100000 [5:13:37<12930:17:21, 465.89s/it]

Episode 85: Summed_Reward = 83.0


  0%|          | 86/100000 [5:23:25<13942:17:57, 502.35s/it]

Episode 86: Summed_Reward = 52.0


  0%|          | 87/100000 [5:32:30<14296:31:57, 515.12s/it]

Episode 87: Summed_Reward = 57.0


  0%|          | 88/100000 [5:42:02<14772:19:56, 532.27s/it]

Episode 88: Summed_Reward = 97.0


  0%|          | 89/100000 [5:53:57<16295:15:40, 587.15s/it]

Episode 89: Summed_Reward = 48.0


  0%|          | 90/100000 [6:04:14<16545:09:53, 596.16s/it]

Episode 90: Summed_Reward = 21.0


  0%|          | 91/100000 [6:13:17<16097:59:56, 580.06s/it]

Episode 91: Summed_Reward = 26.0


  0%|          | 92/100000 [6:22:30<15872:58:07, 571.95s/it]

Episode 92: Summed_Reward = 28.0


  0%|          | 93/100000 [6:31:54<15804:17:52, 569.48s/it]

Episode 93: Summed_Reward = 53.0


  0%|          | 94/100000 [6:42:21<16288:06:12, 586.92s/it]

Episode 94: Summed_Reward = 17.0


  0%|          | 95/100000 [6:51:03<15743:51:24, 567.32s/it]

Episode 95: Summed_Reward = 37.0


  0%|          | 96/100000 [6:59:54<15444:03:34, 556.52s/it]

Episode 96: Summed_Reward = 31.0


  0%|          | 97/100000 [7:08:48<15253:14:35, 549.65s/it]

Episode 97: Summed_Reward = 72.0


  0%|          | 98/100000 [7:19:06<15819:36:32, 570.06s/it]

Episode 98: Summed_Reward = 136.0


  0%|          | 99/100000 [7:31:43<17380:43:53, 626.33s/it]

Episode 99: Summed_Reward = 34.0


  0%|          | 100/100000 [7:39:49<16215:18:23, 584.34s/it]

Episode 100: Summed_Reward = 178.0


  0%|          | 101/100000 [7:53:37<18235:37:01, 657.15s/it]

Episode 101: Summed_Reward = 185.0


  0%|          | 102/100000 [8:08:11<20045:44:45, 722.38s/it]

Episode 102: Summed_Reward = 204.0


  0%|          | 103/100000 [8:24:31<22186:59:31, 799.56s/it]

Episode 103: Summed_Reward = 78.0


  0%|          | 104/100000 [8:35:46<21156:00:20, 762.41s/it]

Episode 104: Summed_Reward = 70.0


  0%|          | 105/100000 [8:46:49<20325:49:24, 732.50s/it]

Episode 105: Summed_Reward = 71.0


  0%|          | 106/100000 [8:57:19<19469:13:18, 701.64s/it]

Episode 106: Summed_Reward = 220.0


  0%|          | 107/100000 [9:12:52<21399:07:30, 771.19s/it]

Episode 107: Summed_Reward = 31.0


  0%|          | 108/100000 [9:21:18<19192:15:40, 691.67s/it]

Episode 108: Summed_Reward = 42.0


  0%|          | 109/100000 [9:29:55<17734:54:05, 639.15s/it]

Episode 109: Summed_Reward = 280.0


  0%|          | 110/100000 [9:46:49<20858:33:01, 751.73s/it]

Episode 110: Summed_Reward = 149.0


  0%|          | 111/100000 [9:59:46<21061:22:10, 759.05s/it]

Episode 111: Summed_Reward = 157.0


  0%|          | 112/100000 [10:12:18<21006:21:41, 757.08s/it]

Episode 112: Summed_Reward = 139.0


  0%|          | 113/100000 [10:24:14<20666:22:30, 744.83s/it]

Episode 113: Summed_Reward = 174.0


  0%|          | 114/100000 [10:37:27<21065:58:04, 759.24s/it]

Episode 114: Summed_Reward = 68.0


  0%|          | 115/100000 [10:47:17<19657:42:42, 708.49s/it]

Episode 115: Summed_Reward = 511.0


  0%|          | 116/100000 [11:13:06<26651:01:50, 960.55s/it]

Episode 116: Summed_Reward = 288.0


  0%|          | 117/100000 [11:30:48<27499:32:45, 991.14s/it]

Episode 117: Summed_Reward = 237.0


  0%|          | 118/100000 [11:46:31<27092:54:17, 976.50s/it]

Episode 118: Summed_Reward = 107.0


  0%|          | 119/100000 [11:57:42<24552:18:02, 884.94s/it]

Episode 119: Summed_Reward = 81.0


  0%|          | 120/100000 [12:07:52<22266:00:39, 802.54s/it]

Episode 120: Summed_Reward = 418.0


  0%|          | 121/100000 [12:30:11<26727:40:44, 963.36s/it]

Episode 121: Summed_Reward = 68.0


  0%|          | 122/100000 [12:40:05<23654:45:17, 852.61s/it]

Episode 122: Summed_Reward = 235.0


  0%|          | 123/100000 [12:55:30<24255:57:37, 874.29s/it]

Episode 123: Summed_Reward = 128.0


  0%|          | 124/100000 [13:07:47<23115:31:43, 833.19s/it]

Episode 124: Summed_Reward = 329.0


  0%|          | 125/100000 [13:26:56<25740:22:48, 927.81s/it]

Episode 125: Summed_Reward = 214.0


  0%|          | 126/100000 [13:42:04<25573:43:26, 921.82s/it]

Episode 126: Summed_Reward = 125.0


  0%|          | 127/100000 [13:53:23<23555:21:00, 849.07s/it]

Episode 127: Summed_Reward = 198.0


  0%|          | 128/100000 [14:07:39<23614:52:37, 851.23s/it]

Episode 128: Summed_Reward = 233.0


  0%|          | 129/100000 [14:23:16<24321:48:21, 876.72s/it]

Episode 129: Summed_Reward = 263.0


  0%|          | 130/100000 [14:40:25<25588:53:20, 922.40s/it]

Episode 130: Summed_Reward = 431.0


  0%|          | 131/100000 [15:03:01<29202:25:20, 1052.67s/it]

Episode 131: Summed_Reward = 287.0


  0%|          | 132/100000 [15:20:31<29176:11:50, 1051.73s/it]

Episode 132: Summed_Reward = 400.0


  0%|          | 133/100000 [15:41:43<31010:18:22, 1117.86s/it]

Episode 133: Summed_Reward = 327.0


  0%|          | 134/100000 [16:00:47<31229:20:14, 1125.76s/it]

Episode 134: Summed_Reward = 281.0


  0%|          | 135/100000 [16:18:25<30663:07:33, 1105.36s/it]

Episode 135: Summed_Reward = 314.0


  0%|          | 136/100000 [16:36:45<30616:05:26, 1103.68s/it]

Episode 136: Summed_Reward = 260.0


  0%|          | 137/100000 [16:53:58<30028:31:10, 1082.51s/it]

Episode 137: Summed_Reward = 245.0


  0%|          | 138/100000 [17:09:40<28861:57:56, 1040.47s/it]

Episode 138: Summed_Reward = 57.0


  0%|          | 139/100000 [17:19:03<24885:31:09, 897.13s/it] 

Episode 139: Summed_Reward = 533.0


  0%|          | 140/100000 [17:44:43<30239:19:50, 1090.14s/it]

Episode 140: Summed_Reward = 269.0


  0%|          | 141/100000 [18:01:38<29609:19:51, 1067.44s/it]

Episode 141: Summed_Reward = 162.0


  0%|          | 142/100000 [18:14:49<27311:45:58, 984.62s/it] 

Episode 142: Summed_Reward = 670.0


  0%|          | 143/100000 [18:46:39<35014:40:19, 1262.33s/it]

Episode 143: Summed_Reward = 365.0


  0%|          | 144/100000 [19:06:56<34635:19:07, 1248.67s/it]

Episode 144: Summed_Reward = 282.0


  0%|          | 145/100000 [19:24:44<33127:32:29, 1194.32s/it]

Episode 145: Summed_Reward = 151.0


  0%|          | 146/100000 [19:37:49<29720:05:10, 1071.49s/it]

Episode 146: Summed_Reward = 571.0


  0%|          | 147/100000 [20:05:50<34792:55:56, 1254.39s/it]

Episode 147: Summed_Reward = 326.0


  0%|          | 148/100000 [20:24:21<33597:34:41, 1211.31s/it]

Episode 148: Summed_Reward = 831.0


  0%|          | 149/100000 [21:01:16<41955:35:43, 1512.66s/it]

Episode 149: Summed_Reward = 545.0


  0%|          | 150/100000 [21:28:48<43114:13:01, 1554.44s/it]

Episode 150: Summed_Reward = 469.0


  0%|          | 151/100000 [21:52:21<41934:37:25, 1511.93s/it]

Episode 151: Summed_Reward = 328.0


  0%|          | 152/100000 [22:11:08<38729:38:44, 1396.39s/it]

Episode 152: Summed_Reward = 265.0


  0%|          | 153/100000 [22:28:21<35704:37:58, 1287.34s/it]

Episode 153: Summed_Reward = 235.0


  0%|          | 154/100000 [22:43:44<32678:20:49, 1178.23s/it]

Episode 154: Summed_Reward = 329.0


  0%|          | 155/100000 [23:03:28<32725:39:39, 1179.95s/it]

Episode 155: Summed_Reward = 413.0


  0%|          | 156/100000 [23:25:55<34114:50:59, 1230.05s/it]

Episode 156: Summed_Reward = 300.0


  0%|          | 157/100000 [23:44:06<32956:30:03, 1188.30s/it]

Episode 157: Summed_Reward = 255.0


  0%|          | 158/100000 [24:00:53<31443:48:45, 1133.77s/it]

Episode 158: Summed_Reward = 240.0


  0%|          | 159/100000 [24:17:00<30056:08:32, 1083.74s/it]

Episode 159: Summed_Reward = 267.0


  0%|          | 160/100000 [24:33:29<29271:53:04, 1055.48s/it]

Episode 160: Summed_Reward = 419.0


  0%|          | 161/100000 [24:55:36<31530:18:19, 1136.92s/it]

Episode 161: Summed_Reward = 525.0


  0%|          | 162/100000 [25:21:48<35151:41:57, 1267.51s/it]

Episode 162: Summed_Reward = 180.0


  0%|          | 163/100000 [25:35:36<31488:25:19, 1135.43s/it]

Episode 163: Summed_Reward = 127.0


  0%|          | 164/100000 [25:47:57<28210:43:18, 1017.25s/it]

Episode 164: Summed_Reward = 319.0


  0%|          | 165/100000 [26:06:20<28919:55:05, 1042.84s/it]

Episode 165: Summed_Reward = 672.0


  0%|          | 166/100000 [26:37:50<35971:18:38, 1297.12s/it]

Episode 166: Summed_Reward = 272.0


  0%|          | 167/100000 [26:55:22<33932:47:32, 1223.62s/it]

Episode 167: Summed_Reward = 433.0


  0%|          | 168/100000 [27:18:09<35123:18:40, 1266.57s/it]

Episode 168: Summed_Reward = 403.0


  0%|          | 169/100000 [27:39:45<35363:43:14, 1275.25s/it]

Episode 169: Summed_Reward = 328.0


  0%|          | 170/100000 [27:58:43<34227:59:25, 1234.31s/it]

Episode 170: Summed_Reward = 309.0


  0%|          | 171/100000 [28:16:58<33069:18:52, 1192.53s/it]

Episode 171: Summed_Reward = 251.0


  0%|          | 172/100000 [28:33:17<31290:23:45, 1128.40s/it]

Episode 172: Summed_Reward = 857.0


  0%|          | 173/100000 [29:11:46<41109:23:44, 1482.50s/it]

Episode 173: Summed_Reward = 595.0


  0%|          | 174/100000 [29:40:56<43331:07:24, 1562.64s/it]

Episode 174: Summed_Reward = 790.0


  0%|          | 175/100000 [30:16:52<48267:29:18, 1740.68s/it]

Episode 175: Summed_Reward = 214.0


  0%|          | 176/100000 [30:31:25<41051:31:52, 1480.46s/it]

Episode 176: Summed_Reward = 296.0


  0%|          | 177/100000 [30:48:58<37492:31:28, 1352.12s/it]

Episode 177: Summed_Reward = 367.0


  0%|          | 178/100000 [31:09:19<36401:37:02, 1312.79s/it]

Episode 178: Summed_Reward = 150.0


  0%|          | 179/100000 [31:21:32<31579:30:57, 1138.90s/it]

Episode 179: Summed_Reward = 361.0


  0%|          | 180/100000 [31:41:43<32183:33:30, 1160.70s/it]

Episode 180: Summed_Reward = 244.0


  0%|          | 181/100000 [31:57:53<30593:36:29, 1103.37s/it]

Episode 181: Summed_Reward = 292.0


  0%|          | 182/100000 [32:16:33<30733:17:44, 1108.42s/it]

Episode 182: Summed_Reward = 247.0


  0%|          | 183/100000 [32:32:34<29504:12:38, 1064.10s/it]

Episode 183: Summed_Reward = 418.0


  0%|          | 184/100000 [32:54:55<31807:05:37, 1147.17s/it]

Episode 184: Summed_Reward = 437.0


  0%|          | 185/100000 [33:17:32<33554:56:34, 1210.22s/it]

Episode 185: Summed_Reward = 333.0


  0%|          | 186/100000 [33:36:29<32946:58:51, 1188.30s/it]

Episode 186: Summed_Reward = 395.0


  0%|          | 187/100000 [33:57:32<33563:15:12, 1210.54s/it]

Episode 187: Summed_Reward = 609.0


  0%|          | 188/100000 [34:26:36<38002:27:58, 1370.67s/it]

Episode 188: Summed_Reward = 166.0


  0%|          | 189/100000 [34:39:31<33049:46:22, 1192.04s/it]

Episode 189: Summed_Reward = 845.0


  0%|          | 190/100000 [35:18:24<42538:49:44, 1534.31s/it]

Episode 190: Summed_Reward = 1000.0


  0%|          | 191/100000 [36:02:31<51794:12:45, 1868.16s/it]

Episode 191: Summed_Reward = 327.0


  0%|          | 192/100000 [36:21:58<45956:45:51, 1657.63s/it]

Episode 192: Summed_Reward = 124.0


  0%|          | 193/100000 [36:33:45<38048:34:33, 1372.40s/it]

Episode 193: Summed_Reward = 760.0


  0%|          | 194/100000 [37:09:00<44231:33:02, 1595.43s/it]

Episode 194: Summed_Reward = 568.0


  0%|          | 195/100000 [37:36:40<44764:12:58, 1614.66s/it]

Episode 195: Summed_Reward = 248.0


  0%|          | 196/100000 [37:52:49<39394:59:18, 1421.00s/it]

Episode 196: Summed_Reward = 254.0


  0%|          | 197/100000 [38:08:58<35638:34:31, 1285.52s/it]

Episode 197: Summed_Reward = 234.0


  0%|          | 198/100000 [38:23:56<32414:13:42, 1169.23s/it]

Episode 198: Summed_Reward = 397.0


  0%|          | 199/100000 [38:45:21<33376:59:12, 1203.97s/it]

Episode 199: Summed_Reward = 800.0


  0%|          | 200/100000 [39:22:05<41692:51:27, 1503.95s/it]

Episode 200: Summed_Reward = 324.0


  0%|          | 201/100000 [39:41:00<38621:57:59, 1393.19s/it]

Episode 201: Summed_Reward = 487.0


  0%|          | 202/100000 [40:06:37<39813:41:16, 1436.19s/it]

Episode 202: Summed_Reward = 758.0


  0%|          | 203/100000 [40:42:18<45674:45:42, 1647.64s/it]

Episode 203: Summed_Reward = 665.0


  0%|          | 204/100000 [41:13:28<47529:11:54, 1714.55s/it]

Episode 204: Summed_Reward = 243.0


  0%|          | 205/100000 [41:29:02<41032:48:36, 1480.22s/it]

Episode 205: Summed_Reward = 284.0


  0%|          | 206/100000 [41:46:31<37452:44:35, 1351.08s/it]

Episode 206: Summed_Reward = 179.0


  0%|          | 207/100000 [42:00:28<33172:21:53, 1196.68s/it]

Episode 207: Summed_Reward = 270.0


  0%|          | 208/100000 [42:16:46<31354:34:12, 1131.12s/it]

Episode 208: Summed_Reward = 463.0


  0%|          | 209/100000 [42:40:20<33706:40:44, 1215.98s/it]

Episode 209: Summed_Reward = 640.0


  0%|          | 210/100000 [43:10:48<38793:54:05, 1399.52s/it]

Episode 210: Summed_Reward = 397.0


  0%|          | 211/100000 [43:32:32<37999:15:05, 1370.87s/it]

Episode 211: Summed_Reward = 199.0


  0%|          | 212/100000 [43:46:50<33733:46:10, 1217.00s/it]

Episode 212: Summed_Reward = 285.0


  0%|          | 213/100000 [44:04:36<32483:03:27, 1171.89s/it]

Episode 213: Summed_Reward = 353.0


  0%|          | 214/100000 [44:24:26<32627:40:22, 1177.12s/it]

Episode 214: Summed_Reward = 236.0


  0%|          | 215/100000 [44:40:01<30613:56:47, 1104.48s/it]

Episode 215: Summed_Reward = 627.0


  0%|          | 216/100000 [45:09:31<36149:17:50, 1304.19s/it]

Episode 216: Summed_Reward = 180.0


  0%|          | 217/100000 [45:23:00<32031:58:13, 1155.66s/it]

Episode 217: Summed_Reward = 1000.0


  0%|          | 218/100000 [46:07:59<44861:14:33, 1618.53s/it]

Episode 218: Summed_Reward = 207.0


  0%|          | 219/100000 [46:22:57<38874:44:14, 1402.56s/it]

Episode 219: Summed_Reward = 460.0


  0%|          | 220/100000 [46:46:55<39168:46:11, 1413.18s/it]

Episode 220: Summed_Reward = 421.0


  0%|          | 221/100000 [47:09:31<38688:14:26, 1395.86s/it]

Episode 221: Summed_Reward = 285.0


  0%|          | 222/100000 [47:26:41<35651:30:32, 1286.31s/it]

Episode 222: Summed_Reward = 180.0


  0%|          | 223/100000 [47:40:52<32032:58:16, 1155.76s/it]

Step: 468

# Deducing only
Testing mode where your trained agent in the training mode will not learn offline. It just keeps running each episode without learning new stuff.

## Loading models

In [None]:
model_list = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        action_size,
                        reward_size,
                        feature_size,
                        sequence_size ,
                        neural_type,
                        num_layers,
                        num_heads,
                        init,
                        opti,
                        loss,
                        bias,
                        drop_rate,
                        alpha)
    model.to(device)
    model_list.append(model)

model_dict = torch.load(model_directory)
for i, model in enumerate(model_list):
    model.load_state_dict(model_dict[f'model_{i}'])

## Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
# score recorder
total_summed_reward = 0

# starting each episode
for testing_episode in range(episode_for_testing):

    # stabilizing decay rate
    greed_epsilon = copy.deepcopy(greed_epsilon_min)

    # initializing summed reward
    summed_reward  = 0

    # initializing short term experience replay buffer
    state_list  = []
    action_list = []
    for _ in range(history_size):
        state_list .append(torch.zeros(state_size  ).to(device) - 1)
        action_list.append(torch.zeros(action_size ).to(device)    )

    # initializing environment
    env = gym.make(game_name, max_episode_steps = max_steps_for_each_episode,
                   render_mode = "human" if render_for_human else None)
    state, info = env.reset(seed = seed)
    if render_for_human == True:
        env.render()

    # observing state
    state = vectorizing_state(state, device)
    state_list.append(state)

    # starting each step
    for count in itertools.count(1):
        print(f'\rStep: {count}\r', end='', flush=True)
        
        # initializing and updating action   
        history_state, \
        history_action = retrieve_history(state_list, action_list, history_size, device)
        present_state  = retrieve_present(state_list, device)
        future_action  = initialize_future_action(init_, greed_epsilon_t, greed_epsilon, (1, future_size, action_size), device)
        desired_reward = initialize_desired_reward((1, future_size, reward_size), device)
        future_action  = update_future_action(iteration_for_deducing,
                                              model_list,
                                              history_state ,
                                              history_action,
                                              present_state,
                                              future_action,
                                              desired_reward,
                                              beta,
                                              loss_scale)
        action, action_       = vectorizing_action(future_action, device)
        action_list.append(action)
        
        # executing action
        state, reward, done, truncated, info = env.step(action_)
        if render_for_human == True:
            env.render()
            
        # summing reward
        summed_reward += reward
        
        # observing state
        state = vectorizing_state(state, device)
        state_list.append(state)
        
        # terminating episode if done or truncated
        if done or truncated:
            break
        
    # closing env
    env.close()

    # recording
    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged summed reward:')
    total_summed_reward += summed_reward
    print(total_summed_reward/(testing_episode + 1))

