# Installing requirements (for colab)

In [1]:
!sudo apt-get install python3.10

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3.10 is already the newest version (3.10.12-1~22.04.7).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
!pip install pandas==2.0.3 numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gym==0.25.2 pygame==2.5.2 tqdm torch==2.0.1

Collecting pandas==2.0.3
  Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy==1.25.2
  Downloading numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting swig==4.2.1
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.6 kB)
Collecting ufal.pybox2d==2.3.10.3
  Downloading ufal.pybox2d-2.3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (416 bytes)
Collecting pygame==2.5.2
  Downloading pygame-2.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metad

# Cloning git (for colab)

In [1]:
!git clone https://github.com/Brownwang0426/Reversal-Generative-Reinforcement-Learning.git

Cloning into 'Reversal-Generative-Reinforcement-Learning'...
remote: Enumerating objects: 3203, done.[K
remote: Counting objects: 100% (805/805), done.[K
remote: Compressing objects: 100% (312/312), done.[K
remote: Total 3203 (delta 536), reused 707 (delta 489), pack-reused 2398 (from 1)[K
Receiving objects: 100% (3203/3203), 19.20 MiB | 7.99 MiB/s, done.
Resolving deltas: 100% (2263/2263), done.


# Changing directory (for colab)

In [14]:
import os
os.chdir('/content/Reversal-Generative-Reinforcement-Learning')

# Importing modules

In [15]:
import gym

import numpy as np
import math
from scipy.special import softmax

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm
from collections import defaultdict

import itertools


In [16]:
import warnings
warnings.filterwarnings('ignore')

# Checking cuda

In [17]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')
assert device != torch.device("cpu")

Device 0: Tesla T4
using cuda...


In [18]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

# Control board

Crucial configurations regarding how your agent will learn in the environment. The meanings are as follow:
(the configs starting with ⚠️ are what we suggest you must tune according to your specific need in your task)
(the configs starting with ◀️ are what we suggest you to play with to see the effect)

| Configs   | Type   | Description                                                                 |
|------------|--------|-----------------------------------------------------------------------------|
| ⚠️game_name  | STR| The name of the environment.                                |
| ⚠️max_steps_for_each_episode | +INT | The maximun steps that the agent will go through while not done. In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.                    |
| ◀️ensemble_size  | +INT | The size of the neural ensemble which the agent is comprised of. The bigger, the better, but the longer training time without parallel training. :-D                  |
| ⚠️state_size  | +INT | The size of the state as input data.                    |
| ⚠️hidden_size   | +INT |The size of the hidden layers. We suggest hidden_size >= state_size.           |
| ⚠️action_size   | +INT | The size of action per step as input data.   |
| ⚠️time_size  | +INT |The length of the sequence of actions. Namely, how many steps in the future the agent will predict or use to discern the present best action.                |
| ⚠️reward_size  | +INT |The size of the reward as output data.                          |
| ⚠️neural_type  | STR |  [**`rnn`**, **`gru`**, **`lstm`**, **`rnn_att`**] The type of neural network you prefer. For now, we support rnn, gru, lstm, and rnn_att (recurrent attention). More to come in the future (or you can build one yourself :-D in the models repository).           |
| ⚠️num_layers  | +INT |The number of layers in rnn, gru, lstm, and rnn_att (recurrent attention). We suggest no less than 3 (>= 3) to provide more flexibility and memory capacity for neural networks.                         |
| ⚠️num_heads  | +INT/None |The number of heads in multi-head attention (Should be able to devide hidden_size) (Should be None for non-attention neural_type).                         |
| hidden_activation  | STR | [**`relu`**, **`leaky_relu`**, **`sigmoid`**, **`tanh`**] The type of activation function in the hidden layers.              |
| output_activation  | STR | [**`relu`**, **`leaky_relu`**, **`sigmoid`**, **`tanh`**] The type of activation function in the output layer.                      |
| shift  | 0/±FLOAT |The value in f(x+shift) where f(x) is activation function in the output layer. This value is interesting. If this value is negatively large, the agent will act more conservatively and prone to exploit known strategy. If this value is positively large, the agent to act more radically and prone to explore all possible strategies before settling down.      |
| init   | STR | [**`random_normal`**, **`random_uniform`**, **`xavier_normal`**, **`xavier_uniform`**, **`glorot_normal`**, **`glorot_uniform`**] The initialization method you prefer.                          |
| opti   | STR | [**`adam`**, **`sgd`**, **`rmsprop`**]  The optimization method you prefer.             |
| loss  | STR | [**`mean_squared_error`**, **`binary_crossentropy`**] The loss or error function you prefer.                           |
| bias  | BOLEAN |Whether you want add bias.                          |
| drop_rate   | 0/+FLOAT |The drop-rate for drop-out.              |
| ⚠️alpha   | 0/+FLOAT |The learning rate for neural networks weight matrices.                           |
| ⚠️iteration_for_learning   | +INT |The iteration for learning.              |
| load_pre_model  | BOLEAN |Whether you want to load previous trained model.                          |
| noise_t  |  +INT |The times applying gaussian noise to the initializated actions of the agent, similar to diffusion model's adding gaussian noise.          |
| ⚠️noise_r  |  0/+FLOAT |The noise range to the initializated actions of the agent. The higher the value is, the more exploration-oriented the agent will be.                    |
| ⚠️noise_r_oscillation  |  +INT |The interval for which noise range will oscillate between noise_r and a very small number like 0.000001 to encourage agent to balance eploration and exploitation.                    |
| ⚠️beta  |  0/+FLOAT |The updating rate for updating actions of the agent.              |
| ⚠️iteration_for_deducing  |  +INT |The iteration for updating actions of the agent.                           |
| episode_for_training  | +INT |How many epsiodes will your agent run in the training mode where your agent will learn offline.              |
| chunk_size  | +INT |The maximum chunk size for sequentializing state, action, reward. We suggest chunk_size <= time_size.      |
| batch_size_for_offline_learning  |+INT | After how many epsodes will your agent start learning from experience buffer.                           |
| PER_epsilon  | 0/+FLOAT |The epsilon for prioritized experience replay.              |
| PER_exponent  | 0/+FLOAT |The expoenet for prioritized experience replay.                           |
| episode_for_testing  | +INT |How many epsiodes will your agent run in the testing mode where your agent will not learn offline.                        |
| render_for_human  | BOLEAN | Wether you want to render the visual result for each step in the testing mode.              |


## frozen lake

In [19]:
game_name =  'FrozenLake-v1'        #⚠️  gym.make(game_name, is_slippery=False, map_name="4x4")
max_steps_for_each_episode = 25     #⚠️


ensemble_size = 5                   #◀️
state_size =  16                    #⚠️
hidden_size = 100                   #⚠️
action_size = 4                     #⚠️
time_size = 8                       #⚠️
reward_size = 100                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1                         #⚠️
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True



## blackjack

In [20]:
game_name = 'Blackjack-v1'          #⚠️
max_steps_for_each_episode = 10     #⚠️


ensemble_size = 5                   #◀️
state_size =  201                   #⚠️
hidden_size = 250                   #⚠️
action_size = 2                     #⚠️
time_size = 5                       #⚠️
reward_size = 100                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True


## cartpole

In [21]:
game_name = 'CartPole-v1'           #⚠️
max_steps_for_each_episode = 2000   #⚠️


ensemble_size = 10                  #◀️
state_size =  400                   #⚠️
hidden_size = 400                   #⚠️
action_size = 2                     #⚠️
time_size = 25                      #⚠️
reward_size = 100                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1                         #⚠️
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True


## mountain car

In [22]:
game_name =  'MountainCar-v0'       #⚠️
max_steps_for_each_episode = 200    #⚠️


ensemble_size = 10                  #◀️
state_size =  200                   #⚠️
hidden_size = 200                   #⚠️
action_size = 3                     #⚠️
time_size = 50                      #⚠️
reward_size = 100                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True


## acrobot

In [23]:
game_name = 'Acrobot-v1'            #⚠️
max_steps_for_each_episode = 200    #⚠️


ensemble_size = 5                   #◀️
state_size =  600                   #⚠️
hidden_size = 600                   #⚠️
action_size = 3                     #⚠️
time_size = 50                      #⚠️
reward_size = 100                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True


## lunar lander

In [24]:
game_name = "LunarLander-v2"        #⚠️
max_steps_for_each_episode = 200    #⚠️


ensemble_size = 5                   #◀️
state_size =  800                   #⚠️
hidden_size = 800                   #⚠️
action_size = 4                     #⚠️
time_size = 50                      #⚠️
reward_size = 250                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1                         #⚠️
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True



## your present config

In [25]:
game_name =  'FrozenLake-v1'        #⚠️
max_steps_for_each_episode = 25     #⚠️


ensemble_size = 5                   #◀️
state_size =  16                    #⚠️
hidden_size = 100                   #⚠️
action_size = 4                     #⚠️
time_size = 8                       #⚠️
reward_size = 100                   #⚠️
neural_type = 'gru'                 #⚠️
num_layers = 3                      #⚠️
num_heads = None                    #⚠️
hidden_activation = 'tanh'
output_activation = 'sigmoid'
shift = 0.0
init = "random_normal"
opti = 'sgd'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.1                         #⚠️
iteration_for_learning = 1000       #⚠️
load_pre_model = False


noise_t = 1                         #⚠️
noise_r = 0.1                       #⚠️
noise_r_oscillation = 10            #⚠️
beta = 0.1                          #⚠️
iteration_for_deducing = 100        #⚠️


episode_for_training = 100000
chunk_size = time_size
batch_size_for_offline_learning = 1
PER_epsilon = 0.000001
PER_exponent = 5


episode_for_testing = 100
render_for_human = True



In [26]:
suffix                      = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_drop={drop_rate:.5f}_learn={iteration_for_learning:05d}_interval={batch_size_for_offline_learning:05d}_deduce={iteration_for_deducing:05d}"
directory                   = f'./result/{game_name}/'
model_directory             = f'./result/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'./result/{game_name}/performace_log_{suffix}.csv'

# Importing local modules

In [27]:
if   game_name == 'FrozenLake-v1':
    from envs.env_frozenlake   import vectorizing_state, vectorizing_action, vectorizing_reward
elif   game_name == 'Blackjack-v1':
    from envs.env_blackjack   import vectorizing_state, vectorizing_action, vectorizing_reward
elif   game_name == 'CartPole-v1':
    from envs.env_cartpole    import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == 'MountainCar-v0':
    from envs.env_mountaincar import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == 'MountainCarContinuous-v0':
    from envs.env_mountaincar_continuous import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == 'Acrobot-v1':
    from envs.env_acrobot import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == "Pendulum-v1":
    from envs.env_pendulum import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == "LunarLander-v2":
    from envs.env_lunarlander import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == 'BipedalWalker-v3':
    from envs.env_bipedalwalker import vectorizing_state, vectorizing_action, vectorizing_reward
else:
   raise RuntimeError('missing env functions')

In [28]:
if neural_type == 'rnn_att':
    from models.model_rnn_att import build_model
    from utils.util_rnn_att   import initialize_pre_activated_actions, \
                                 update_pre_activated_actions, \
                                 sequentialize, \
                                 update_model,\
                                 save_performance_to_csv
else:
    from models.model_rnn import build_model
    from utils.util_rnn_  import initialize_pre_activated_actions, \
                                 update_pre_activated_actions, \
                                 sequentialize, \
                                 update_model,\
                                 save_performance_to_csv

# Deducing -> Learning
Training mode where your agent will learn offline. You can see here how your agent learn overtime and improve its performance.

Creating or loading models

In [29]:

if not os.path.exists(directory):
    os.makedirs(directory)

if load_pre_model == False:

    model_list = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            shift,
                            init,
                            opti,
                            loss,
                            bias,
                            drop_rate,
                            alpha)
        model.to(device)
        model_list.append(model)

elif load_pre_model == True:

    model_list = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            shift,
                            init,
                            opti,
                            loss,
                            bias,
                            drop_rate,
                            alpha)
        model.to(device)
        model_list.append(model)

    for i in range(len(model_list)):
        model_list[i].load_state_dict(torch.load( model_directory  % i ))



Creating Streams

In [30]:
stream_list = []
for _ in range(ensemble_size):
    stream  = torch.cuda.Stream()
    stream_list.append(stream)

Creating desired reward

In [31]:
desired_reward = torch.ones((1, time_size, reward_size))

Putting all the previous works into play

In [None]:

performance_log  = []

for training_episode in tqdm(range(episode_for_training)):

    # initializing short term experience replay buffer
    list_states  = []
    list_actions = []
    list_rewards = []

    # initializing environment
    env                    = gym.make(game_name, is_slippery=False, map_name="4x4")
    env._max_episode_steps = max_steps_for_each_episode
    state                  = env.reset()
    summed_reward          = 0

    # observing state
    state    = vectorizing_state(state)
    list_states.append(state)

    for count in itertools.count(1):

        print(f'\rStep: {count}\r', end='', flush=True)

        # initializing and updating action
        state                 = torch.tensor(np.atleast_2d(state), dtype=torch.float)
        t_oscillation         =  (noise_t if (training_episode % (2 * noise_r_oscillation) < noise_r_oscillation) else 1)
        r_oscillation         =  (noise_r if (training_episode % (2 * noise_r_oscillation) < noise_r_oscillation) else 0.000001)
        pre_activated_action  = initialize_pre_activated_actions(init,
                                                                t_oscillation,
                                                                r_oscillation,
                                                                (time_size, action_size))
        pre_activated_action  = torch.tensor(pre_activated_action[np.newaxis, :, :], dtype=torch.float)
        pre_activated_action  = update_pre_activated_actions(iteration_for_deducing,
                                                             model_list,
                                                             state,
                                                             pre_activated_action,
                                                             desired_reward,
                                                             beta,
                                                             device)
        action, action_       = vectorizing_action(pre_activated_action)
        list_actions.append(action)

        # executing action
        state, reward, done, info = env.step(action_)

        # observing actual reward
        summed_reward += reward
        reward = vectorizing_reward(state, reward, summed_reward, done, reward_size)
        list_rewards.append(reward)

        # observing state
        state    = vectorizing_state(state)
        list_states.append(state)

        if done:
            print(f'Episode {training_episode}: Summed_Reward = {summed_reward}')
            performance_log.append([training_episode, summed_reward])
            save_performance_to_csv(performance_log, performance_log_directory)
            break
        else:
            pass

    env.close()




    # sequentializing short term experience replay buffer
    present_state_tensors   ,\
    future_actions_tensors  ,\
    future_rewards_tensors   ,\
    future_states_tensors    ,\
    pad_size_tensors            = sequentialize(list_states  ,
                                            list_actions ,
                                            list_rewards , chunk_size, device)

    # if training_episode == 0:
    #   long_term_present_state_tensors   = copy.deepcopy(present_state_tensors  )
    #   long_term_future_actions_tensors  = copy.deepcopy(future_actions_tensors )
    #   long_term_future_rewards_tensors   = copy.deepcopy(future_rewards_tensors  )
    #   long_term_future_states_tensors    = copy.deepcopy(future_states_tensors   )
    #   long_term_pad_size_tensors            = copy.deepcopy(pad_size_tensors           )

      # storing sequentialized short term experience to long term experience replay buffer by length when it is new
      # existing_hashes = set(
      #     hash((tuple(t.numpy().flatten()) for t in tensors))
      #     for tensors in zip(
      #         long_term_present_state_tensors,
      #         long_term_future_actions_tensors,
      #         long_term_future_rewards_tensors,
      #         long_term_future_states_tensors,
      #         long_term_pad_size_tensors,
      #     )
      # )
    # else:
    #   for i in range(len(present_state_tensors)):
    #       new_sample = (
    #           present_state_tensors[i],
    #           future_actions_tensors[i],
    #           future_rewards_tensors[i],
    #           future_states_tensors[i],
    #           pad_size_tensors[i]
    #       )
    #       sample_hash = hash(tuple(t.numpy().flatten()) for t in new_sample)
    #       if sample_hash not in existing_hashes:
#
    #           long_term_present_state_tensors  = torch.cat((long_term_present_state_tensors  , new_sample[0].unsqueeze(0)), dim=0)
    #           long_term_future_actions_tensors = torch.cat((long_term_future_actions_tensors , new_sample[1].unsqueeze(0)), dim=0)
    #           long_term_future_rewards_tensors  = torch.cat((long_term_future_rewards_tensors  , new_sample[2].unsqueeze(0)), dim=0)
    #           long_term_future_states_tensors   = torch.cat((long_term_future_states_tensors   , new_sample[3].unsqueeze(0)), dim=0)
    #           long_term_pad_size_tensors           = torch.cat((long_term_pad_size_tensors           , new_sample[4].unsqueeze(0)), dim=0)
#
    #           existing_hashes.add(sample_hash)

    if training_episode == 0:
        long_term_present_state_tensors   = copy.deepcopy(present_state_tensors  )
        long_term_future_actions_tensors  = copy.deepcopy(future_actions_tensors )
        long_term_future_rewards_tensors   = copy.deepcopy(future_rewards_tensors  )
        long_term_future_states_tensors    = copy.deepcopy(future_states_tensors   )
        long_term_pad_size_tensors            = copy.deepcopy(pad_size_tensors           )
    else:
        for i in range(len(present_state_tensors)):
            present_state_tensor   = present_state_tensors    [i]
            future_actions_tensor  = future_actions_tensors   [i]
            future_rewards_tensor   = future_rewards_tensors    [i]
            future_states_tensor    = future_states_tensors     [i]
            pad_size_tensor            = pad_size_tensors             [i]
            if  not any(torch.all(torch.eq(present_state_tensor  , t)) for t in long_term_present_state_tensors  ) or \
                not any(torch.all(torch.eq(future_actions_tensor , t)) for t in long_term_future_actions_tensors ) or \
                not any(torch.all(torch.eq(future_rewards_tensor  , t)) for t in long_term_future_rewards_tensors  ) or \
                not any(torch.all(torch.eq(future_states_tensor   , t)) for t in long_term_future_states_tensors   ) or \
                not any(torch.all(torch.eq(pad_size_tensor           , t)) for t in long_term_pad_size_tensors           ):
                long_term_present_state_tensors  = torch.cat((long_term_present_state_tensors  , present_state_tensor  .unsqueeze(0) ), dim=0)
                long_term_future_actions_tensors = torch.cat((long_term_future_actions_tensors , future_actions_tensor .unsqueeze(0) ), dim=0)
                long_term_future_rewards_tensors  = torch.cat((long_term_future_rewards_tensors  , future_rewards_tensor  .unsqueeze(0) ), dim=0)
                long_term_future_states_tensors   = torch.cat((long_term_future_states_tensors   , future_states_tensor   .unsqueeze(0) ), dim=0)
                long_term_pad_size_tensors           = torch.cat((long_term_pad_size_tensors           , pad_size_tensor           .unsqueeze(0) ), dim=0)


    # batch offline learning
    if (training_episode+1) % batch_size_for_offline_learning == 0:


        start_time = time.time()


        # training with Prioritized Experience Replay (PER)
        for i, model in enumerate(model_list):
            with torch.cuda.stream(stream_list[i]):
                model                     = update_model(iteration_for_learning,
                                                         long_term_present_state_tensors  ,
                                                         long_term_future_actions_tensors ,
                                                         long_term_future_rewards_tensors  ,
                                                         long_term_future_states_tensors   ,
                                                         long_term_pad_size_tensors           ,
                                                         model,
                                                         PER_epsilon,
                                                         PER_exponent,
                                                         device)
                model_list[i]             = model
        torch.cuda.synchronize()


        end_time = time.time()  # Record end time
        execution_time = end_time - start_time  # Calculate duration
        print(f"Execution Time: {execution_time:.4f} seconds")


        # saving:
        for i in range(len(model_list)):
            torch.save(model_list[i].state_dict(), model_directory % i)


        gc.collect()
        torch.cuda.empty_cache()

  0%|          | 0/100000 [00:00<?, ?it/s]

Episode 0: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8],
       device='cuda:0')


  0%|          | 1/100000 [02:31<4212:58:36, 151.67s/it]

Execution Time: 137.0752 seconds
Episode 1: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3

  0%|          | 2/100000 [05:14<4388:19:23, 157.98s/it]

Execution Time: 143.6712 seconds
Episode 2: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3

  0%|          | 3/100000 [07:39<4225:00:31, 152.10s/it]

Execution Time: 142.0799 seconds
Episode 3: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8], device='cuda:0')


  0%|          | 4/100000 [10:18<4306:06:12, 155.03s/it]

Execution Time: 146.5756 seconds
Episode 4: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6,

  0%|          | 5/100000 [13:09<4468:13:05, 160.86s/it]

Execution Time: 150.2748 seconds
Episode 5: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,

  0%|          | 6/100000 [15:50<4465:31:56, 160.77s/it]

Execution Time: 154.3744 seconds
Episode 6: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5

  0%|          | 7/100000 [18:29<4447:37:53, 160.13s/it]

Execution Time: 150.8139 seconds
Episode 7: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4

  0%|          | 8/100000 [21:07<4430:36:48, 159.51s/it]

Execution Time: 152.1783 seconds
Episode 8: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1

  0%|          | 9/100000 [23:56<4514:14:26, 162.53s/it]

Execution Time: 154.2595 seconds
Episode 9: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 

  0%|          | 10/100000 [26:40<4525:16:08, 162.93s/it]

Execution Time: 154.6665 seconds
Episode 10: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 3, 4, 5, 6, 7, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5

  0%|          | 11/100000 [29:57<4812:36:02, 173.27s/it]

Execution Time: 154.5675 seconds
Episode 11: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 3, 4, 5, 6, 7, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5

  0%|          | 12/100000 [33:14<5012:26:52, 180.47s/it]

Execution Time: 155.2394 seconds
Episode 12: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 3, 4, 5, 6, 7, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5

  0%|          | 13/100000 [36:30<5144:46:06, 185.24s/it]

Execution Time: 154.0773 seconds
Episode 13: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 3, 4, 5, 6, 7, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5

  0%|          | 14/100000 [39:46<5234:07:19, 188.45s/it]

Execution Time: 154.4696 seconds
Episode 14: Summed_Reward = 0.0
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 3, 4, 3, 4, 4,
        5, 2, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 7, 7, 7, 8, 8, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 3, 4, 5, 6, 7, 8],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 5, 5, 5

# Deducing only
Testing mode where your trained agent in the training mode will not learn offline. It just keeps running each episode without learning new stuff.

Loading models

In [None]:
model_list = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        hidden_size,
                        action_size,
                        time_size,
                        reward_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        hidden_activation,
                        output_activation,
                        shift,
                        init,
                        opti,
                        loss,
                        bias,
                        drop_rate,
                        alpha)
    model.to(device)
    model_list.append(model)

for i in range(len(model_list)):
    model_list[i].load_state_dict(torch.load(model_directory % i))

Creating desired reward ... again

In [None]:
desired_reward = torch.ones((1, time_size, reward_size))

Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
total_summed_reward = 0

for testing_episode in range(episode_for_testing):

    if render_for_human == True:
        env = gym.make( game_name, render_mode="human")
    else:
        env = gym.make( game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state                  = env.reset()
    if render_for_human == True:
        env.render()
    summed_reward = 0

    state = vectorizing_state(state)

    done = False
    while not done:

        state                 = torch.tensor(np.atleast_2d(state), dtype=torch.float)
        pre_activated_action  = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_action  = torch.tensor(pre_activated_action[np.newaxis, :, :], dtype=torch.float)
        pre_activated_action  = update_pre_activated_actions(iteration_for_deducing,
                                                             model_list,
                                                             state,
                                                             pre_activated_action,
                                                             desired_reward,
                                                             beta,
                                                             device)
        action, action_       = vectorizing_action(pre_activated_action)

        state, reward, done,  info = env.step(action_)
        if render_for_human == True:
            env.render()

        summed_reward += reward

        state = vectorizing_state(state)

        if done:
            break

    env.close()

    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged summed reward:')
    total_summed_reward += summed_reward
    print(total_summed_reward/(testing_episode + 1))



In [None]:
import torch

# Input tensor
input_tensor = torch.tensor([4, 2, 3])  # Shape: (3,)

# Fixed length for each row
fixed_length = 50

# Generate a range for the fixed length
range_tensor = torch.arange(fixed_length)  # Shape: (fixed_length,)
print(range_tensor)
# Compare and create the mask
output_tensor = (range_tensor < input_tensor.unsqueeze(1)).int()  # Shape: (3, fixed_length)

print(output_tensor)


In [None]:
import torch

# Example tensors
tensor_3d = torch.randn(2, 3, 999)  # Shape: (batch_size, num_rows, feature_size)
print(tensor_3d)
tensor_2d = torch.tensor([[1, 2, 3], [4, 5, 6]])  # Shape: (batch_size, num_rows)
print(tensor_2d)
# Expand the 2D tensor to match the 3D tensor's shape for broadcasting
tensor_2d_expanded = tensor_2d.unsqueeze(-1)  # Shape: (batch_size, num_rows, 1)

# Perform element-wise multiplication
result = tensor_3d * tensor_2d_expanded
print(result)
print(f"3D Tensor Shape: {tensor_3d.shape}")
print(f"2D Tensor Shape: {tensor_2d.shape}")
print(f"Expanded 2D Tensor Shape: {tensor_2d_expanded.shape}")
print(f"Result Shape: {result.shape}")
