<a href="https://colab.research.google.com/github/Brownwang0426/RGRL/blob/main/CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirements

In [None]:
!sudo apt-get install python3.10

In [None]:
!pip install pandas==2.0.3 numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gym==0.25.2 pygame==2.5.2 tqdm torch==2.0.1

# Importing modules

In [1]:
import gym

import numpy as np
import math
from scipy.special import softmax

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm


# Checking cuda

In [2]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')
assert device != torch.device("cpu") # Sorry, but we really recommend you to run it on GPU :-) Nvidia needs your money :-)

Device 0: NVIDIA T500
using cuda...


In [3]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

# Class for building model

In [4]:
class build_model(nn.Module):
    def __init__(self,
                 input_neuron_size_,
                 hidden_neuron_size,
                 input_neuron_size,
                 input_sequence_size,
                 output_neuron_size,
                 neural_type,
                 num_layers,
                 num_heads,
                 hidden_activation,
                 output_activation,
                 initializer,
                 optimizer,
                 loss,
                 drop_rate,
                 alpha,
                 mask_value):

        super(build_model, self).__init__()

        self.input_neuron_size_   = int(input_neuron_size_)
        self.hidden_neuron_size   = int(hidden_neuron_size)
        self.input_neuron_size    = int(input_neuron_size)
        self.input_sequence_size  = int(input_sequence_size)
        self.output_neuron_size   = int(output_neuron_size)
        self.neural_type          = neural_type
        self.num_heads            = num_heads

        self.hidden_activation    = hidden_activation
        self.output_activation    = output_activation
        self.initializer          = initializer
        self.optimizer            = optimizer
        self.loss                 = loss
        self.drop_rate            = drop_rate
        self.alpha                = alpha
        self.mask_value           = mask_value

        self.bias = False

        self.num_layers = num_layers

        neural_types = {
            'rnn': nn.RNN,
            'gru': nn.GRU,
            'lstm': nn.LSTM
        }

        self.fully_connected_layer_in_0      = nn.Linear(self.input_neuron_size_, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_in_1      = nn.Linear(self.hidden_neuron_size, self.hidden_neuron_size, bias=self.bias)

        self.recurrent_layer_0               = neural_types[neural_type.lower()](self.input_neuron_size, self.hidden_neuron_size, num_layers=self.num_layers, batch_first=False, bias=self.bias, dropout=self.drop_rate)

        self.fully_connected_layer_0         = nn.Linear(self.hidden_neuron_size * self.input_sequence_size, self.output_neuron_size, bias=self.bias)

        # Activation functions
        self.hidden_activation = self.get_activation(self.hidden_activation)
        self.output_activation = self.get_activation(self.output_activation)

        # Initialize weights for fully connected layers
        self.initialize_weights(self.initializer  )

        # Optimizer
        optimizers = {
            'adam': optim.Adam,
            'sgd': optim.SGD,
            'rmsprop': optim.RMSprop
        }
        self.selected_optimizer = optimizers[self.optimizer.lower()](self.parameters(), lr=self.alpha)

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(),
            'binary_crossentropy': torch.nn.BCELoss()
        }
        self.loss_function = losses[self.loss .lower()]

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(reduction='none'),
            'binary_crossentropy': torch.nn.BCELoss(reduction='none')
        }
        self.loss_function_ = losses[self.loss .lower()]


    def forward(self, initial_hidden, x, padding_mask):

        h  = self.fully_connected_layer_in_0(initial_hidden)
        h  = self.hidden_activation(h)
        h  = self.fully_connected_layer_in_1(h)
        h  = self.hidden_activation(h)
        h  = torch.unsqueeze(h, dim=0).repeat(self.num_layers, 1, 1)

        out        = x.permute(1, 0, 2)
        lengths    = (out != self.mask_value).any(dim=2).sum(dim=0).cpu().long() # since x is (sequence_length, batch_size, input_size), we should use sum(dim=0)
        out        = rnn_utils.pack_padded_sequence(out, lengths, batch_first=False, enforce_sorted=False)
        # Forward propagate RNN
        if self.neural_type == 'lstm':
            out, h   = self.recurrent_layer_0(out, (h, h))
        else:
            out, h   = self.recurrent_layer_0(out, h)

        out, _     = rnn_utils.pad_packed_sequence(out, batch_first=False)
        padding    = (0, 0, 0, 0, 0, self.input_sequence_size - out.size(0))
        out        = F.pad(out, padding, "constant", 0)
        out        = out.permute(1, 0, 2)

        out    = torch.flatten(out, start_dim=1)
        out    = self.fully_connected_layer_0(out)
        out    = self.output_activation(out)

        return out 




    def get_activation(self,  activation):
        activations = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(),
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh()
        }
        return activations[ activation.lower()]

    def initialize_weights(self, initializer):
        initializers = {
            'random_uniform': nn.init.uniform_,
            'random_normal': nn.init.normal_,
            'glorot_uniform': nn.init.xavier_uniform_,
            'glorot_normal': nn.init.xavier_normal_,
            'xavier_uniform': nn.init.xavier_uniform_,
            'xavier_normal': nn.init.xavier_normal_
        }
        initializer = initializers[initializer.lower()]
        for layer in self.children():
            if isinstance(layer, nn.Linear):
                initializer(layer.weight)

# Function for updating pre-activated actions using error backprop

In [5]:

def update_pre_activated_actions(epoch_for_deducing,
                                 model_loader,
                                 state,
                                 pre_activated_actions,
                                 desired_reward,
                                 beta):

    model_loader_copy = copy.deepcopy(model_loader)

    for epoch in range(epoch_for_deducing):

        random.shuffle(model_loader_copy)

        for model in model_loader_copy:

            actions = torch.sigmoid(pre_activated_actions)

            model.train()
            actions = actions.clone().detach().requires_grad_(True)
            if actions.grad is not None:
                actions.grad.zero_()
            for param in model.parameters():
                param.requires_grad = False

            loss_function = model.loss_function
            output        = model(state, actions, padding_mask=None)
            total_loss    = loss_function(output, desired_reward)
            total_loss.backward() # get grad

            pre_activated_actions -= actions.grad * (1 - actions) * actions * beta # update params

    return pre_activated_actions




# Function for updating model using error backprop

Elastic weight consolidation:
https://arxiv.org/pdf/1612.00796

In [6]:
# traditional EWC
def EWC_loss(EWC_lambda, model, prev_model, prev_gradient_matrix):
    model_param      = model.state_dict()
    prev_model_param = prev_model.state_dict()
    loss = 0
    for name, param in model.named_parameters():
        diagonal_fisher_matrix = prev_gradient_matrix[name] ** 2
        param_diff             = (model_param[name] - prev_model_param[name]) ** 2
        loss                  += (diagonal_fisher_matrix * param_diff).sum()
    return EWC_lambda * loss




def update_model(epoch_for_learning,
                 model,
                 train_loader,
                 train_loader_,
                 prev_model,
                 prev_gradient_matrix,
                 EWC_lambda):

    for epoch in range(epoch_for_learning):

        for state, actions, reward, next_state, padding_mask in train_loader:

            model.train()
            selected_optimizer = model.selected_optimizer
            selected_optimizer.zero_grad()

            loss_function        = model.loss_function
            output               = model(state, actions, padding_mask)
            total_loss           = loss_function(output, reward) 
            total_loss          += EWC_loss(EWC_lambda, model, prev_model, prev_gradient_matrix)
            total_loss.backward()     # get grad

            selected_optimizer.step() # update params




    # training and updating present gradient_matrix
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}

    for state, actions, reward, next_state, padding_mask in train_loader_:

        model.train()
        selected_optimizer = model.selected_optimizer
        selected_optimizer.zero_grad()

        loss_function        = model.loss_function
        output               = model(state, actions, padding_mask)
        total_loss           = loss_function(output, reward) 
        total_loss.backward()        # get grad

        for name, param in model.named_parameters():
            gradient_matrix[name] += param.grad

    gradient_matrix = {name: param / len(train_loader) for name, param in gradient_matrix.items()}




    return model, gradient_matrix

# Function for re-initializing action value in each step

In [7]:
def initialize_pre_activated_actions(init, noise_t, noise_r, shape):
    input = 0
    if   init == "random_uniform":
        for _ in range(noise_t):
            input += np.array([  np.random.uniform(low=0, high=1, size=shape)    ]) * noise_r
    elif init == "random_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= 1, size= shape )    ])  * noise_r
    elif init == "glorot_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "glorot_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    elif init == "xavier_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "xavier_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    return input

# Function for vectorizing
Crucial function regarding how you manipulate or shape your state, action and reward

- It's essential to choose between immediate rewards and summed rewards for training your agent. If the current state doesn't encapsulate all crucial past information, using immediate rewards is advisable. This approach prevents confusion caused by varying summed rewards for the same state.

- As for reward shaping, it is recommended to increase your reward upper and decrease your reward lower bound.

In [8]:

def quantifying(array_size, init, interval, input):
    array = np.zeros(array_size)
    index = int( (input - init) // interval + 1)
    if index >= 0:
        array[ : index] = 1
    return array

def vectorizing_state(state):      # Reminder: change this for your specific task ⚠️⚠️⚠️
    state_0 = quantifying(100, -2.5, 0.050, state[0])
    state_1 = quantifying(100, -3.75, 0.075, state[1])
    state_2 = quantifying(100, -0.375, 0.0075, state[2])
    state_3 = quantifying(100, -3.75, 0.075, state[3])
    state_4 = quantifying(100, 0, 10, 0)
    state   = np.atleast_2d(np.concatenate((state_0, state_1, state_2, state_3, state_4)))
    return state

def vectorizing_action(action_size, action_argmax):  # Reminder: change this for your specific task ⚠️⚠️⚠️
    return np.eye(action_size)[action_argmax]

def vectorizing_reward(state, reward, summed_reward, done, reward_size):       # Reminder: change this for your specific task ⚠️⚠️⚠️
    if done:
        reward = np.zeros(reward_size)
    else:
        reward = np.ones(reward_size)
    return reward


# Function for sequentializing state, action and reward

In [9]:
# I don't know why the following multi-processing does not work...T_T (I suck) But I kept it just for later investigation.

def process_time_size(params):
    state_list, action_list, reward_list, time_size, time = params

    sequentialized_state_list      = []
    sequentialized_action_list     = []
    sequentialized_reward_list     = []
    sequentialized_next_state_list = []

    if time_size > len(state_list[:-1]):
        time_size = len(state_list[:-1])
    else:
      pass

    time_size_ = time_size
    for i in range(len(reward_list[:])):
        sequentialized_state_list.append(       state_list [ i ] )
        sequentialized_action_list.append(      action_list[ i : i+time_size_]  )
        sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+time_size_]) - 1 ]  )
        sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+time_size_])     ]  )

    # a more sophisticated method
    # for j in range(time_size):
    #     time_size_ = j+1
    #     if time_size_== 1:
    #         for i in range(len(reward_list[:])):
    #             sequentialized_state_list.append(       state_list [ i ] )
    #             sequentialized_action_list.append(      action_list[ i : i+time_size_]  )
    #             sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+time_size_]) - 1 ]  )
    #             sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+time_size_])     ]  )
    #     else:
    #         for i in range(len(reward_list[:-time_size_+1])):
    #             sequentialized_state_list.append(       state_list [ i ] )
    #             sequentialized_action_list.append(      action_list[ i : i+time_size_]  )
    #             sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+time_size_]) - 1 ]  )
    #             sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+time_size_])     ]  )

    return (sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list)




def sequentialize(state_list, action_list, reward_list, time_size):
    # Prepare the parameters for each process
    params_list = [(state_list, action_list, reward_list, time_size, time) for time in range(time_size)]

    # Use multiprocessing Pool to process chunks in parallel
    with mp.Pool(processes=mp.cpu_count()) as pool:
        results = pool.map(process_time_size, params_list)

    # Aggregate results
    sequentialized_state_list = []
    sequentialized_action_list = []
    sequentialized_reward_list = []
    sequentialized_next_state_list = []

    for result in results:
        s_states, s_actions, s_rewards, s_next_states = result
        sequentialized_state_list.extend(s_states)
        sequentialized_action_list.extend(s_actions)
        sequentialized_reward_list.extend(s_rewards)
        sequentialized_next_state_list.extend(s_next_states)

    return sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list


In [10]:
def sequentialize(state_list, action_list, reward_list, time_size):

    sequentialized_state_list       = []
    sequentialized_action_list      = []
    sequentialized_reward_list      = []
    sequentialized_next_state_list  = []

    if time_size > len(state_list[:-1]):
        time_size = len(state_list[:-1])
    else:
      pass

    time_size_ = time_size
    for i in range(len(reward_list[:])):
        sequentialized_state_list.append(       state_list [ i ] )
        sequentialized_action_list.append(      action_list[ i : i+time_size_]  )
        sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+time_size_]) - 1 ]  )
        sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+time_size_])     ]  )

    # a more sophisticated method
    # for j in range(time_size):
    #     time_size_ = j+1
    #     if time_size_== 1:
    #         for i in range(len(reward_list[:])):
    #             sequentialized_state_list.append(       state_list [ i ] )
    #             sequentialized_action_list.append(      action_list[ i : i+time_size_]  )
    #             sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+time_size_]) - 1 ]  )
    #             sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+time_size_])     ]  )
    #     else:
    #         for i in range(len(reward_list[:-time_size_+1])):
    #             sequentialized_state_list.append(       state_list [ i ] )
    #             sequentialized_action_list.append(      action_list[ i : i+time_size_]  )
    #             sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+time_size_]) - 1 ]  )
    #             sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+time_size_])     ]  )

    return sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list

# Function for data preparation

In [11]:
def obtain_tensor_from_list(sequentialized_state_list,
                            sequentialized_actions_list,
                            sequentialized_reward_list,
                            sequentialized_next_state_list,
                            device,
                            time_size,
                            mask_value,
                            num_heads):

    # Convert lists to tensors directly on the desired device and data type
    state_tensor = torch.tensor(np.array(sequentialized_state_list), dtype=torch.float).to(device)
    reward_tensor = torch.tensor(np.array(sequentialized_reward_list), dtype=torch.float).to(device)
    next_state_tensor = torch.tensor(np.array(sequentialized_next_state_list), dtype=torch.float).to(device)

    # Pad and stack actions_tensor efficiently
    actions_list = []
    for arr in sequentialized_actions_list:
        tensor_arr = torch.tensor(np.array(arr), dtype=torch.float).to(device)
        # Pad tensor only once per tensor
        if tensor_arr.size(0) < time_size:
            padded_arr = F.pad(tensor_arr,
                               (0, 0, 0, time_size - tensor_arr.size(0)),
                               mode='constant',
                               value=mask_value)
        else:
            padded_arr = tensor_arr
        actions_list.append(padded_arr)
    actions_tensor = torch.stack(actions_list).to(device)

    # Compute row_mask and padding_mask efficiently
    row_mask = (actions_tensor == mask_value).all(dim=-1)
    padding_mask = row_mask.to(dtype=torch.bool)
    padding_mask = padding_mask.to(device)

    return state_tensor, actions_tensor, reward_tensor, next_state_tensor, padding_mask

In [12]:
def update_error(model,
                 train_loader_,
                 device):


    for state, actions, reward, next_state, padding_mask in train_loader_:

        model.train()
        selected_optimizer = model.selected_optimizer
        selected_optimizer.zero_grad()

        loss_function        = model.loss_function_
        output               = model(state, actions, padding_mask)
        total_loss           = loss_function(output, reward).detach().to(device)
        total_loss           = torch.sum(torch.abs(total_loss), axis=1)

    return total_loss

In [13]:
def obtain_tensor_according_to_TD_error(state_tensor, actions_tensor, reward_tensor, next_state_tensor, padding_mask, index_list):

    # Use advanced indexing to select elements based on random_index
    state_tensor = state_tensor[index_list]
    actions_tensor = actions_tensor[index_list]
    reward_tensor = reward_tensor[index_list]
    next_state_tensor = next_state_tensor[index_list]
    padding_mask = padding_mask[index_list]

    return state_tensor, actions_tensor, reward_tensor, next_state_tensor, padding_mask

In [14]:
def save_performance_to_csv(performance_log, filename='performance_log.csv'):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Episode', 'Summed_Reward'])
        writer.writerows(performance_log)

# Control board

Crucial variables regarding how your agent will learn in the environment

- In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.



In [15]:
game_name = 'CartPole-v1'                # Reminder: change this for your specific task ⚠️⚠️⚠️
max_steps_for_each_episode = 2000        # Reminder: change this for your specific task ⚠️⚠️⚠️


ensemble_size = 5                        # Reminder: change this value to see the impact of MWM-SGD ◀️◀️◀️
state_size =  500                        # Reminder: change this for your specific task ⚠️⚠️⚠️
hidden_size = 100                        # Reminder: change this for your specific task ⚠️⚠️⚠️
action_size = 2                          # Reminder: change this for your specific task ⚠️⚠️⚠️
time_size = 15                           # Reminder: change this for your specific task ⚠️⚠️⚠️
reward_size = 100                        # Reminder: change this for your specific task ⚠️⚠️⚠️
neural_type = 'gru'                      # rnn gru lstm; att
num_layers = 2                           # 1, 2, 3, 4, etc.
num_heads = None                         # None for non-attention; should be able to divide hidden_size for attention
hidden_activation = 'tanh'               # relu leaky_relu sigmoid tanh
output_activation = 'sigmoid'            # relu leaky_relu sigmoid tanh

init = "random_normal"                   # random_normal random_uniform xavier_normal xavier_uniform  glorot_normal  glorot_uniform
opti = 'sgd'                             # adam sgd rmsprop
loss = 'mean_squared_error'              # mean_squared_error  binary_crossentropy
drop_rate = 0.001                       # Reminder: change this value to see the impact of drop-out ◀️◀️◀️
alpha = 0.1                              # learning rate
epoch_for_learning = 10                  # Reminder: change this for your specific task ⚠️⚠️⚠️
mask_value = sys.maxsize                 # mask value
batch_size = 1
load_pre_model = False


noise_t = 1               # gaussian noise
noise_r = 0.1             # smaller value encourages agent to exploit experience while larger value encourages agent to explore
beta = 0.1
epoch_for_deducing =  int(100/ensemble_size)    # Reminder: change this for your specific task ⚠️⚠️⚠️


episode_for_training = 100000
batch_size_for_offline_learning = 10     # batch size for batch offline learning, Reminder: change this for your specific task ⚠️⚠️⚠️
PER_sample_size = 10000                  # prioritized experience replay samples, Reminder: change this for your specific task ⚠️⚠️⚠️
EWC_lambda = 1                           # elastic weight control lambda, Reminder: change this for your specific task ⚠️⚠️⚠️


episode_for_testing = 100
render_for_human = True


suffix                      = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_drop={drop_rate:.5f}_learn={epoch_for_learning:05d}_interval={batch_size_for_offline_learning:05d}_deduce={epoch_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/result/{game_name}/'
model_directory             = f'/content/result/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/result/{game_name}/performace_log_{suffix}.csv'

# Deducing > Learning


Creating or loading models

In [16]:

if not os.path.exists(directory):
    os.makedirs(directory)

if load_pre_model == False:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            drop_rate,
                            alpha,
                            mask_value)
        model.to(device)
        model_loader.append(model)

elif load_pre_model == True:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            drop_rate,
                            alpha,
                            mask_value)
        model.to(device)
        model_loader.append(model)

    for i in range(len(model_loader)):
        model_loader[i].load_state_dict(torch.load( model_directory  % i ))


Creating Streams

In [17]:
stream_list = []
for _ in range(ensemble_size):
    stream  = torch.cuda.Stream()
    stream_list.append(stream)


Creating intial gradient matrices

In [18]:

prev_model_loader = copy.deepcopy(model_loader)

prev_gradient_matrix_loader = []
for model in model_loader:
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}
    prev_gradient_matrix_loader.append( gradient_matrix )


Creating desired reward

In [19]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float).to(device)

Putting all the previous works into play

In [20]:

performance_log = []
performance_log.append([0, 0])

for training_episode in tqdm(range(episode_for_training)):

    # initializing short term experience replay buffer
    short_term_state_list  = []
    short_term_action_list = []
    short_term_reward_list = []

    # initializing environment
    env           = gym.make(game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state         = env.reset()
    summed_reward = 0

    # observing state
    state = vectorizing_state(state)
    short_term_state_list.append(state[0])

    done = False
    while not done:

        # initializing and updating actions
        state                 = torch.tensor(state, dtype=torch.float).to(device)
        pre_activated_actions = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_actions = torch.tensor(pre_activated_actions, dtype=torch.float).to(device)
        pre_activated_actions = update_pre_activated_actions(epoch_for_deducing,
                                                             model_loader,
                                                             state,
                                                             pre_activated_actions,
                                                             desired_reward,
                                                             beta)
        action_argmax    = int(torch.argmax(pre_activated_actions[0, 0]))
        action           = vectorizing_action(action_size, action_argmax)
        short_term_action_list.append(action)

        # executing action
        state, reward, done, info = env.step(action_argmax)

        # observing actual reward
        summed_reward += reward
        reward = vectorizing_reward(state, reward, summed_reward, done, reward_size)
        short_term_reward_list.append(reward)

        # observing state
        state = vectorizing_state(state)
        short_term_state_list.append(state[0])

        if done:
            print(f'Episode {training_episode+1}: Summed_Reward = {summed_reward}')
            performance_log.append([training_episode+1, summed_reward])
            save_performance_to_csv(performance_log, performance_log_directory)
            break




    env.close()




    # sequentialize short term experience replay buffer and then save it to long term experience replay buffer
    short_term_sequentialized_state_list, \
    short_term_sequentialized_actions_list, \
    short_term_sequentialized_reward_list, \
    short_term_sequentialized_next_state_list = sequentialize(short_term_state_list, short_term_action_list, short_term_reward_list, time_size )
    short_term_sequentialized_state_tensor,\
    short_term_sequentialized_actions_tensor,\
    short_term_sequentialized_reward_tensor,\
    short_term_sequentialized_next_state_tensor,\
    short_term_sequentialized_padding_mask = obtain_tensor_from_list(short_term_sequentialized_state_list,
                                                                     short_term_sequentialized_actions_list,
                                                                     short_term_sequentialized_reward_list,
                                                                     short_term_sequentialized_next_state_list,
                                                                     device,
                                                                     time_size,
                                                                     mask_value,
                                                                     num_heads)
    if training_episode==0:
        long_term_sequentialized_state_tensor      = copy.deepcopy(short_term_sequentialized_state_tensor)
        long_term_sequentialized_actions_tensor    = copy.deepcopy(short_term_sequentialized_actions_tensor)
        long_term_sequentialized_reward_tensor     = copy.deepcopy(short_term_sequentialized_reward_tensor)
        long_term_sequentialized_next_state_tensor = copy.deepcopy(short_term_sequentialized_next_state_tensor)
        long_term_sequentialized_padding_mask      = copy.deepcopy(short_term_sequentialized_padding_mask)
    else:
        long_term_sequentialized_state_tensor      = torch.cat((long_term_sequentialized_state_tensor     , short_term_sequentialized_state_tensor     ), dim=0)
        long_term_sequentialized_actions_tensor    = torch.cat((long_term_sequentialized_actions_tensor   , short_term_sequentialized_actions_tensor   ), dim=0)
        long_term_sequentialized_reward_tensor     = torch.cat((long_term_sequentialized_reward_tensor    , short_term_sequentialized_reward_tensor    ), dim=0)
        long_term_sequentialized_next_state_tensor = torch.cat((long_term_sequentialized_next_state_tensor, short_term_sequentialized_next_state_tensor), dim=0)
        long_term_sequentialized_padding_mask      = torch.cat((long_term_sequentialized_padding_mask     , short_term_sequentialized_padding_mask     ), dim=0)
        



    # batch offline learning
    if (training_episode+1) % batch_size_for_offline_learning == 0:




        # creating samples for prioritized experience replay buffer:
        dataset      = TensorDataset(long_term_sequentialized_state_tensor     ,
                                     long_term_sequentialized_actions_tensor   ,
                                     long_term_sequentialized_reward_tensor    ,
                                     long_term_sequentialized_next_state_tensor,
                                     long_term_sequentialized_padding_mask     )
        data_loader_ = DataLoader(dataset, batch_size = len(dataset), shuffle=False)
        total_temporal_difference_error = 0
        for i, model in enumerate(model_loader):
            with torch.cuda.stream(stream_list[i]):
                temporal_difference_error        = update_error(model, 
                                                                data_loader_,
                                                                device)
                total_temporal_difference_error += temporal_difference_error
        torch.cuda.synchronize()
        total_temporal_difference_error = total_temporal_difference_error.cpu().numpy()
        total_temporal_difference_error = total_temporal_difference_error / np.sum(total_temporal_difference_error)
        index_list                      = np.random.choice(range(len(dataset)), 
                                                           size=min(PER_sample_size, len(dataset)), 
                                                           p=total_temporal_difference_error, 
                                                           replace=False)
        selected_sequentialized_state_tensor,\
        selected_sequentialized_actions_tensor,\
        selected_sequentialized_reward_tensor,\
        selected_sequentialized_next_state_tensor,\
        selected_sequentialized_padding_mask = obtain_tensor_according_to_TD_error(long_term_sequentialized_state_tensor     ,
                                                                                   long_term_sequentialized_actions_tensor   ,
                                                                                   long_term_sequentialized_reward_tensor    ,
                                                                                   long_term_sequentialized_next_state_tensor,
                                                                                   long_term_sequentialized_padding_mask     ,
                                                                                   index_list)
        



        # starting learning with elastic weight control
        dataset      = TensorDataset(selected_sequentialized_state_tensor,
                                     selected_sequentialized_actions_tensor,
                                     selected_sequentialized_reward_tensor,
                                     selected_sequentialized_next_state_tensor,
                                     selected_sequentialized_padding_mask)
        data_loader  = DataLoader(dataset, batch_size = batch_size, shuffle=True)
        data_loader_ = DataLoader(dataset, batch_size = len(dataset), shuffle=False)
        gradient_matrix_loader = [''] * len(model_loader)
        for i, model in enumerate(model_loader):
            with torch.cuda.stream(stream_list[i]):
                model, gradient_matrix    = update_model(epoch_for_learning,
                                                         model,
                                                         data_loader,
                                                         data_loader_,
                                                         prev_model_loader[i],
                                                         prev_gradient_matrix_loader[i],
                                                         EWC_lambda)
                model_loader[i]           = model
                gradient_matrix_loader[i] = gradient_matrix
        torch.cuda.synchronize()
        prev_model_loader           = copy.deepcopy(model_loader)
        prev_gradient_matrix_loader = copy.deepcopy(gradient_matrix_loader)




        # saving:
        for i in range(len(model_loader)):
            torch.save(model_loader[i].state_dict(), model_directory % i)


        gc.collect()
        torch.cuda.empty_cache()

  deprecation(
  deprecation(
  result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,
  if not isinstance(terminated, (bool, np.bool8)):
  0%|          | 1/100000 [00:15<440:43:57, 15.87s/it]

Episode 1: Summed_Reward = 20.0


  0%|          | 2/100000 [00:54<805:28:51, 29.00s/it]

Episode 2: Summed_Reward = 41.0


  0%|          | 3/100000 [01:07<603:09:42, 21.71s/it]

Episode 3: Summed_Reward = 23.0


  0%|          | 4/100000 [01:14<445:56:00, 16.05s/it]

Episode 4: Summed_Reward = 14.0


  0%|          | 5/100000 [01:28<428:25:24, 15.42s/it]

Episode 5: Summed_Reward = 25.0


  0%|          | 6/100000 [01:41<398:08:00, 14.33s/it]

Episode 6: Summed_Reward = 21.0


  0%|          | 7/100000 [01:49<343:15:47, 12.36s/it]

Episode 7: Summed_Reward = 16.0


  0%|          | 8/100000 [01:58<314:30:05, 11.32s/it]

Episode 8: Summed_Reward = 15.0


  0%|          | 9/100000 [02:37<555:23:58, 20.00s/it]

Episode 9: Summed_Reward = 47.0
Episode 10: Summed_Reward = 24.0


  0%|          | 11/100000 [05:14<1267:02:28, 45.62s/it]

Episode 11: Summed_Reward = 37.0


  0%|          | 12/100000 [05:29<1005:23:46, 36.20s/it]

Episode 12: Summed_Reward = 21.0


  0%|          | 13/100000 [05:38<778:09:28, 28.02s/it] 

Episode 13: Summed_Reward = 13.0


  0%|          | 14/100000 [05:51<652:09:26, 23.48s/it]

Episode 14: Summed_Reward = 19.0


  0%|          | 15/100000 [05:58<511:20:21, 18.41s/it]

Episode 15: Summed_Reward = 10.0


  0%|          | 16/100000 [06:05<420:25:15, 15.14s/it]

Episode 16: Summed_Reward = 11.0


  0%|          | 17/100000 [06:12<351:48:42, 12.67s/it]

Episode 17: Summed_Reward = 10.0


  0%|          | 18/100000 [06:19<300:29:19, 10.82s/it]

Episode 18: Summed_Reward = 9.0


  0%|          | 19/100000 [06:31<318:28:17, 11.47s/it]

Episode 19: Summed_Reward = 19.0
Episode 20: Summed_Reward = 15.0


  0%|          | 21/100000 [09:45<1348:55:29, 48.57s/it]

Episode 21: Summed_Reward = 43.0


  0%|          | 22/100000 [09:54<1018:54:43, 36.69s/it]

Episode 22: Summed_Reward = 16.0


  0%|          | 23/100000 [10:00<763:42:10, 27.50s/it] 

Episode 23: Summed_Reward = 11.0


  0%|          | 24/100000 [10:09<611:47:46, 22.03s/it]

Episode 24: Summed_Reward = 17.0


  0%|          | 25/100000 [10:22<539:46:50, 19.44s/it]

Episode 25: Summed_Reward = 25.0


  0%|          | 26/100000 [10:30<442:19:01, 15.93s/it]

Episode 26: Summed_Reward = 15.0


  0%|          | 27/100000 [10:43<419:11:28, 15.09s/it]

Episode 27: Summed_Reward = 24.0


  0%|          | 28/100000 [10:53<372:45:29, 13.42s/it]

Episode 28: Summed_Reward = 18.0


  0%|          | 29/100000 [11:00<323:26:07, 11.65s/it]

Episode 29: Summed_Reward = 14.0
Episode 30: Summed_Reward = 9.0


  0%|          | 31/100000 [14:54<1560:03:07, 56.18s/it]

Episode 31: Summed_Reward = 26.0


  0%|          | 32/100000 [15:03<1167:14:45, 42.03s/it]

Episode 32: Summed_Reward = 13.0


  0%|          | 33/100000 [15:18<944:51:39, 34.03s/it] 

Episode 33: Summed_Reward = 29.0


  0%|          | 34/100000 [15:28<739:43:32, 26.64s/it]

Episode 34: Summed_Reward = 17.0


  0%|          | 35/100000 [15:35<574:59:58, 20.71s/it]

Episode 35: Summed_Reward = 12.0


  0%|          | 36/100000 [16:01<624:57:20, 22.51s/it]

Episode 36: Summed_Reward = 50.0


  0%|          | 37/100000 [16:09<499:35:07, 17.99s/it]

Episode 37: Summed_Reward = 14.0


  0%|          | 38/100000 [16:30<523:02:43, 18.84s/it]

Episode 38: Summed_Reward = 39.0


  0%|          | 39/100000 [16:52<550:28:54, 19.83s/it]

Episode 39: Summed_Reward = 42.0
Episode 40: Summed_Reward = 20.0


  0%|          | 41/100000 [22:10<2151:42:13, 77.49s/it] 

Episode 41: Summed_Reward = 16.0


  0%|          | 42/100000 [22:31<1681:18:17, 60.55s/it]

Episode 42: Summed_Reward = 34.0


  0%|          | 43/100000 [22:48<1315:56:20, 47.39s/it]

Episode 43: Summed_Reward = 29.0


  0%|          | 44/100000 [22:55<984:12:20, 35.45s/it] 

Episode 44: Summed_Reward = 14.0


  0%|          | 45/100000 [23:08<797:09:35, 28.71s/it]

Episode 45: Summed_Reward = 24.0


  0%|          | 46/100000 [23:19<645:14:11, 23.24s/it]

Episode 46: Summed_Reward = 20.0


  0%|          | 47/100000 [23:29<533:36:06, 19.22s/it]

Episode 47: Summed_Reward = 18.0


  0%|          | 48/100000 [23:35<425:09:21, 15.31s/it]

Episode 48: Summed_Reward = 12.0


  0%|          | 49/100000 [23:48<404:16:44, 14.56s/it]

Episode 49: Summed_Reward = 24.0
Episode 50: Summed_Reward = 12.0


  0%|          | 51/100000 [30:20<2557:20:23, 92.11s/it] 

Episode 51: Summed_Reward = 47.0


  0%|          | 52/100000 [30:36<1922:52:04, 69.26s/it]

Episode 52: Summed_Reward = 28.0


  0%|          | 53/100000 [30:49<1449:53:01, 52.22s/it]

Episode 53: Summed_Reward = 22.0


  0%|          | 54/100000 [31:11<1197:41:29, 43.14s/it]

Episode 54: Summed_Reward = 42.0


  0%|          | 55/100000 [31:23<940:22:12, 33.87s/it] 

Episode 55: Summed_Reward = 23.0


  0%|          | 56/100000 [31:31<722:06:08, 26.01s/it]

Episode 56: Summed_Reward = 14.0


  0%|          | 57/100000 [31:58<728:20:09, 26.24s/it]

Episode 57: Summed_Reward = 51.0


  0%|          | 58/100000 [32:13<640:03:28, 23.06s/it]

Episode 58: Summed_Reward = 30.0


  0%|          | 59/100000 [32:21<514:17:56, 18.53s/it]

Episode 59: Summed_Reward = 15.0
Episode 60: Summed_Reward = 24.0


  0%|          | 61/100000 [40:10<3003:17:51, 108.18s/it]

Episode 61: Summed_Reward = 13.0


  0%|          | 62/100000 [40:43<2376:42:50, 85.61s/it] 

Episode 62: Summed_Reward = 61.0


  0%|          | 63/100000 [40:49<1711:14:49, 61.64s/it]

Episode 63: Summed_Reward = 10.0


  0%|          | 64/100000 [41:00<1291:27:07, 46.52s/it]

Episode 64: Summed_Reward = 18.0


  0%|          | 65/100000 [41:06<958:12:06, 34.52s/it] 

Episode 65: Summed_Reward = 10.0


  0%|          | 66/100000 [41:17<758:48:42, 27.34s/it]

Episode 66: Summed_Reward = 17.0


  0%|          | 67/100000 [41:24<592:42:14, 21.35s/it]

Episode 67: Summed_Reward = 12.0


  0%|          | 68/100000 [41:31<469:14:15, 16.90s/it]

Episode 68: Summed_Reward = 11.0


  0%|          | 69/100000 [41:44<436:54:34, 15.74s/it]

Episode 69: Summed_Reward = 24.0
Episode 70: Summed_Reward = 11.0


  0%|          | 71/100000 [50:31<3304:44:11, 119.06s/it]

Episode 71: Summed_Reward = 11.0


  0%|          | 72/100000 [50:43<2406:03:32, 86.68s/it] 

Episode 72: Summed_Reward = 20.0


  0%|          | 73/100000 [50:54<1776:53:53, 64.02s/it]

Episode 73: Summed_Reward = 21.0


  0%|          | 74/100000 [51:01<1305:41:58, 47.04s/it]

Episode 74: Summed_Reward = 14.0


  0%|          | 75/100000 [51:09<976:43:36, 35.19s/it] 

Episode 75: Summed_Reward = 14.0


  0%|          | 76/100000 [51:17<757:17:55, 27.28s/it]

Episode 76: Summed_Reward = 16.0


  0%|          | 77/100000 [51:30<634:23:06, 22.86s/it]

Episode 77: Summed_Reward = 24.0


  0%|          | 78/100000 [51:47<588:14:59, 21.19s/it]

Episode 78: Summed_Reward = 33.0


  0%|          | 79/100000 [51:54<464:47:03, 16.75s/it]

Episode 79: Summed_Reward = 12.0
Episode 80: Summed_Reward = 29.0


  0%|          | 81/100000 [1:01:51<3735:40:12, 134.59s/it]

Episode 81: Summed_Reward = 17.0


  0%|          | 82/100000 [1:02:11<2783:29:27, 100.29s/it]

Episode 82: Summed_Reward = 33.0


  0%|          | 83/100000 [1:02:22<2032:26:24, 73.23s/it] 

Episode 83: Summed_Reward = 14.0


  0%|          | 84/100000 [1:02:40<1580:04:49, 56.93s/it]

Episode 84: Summed_Reward = 29.0


  0%|          | 85/100000 [1:02:52<1200:41:00, 43.26s/it]

Episode 85: Summed_Reward = 19.0


  0%|          | 86/100000 [1:03:12<1005:51:31, 36.24s/it]

Episode 86: Summed_Reward = 37.0


  0%|          | 87/100000 [1:03:21<780:57:13, 28.14s/it] 

Episode 87: Summed_Reward = 18.0


  0%|          | 88/100000 [1:03:29<612:56:59, 22.09s/it]

Episode 88: Summed_Reward = 15.0


  0%|          | 89/100000 [1:03:37<500:24:54, 18.03s/it]

Episode 89: Summed_Reward = 16.0
Episode 90: Summed_Reward = 35.0


  0%|          | 91/100000 [1:15:11<4344:51:21, 156.56s/it]

Episode 91: Summed_Reward = 42.0


  0%|          | 92/100000 [1:15:22<3133:19:57, 112.90s/it]

Episode 92: Summed_Reward = 18.0


  0%|          | 93/100000 [1:15:35<2302:34:05, 82.97s/it] 

Episode 93: Summed_Reward = 21.0


  0%|          | 94/100000 [1:15:45<1699:25:48, 61.24s/it]

Episode 94: Summed_Reward = 18.0


  0%|          | 95/100000 [1:15:56<1281:08:31, 46.16s/it]

Episode 95: Summed_Reward = 20.0


  0%|          | 96/100000 [1:16:13<1032:37:09, 37.21s/it]

Episode 96: Summed_Reward = 31.0


  0%|          | 97/100000 [1:16:22<797:54:15, 28.75s/it] 

Episode 97: Summed_Reward = 17.0


  0%|          | 98/100000 [1:16:28<611:17:04, 22.03s/it]

Episode 98: Summed_Reward = 12.0


  0%|          | 99/100000 [1:16:53<634:30:07, 22.86s/it]

Episode 99: Summed_Reward = 46.0
Episode 100: Summed_Reward = 65.0


  0%|          | 101/100000 [1:29:55<4890:47:36, 176.25s/it]

Episode 101: Summed_Reward = 15.0


  0%|          | 102/100000 [1:30:02<3482:17:35, 125.49s/it]

Episode 102: Summed_Reward = 13.0


  0%|          | 103/100000 [1:30:10<2499:39:12, 90.08s/it] 

Episode 103: Summed_Reward = 13.0


  0%|          | 104/100000 [1:30:24<1868:11:01, 67.32s/it]

Episode 104: Summed_Reward = 26.0


  0%|          | 105/100000 [1:30:33<1382:39:49, 49.83s/it]

Episode 105: Summed_Reward = 16.0


  0%|          | 106/100000 [1:30:48<1095:17:25, 39.47s/it]

Episode 106: Summed_Reward = 27.0


  0%|          | 107/100000 [1:31:10<951:02:33, 34.27s/it] 

Episode 107: Summed_Reward = 41.0


  0%|          | 108/100000 [1:31:28<812:36:37, 29.29s/it]

Episode 108: Summed_Reward = 33.0


  0%|          | 109/100000 [1:31:37<645:12:55, 23.25s/it]

Episode 109: Summed_Reward = 17.0
Episode 110: Summed_Reward = 12.0


  0%|          | 111/100000 [1:45:43<5274:49:31, 190.10s/it]

Episode 111: Summed_Reward = 23.0


  0%|          | 112/100000 [1:45:51<3766:21:07, 135.74s/it]

Episode 112: Summed_Reward = 15.0


  0%|          | 113/100000 [1:45:57<2682:48:51, 96.69s/it] 

Episode 113: Summed_Reward = 10.0


  0%|          | 114/100000 [1:46:04<1934:55:42, 69.74s/it]

Episode 114: Summed_Reward = 12.0


  0%|          | 115/100000 [1:46:23<1511:52:36, 54.49s/it]

Episode 115: Summed_Reward = 36.0


  0%|          | 116/100000 [1:46:52<1304:17:17, 47.01s/it]

Episode 116: Summed_Reward = 55.0


  0%|          | 117/100000 [1:47:24<1174:58:56, 42.35s/it]

Episode 117: Summed_Reward = 60.0


  0%|          | 118/100000 [1:47:41<968:47:33, 34.92s/it] 

Episode 118: Summed_Reward = 33.0


  0%|          | 119/100000 [1:47:53<773:52:37, 27.89s/it]

Episode 119: Summed_Reward = 21.0
Episode 120: Summed_Reward = 42.0


  0%|          | 121/100000 [2:03:49<5982:24:45, 215.63s/it]

Episode 121: Summed_Reward = 24.0


  0%|          | 122/100000 [2:04:09<4350:18:19, 156.80s/it]

Episode 122: Summed_Reward = 37.0


  0%|          | 123/100000 [2:04:25<3178:42:41, 114.57s/it]

Episode 123: Summed_Reward = 31.0


  0%|          | 124/100000 [2:04:33<2290:24:39, 82.56s/it] 

Episode 124: Summed_Reward = 15.0


  0%|          | 125/100000 [2:04:44<1699:00:33, 61.24s/it]

Episode 125: Summed_Reward = 22.0


  0%|          | 126/100000 [2:04:52<1252:04:29, 45.13s/it]

Episode 126: Summed_Reward = 14.0


  0%|          | 127/100000 [2:05:02<959:44:39, 34.59s/it] 

Episode 127: Summed_Reward = 16.0


  0%|          | 128/100000 [2:05:38<971:54:48, 35.03s/it]

Episode 128: Summed_Reward = 66.0


  0%|          | 129/100000 [2:05:58<852:30:13, 30.73s/it]

Episode 129: Summed_Reward = 36.0
Episode 130: Summed_Reward = 20.0


  0%|          | 131/100000 [2:23:57<6778:28:47, 244.35s/it]

Episode 131: Summed_Reward = 52.0


  0%|          | 132/100000 [2:24:15<4894:14:53, 176.43s/it]

Episode 132: Summed_Reward = 30.0


  0%|          | 133/100000 [2:24:40<3631:10:34, 130.90s/it]

Episode 133: Summed_Reward = 43.0


  0%|          | 134/100000 [2:24:51<2633:34:30, 94.94s/it] 

Episode 134: Summed_Reward = 19.0


  0%|          | 135/100000 [2:24:57<1896:05:29, 68.35s/it]

Episode 135: Summed_Reward = 11.0


  0%|          | 136/100000 [2:25:06<1403:49:55, 50.61s/it]

Episode 136: Summed_Reward = 16.0


  0%|          | 137/100000 [2:25:20<1094:39:56, 39.46s/it]

Episode 137: Summed_Reward = 24.0


  0%|          | 138/100000 [2:25:34<884:03:45, 31.87s/it] 

Episode 138: Summed_Reward = 25.0


  0%|          | 139/100000 [2:26:02<854:43:16, 30.81s/it]

Episode 139: Summed_Reward = 50.0
Episode 140: Summed_Reward = 12.0


  0%|          | 141/100000 [2:45:04<7112:21:13, 256.41s/it]

Episode 141: Summed_Reward = 25.0


  0%|          | 142/100000 [2:45:36<5239:37:47, 188.89s/it]

Episode 142: Summed_Reward = 48.0


  0%|          | 143/100000 [2:45:47<3765:18:29, 135.75s/it]

Episode 143: Summed_Reward = 18.0


  0%|          | 144/100000 [2:46:01<2748:57:26, 99.11s/it] 

Episode 144: Summed_Reward = 22.0


  0%|          | 145/100000 [2:46:10<1998:55:06, 72.07s/it]

Episode 145: Summed_Reward = 16.0


  0%|          | 146/100000 [2:46:15<1443:24:16, 52.04s/it]

Episode 146: Summed_Reward = 9.0


  0%|          | 147/100000 [2:46:53<1324:31:00, 47.75s/it]

Episode 147: Summed_Reward = 66.0


  0%|          | 148/100000 [2:47:21<1156:32:23, 41.70s/it]

Episode 148: Summed_Reward = 49.0


  0%|          | 149/100000 [2:47:29<882:03:16, 31.80s/it] 

Episode 149: Summed_Reward = 15.0
Episode 150: Summed_Reward = 20.0


  0%|          | 151/100000 [3:08:14<7717:14:15, 278.24s/it] 

Episode 151: Summed_Reward = 27.0


  0%|          | 152/100000 [3:08:28<5516:54:34, 198.91s/it]

Episode 152: Summed_Reward = 24.0


  0%|          | 153/100000 [3:08:34<3918:43:43, 141.29s/it]

Episode 153: Summed_Reward = 12.0


  0%|          | 154/100000 [3:09:30<3202:20:46, 115.46s/it]

Episode 154: Summed_Reward = 99.0


  0%|          | 155/100000 [3:09:39<2321:40:14, 83.71s/it] 

Episode 155: Summed_Reward = 17.0


  0%|          | 156/100000 [3:09:48<1698:22:10, 61.24s/it]

Episode 156: Summed_Reward = 15.0


  0%|          | 157/100000 [3:09:57<1263:15:49, 45.55s/it]

Episode 157: Summed_Reward = 16.0


  0%|          | 158/100000 [3:10:25<1117:09:30, 40.28s/it]

Episode 158: Summed_Reward = 49.0


  0%|          | 159/100000 [3:11:02<1093:57:38, 39.45s/it]

Episode 159: Summed_Reward = 66.0
Episode 160: Summed_Reward = 33.0


  0%|          | 161/100000 [3:34:00<8584:47:47, 309.55s/it] 

Episode 161: Summed_Reward = 18.0


  0%|          | 162/100000 [3:34:45<6384:58:57, 230.23s/it]

Episode 162: Summed_Reward = 80.0


  0%|          | 163/100000 [3:34:59<4585:36:13, 165.35s/it]

Episode 163: Summed_Reward = 25.0


  0%|          | 164/100000 [3:35:30<3467:39:10, 125.04s/it]

Episode 164: Summed_Reward = 55.0


  0%|          | 165/100000 [3:35:56<2639:46:59, 95.19s/it] 

Episode 165: Summed_Reward = 45.0


  0%|          | 166/100000 [3:36:05<1924:17:51, 69.39s/it]

Episode 166: Summed_Reward = 16.0


  0%|          | 167/100000 [3:36:27<1530:27:00, 55.19s/it]

Episode 167: Summed_Reward = 39.0


  0%|          | 168/100000 [3:37:16<1479:59:17, 53.37s/it]

Episode 168: Summed_Reward = 86.0


  0%|          | 169/100000 [3:37:31<1160:24:30, 41.85s/it]

Episode 169: Summed_Reward = 26.0
Episode 170: Summed_Reward = 81.0


  0%|          | 171/100000 [4:03:21<9612:44:23, 346.65s/it] 

Episode 171: Summed_Reward = 12.0


  0%|          | 172/100000 [4:04:18<7199:03:33, 259.61s/it]

Episode 172: Summed_Reward = 91.0


  0%|          | 173/100000 [4:04:37<5200:48:50, 187.55s/it]

Episode 173: Summed_Reward = 33.0


  0%|          | 174/100000 [4:05:08<3897:13:17, 140.54s/it]

Episode 174: Summed_Reward = 55.0


  0%|          | 175/100000 [4:05:30<2909:39:20, 104.93s/it]

Episode 175: Summed_Reward = 39.0


  0%|          | 176/100000 [4:05:53<2227:56:40, 80.35s/it] 

Episode 176: Summed_Reward = 41.0


  0%|          | 177/100000 [4:06:19<1773:27:48, 63.96s/it]

Episode 177: Summed_Reward = 45.0


  0%|          | 178/100000 [4:06:41<1426:47:23, 51.46s/it]

Episode 178: Summed_Reward = 39.0


  0%|          | 179/100000 [4:06:47<1048:07:56, 37.80s/it]

Episode 179: Summed_Reward = 11.0
Episode 180: Summed_Reward = 57.0


  0%|          | 181/100000 [4:35:22<10546:07:53, 380.35s/it]

Episode 181: Summed_Reward = 32.0


  0%|          | 182/100000 [4:35:30<7445:16:16, 268.52s/it] 

Episode 182: Summed_Reward = 13.0


  0%|          | 183/100000 [4:36:08<5533:38:53, 199.58s/it]

Episode 183: Summed_Reward = 68.0


  0%|          | 184/100000 [4:36:27<4027:08:33, 145.24s/it]

Episode 184: Summed_Reward = 33.0


  0%|          | 185/100000 [4:37:05<3133:11:22, 113.00s/it]

Episode 185: Summed_Reward = 66.0


  0%|          | 186/100000 [4:37:13<2260:51:22, 81.54s/it] 

Episode 186: Summed_Reward = 14.0


  0%|          | 187/100000 [4:37:23<1667:44:05, 60.15s/it]

Episode 187: Summed_Reward = 18.0


  0%|          | 188/100000 [4:37:46<1354:01:18, 48.84s/it]

Episode 188: Summed_Reward = 39.0


  0%|          | 189/100000 [4:37:56<1038:14:11, 37.45s/it]

Episode 189: Summed_Reward = 19.0
Episode 190: Summed_Reward = 19.0


  0%|          | 191/100000 [5:07:38<10905:43:41, 393.36s/it]

Episode 191: Summed_Reward = 18.0


  0%|          | 192/100000 [5:08:01<7828:35:20, 282.37s/it] 

Episode 192: Summed_Reward = 40.0


  0%|          | 193/100000 [5:08:31<5724:44:49, 206.49s/it]

Episode 193: Summed_Reward = 52.0


  0%|          | 194/100000 [5:08:57<4225:04:07, 152.40s/it]

Episode 194: Summed_Reward = 47.0


  0%|          | 195/100000 [5:09:26<3203:24:42, 115.55s/it]

Episode 195: Summed_Reward = 53.0


  0%|          | 196/100000 [5:09:54<2472:43:43, 89.19s/it] 

Episode 196: Summed_Reward = 48.0


  0%|          | 197/100000 [5:10:31<2042:28:11, 73.67s/it]

Episode 197: Summed_Reward = 66.0


  0%|          | 198/100000 [5:11:09<1741:40:42, 62.82s/it]

Episode 198: Summed_Reward = 66.0


  0%|          | 199/100000 [5:11:53<1585:14:29, 57.18s/it]

Episode 199: Summed_Reward = 77.0
Episode 200: Summed_Reward = 38.0


  0%|          | 201/100000 [5:44:14<12091:39:44, 436.18s/it]

Episode 201: Summed_Reward = 11.0


  0%|          | 202/100000 [5:44:29<8589:21:25, 309.84s/it] 

Episode 202: Summed_Reward = 27.0


  0%|          | 203/100000 [5:45:30<6519:43:16, 235.19s/it]

Episode 203: Summed_Reward = 111.0


  0%|          | 204/100000 [5:45:42<4666:14:06, 168.33s/it]

Episode 204: Summed_Reward = 22.0


  0%|          | 205/100000 [5:46:01<3423:47:01, 123.51s/it]

Episode 205: Summed_Reward = 34.0


  0%|          | 206/100000 [5:46:16<2518:08:23, 90.84s/it] 

Episode 206: Summed_Reward = 26.0


  0%|          | 207/100000 [5:46:29<1873:03:57, 67.57s/it]

Episode 207: Summed_Reward = 24.0


  0%|          | 208/100000 [5:46:35<1362:56:57, 49.17s/it]

Episode 208: Summed_Reward = 11.0


  0%|          | 209/100000 [5:47:07<1216:20:27, 43.88s/it]

Episode 209: Summed_Reward = 57.0
Episode 210: Summed_Reward = 22.0


  0%|          | 211/100000 [6:22:32<13115:41:15, 473.16s/it]

Episode 211: Summed_Reward = 106.0


  0%|          | 212/100000 [6:22:45<9287:41:56, 335.07s/it] 

Episode 212: Summed_Reward = 23.0


  0%|          | 213/100000 [6:23:04<6661:03:09, 240.31s/it]

Episode 213: Summed_Reward = 33.0


  0%|          | 214/100000 [6:23:51<5051:48:50, 182.26s/it]

Episode 214: Summed_Reward = 84.0


  0%|          | 215/100000 [6:24:28<3844:54:00, 138.71s/it]

Episode 215: Summed_Reward = 65.0


  0%|          | 216/100000 [6:24:48<2854:15:04, 102.98s/it]

Episode 216: Summed_Reward = 34.0


  0%|          | 217/100000 [6:25:12<2201:09:20, 79.41s/it] 

Episode 217: Summed_Reward = 44.0


  0%|          | 218/100000 [6:26:29<2180:26:06, 78.67s/it]

Episode 218: Summed_Reward = 136.0


  0%|          | 219/100000 [6:27:11<1878:46:28, 67.78s/it]

Episode 219: Summed_Reward = 75.0
Episode 220: Summed_Reward = 32.0


  0%|          | 221/100000 [7:05:29<14320:00:44, 516.66s/it]

Episode 221: Summed_Reward = 18.0


  0%|          | 222/100000 [7:05:40<10111:50:47, 364.84s/it]

Episode 222: Summed_Reward = 19.0


  0%|          | 223/100000 [7:07:00<7745:31:10, 279.46s/it] 

Episode 223: Summed_Reward = 144.0


  0%|          | 224/100000 [7:07:25<5626:53:54, 203.02s/it]

Episode 224: Summed_Reward = 44.0


  0%|          | 225/100000 [7:08:33<4506:08:59, 162.59s/it]

Episode 225: Summed_Reward = 118.0


  0%|          | 226/100000 [7:09:06<3430:51:02, 123.79s/it]

Episode 226: Summed_Reward = 57.0


  0%|          | 227/100000 [7:10:41<3190:29:41, 115.12s/it]

Episode 227: Summed_Reward = 168.0


  0%|          | 228/100000 [7:11:07<2452:35:03, 88.49s/it] 

Episode 228: Summed_Reward = 45.0


  0%|          | 229/100000 [7:12:00<2155:57:15, 77.79s/it]

Episode 229: Summed_Reward = 82.0


  0%|          | 229/100000 [7:13:29<3147:45:38, 113.58s/it]


KeyboardInterrupt: 

# Deducing (testing)

Loading models

In [None]:
model_loader = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        hidden_size,
                        action_size,
                        time_size,
                        reward_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        hidden_activation,
                        output_activation,
                        init,
                        opti,
                        loss,
                        drop_rate,
                        alpha,
                        mask_value)
    model.to(device)
    model_loader.append(model)

for i in range(len(model_loader)):
    model_loader[i].load_state_dict(torch.load(model_directory % i))

Creating desired reward ... again

In [None]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float).to(device)

Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
total_summed_reward = 0

for testing_episode in range(episode_for_testing):

    if render_for_human == True:
        env = gym.make( game_name, render_mode="human")
    else:
        env = gym.make( game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state                  = env.reset()
    if render_for_human == True:
        env.render()
    summed_reward = 0

    state = vectorizing_state(state)

    done = False
    while not done:

        state                 = torch.tensor(state, dtype=torch.float).to(device)
        pre_activated_actions = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_actions = torch.tensor(pre_activated_actions, dtype=torch.float).to(device)
        pre_activated_actions = update_pre_activated_actions(epoch_for_deducing,
                                                             model_loader,
                                                             state,
                                                             pre_activated_actions,
                                                             desired_reward,
                                                             beta)
        action_argmax    = int(torch.argmax(pre_activated_actions[0, 0]))

        state, reward, done,  info = env.step(action_argmax)
        if render_for_human == True:
            env.render()

        summed_reward += reward

        state = vectorizing_state(state)

        if done:
            break


    env.close()

    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged summed reward:')
    total_summed_reward += summed_reward
    print(total_summed_reward/(testing_episode + 1))

