<a href="https://colab.research.google.com/github/Brownwang0426/RGRL/blob/main/CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning git

In [None]:
!git clone https://github.com/Brownwang0426/RGRL.git

# Installing requirements

In [None]:
!sudo apt-get install python3.10

In [None]:
!pip install pandas==2.0.3 numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gym==0.25.2 pygame==2.5.2 tqdm torch==2.0.1

# Importing modules

In [1]:
import gym

import numpy as np
import math
from scipy.special import softmax

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm


# Checking cuda

In [2]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')
assert device != torch.device("cpu") # Sorry, but we really recommend you to run it on GPU :-) Nvidia needs your money :-)

Device 0: NVIDIA T500
using cuda...


In [3]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

# Control board

Crucial variables regarding how your agent will learn in the environment

- In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.


In [5]:
game_name =  'MountainCar-v0'     # (Reminder: change this for your specific task ⚠️⚠️⚠️)
max_steps_for_each_episode = 200  # (Reminder: change this for your specific task ⚠️⚠️⚠️)


ensemble_size = 5                 # (Reminder: change this to see MWM-SGD's magic ◀️◀️◀️) choose the size of the neural ensemble 
state_size =  200                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
hidden_size = 100                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
action_size = 3                   # (Reminder: change this for your specific task ⚠️⚠️⚠️)
time_size = 50                    # (Reminder: change this for your specific task ⚠️⚠️⚠️)
reward_size = 100                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
neural_type = 'gru'               # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose your neural type: [rnn, gru, lstm] [att]
num_layers = 2                    # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose the number of layers for your rnn or attention: [1, 2, 3, 4, etc.]
num_heads = None                  # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose your number of heads: [None for non-attention] [should be able to divide hidden_size for attention]
hidden_activation = 'tanh'        # choose hidden activation function: [relu, leaky_relu, sigmoid, tanh]
output_activation = 'sigmoid'     # choose output activation function: [relu, leaky_relu, sigmoid, tanh]
shift = 0.0                       # choose shift for output 
init = "random_normal"            # choose initialization method: [random_normal, random_uniform, xavier_normal, xavier_uniform, glorot_normal, glorot_uniform]
opti = 'sgd'                      # choose optimization method: [adam, sgd, rmsprop]
loss = 'mean_squared_error'       # choose error function type: [mean_squared_error, binary_crossentropy]
drop_rate = 0.001                 # (Reminder: change this to see dropout's magic ◀️◀️◀️) choose your drop rate 
alpha = 0.1                       # choose your learning rate for updating neural nets
iteration_for_learning = 20000    # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose learning iteration for nn weights 
mask_value = sys.maxsize          # mask value
batch_size = 1                    # batch_size for learning
load_pre_model = False            # retrain from existing neural nets or not


noise_t = 1                       # gaussian noise
noise_r = 0.1                     # (Reminder: change this for your specific task ⚠️⚠️⚠️) smaller value encourages agent to exploit experience while larger value encourages agent to explore at the cost of longer training time 
beta = 0.1                        # updating rate for input actions
iteration_for_deducing = 200      # (Reminder: change this for your specific task ⚠️⚠️⚠️) updating iteration for input actions


episode_for_training = 100000
batch_size_for_offline_learning = 1 # batch size for batch offline learning
PER_epsilon = 0.000001              # prioritized_experience_replay epsilon
PER_exponent = 1                    # prioritized_experience_replay exponent
EWC_lambda = 1                      # elastic weight control lambda 


episode_for_testing = 100
render_for_human = True


suffix                      = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_drop={drop_rate:.5f}_learn={iteration_for_learning:05d}_interval={batch_size_for_offline_learning:05d}_deduce={iteration_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/result/{game_name}/'
model_directory             = f'/content/result/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/result/{game_name}/performace_log_{suffix}.csv'

In [6]:
game_name = "LunarLander-v2"      # (Reminder: change this for your specific task ⚠️⚠️⚠️)
max_steps_for_each_episode = 200  # (Reminder: change this for your specific task ⚠️⚠️⚠️)


ensemble_size = 10                # (Reminder: change this to see MWM-SGD's magic ◀️◀️◀️) choose the size of the neural ensemble 
state_size =  800                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
hidden_size = 250                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
action_size = 4                   # (Reminder: change this for your specific task ⚠️⚠️⚠️)
time_size = 50                    # (Reminder: change this for your specific task ⚠️⚠️⚠️)
reward_size = 250                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
neural_type = 'gru'               # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose your neural type: [rnn, gru, lstm] [att]
num_layers = 2                    # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose the number of layers for your rnn or attention: [1, 2, 3, 4, etc.]
num_heads = None                  # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose your number of heads: [None for non-attention] [should be able to divide hidden_size for attention]
hidden_activation = 'tanh'        # choose hidden activation function: [relu, leaky_relu, sigmoid, tanh]
output_activation = 'sigmoid'     # choose output activation function: [relu, leaky_relu, sigmoid, tanh]
shift = 0.0                       # choose shift for output 
init = "random_normal"            # choose initialization method: [random_normal, random_uniform, xavier_normal, xavier_uniform, glorot_normal, glorot_uniform]
opti = 'sgd'                      # choose optimization method: [adam, sgd, rmsprop]
loss = 'mean_squared_error'       # choose error function type: [mean_squared_error, binary_crossentropy]
drop_rate = 0.001                 # (Reminder: change this to see dropout's magic ◀️◀️◀️) choose your drop rate 
alpha = 0.1                       # choose your learning rate for updating neural nets
iteration_for_learning = 20000    # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose learning iteration for nn weights 
mask_value = sys.maxsize          # mask value
batch_size = 1                    # batch_size for learning
load_pre_model = False            # retrain from existing neural nets or not


noise_t = 1                       # gaussian noise
noise_r = 0.1                     # (Reminder: change this for your specific task ⚠️⚠️⚠️) smaller value encourages agent to exploit experience while larger value encourages agent to explore at the cost of longer training time 
beta = 0.1                        # updating rate for input actions
iteration_for_deducing = 200      # (Reminder: change this for your specific task ⚠️⚠️⚠️) updating iteration for input actions


episode_for_training = 100000
batch_size_for_offline_learning = 1 # batch size for batch offline learning
PER_epsilon = 0.000001              # prioritized_experience_replay epsilon
PER_exponent = 1                    # prioritized_experience_replay exponent
EWC_lambda = 1                      # elastic weight control lambda 


episode_for_testing = 100
render_for_human = True


suffix                      = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_drop={drop_rate:.5f}_learn={iteration_for_learning:05d}_interval={batch_size_for_offline_learning:05d}_deduce={iteration_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/result/{game_name}/'
model_directory             = f'/content/result/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/result/{game_name}/performace_log_{suffix}.csv'

In [7]:
game_name = 'CartPole-v1'         # (Reminder: change this for your specific task ⚠️⚠️⚠️)
max_steps_for_each_episode = 2000 # (Reminder: change this for your specific task ⚠️⚠️⚠️)


ensemble_size = 10                # (Reminder: change this to see MWM-SGD's magic ◀️◀️◀️) choose the size of the neural ensemble 
state_size =  500                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
hidden_size = 100                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
action_size = 2                   # (Reminder: change this for your specific task ⚠️⚠️⚠️)
time_size = 15                    # (Reminder: change this for your specific task ⚠️⚠️⚠️)
reward_size = 100                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
neural_type = 'gru'               # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose your neural type: [rnn, gru, lstm] [att]
num_layers = 2                    # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose the number of layers for your rnn or attention: [1, 2, 3, 4, etc.]
num_heads = None                  # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose your number of heads: [None for non-attention] [should be able to divide hidden_size for attention]
hidden_activation = 'tanh'        # choose hidden activation function: [relu, leaky_relu, sigmoid, tanh]
output_activation = 'sigmoid'     # choose output activation function: [relu, leaky_relu, sigmoid, tanh]
shift = 0.0                       # choose shift for output 
init = "random_normal"            # choose initialization method: [random_normal, random_uniform, xavier_normal, xavier_uniform, glorot_normal, glorot_uniform]
opti = 'sgd'                      # choose optimization method: [adam, sgd, rmsprop]
loss = 'mean_squared_error'       # choose error function type: [mean_squared_error, binary_crossentropy]
drop_rate = 0.001                 # (Reminder: change this to see dropout's magic ◀️◀️◀️) choose your drop rate 
alpha = 0.1                       # choose your learning rate for updating neural nets
iteration_for_learning = 20000    # (Reminder: change this for your specific task ⚠️⚠️⚠️) choose learning iteration for nn weights 
mask_value = sys.maxsize          # mask value
batch_size = 1                    # batch_size for learning
load_pre_model = False            # retrain from existing neural nets or not


noise_t = 1                       # gaussian noise
noise_r = 0.1                     # (Reminder: change this for your specific task ⚠️⚠️⚠️) smaller value encourages agent to exploit experience while larger value encourages agent to explore at the cost of longer training time 
beta = 0.1                        # updating rate for input actions
iteration_for_deducing = 200      # (Reminder: change this for your specific task ⚠️⚠️⚠️) updating iteration for input actions


episode_for_training = 100000
batch_size_for_offline_learning = 1 # batch size for batch offline learning
PER_epsilon = 0.000001              # prioritized_experience_replay epsilon
PER_exponent = 1                    # prioritized_experience_replay exponent
EWC_lambda = 1                      # elastic weight control lambda 


episode_for_testing = 100
render_for_human = True


suffix                      = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_drop={drop_rate:.5f}_learn={iteration_for_learning:05d}_interval={batch_size_for_offline_learning:05d}_deduce={iteration_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/result/{game_name}/'
model_directory             = f'/content/result/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/result/{game_name}/performace_log_{suffix}.csv'

# Importing local modules

In [8]:
if   game_name == 'CartPole-v1':
    from envs.env_cartpole    import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == 'MountainCar-v0':
    from envs.env_mountaincar import vectorizing_state, vectorizing_action, vectorizing_reward
elif game_name == "LunarLander-v2":
    from envs.env_lunarlander import vectorizing_state, vectorizing_action, vectorizing_reward
else:
   raise RuntimeError('missing env functions')

In [9]:
if neural_type == 'att':
    from models.model_att import build_model
    from utils.util_att   import update_pre_activated_actions, \
                                 update_model, \
                                 update_gradient_matrix, \
                                 initialize_pre_activated_actions, \
                                 sequentialize, \
                                 obtain_tensor_from_list, \
                                 obtain_TD_error, \
                                 save_performance_to_csv
else:
    from models.model_rnn import build_model
    from utils.util_rnn   import update_pre_activated_actions, \
                                 update_model, \
                                 update_gradient_matrix, \
                                 initialize_pre_activated_actions, \
                                 sequentialize, \
                                 obtain_tensor_from_list, \
                                 obtain_TD_error, \
                                 save_performance_to_csv

# Deducing > Learning


Creating or loading models

In [10]:

if not os.path.exists(directory):
    os.makedirs(directory)

if load_pre_model == False:

    model_list = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            shift,
                            init,
                            opti,
                            loss,
                            drop_rate,
                            alpha,
                            mask_value)
        model.to(device)
        model_list.append(model)

elif load_pre_model == True:

    model_list = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            shift,
                            init,
                            opti,
                            loss,
                            drop_rate,
                            alpha,
                            mask_value)
        model.to(device)
        model_list.append(model)

    for i in range(len(model_list)):
        model_list[i].load_state_dict(torch.load( model_directory  % i ))

gradient_matrix_list = [''] * len(model_list)

Creating Streams

In [11]:
stream_list = []
for _ in range(ensemble_size):
    stream  = torch.cuda.Stream()
    stream_list.append(stream)


Creating intial gradient matrices

In [12]:

prev_model_list = copy.deepcopy(model_list)

prev_gradient_matrix_list = []
for model in model_list:
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}
    prev_gradient_matrix_list.append( gradient_matrix )


Creating desired reward

In [13]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float)

Putting all the previous works into play

In [14]:

performance_log = []
performance_log.append([0, 0])

for training_episode in tqdm(range(episode_for_training)):

    # initializing short term experience replay buffer
    short_term_state_list  = []
    short_term_action_list = []
    short_term_reward_list = []

    # initializing environment
    env           = gym.make(game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state         = env.reset()
    summed_reward = 0

    # observing state
    state = vectorizing_state(state)
    short_term_state_list.append(state[0])

    done = False
    while not done:

        # initializing and updating actions
        state                 = torch.tensor(state, dtype=torch.float)
        pre_activated_actions = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_actions = torch.tensor(pre_activated_actions, dtype=torch.float)
        pre_activated_actions = update_pre_activated_actions(iteration_for_deducing,
                                                             model_list,
                                                             state,
                                                             pre_activated_actions,
                                                             desired_reward,
                                                             beta,
                                                             device)
        action_argmax    = int(torch.argmax(pre_activated_actions[0, 0]))
        action           = vectorizing_action(action_size, action_argmax)
        short_term_action_list.append(action)

        # executing action
        state, reward, done, info = env.step(action_argmax)

        # observing actual reward
        summed_reward += reward
        reward = vectorizing_reward(state, reward, summed_reward, done, reward_size)
        short_term_reward_list.append(reward)

        # observing state
        state = vectorizing_state(state)
        short_term_state_list.append(state[0])

        if done:
            print(f'Episode {training_episode+1}: Summed_Reward = {summed_reward}')
            performance_log.append([training_episode+1, summed_reward])
            save_performance_to_csv(performance_log, performance_log_directory)
            break




    env.close()




    # sequentializing short term experience replay buffer 
    short_term_sequentialized_state_list, \
    short_term_sequentialized_actions_list, \
    short_term_sequentialized_reward_list, \
    short_term_sequentialized_next_state_list = sequentialize(short_term_state_list, short_term_action_list, short_term_reward_list, time_size )
    
    


    # saving short term experience replay buffer to long term experience replay buffer
    short_term_sequentialized_state_tensor,\
    short_term_sequentialized_actions_tensor,\
    short_term_sequentialized_reward_tensor,\
    short_term_sequentialized_next_state_tensor,\
    short_term_sequentialized_padding_mask = obtain_tensor_from_list(short_term_sequentialized_state_list,
                                                                     short_term_sequentialized_actions_list,
                                                                     short_term_sequentialized_reward_list,
                                                                     short_term_sequentialized_next_state_list,
                                                                     time_size,
                                                                     mask_value,
                                                                     num_heads,
                                                                     device) 
    if training_episode==0:
        long_term_sequentialized_state_tensor      = copy.deepcopy(short_term_sequentialized_state_tensor)
        long_term_sequentialized_actions_tensor    = copy.deepcopy(short_term_sequentialized_actions_tensor)
        long_term_sequentialized_reward_tensor     = copy.deepcopy(short_term_sequentialized_reward_tensor)
        long_term_sequentialized_next_state_tensor = copy.deepcopy(short_term_sequentialized_next_state_tensor)
        long_term_sequentialized_padding_mask      = copy.deepcopy(short_term_sequentialized_padding_mask)
    else:
        long_term_sequentialized_state_tensor      = torch.cat((long_term_sequentialized_state_tensor     , short_term_sequentialized_state_tensor     ), dim=0)
        long_term_sequentialized_actions_tensor    = torch.cat((long_term_sequentialized_actions_tensor   , short_term_sequentialized_actions_tensor   ), dim=0)
        long_term_sequentialized_reward_tensor     = torch.cat((long_term_sequentialized_reward_tensor    , short_term_sequentialized_reward_tensor    ), dim=0)
        long_term_sequentialized_next_state_tensor = torch.cat((long_term_sequentialized_next_state_tensor, short_term_sequentialized_next_state_tensor), dim=0)
        long_term_sequentialized_padding_mask      = torch.cat((long_term_sequentialized_padding_mask     , short_term_sequentialized_padding_mask     ), dim=0)
        



    # batch offline learning
    if (training_episode+1) % batch_size_for_offline_learning == 0:




        # creating dataset and data loader
        dataset      = TensorDataset(long_term_sequentialized_state_tensor     ,
                                     long_term_sequentialized_actions_tensor   ,
                                     long_term_sequentialized_reward_tensor    ,
                                     long_term_sequentialized_next_state_tensor,
                                     long_term_sequentialized_padding_mask     )
        data_loader  = DataLoader(dataset, batch_size = len(dataset), shuffle=False)




        # training with Prioritized Experience Replay (PER) and Elastic Weight Control (EWC)
        for i, model in enumerate(model_list):
            with torch.cuda.stream(stream_list[i]):




                # creating TD error probability
                TD_error     = obtain_TD_error(model, data_loader)
                TD_error     =(TD_error.cpu().numpy() + PER_epsilon) ** PER_exponent
                TD_error_p   = TD_error / np.sum(TD_error)
                # creating sub dataset and sub data loader from  TD error probability
                index_arry       = np.random.choice(range(len(dataset)), 
                                                    p=TD_error_p, 
                                                    size=iteration_for_learning, 
                                                    replace=True)
                index_arry       = np.random.permutation(index_arry)
                sub_dataset      = Subset(dataset, index_arry)
                sub_data_loader  = DataLoader(sub_dataset, batch_size = batch_size, shuffle=True)




                # training with PER and EWC
                model                     = update_model(model,
                                                         sub_data_loader,
                                                         prev_model_list[i],
                                                         prev_gradient_matrix_list[i],
                                                         EWC_lambda)
                model_list[i]             = model




                # obtaining EWC gradient
                gradient_matrix           = update_gradient_matrix(model,
                                                                   data_loader)
                gradient_matrix_list[i]   = gradient_matrix
        torch.cuda.synchronize()
        prev_model_list           = copy.deepcopy(model_list)
        prev_gradient_matrix_list = copy.deepcopy(gradient_matrix_list)




        # saving:
        for i in range(len(model_list)):
            torch.save(model_list[i].state_dict(), model_directory % i)


        gc.collect()
        torch.cuda.empty_cache()

  deprecation(
  deprecation(
  result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,
  if not isinstance(terminated, (bool, np.bool8)):


Episode 1: Summed_Reward = 26.0


  0%|          | 1/100000 [34:49<58039:15:44, 2089.43s/it]

Episode 2: Summed_Reward = 17.0


  0%|          | 2/100000 [1:08:21<56775:14:27, 2043.95s/it]

Episode 3: Summed_Reward = 14.0


  0%|          | 3/100000 [1:41:50<56324:48:26, 2027.75s/it]

Episode 4: Summed_Reward = 21.0


  0%|          | 4/100000 [2:34:21<68646:30:33, 2471.37s/it]

Episode 5: Summed_Reward = 50.0


  0%|          | 5/100000 [3:29:54<77273:53:06, 2782.00s/it]

Episode 6: Summed_Reward = 41.0


  0%|          | 6/100000 [3:59:54<68006:13:41, 2448.37s/it]

Episode 7: Summed_Reward = 166.0


  0%|          | 7/100000 [4:25:57<59958:27:06, 2158.66s/it]

Episode 8: Summed_Reward = 209.0


  0%|          | 8/100000 [5:04:23<61263:14:32, 2205.65s/it]

Episode 9: Summed_Reward = 204.0


  0%|          | 9/100000 [6:14:26<78601:54:23, 2829.92s/it]

Episode 10: Summed_Reward = 297.0


  0%|          | 10/100000 [7:07:33<81669:52:30, 2940.41s/it]

Episode 11: Summed_Reward = 298.0


  0%|          | 11/100000 [7:51:34<79118:59:41, 2848.60s/it]

Episode 12: Summed_Reward = 341.0


  0%|          | 12/100000 [8:33:17<76197:15:15, 2743.43s/it]

Episode 13: Summed_Reward = 360.0


# Deducing (testing)

Loading models

In [None]:
model_list = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        hidden_size,
                        action_size,
                        time_size,
                        reward_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        hidden_activation,
                        output_activation,
                        shift,
                        init,
                        opti,
                        loss,
                        drop_rate,
                        alpha,
                        mask_value)
    model.to(device)
    model_list.append(model)

for i in range(len(model_list)):
    model_list[i].load_state_dict(torch.load(model_directory % i))

Creating desired reward ... again

In [None]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float)

Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
total_summed_reward = 0

for testing_episode in range(episode_for_testing):

    if render_for_human == True:
        env = gym.make( game_name, render_mode="human")
    else:
        env = gym.make( game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state                  = env.reset()
    if render_for_human == True:
        env.render()
    summed_reward = 0

    state = vectorizing_state(state)

    done = False
    while not done:

        state                 = torch.tensor(state, dtype=torch.float)
        pre_activated_actions = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_actions = torch.tensor(pre_activated_actions, dtype=torch.float)
        pre_activated_actions = update_pre_activated_actions(iteration_for_deducing,
                                                             model_list,
                                                             state,
                                                             pre_activated_actions,
                                                             desired_reward,
                                                             beta,
                                                             device)
        action_argmax    = int(torch.argmax(pre_activated_actions[0, 0]))

        state, reward, done,  info = env.step(action_argmax)
        if render_for_human == True:
            env.render()

        summed_reward += reward

        state = vectorizing_state(state)

        if done:
            break


    env.close()

    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged summed reward:')
    total_summed_reward += summed_reward
    print(total_summed_reward/(testing_episode + 1))

