<a href="https://colab.research.google.com/github/Brownwang0426/RGRL/blob/main/CartPole_nested_recurrent_attention_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirements

In [None]:
!sudo apt-get install python3.10

In [None]:
!pip install pandas==2.0.3 numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gym==0.25.2 pygame==2.5.2 tqdm torch==2.0.1

# Importing modules

In [1]:
import gym

import numpy as np
import math
from scipy.special import softmax

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm


# Checking cuda

In [None]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')
assert device != torch.device("cpu") # Sorry, but we really recommend you to run it on GPU :-) Nvidia needs your money :-)

In [3]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

# Class for building model

In [4]:
class custom_attn(nn.Module):
    def __init__(self, d_model, num_heads = 8):
        super(custom_attn, self).__init__()

        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.bias      = False
        self.d_model   = d_model
        self.num_heads = num_heads
        self.d_k       = d_model // num_heads

        self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
        self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
        self.W_v  = nn.Linear(d_model, d_model, bias=self.bias)
        self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

        if mask != None:
            attn_scores += mask

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output     = torch.matmul(attn_probs, V)

        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Q    -> (batch_size, seq_length, d_model)
        # mask -> (batch_size, 1, seq_length, d_model)
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output      = self.W_o(self.combine_heads(attn_output))
        return output




class build_model(nn.Module):
    def __init__(self,
                 state_size,
                 action_size,
                 reward_size,
                 hidden_size,
                 time_size,
                 neural_type,
                 num_layers,
                 num_heads,
                 hidden_activation,
                 output_activation,
                 init,
                 opti,
                 loss,
                 drop_rate,
                 alpha):

        super(build_model, self).__init__()

        self.state_size           = state_size
        self.action_size          = action_size
        self.reward_size          = reward_size
        self.hidden_size          = hidden_size
        self.time_size            = time_size
        self.neural_type          = neural_type
        self.num_layers           = num_layers
        self.num_heads            = num_heads
        self.hidden_activation    = hidden_activation
        self.output_activation    = output_activation
        self.init                 = init
        self.opti                 = opti
        self.loss                 = loss
        self.drop_rate            = drop_rate
        self.alpha                = alpha

        self.bias                 = False

        self.state_linear         = nn.Linear(self.state_size, self.hidden_size, bias=self.bias)
        self.action_linear        = nn.Linear(self.action_size , self.hidden_size, bias=self.bias)
        self.positional_encoding  = nn.Parameter(self.generate_positional_encoding(2, self.hidden_size ), requires_grad=False)
        self.transformer_layers   = \
        nn.ModuleList([
            nn.ModuleList([
                custom_attn(self.hidden_size, self.num_heads),
                nn.LayerNorm(self.hidden_size),
                nn.Linear(self.hidden_size, self.hidden_size, bias=self.bias),
                nn.LayerNorm(self.hidden_size)
            ])
            for _ in range(self.num_layers)
        ])
        self.reward_linear        = nn.Linear(self.hidden_size, self.reward_size, bias=self.bias)
        self.state_linear_        = nn.Linear(self.hidden_size, self.state_size, bias=self.bias)

        # Activation functions
        self.hidden_activation = self.get_activation(self.hidden_activation)
        self.output_activation = self.get_activation(self.output_activation)

        # Initialize weights for fully connected layers
        self.initialize_weights(self.init  )

        # Optimizer
        optimizers = {
            'adam': optim.Adam,
            'sgd': optim.SGD,
            'rmsprop': optim.RMSprop
        }
        self.selected_optimizer = optimizers[self.opti.lower()](self.parameters(), lr=self.alpha)

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(),
            'binary_crossentropy': torch.nn.BCELoss()
        }
        self.loss_function = losses[self.loss .lower()]

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(reduction='none'),
            'binary_crossentropy': torch.nn.BCELoss(reduction='none')
        }
        self.loss_function_ = losses[self.loss .lower()]


    def forward(self, s, a_list):
        
        mask = None

        r_list = list()
        s_list = list()

        s  = self.state_linear(s)
        s  = self.hidden_activation(s)

        a_ = self.action_linear(a_list[:,0])
        a_ = self.hidden_activation(a_)

        h  = torch.stack([s, a_], dim=0).view(a_.size(0), 2, a_.size(1))
        h  = h + self.positional_encoding[:, :, :]

        pres_h_list = list()
        for j, layer in enumerate(self.transformer_layers):
            attention_layer, attention_norm_layer, fully_connected_layer, fully_connected_norm_layer = layer
            h_ = attention_layer(h, h, h, mask)
            h  = attention_norm_layer(h + h_)
            h_ = fully_connected_layer(h)
            h  = fully_connected_norm_layer(h + h_)
            pres_h_list.append(h)
        prev_h_list = pres_h_list

        r  = h[:, 0]
        s_ = h[:, 1]
        
        r  = self.reward_linear(r)   
        r  = self.custom_activation(r)

        s_ = self.state_linear_(s_)   
        s_ = self.output_activation(s_)

        s  = self.state_linear(s_)
        s  = self.hidden_activation(s)

        r_list.append(r)
        s_list.append(s_)

        for i in range(a_list.size(1)-1):

            a_ = self.action_linear(a_list[:,i+1])
            a_ = self.hidden_activation(a_)

            h  = torch.stack([s, a_], dim=0).view(a_.size(0), 2, a_.size(1))
            h  = h + self.positional_encoding[:, :, :]

            pres_h_list = list()
            for j, layer in enumerate(self.transformer_layers):
                attention_layer, attention_norm_layer, fully_connected_layer, fully_connected_norm_layer = layer
                h_ = attention_layer(prev_h_list[j], prev_h_list[j], h, mask)
                h  = attention_norm_layer(h + h_)
                h_ = fully_connected_layer(h)
                h  = fully_connected_norm_layer(h + h_)
                pres_h_list.append(h)
            prev_h_list = pres_h_list

            r  = h[:, 0]
            s_ = h[:, 1]
            
            r  = self.reward_linear(r)   
            r  = self.custom_activation(r)

            s_ = self.state_linear_(s_)   
            s_ = self.output_activation(s_)

            s  = self.state_linear(s_)
            s  = self.hidden_activation(s)

            r_list.append(r)
            s_list.append(s_)

        r_list = torch.stack(r_list, dim=1)
        s_list = torch.stack(s_list, dim=1)
        
        return r_list, s_list


    def generate_positional_encoding(self, max_len, model_dim):
        pe = torch.zeros(max_len,model_dim)
        for pos in range(max_len):
            for i in range(0,model_dim,2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/model_dim)))
                if i + 1 < model_dim:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * i)/model_dim)))
        return pe.unsqueeze(0)  # Shape: (1, max_len, model_dim)

    def custom_activation(self, x):
        return torch.sigmoid(x + 1.5)

    def get_activation(self,  activation):
        activations = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(),
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh()
        }
        return activations[ activation.lower()]

    def initialize_weights(self, initializer):
        initializers = {
            'random_uniform': nn.init.uniform_,
            'random_normal': nn.init.normal_,
            'glorot_uniform': nn.init.xavier_uniform_,
            'glorot_normal': nn.init.xavier_normal_,
            'xavier_uniform': nn.init.xavier_uniform_,
            'xavier_normal': nn.init.xavier_normal_
        }
        initializer = initializers[initializer.lower()]
        for layer in self.children():
            if isinstance(layer, nn.Linear):
                initializer(layer.weight)

# Function for updating pre-activated action using error backprop

In [5]:

def update_pre_activated_future_action(iteration_for_deducing,
                                       model_loader,
                                       state,
                                       pre_activated_future_action,
                                       desired_future_reward,
                                       beta):

    model_loader_copy = copy.deepcopy(model_loader)

    for _ in range(iteration_for_deducing):

        model   = random.choice(model_loader_copy)

        future_action = torch.sigmoid(pre_activated_future_action)

        model.train()
        future_action = future_action.clone().detach().requires_grad_(True)
        if future_action.grad is not None:
            future_action.grad.zero_()
        for param in model.parameters():
            param.requires_grad = False

        loss_function       = model.loss_function
        output_reward, _    = model(state, future_action)
        total_loss          = loss_function(output_reward, desired_future_reward)
        total_loss.backward() # get grad

        pre_activated_future_action -= future_action.grad * (1 - future_action) * future_action * beta # update params

    return pre_activated_future_action




# Function for updating model using error backprop

Elastic weight consolidation:
https://arxiv.org/pdf/1612.00796

In [6]:
# traditional EWC
def EWC_loss(EWC_lambda, model, prev_model, prev_gradient_matrix):
    model_param      = model.state_dict()
    prev_model_param = prev_model.state_dict()
    loss = 0
    for name, param in model.named_parameters():
        diagonal_fisher_matrix = prev_gradient_matrix[name] ** 2
        param_diff             = (model_param[name] - prev_model_param[name]) ** 2
        loss                  += (diagonal_fisher_matrix * param_diff).sum()
    return EWC_lambda * loss




def update_model(iteration_for_learning,
                 model,
                 sub_data_loader,
                 prev_model,
                 prev_gradient_matrix,
                 EWC_lambda):
    
    for state, future_action, future_reward, future_state in sub_data_loader:

        model.train()
        selected_optimizer = model.selected_optimizer
        selected_optimizer.zero_grad()

        loss_function               = model.loss_function
        output_reward, output_state = model(state, future_action)
        total_loss                  = loss_function(output_reward, future_reward) + loss_function(output_state, future_state)
        total_loss                 += EWC_loss(EWC_lambda, model, prev_model, prev_gradient_matrix)
        total_loss.backward()     # get grad

        selected_optimizer.step() # update params

    return model




def update_gradient_matrix(model,
                           data_loader):
    
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}

    for state, future_action, future_reward, future_state in data_loader:
            
        model.train()
        selected_optimizer = model.selected_optimizer
        selected_optimizer.zero_grad()

        loss_function               = model.loss_function
        output_reward, output_state = model(state, future_action)
        total_loss                  = loss_function(output_reward, future_reward) + loss_function(output_state, future_state)
        total_loss.backward()        # get grad

        for name, param in model.named_parameters():
            if name != "positional_encoding":
                gradient_matrix[name] += param.grad

    gradient_matrix = {name: param / len(data_loader) for name, param in gradient_matrix.items()}

    return gradient_matrix




# Function for re-initializing action value in each step

In [7]:
def initialize_pre_activated_future_action(init, noise_t, noise_r, shape):
    input = 0
    if   init == "random_uniform":
        for _ in range(noise_t):
            input += np.array([  np.random.uniform(low=0, high=1, size=shape)    ]) * noise_r
    elif init == "random_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= 1, size= shape )    ])  * noise_r
    elif init == "glorot_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "glorot_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    elif init == "xavier_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "xavier_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    return input

# Function for vectorizing
Crucial function regarding how you manipulate or shape your state, action and reward

- It's essential to choose between immediate rewards and summed rewards for training your agent. If the current state doesn't encapsulate all crucial past information, using immediate rewards is advisable. This approach prevents confusion caused by varying summed rewards for the same state.

- As for reward shaping, it is recommended to increase your reward upper and decrease your reward lower bound.

In [8]:

def quantifying(array_size, init, interval, input):
    array = np.zeros(array_size)
    index = int( (input - init) // interval + 1)
    if index >= 0:
        array[ : index] = 1
    return array

def vectorizing_state(state):      # Reminder: change this for your specific task ⚠️⚠️⚠️
    state_0 = quantifying(100, -2.5, 0.050, state[0])
    state_1 = quantifying(100, -3.75, 0.075, state[1])
    state_2 = quantifying(100, -0.375, 0.0075, state[2])
    state_3 = quantifying(100, -3.75, 0.075, state[3])
    state_4 = quantifying(100, 0, 10, 0)
    state   = np.atleast_2d(np.concatenate((state_0, state_1, state_2, state_3, state_4)))
    return state

def vectorizing_action(action_size, action_argmax):  # Reminder: change this for your specific task ⚠️⚠️⚠️
    return np.eye(action_size)[action_argmax]

def vectorizing_reward(state, reward, summed_reward, reward_size):       # Reminder: change this for your specific task ⚠️⚠️⚠️
    if reward != 1:
        reward = np.zeros(reward_size)
    else:
        reward = np.ones(reward_size)
    return reward


# Function for sequentializing state, action and reward

In [9]:
def sequentialize(short_term_state_list, short_term_action_list, short_term_reward_list, time_size):

    short_term_present_state_list = []
    short_term_future_action_list = []
    short_term_future_reward_list = []
    short_term_future_state_list  = []

    if time_size > len(short_term_state_list[:-1]):
        time_size = len(short_term_state_list[:-1])
    else:
      pass

    for i in range(len(short_term_reward_list[:-time_size+1])):
        short_term_present_state_list.append(      short_term_state_list [ i                       ]  )
        short_term_future_action_list.append(      short_term_action_list[ i   : i+time_size       ]  )
        short_term_future_reward_list.append(      short_term_reward_list[ i   : i+time_size       ]  )
        short_term_future_state_list.append(       short_term_state_list [ i+1 : i+time_size+1     ]  )

    return short_term_present_state_list, short_term_future_action_list, short_term_future_reward_list, short_term_future_state_list

# Function for data preparation

In [10]:
def obtain_tensor_from_list(short_term_state_list,
                            short_term_future_action_list,
                            short_term_future_reward_list,
                            short_term_future_state_list,
                            device):

    # Convert lists to tensors directly on the desired device and data type
    short_term_state_tensor         = torch.tensor(np.array(short_term_state_list), dtype=torch.float).to(device)
    short_term_future_action_tensor = torch.tensor(np.array(short_term_future_action_list), dtype=torch.float).to(device)
    short_term_future_reward_tensor = torch.tensor(np.array(short_term_future_reward_list), dtype=torch.float).to(device)
    short_term_future_state_tensor  = torch.tensor(np.array(short_term_future_state_list), dtype=torch.float).to(device)

    return short_term_state_tensor, short_term_future_action_tensor, short_term_future_reward_tensor, short_term_future_state_tensor

In [11]:
def obtain_TD_error(model,
                    train_loader_,
                    device):


    for state, future_action, future_reward, future_state in train_loader_:

        model.train()
        selected_optimizer = model.selected_optimizer
        selected_optimizer.zero_grad()

        loss_function        = model.loss_function_
        output_reward, _     = model(state, future_action)
        total_loss           = loss_function(output_reward, future_reward).detach().to(device)
        total_loss           = torch.sum(torch.abs(total_loss), dim=(1, 2))

    return total_loss

In [12]:
def save_performance_to_csv(performance_log, filename='performance_log.csv'):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Episode', 'Summed_Reward'])
        writer.writerows(performance_log)

# Control board

Crucial variables regarding how your agent will learn in the environment

- In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.



In [13]:
game_name = 'CartPole-v1'                # (Reminder: change this for your specific task ⚠️⚠️⚠️)
max_steps_for_each_episode = 2000        # (Reminder: change this for your specific task ⚠️⚠️⚠️)


ensemble_size = 10                # choose the size of the neural ensemble (Reminder: change this value to see the impact of MWM-SGD ◀️◀️◀️)
state_size =  500                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
action_size = 2                   # (Reminder: change this for your specific task ⚠️⚠️⚠️)
reward_size = 100                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
hidden_size = 100                 # (Reminder: change this for your specific task ⚠️⚠️⚠️)
time_size = 20                    # (Reminder: change this for your specific task ⚠️⚠️⚠️)
neural_type = 'att'               # choose your neural type: [rnn gru lstm] [att]
num_layers = 2                    # choose the number of layers for your rnn or attention: [1 2 3 4 etc.]
num_heads = 10                    # choose your number of heads: [None for non-attention] [should be able to divide hidden_size for attention]
hidden_activation = 'tanh'        # choose hidden activation function: [relu leaky_relu sigmoid tanh]
output_activation = 'sigmoid'     # choose output activation function: [relu leaky_relu sigmoid tanh]
init = "random_normal"            # choose initialization method: [random_normal random_uniform xavier_normal xavier_uniform glorot_normal glorot_uniform]
opti = 'sgd'                      # choose optimization method: [adam sgd rmsprop]
loss = 'mean_squared_error'       # choose error function type: [mean_squared_error binary_crossentropy]
drop_rate = 0.0001                # choose your drop rate (Reminder: change this value to see the impact of drop-out ◀️◀️◀️)
alpha = 0.1                       # choose your learning rate for updating neural nets
iteration_for_learning = 10000    # choose learning iteration for nn weights (Reminder: change this for your specific task ⚠️⚠️⚠️)
batch_size = 1                    # batch_size for learning
load_pre_model = False            # retrain from existing neural nets or not


noise_t = 1                      # gaussian noise
noise_r = 0.1                    # smaller value encourages agent to exploit experience while larger value encourages agent to explore at the cost of longer training time (Reminder: change this for your specific task ⚠️⚠️⚠️)
beta = 0.1                       # updating rate for input action
iteration_for_deducing =  100    # updating iteration for input action (Reminder: change this for your specific task ⚠️⚠️⚠️)


episode_for_training = 100000
batch_size_for_offline_learning = 10     # batch size for batch offline learning (Reminder: change this for your specific task ⚠️⚠️⚠️)
PER_exponent = 1                         # prioritized_experience_replay (Reminder: change this for your specific task ⚠️⚠️⚠️)
PER_replace = True                       # prioritized_experience_replay sample method (Reminder: change this for your specific task ⚠️⚠️⚠️)
EWC_lambda = 1                           # elastic weight control lambda (Reminder: change this for your specific task ⚠️⚠️⚠️)


episode_for_testing = 100
render_for_human = True


suffix                      = f"game={game_name}_type={neural_type}_ensemble={ensemble_size:05d}_drop={drop_rate:.5f}_learn={iteration_for_learning:05d}_interval={batch_size_for_offline_learning:05d}_deduce={iteration_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/result/{game_name}/'
model_directory             = f'/content/result/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/result/{game_name}/performace_log_{suffix}.csv'

# Deducing > Learning


Creating or loading models

In [14]:

if not os.path.exists(directory):
    os.makedirs(directory)

if load_pre_model == False:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            action_size,
                            reward_size,
                            hidden_size,
                            time_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            drop_rate,
                            alpha)
        model.to(device)
        model_loader.append(model)

elif load_pre_model == True:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            action_size,
                            reward_size,
                            hidden_size,
                            time_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            drop_rate,
                            alpha)
        model.to(device)
        model_loader.append(model)

    for i in range(len(model_loader)):
        model_loader[i].load_state_dict(torch.load( model_directory  % i ))


gradient_matrix_loader = [''] * len(model_loader)

Creating Streams

In [15]:
stream_list = []
for _ in range(ensemble_size):
    stream  = torch.cuda.Stream()
    stream_list.append(stream)


Creating intial gradient matrices

In [16]:

prev_model_loader = copy.deepcopy(model_loader)

prev_gradient_matrix_loader = []
for model in model_loader:
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}
    prev_gradient_matrix_loader.append( gradient_matrix )


Creating desired reward

In [17]:
desired_future_reward = torch.ones((batch_size, time_size, reward_size)).to(device)

Putting all the previous works into play

In [None]:

performance_log = []
performance_log.append([0, 0])

for training_episode in tqdm(range(episode_for_training)):

    # initializing short term experience replay buffer
    short_term_state_list  = []
    short_term_action_list = []
    short_term_reward_list = []

    # initializing environment
    env           = gym.make(game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state         = env.reset()
    summed_step   = 0
    summed_reward = 0

    # observing state
    state = vectorizing_state(state)
    short_term_state_list.append(state[0])

    done = False
    while not done:

        

        # initializing and updating action
        state                       = torch.tensor(state, dtype=torch.float).to(device)
        pre_activated_future_action = initialize_pre_activated_future_action(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_future_action = torch.tensor(pre_activated_future_action, dtype=torch.float).to(device)
        pre_activated_future_action = update_pre_activated_future_action(iteration_for_deducing,
                                                                  model_loader,
                                                                  state,
                                                                  pre_activated_future_action,
                                                                  desired_future_reward,
                                                                  beta)
        action_argmax    = int(torch.argmax(pre_activated_future_action[0, 0]))
        action           = vectorizing_action(action_size, action_argmax)
        short_term_action_list.append(action)

        # executing action
        state, reward, done, info = env.step(action_argmax)

        # observing actual reward
        summed_step += 1
        summed_reward += reward
        reward = vectorizing_reward(state, reward, summed_reward, reward_size)
        short_term_reward_list.append(reward)

        # observing state
        state = vectorizing_state(state)
        short_term_state_list.append(state[0])


        if summed_step < time_size:
            done = False
        else:
            if done:
                print(f'Episode {training_episode+1}: Summed_Reward = {summed_reward}')
                performance_log.append([training_episode+1, summed_reward])
                save_performance_to_csv(performance_log, performance_log_directory)
                break




    env.close()




    # sequentializing short term experience replay buffer 
    short_term_state_list, \
    short_term_future_action_list, \
    short_term_future_reward_list, \
    short_term_future_state_list = sequentialize(short_term_state_list, short_term_action_list, short_term_reward_list, time_size )
    
    
    
    
    # saving short term experience replay buffer to long term experience replay buffer
    short_term_state_tensor,\
    short_term_future_action_tensor,\
    short_term_future_reward_tensor,\
    short_term_future_state_tensor = obtain_tensor_from_list(short_term_state_list,
                                                             short_term_future_action_list,
                                                             short_term_future_reward_list,
                                                             short_term_future_state_list,
                                                             device)
    if training_episode==0:
        long_term_state_tensor               = copy.deepcopy(short_term_state_tensor)
        long_term_future_action_tensor       = copy.deepcopy(short_term_future_action_tensor)
        long_term_future_reward_tensor       = copy.deepcopy(short_term_future_reward_tensor)
        long_term_future_state_tensor        = copy.deepcopy(short_term_future_state_tensor)
    else:
        long_term_state_tensor               = torch.cat((long_term_state_tensor              , short_term_state_tensor            ), dim=0)
        long_term_future_action_tensor       = torch.cat((long_term_future_action_tensor      , short_term_future_action_tensor    ), dim=0)
        long_term_future_reward_tensor       = torch.cat((long_term_future_reward_tensor      , short_term_future_reward_tensor    ), dim=0)
        long_term_future_state_tensor        = torch.cat((long_term_future_state_tensor       , short_term_future_state_tensor     ), dim=0)
        



    # batch offline learning
    if (training_episode+1) % batch_size_for_offline_learning == 0:




        # creating dataset
        dataset      = TensorDataset(long_term_state_tensor          ,
                                     long_term_future_action_tensor  ,
                                     long_term_future_reward_tensor  ,
                                     long_term_future_state_tensor   )
        data_loader  = DataLoader(dataset, batch_size = len(dataset), shuffle=False)




        # training with Prioritized Experience Replay (PER) and Elastic Weight Control (EWC)
        for i, model in enumerate(model_loader):
            with torch.cuda.stream(stream_list[i]):




                # creating TD error probability
                TD_error     = obtain_TD_error(model, data_loader, device)
                TD_error     = TD_error.cpu().numpy() ** PER_exponent
                TD_error_p   = TD_error / np.sum(TD_error)

                # creating sub dataset and sub data loader from  TD error probability
                if PER_replace == True:
                    index_list       = np.random.choice(range(len(dataset)), 
                                                        p = TD_error_p, 
                                                        size = iteration_for_learning, 
                                                        replace=PER_replace)
                else:
                    index_list       = np.random.choice(range(len(dataset)), 
                                                        p = TD_error_p, 
                                                        size = min(iteration_for_learning, len(dataset)),
                                                        replace=PER_replace)
                sub_dataset      = Subset(dataset, index_list)
                sub_data_loader  = DataLoader(sub_dataset, batch_size = batch_size, shuffle=True)




                # training with PER and EWC
                model                     = update_model(iteration_for_learning,
                                                         model,
                                                         sub_data_loader,
                                                         prev_model_loader[i],
                                                         prev_gradient_matrix_loader[i],
                                                         EWC_lambda)
                model_loader[i]           = model




                # obtaining EWC gradient
                gradient_matrix           = update_gradient_matrix(model,
                                                                   data_loader)
                gradient_matrix_loader[i] = gradient_matrix
        torch.cuda.synchronize()
        prev_model_loader           = copy.deepcopy(model_loader)
        prev_gradient_matrix_loader = copy.deepcopy(gradient_matrix_loader)




        # saving:
        for i in range(len(model_loader)):
            torch.save(model_loader[i].state_dict(), model_directory % i)


        gc.collect()
        torch.cuda.empty_cache()

# Deducing (testing)

Loading models

In [None]:
model_loader = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        action_size,
                        reward_size,
                        hidden_size,
                        time_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        hidden_activation,
                        output_activation,
                        init,
                        opti,
                        loss,
                        drop_rate,
                        alpha)
    model.to(device)
    model_loader.append(model)

for i in range(len(model_loader)):
    model_loader[i].load_state_dict(torch.load(model_directory % i))

Creating desired reward ... again

In [None]:
desired_future_reward = torch.ones((batch_size, time_size, reward_size)).to(device)

Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
total_summed_reward = 0

for testing_episode in range(episode_for_testing):

    if render_for_human == True:
        env = gym.make( game_name, render_mode="human")
    else:
        env = gym.make( game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state                  = env.reset()
    if render_for_human == True:
        env.render()
    summed_reward = 0

    state = vectorizing_state(state)

    done = False
    while not done:

        state                       = torch.tensor(state, dtype=torch.float).to(device)
        pre_activated_future_action = initialize_pre_activated_future_action(init, noise_t, noise_r, (1, time_size, action_size))
        pre_activated_future_action = torch.tensor(pre_activated_future_action, dtype=torch.float).to(device)
        pre_activated_future_action = update_pre_activated_future_action(iteration_for_deducing,
                                                                  model_loader,
                                                                  state,
                                                                  pre_activated_future_action,
                                                                  desired_future_reward,
                                                                  beta)
        action_argmax    = int(torch.argmax(pre_activated_future_action[0, 0]))

        state, reward, done,  info = env.step(action_argmax)
        if render_for_human == True:
            env.render()

        summed_reward += reward

        state = vectorizing_state(state)

        if done:
            break


    env.close()

    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged summed reward:')
    total_summed_reward += summed_reward
    print(total_summed_reward/(testing_episode + 1))

