<a href="https://colab.research.google.com/github/Brownwang0426/Genrl/blob/main/CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirements

In [None]:
!sudo apt-get install python3.10

In [None]:
!pip install pandas==2.0.3 numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gym==0.25.2 pygame==2.5.2 tqdm torch==2.0.1

# Importing modules

In [148]:
import gym

import numpy as np
from scipy.special import softmax

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset

import csv

import os
import sys
import copy
import random
import gc
from tqdm import tqdm


# Checking cuda

In [149]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')
assert device != torch.device("cpu") # Sorry, but we really recommend you to run it on GPU :-) Nvidia needs your money :-)

Device 0: NVIDIA T500
using cuda...


In [150]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

# Class for building model

In [151]:
class build_model(nn.Module):
    def __init__(self,
                 input_neuron_size_,
                 hidden_neuron_size,
                 input_neuron_size,
                 input_sequence_size,
                 output_neuron_size,
                 neural_type,
                 num_layers,
                 num_heads,
                 hidden_activation,
                 output_activation,
                 initializer,
                 optimizer,
                 loss,
                 alpha,
                 mask_value):

        super(build_model, self).__init__()

        self.input_neuron_size_   = int(input_neuron_size_)
        self.hidden_neuron_size   = int(hidden_neuron_size)
        self.input_neuron_size    = int(input_neuron_size)
        self.input_sequence_size  = int(input_sequence_size)
        self.output_neuron_size   = int(output_neuron_size)
        self.neural_type          = neural_type
        self.num_heads            = num_heads

        self.hidden_activation    = hidden_activation
        self.output_activation    = output_activation
        self.initializer          = initializer
        self.optimizer            = optimizer
        self.loss                 = loss
        self.alpha                = alpha
        self.mask_value           = mask_value

        self.bias = False

        self.num_layers = num_layers

        neural_types = {
            'rnn': nn.RNN,
            'gru': nn.GRU,
            'lstm': nn.LSTM
        }

        self.fully_connected_layer_in_0      = nn.Linear(self.input_neuron_size_, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_in_1      = nn.Linear(self.hidden_neuron_size, self.hidden_neuron_size, bias=self.bias)

        self.fully_connected_layer_out_0     = nn.Linear(self.hidden_neuron_size, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_out_1     = nn.Linear(self.hidden_neuron_size, self.input_neuron_size_, bias=self.bias)

        self.recurrent_layer_0               = neural_types[neural_type.lower()](self.input_neuron_size, self.hidden_neuron_size, num_layers=self.num_layers, batch_first=False, bias=self.bias)

        # self.attention_layer_0               = nn.MultiheadAttention(self.hidden_neuron_size, self.num_heads)
        self.fully_connected_layer_0         = nn.Linear(self.hidden_neuron_size * self.input_sequence_size, self.output_neuron_size, bias=self.bias)

        # Activation functions
        self.hidden_activation = self.get_activation(self.hidden_activation)
        self.output_activation = self.get_activation(self.output_activation)

        # Initialize weights for fully connected layers
        self.initialize_weights(self.initializer  )

        # Optimizer
        optimizers = {
            'adam': optim.Adam,
            'sgd': optim.SGD,
            'rmsprop': optim.RMSprop
        }
        self.selected_optimizer = optimizers[self.optimizer.lower()](self.parameters(), lr=self.alpha)

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(),
            'binary_crossentropy': torch.nn.BCELoss()
        }
        self.loss_function = losses[self.loss .lower()]

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(reduction='none'),
            'binary_crossentropy': torch.nn.BCELoss(reduction='none')
        }
        self.loss_function_ = losses[self.loss .lower()]


    def forward(self, initial_hidden, x, padding_mask):

        h  = self.fully_connected_layer_in_0(initial_hidden)
        h  = self.hidden_activation(h)
        h  = self.fully_connected_layer_in_1(h)
        h  = self.hidden_activation(h)
        h  = torch.unsqueeze(h, dim=0).repeat(self.num_layers, 1, 1)

        out        = x.permute(1, 0, 2)
        lengths    = (out != self.mask_value).any(dim=2).sum(dim=0).cpu().long() # since x is (sequence_length, batch_size, input_size), we should use sum(dim=0)
        out        = rnn_utils.pack_padded_sequence(out, lengths, batch_first=False, enforce_sorted=False)
        # Forward propagate RNN
        if self.neural_type == 'lstm':
            out, h   = self.recurrent_layer_0(out, (h, h))
            h        = h[0]
        else:
            out, h   = self.recurrent_layer_0(out, h)
            h        = h
        out, _     = rnn_utils.pad_packed_sequence(out, batch_first=False)
        padding    = (0, 0, 0, 0, 0, self.input_sequence_size - out.size(0))
        out        = F.pad(out, padding, "constant", 0)
        out        = out.permute(1, 0, 2)

        h  = self.fully_connected_layer_out_0(h)
        h  = self.hidden_activation(h)
        h  = self.fully_connected_layer_out_1(h)
        h  = self.output_activation(h)

        # if padding_mask is not None:
        #     padding_mask = torch.any(padding_mask, dim=-1) # to (batch_size, sequence_length)
        # out, _ = self.attention_layer_0(out, out, out, padding_mask)
        out    = torch.flatten(out, start_dim=1)
        out    = self.fully_connected_layer_0(out)
        out    = self.output_activation(out)

        return out, h




    def get_activation(self,  activation):
        activations = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(),
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh()
        }
        return activations[ activation.lower()]

    def initialize_weights(self, initializer):
        initializers = {
            'random_uniform': nn.init.uniform_,
            'random_normal': nn.init.normal_,
            'glorot_uniform': nn.init.xavier_uniform_,
            'glorot_normal': nn.init.xavier_normal_,
            'xavier_uniform': nn.init.xavier_uniform_,
            'xavier_normal': nn.init.xavier_normal_
        }
        initializer = initializers[initializer.lower()]
        for layer in self.children():
            if isinstance(layer, nn.Linear):
                initializer(layer.weight)

# Function for updating pre-activated actions using error backprop

In [152]:

def update_pre_activated_actions(epoch_for_deducing,
                                 model_loader,
                                 state,
                                 pre_activated_actions,
                                 desired_reward,
                                 beta):

    model_loader_copy = copy.deepcopy(model_loader)

    for epoch in range(epoch_for_deducing):

        random.shuffle(model_loader_copy)

        for model in model_loader_copy:

            actions = torch.sigmoid(pre_activated_actions)

            model.train()
            actions = actions.clone().detach().requires_grad_(True)
            if actions.grad is not None:
                actions.grad.zero_()
            for param in model.parameters():
                param.requires_grad = False

            loss_function = model.loss_function
            output, _     = model(state, actions, padding_mask=None)
            total_loss    = loss_function(output, desired_reward)
            total_loss.backward() # get grad

            pre_activated_actions -= actions.grad * (1 - actions) * actions * beta # update params

    return pre_activated_actions




# Function for updating model using error backprop

Elastic weight consolidation:
https://arxiv.org/pdf/1612.00796

In [153]:
# traditional EWC
def EWC_loss(EWC_lambda, model, prev_model, prev_gradient_matrix):
    model_param      = model.state_dict()
    prev_model_param = prev_model.state_dict()
    loss = 0
    for name, param in model.named_parameters():
        diagonal_fisher_matrix = prev_gradient_matrix[name] ** 2
        param_diff             = (model_param[name] - prev_model_param[name]) ** 2
        loss                  += (diagonal_fisher_matrix * param_diff).sum()
    return EWC_lambda * loss




def update_model(epoch_for_learning,
                 model,
                 train_loader,
                 prev_model,
                 prev_gradient_matrix,
                 EWC_lambda):

    for epoch in range(epoch_for_learning):

        for state, actions, reward, next_state, padding_mask in train_loader:

            next_state  = torch.unsqueeze(next_state, dim=0).repeat(model.num_layers, 1, 1)

            model.train()
            selected_optimizer = model.selected_optimizer
            selected_optimizer.zero_grad()

            loss_function        = model.loss_function
            output, output_state = model(state, actions, padding_mask)
            total_loss           = loss_function(output, reward) + loss_function(output_state, next_state)
            total_loss          += EWC_loss(EWC_lambda, model, prev_model, prev_gradient_matrix)
            total_loss.backward()     # get grad

            selected_optimizer.step() # update params




    # training and updating present_gradient_matrix
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}

    for epoch in range(1):

        for state, actions, reward, next_state, padding_mask in train_loader:

            next_state  = torch.unsqueeze(next_state, dim=0).repeat(model.num_layers, 1, 1)

            model.train()
            selected_optimizer = model.selected_optimizer
            selected_optimizer.zero_grad()

            loss_function        = model.loss_function
            output, output_state = model(state, actions, padding_mask)
            total_loss           = loss_function(output, reward)  + loss_function(output_state, next_state)
            total_loss.backward()        # get grad

            for name, param in model.named_parameters():
                gradient_matrix[name] += param.grad

    gradient_matrix = {name: param / len(train_loader) for name, param in gradient_matrix.items()}




    return model, gradient_matrix

# Function for updating error for weighted experience replay 

In [154]:
def update_error(model,
                 train_loader):


    for state, actions, reward, next_state, padding_mask in train_loader:

        next_state  = torch.unsqueeze(next_state, dim=0).repeat(model.num_layers, 1, 1)

        model.train()
        selected_optimizer = model.selected_optimizer
        selected_optimizer.zero_grad()

        loss_function        = model.loss_function_
        output, output_state = model(state, actions, padding_mask)

        loss_1 = np.abs(loss_function(output, reward).detach().cpu().numpy())
        loss_1 = np.sum(loss_1, axis=1)

        loss_2 = np.abs(loss_function(output_state, next_state).detach().cpu().numpy())
        loss_2 = np.sum(loss_2, axis=2)
        loss_2 = np.sum(loss_2, axis=0)

        break

    return loss_1 + loss_2

# Function for re-initializing action value in each step

In [155]:
def initialize_pre_activated_actions(init, noise_t, noise_r, shape):
    input = 0
    if   init == "random_uniform":
        for _ in range(noise_t):
            input += np.array([  np.random.uniform(low=0, high=1, size=shape)    ]) * noise_r
    elif init == "random_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= 1, size= shape )    ])  * noise_r
    elif init == "glorot_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "glorot_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    elif init == "xavier_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "xavier_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    return input

# Function for sequentializing state, action and reward

In [156]:
def sequentialize(state_list, action_list, reward_list, chunk_size):

    sequentialized_state_list  = []
    sequentialized_action_list = []
    sequentialized_reward_list = []
    sequentialized_next_state_list  = []

    if chunk_size > len(state_list[:-1]):
        chunk_size = len(state_list[:-1])
    else:
      pass

    for j in range(chunk_size):
        chunk_size_ = j+1
        if chunk_size != 1:
            for i in range(len(reward_list[:-chunk_size_+1])):
                sequentialized_state_list.append(       state_list [ i ] )
                sequentialized_action_list.append(      action_list[ i : i+chunk_size_]  )
                sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+chunk_size_]) - 1 ]  )
                sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+chunk_size_])     ]  )
        else:
            for i in range(len(reward_list[:])):
                sequentialized_state_list.append(       state_list [ i ] )
                sequentialized_action_list.append(      action_list[ i : i+chunk_size_]  )
                sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+chunk_size_]) - 1 ]  )
                sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+chunk_size_])     ]  )


    return sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list


In [157]:
def save_performance_to_csv(performance_log, filename='performance_log.csv'):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Episode', 'Summed_Reward'])
        writer.writerows(performance_log)

# Function for vectorizing
Crucial function regarding how you manipulate or shape your state, action and reward

- It's essential to choose between immediate rewards and summed rewards for training your agent. If the current state doesn't encapsulate all crucial past information, using immediate rewards is advisable. This approach prevents confusion caused by varying summed rewards for the same state.

- As for reward shaping, it is recommended to increase your reward upper and decrease your reward lower bound.

In [158]:

def quantifying(array_size, init, interval, input):
    array = np.zeros(array_size)
    index = int( (input - init) // interval + 1)
    if index >= 0:
        array[ : index] = 1
    return array

def vectorizing_state(state):      # Reminder: change this for your specific task ⚠️⚠️⚠️
    state_0 = quantifying(100, -2.5, 0.050, state[0])
    state_1 = quantifying(100, -3.75, 0.075, state[1])
    state_2 = quantifying(100, -0.375, 0.0075, state[2])
    state_3 = quantifying(100, -3.75, 0.075, state[3])
    state_4 = quantifying(100, 0, 10, 0)
    state   = np.atleast_2d(np.concatenate((state_0, state_1, state_2, state_3, state_4)))
    return state

def vectorizing_action(action_size, action_argmax):  # Reminder: change this for your specific task ⚠️⚠️⚠️
    return np.eye(action_size)[action_argmax]

def vectorizing_reward(state, reward, summed_reward, done, reward_size):       # Reminder: change this for your specific task ⚠️⚠️⚠️
    if done:
        reward = np.zeros(reward_size)
    else:
        reward = np.ones(reward_size)
    return reward


# Control board

Crucial variables regarding how your agent will learn in the environment

- In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.



In [159]:
game_name = 'CartPole-v1'                # Reminder: change this for your specific task ⚠️⚠️⚠️
max_steps_for_each_episode = 2000        # Reminder: change this for your specific task ⚠️⚠️⚠️


ensemble_size = 20                       # Reminder: change this value to see the impact of MWM-SGD ◀️◀️◀️
state_size =  500                        # Reminder: change this for your specific task ⚠️⚠️⚠️
hidden_size = 100                        # Reminder: change this for your specific task ⚠️⚠️⚠️ (should be dividable by num_heads below)
action_size = 2                          # Reminder: change this for your specific task ⚠️⚠️⚠️
time_size = 15                           # Reminder: change this for your specific task ⚠️⚠️⚠️
chunk_size = 15                          # Reminder: change this for your specific task ⚠️⚠️⚠️
reward_size = 100                        # Reminder: change this for your specific task ⚠️⚠️⚠️
neural_type = 'gru'                      # rnn gru lstm
num_layers = 2                           # Reminder: change this for your specific task ⚠️⚠️⚠️
num_heads = 10                           # should be able to divide hidden_size
hidden_activation = 'tanh'               # relu leaky_relu sigmoid tanh
output_activation = 'sigmoid'            # relu leaky_relu sigmoid tanh
init = "random_normal"                   # random_normal random_uniform xavier_normal xavier_uniform  glorot_normal  glorot_uniform
opti = 'sgd'                             # adam sgd rmsprop
loss = 'mean_squared_error'              # mean_squared_error  binary_crossentropy
alpha = 0.1
mask_value = sys.maxsize
epoch_for_learning = 10
batch_size = 1                  
load_pre_model = False


noise_t = 1
noise_r = 0.1
beta = 0.1
epoch_for_deducing =  int(100/ensemble_size)


episode_for_training             = 100000
interval_for_initiating_learning = 1                     
replay_t = 1000                                          # Reminder: change this for your specific task ⚠️⚠️⚠️                         
EWC_lambda = 1                                           # Reminder: change this value to see the impact of EWC ◀️◀️◀️


episode_for_testing = 100                # Reminder: change this for your specific task ⚠️⚠️⚠️
render_for_human = False                 # Reminder: change this for your specific task ⚠️⚠️⚠️


suffix                      = f"ensemble={ensemble_size:05d}_learn={epoch_for_learning:05d}_interval={interval_for_initiating_learning:05d}_deduce={epoch_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/Genrl/{game_name}/'
model_directory             = f'/content/Genrl/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/Genrl/{game_name}/performace_log_{suffix}.csv'

# Deducing > Learning


Creating or loading models

In [160]:

if not os.path.exists(directory):
    os.makedirs(directory)

if load_pre_model == False:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            alpha,
                            mask_value)
        model.to(device)
        model_loader.append(model)

elif load_pre_model == True:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            alpha,
                            mask_value)
        model.to(device)
        model_loader.append(model)

    for i in range(len(model_loader)):
        model_loader[i].load_state_dict(torch.load( model_directory  % i ))


Creating Streams

In [161]:
stream_list = []
for _ in range(ensemble_size):
    stream  = torch.cuda.Stream()
    stream_list.append(stream)


Creating intial gradient matrices

In [162]:
"""
storing previous models
"""
prev_model_loader = copy.deepcopy(model_loader)

"""
calculating gradient matrix
"""
prev_gradient_matrix_loader = []
for model in model_loader:
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}
    prev_gradient_matrix_loader.append( gradient_matrix )


Creating desired reward

In [163]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float).to(device)

Putting all the previous works into play

In [164]:

performance_log = []
performance_log.append([0, 0])

# initializing long term experience replay buffer
sequentialized_state_list      = []
sequentialized_actions_list    = []
sequentialized_reward_list     = []
sequentialized_next_state_list = []

for training_episode in tqdm(range(episode_for_training)):

    # initializing short term experience replay buffer
    state_list  = []
    action_list = []
    reward_list = []

    # initializing environment
    env           = gym.make(game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state         = env.reset()
    summed_reward = 0

    # observing state 
    state = vectorizing_state(state)
    state_list.append(state[0])

    done = False
    while not done:

        # initializing and updating actions
        state                 = torch.tensor(state, dtype=torch.float).to(device)
        pre_activated_actions = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_actions = torch.tensor(pre_activated_actions, dtype=torch.float).to(device)
        pre_activated_actions = update_pre_activated_actions(epoch_for_deducing,
                                                             model_loader,
                                                             state,
                                                             pre_activated_actions,
                                                             desired_reward,
                                                             beta)
        action_argmax    = int(torch.argmax(pre_activated_actions[0, 0]))
        action           = vectorizing_action(action_size, action_argmax)
        action_list.append(action)

        # executing action
        state, reward, done, info = env.step(action_argmax)

        # observing actual reward
        summed_reward += reward
        reward = vectorizing_reward(state, reward, summed_reward, done, reward_size)
        reward_list.append(reward)

        # observing state 
        state = vectorizing_state(state)
        state_list.append(state[0])

        if done:
            print(f'Episode {training_episode+1}: Summed_Reward = {summed_reward}')
            performance_log.append([training_episode+1, summed_reward])
            save_performance_to_csv(performance_log, performance_log_directory)
            break




    env.close()




    # sequentializing short term experience replay buffer
    sequentialized_state_list_slice, sequentialized_actions_list_slice, sequentialized_reward_list_slice, sequentialized_next_state_list_slice = sequentialize(state_list, action_list, reward_list, chunk_size )




    # storing sequentialized experience to long term experience replay buffer
    sequentialized_state_list       .extend(sequentialized_state_list_slice)
    sequentialized_actions_list     .extend(sequentialized_actions_list_slice)
    sequentialized_reward_list      .extend(sequentialized_reward_list_slice)
    sequentialized_next_state_list  .extend(sequentialized_next_state_list_slice)




    # batch offline learning
    if (training_episode+1) % interval_for_initiating_learning == 0:




        # weighted experience replay:
        state_tensor      = torch.stack( [torch.tensor(arr) for arr in sequentialized_state_list]               ).float().to(device)
        actions_tensor    = torch.stack( [F.pad(torch.tensor(arr),
                                                pad=(0, 0, 0, time_size - torch.tensor(arr).size(0)),
                                                mode='constant',
                                                value= mask_value) for arr in sequentialized_actions_list]      ).float().to(device)
        reward_tensor     = torch.stack( [torch.tensor(arr) for arr in sequentialized_reward_list]              ).float().to(device)
        next_state_tensor = torch.stack( [torch.tensor(arr) for arr in sequentialized_next_state_list]          ).float().to(device)

        row_mask     = torch.all(actions_tensor == mask_value, dim = -1)
        padding_mask = torch.zeros_like(actions_tensor, dtype = torch.bool)
        padding_mask[row_mask] = True
        padding_mask = padding_mask.to(device)

        dataset      = TensorDataset(state_tensor, actions_tensor, reward_tensor, next_state_tensor, padding_mask)
        data_loader  = DataLoader(dataset, batch_size = len(dataset), shuffle=False)

        temporal_difference_error_list = np.array([0.0] * len(dataset))
        for i, model in enumerate(model_loader):
            with torch.cuda.stream(stream_list[i]):
                temporal_difference_error = update_error(model, data_loader)
                temporal_difference_error_list += np.array(temporal_difference_error)
        torch.cuda.synchronize()
        temporal_difference_error_list = softmax(temporal_difference_error_list) + sys.float_info.epsilon

        num_indices = replay_t
        total_len   = len(sequentialized_state_list)
        if num_indices > total_len:
            num_indices = total_len
        random_index = np.random.choice(range(total_len), size=num_indices, p=temporal_difference_error_list, replace=False)
        sequentialized_state_list_weighted      = []
        sequentialized_actions_list_weighted    = []
        sequentialized_reward_list_weighted     = []
        sequentialized_next_state_list_weighted = []
        sequentialized_state_list_weighted     .extend([sequentialized_state_list [i]      for i in  random_index  ]  )
        sequentialized_actions_list_weighted   .extend([sequentialized_actions_list [i]    for i in  random_index  ]  )
        sequentialized_reward_list_weighted    .extend([sequentialized_reward_list [i]     for i in  random_index  ]  )
        sequentialized_next_state_list_weighted.extend([sequentialized_next_state_list [i] for i in  random_index  ]  )




        # masking and uploading data
        state_tensor      = torch.stack( [torch.tensor(arr) for arr in sequentialized_state_list_weighted]               ).float().to(device)
        actions_tensor    = torch.stack( [F.pad(torch.tensor(arr),
                                                pad=(0, 0, 0, time_size - torch.tensor(arr).size(0)),
                                                mode='constant',
                                                value= mask_value) for arr in sequentialized_actions_list_weighted]      ).float().to(device)
        reward_tensor     = torch.stack( [torch.tensor(arr) for arr in sequentialized_reward_list_weighted]              ).float().to(device)
        next_state_tensor = torch.stack( [torch.tensor(arr) for arr in sequentialized_next_state_list_weighted]          ).float().to(device)

        row_mask     = torch.all(actions_tensor == mask_value, dim = -1)
        padding_mask = torch.zeros_like(actions_tensor, dtype = torch.bool)
        padding_mask[row_mask] = True
        padding_mask = padding_mask.to(device)

        dataset      = TensorDataset(state_tensor, actions_tensor, reward_tensor, next_state_tensor, padding_mask)
        data_loader  = DataLoader(dataset, batch_size = batch_size, shuffle=True)

        # learning
        gradient_matrix_loader = [''] * len(model_loader)
        for i, model in enumerate(model_loader):
            with torch.cuda.stream(stream_list[i]):
                model, gradient_matrix    = update_model(epoch_for_learning, model, data_loader, prev_model_loader[i], prev_gradient_matrix_loader[i], EWC_lambda)
                model_loader[i]           = model
                gradient_matrix_loader[i] = gradient_matrix
        torch.cuda.synchronize()
        prev_model_loader           = copy.deepcopy(model_loader)
        prev_gradient_matrix_loader = copy.deepcopy(gradient_matrix_loader)




        # saving:
        for i in range(len(model_loader)):
            torch.save(model_loader[i].state_dict(), model_directory % i)


        gc.collect()
        torch.cuda.empty_cache()



  0%|          | 0/100000 [00:00<?, ?it/s]

Episode 1: Summed_Reward = 69.0


  0%|          | 1/100000 [25:57<43260:04:03, 1557.38s/it]

Episode 2: Summed_Reward = 11.0


  0%|          | 2/100000 [54:19<45619:22:13, 1642.33s/it]

Episode 3: Summed_Reward = 46.0


  0%|          | 3/100000 [1:22:11<45996:48:18, 1655.93s/it]

Episode 4: Summed_Reward = 22.0


  0%|          | 4/100000 [1:46:50<44057:12:55, 1586.12s/it]

Episode 5: Summed_Reward = 82.0


  0%|          | 5/100000 [2:12:18<43474:59:54, 1565.18s/it]

Episode 6: Summed_Reward = 13.0


  0%|          | 6/100000 [2:35:51<42041:22:04, 1513.58s/it]

Episode 7: Summed_Reward = 23.0


  0%|          | 7/100000 [2:59:22<41109:44:38, 1480.05s/it]

Episode 8: Summed_Reward = 20.0


  0%|          | 8/100000 [3:23:31<40832:33:43, 1470.09s/it]

Episode 9: Summed_Reward = 71.0


  0%|          | 9/100000 [3:48:18<40977:15:56, 1475.31s/it]

Episode 10: Summed_Reward = 90.0


  0%|          | 10/100000 [4:13:21<41212:14:59, 1483.79s/it]

Episode 11: Summed_Reward = 45.0


  0%|          | 11/100000 [4:37:43<41029:54:36, 1477.24s/it]

Episode 12: Summed_Reward = 57.0


  0%|          | 12/100000 [5:01:42<40706:38:40, 1465.62s/it]

Episode 13: Summed_Reward = 131.0


  0%|          | 13/100000 [5:26:35<40935:30:30, 1473.87s/it]

Episode 14: Summed_Reward = 80.0


  0%|          | 14/100000 [5:51:45<41241:05:29, 1484.89s/it]

Episode 15: Summed_Reward = 100.0


  0%|          | 15/100000 [6:16:50<41407:51:02, 1490.91s/it]

Episode 16: Summed_Reward = 56.0


  0%|          | 15/100000 [6:39:01<44328:36:27, 1596.07s/it]


KeyboardInterrupt: 

# Deducing (testing)

Loading models

In [None]:
model_loader = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        hidden_size,
                        action_size,
                        time_size,
                        reward_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        hidden_activation,
                        output_activation,
                        init,
                        opti,
                        loss,
                        alpha,
                        mask_value)
    model.to(device)
    model_loader.append(model)

for i in range(len(model_loader)):
    model_loader[i].load_state_dict(torch.load(model_directory % i))

Creating desired reward ... again

In [None]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float).to(device)

Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
total_summed_reward = 0

for testing_episode in range(episode_for_testing):

    if render_for_human == True:
        env = gym.make( game_name, render_mode="human")
    else:
        env = gym.make( game_name)
    env._max_episode_steps = max_steps_for_each_episode
    state                  = env.reset()
    if render_for_human == True:
        env.render()
    summed_reward = 0

    state = vectorizing_state(state)

    done = False
    while not done:

        state                 = torch.tensor(state, dtype=torch.float).to(device)
        pre_activated_actions = initialize_pre_activated_actions(init, noise_t, noise_r, (time_size, action_size))
        pre_activated_actions = torch.tensor(pre_activated_actions, dtype=torch.float).to(device)
        pre_activated_actions = update_pre_activated_actions(epoch_for_deducing,
                                                             model_loader,
                                                             state,
                                                             pre_activated_actions,
                                                             desired_reward,
                                                             beta)
        action_argmax    = int(torch.argmax(pre_activated_actions[0, 0]))

        state, reward, done,  info = env.step(action_argmax)
        if render_for_human == True:
            env.render()

        summed_reward += reward

        state = vectorizing_state(state)

        if done:
            break


    env.close()

    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged summed reward:')
    total_summed_reward += summed_reward
    print(total_summed_reward/(testing_episode + 1))

