<a href="https://colab.research.google.com/github/Brownwang0426/Genrl/blob/main/CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirements

In [None]:
!sudo apt-get install python3.10

In [None]:
!pip install pandas==2.0.3 numpy==1.25.2 scipy==1.11.4 swig==4.2.1 ufal.pybox2d==2.3.10.3 gym==0.25.2 pygame==2.5.2 tqdm torch==2.0.1

# Importing modules

In [1]:
import numpy as np
import pandas as pd
from scipy.special import expit
import gym
import copy
import os
import sys
from tqdm import tqdm
import random
import math

import pickle

import multiprocessing
import time
import csv
from collections import Counter

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset
import gc

  and should_run_async(code)


# Checking cuda

In [3]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')
assert device != torch.device("cpu") # Sorry, but we really recommend you to run it on GPU :-) Nvidia needs your money :-)

Device 0: Tesla T4
using cuda...


  and should_run_async(code)


In [4]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

# Class for building model

In [5]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiheadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.bias = False

        self.d_model   = d_model
        self.num_heads = num_heads
        self.d_k       = d_model // num_heads

        self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
        self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
        self.W_v  = nn.Linear(d_model, d_model, bias=self.bias)
        self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:

            mask = mask.unsqueeze(1).repeat(1, attn_scores.size(1), 1)
            mask = mask.unsqueeze(3).repeat(1, 1, 1, attn_scores.size(2))
            attn_scores = attn_scores.masked_fill(mask == True, -1e9)
            attn_scores = attn_scores.masked_fill(mask.transpose(-2, -1) == True, -1e9)
        else:
            attn_scores = attn_scores

        attn_probs = torch.softmax(attn_scores, dim=-1)

        if mask is not None:
            inverted_mask = ~mask
            attn_probs = attn_probs * inverted_mask.type_as(attn_probs)

        output = torch.matmul(attn_probs, V)

        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output      = self.W_o(self.combine_heads(attn_output))
        return output




class build_model(nn.Module):
    def __init__(self,
                 input_neuron_size_,
                 hidden_neuron_size,
                 input_neuron_size,
                 input_sequence_size,
                 output_neuron_size,
                 neural_type,
                 num_layers,
                 num_heads,
                 hidden_activation,
                 output_activation,
                 initializer,
                 optimizer,
                 loss,
                 alpha,
                 mask_value):

        super(build_model, self).__init__()

        self.input_neuron_size_   = int(input_neuron_size_)
        self.hidden_neuron_size   = int(hidden_neuron_size)
        self.input_neuron_size    = int(input_neuron_size)
        self.input_sequence_size  = int(input_sequence_size)
        self.output_neuron_size   = int(output_neuron_size)
        self.neural_type          = neural_type
        self.num_heads            = num_heads

        self.hidden_activation    = hidden_activation
        self.output_activation    = output_activation
        self.initializer          = initializer
        self.optimizer            = optimizer
        self.loss                 = loss
        self.alpha                = alpha
        self.mask_value           = mask_value

        self.bias = False

        self.num_layers = num_layers

        neural_types = {
            'rnn': nn.RNN,
            'gru': nn.GRU,
            'lstm': nn.LSTM
        }

        self.fully_connected_layer_in_0      = nn.Linear(self.input_neuron_size_, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_in_1      = nn.Linear(self.hidden_neuron_size, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_out_0     = nn.Linear(self.hidden_neuron_size, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_out_1     = nn.Linear(self.hidden_neuron_size, self.input_neuron_size_, bias=self.bias)

        self.recurrent_layer_0               = neural_types[neural_type.lower()](self.input_neuron_size, self.hidden_neuron_size, num_layers=self.num_layers, batch_first=False, bias=self.bias)

        self.fully_connected_layer_0         = nn.Linear(self.hidden_neuron_size * self.input_sequence_size, self.hidden_neuron_size, bias=self.bias)
        self.fully_connected_layer_1         = nn.Linear(self.hidden_neuron_size, self.output_neuron_size, bias=self.bias)

        # Activation functions
        self.hidden_activation = self.get_activation(self.hidden_activation)
        self.output_activation = self.get_activation(self.output_activation)

        # Initialize weights for fully connected layers
        self.initialize_weights(self.initializer  )

        # Optimizer
        optimizers = {
            'adam': optim.Adam,
            'sgd': optim.SGD,
            'rmsprop': optim.RMSprop
        }
        self.selected_optimizer = optimizers[self.optimizer.lower()](self.parameters(), lr=self.alpha)

        # Loss function
        losses = {
            'mean_squared_error': torch.nn.MSELoss(),
            'binary_crossentropy': torch.nn.BCELoss()
        }
        self.loss_function = losses[self.loss .lower()]

    def forward(self, initial_hidden, x, padding_mask):

        h  = self.fully_connected_layer_in_0(initial_hidden)
        h  = self.hidden_activation(h)
        h  = self.fully_connected_layer_in_1(h)
        h  = self.hidden_activation(h)
        h  = torch.unsqueeze(h, dim=0).repeat(self.num_layers, 1, 1)

        out          = x.permute(1, 0, 2)
        lengths      = (out != self.mask_value).any(dim=2).sum(dim=0).cpu().long() # since x is (sequence_length, batch_size, input_size), we should use sum(dim=0)
        out          = rnn_utils.pack_padded_sequence(out, lengths, batch_first=False, enforce_sorted=False)

        # Forward propagate RNN
        if self.neural_type == 'lstm':
            out, h     = self.recurrent_layer_0(out, (h, h))
            h          = h[0]
        else:
            out, h     = self.recurrent_layer_0(out, h)
            h          = h

        out, _     = rnn_utils.pad_packed_sequence(out, batch_first=False)
        padding    = (0, 0, 0, 0, 0, self.input_sequence_size - out.size(0))
        out        = F.pad(out, padding, "constant", 0)
        out        = out.permute(1, 0, 2)

        h  = self.fully_connected_layer_out_0(h)
        h  = self.hidden_activation(h)
        h  = self.fully_connected_layer_out_1(h)
        h  = self.output_activation(h)

        out = torch.flatten(out, start_dim=1)
        out = self.fully_connected_layer_0(out)
        out = self.hidden_activation(out)
        out = self.fully_connected_layer_1(out)
        out = self.output_activation(out)

        return out, h




    def get_activation(self,  activation):
        activations = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(),
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh()
        }
        return activations[ activation.lower()]

    def initialize_weights(self, initializer):
        initializers = {
            'random_uniform': nn.init.uniform_,
            'random_normal': nn.init.normal_,
            'glorot_uniform': nn.init.xavier_uniform_,
            'glorot_normal': nn.init.xavier_normal_,
            'xavier_uniform': nn.init.xavier_uniform_,
            'xavier_normal': nn.init.xavier_normal_
        }
        initializer = initializers[initializer.lower()]
        for layer in self.children():
            if isinstance(layer, nn.Linear):
                initializer(layer.weight)

# Function for updating input value using error backprop

In [6]:

def update_actions_value(epoch_for_deducing,
            model_loader,
            desired_reward,
            state,
            actions_value,
            beta):

    model_loader_ = copy.deepcopy(model_loader)

    for epoch in range(epoch_for_deducing):

        random.shuffle(model_loader_)

        for model in model_loader_:

            action = torch.sigmoid(actions_value)

            action = action.clone().detach().requires_grad_(True)
            if action.grad is not None:
                action.grad.zero_()
            for param in model.parameters():
                param.requires_grad = False
            loss_function = model.loss_function

            output, _ = model(state, action, padding_mask=None)
            total_loss = loss_function(output, desired_reward)

            total_loss.backward() # Error Backpropagation
            actions_value -= action.grad * (1 - action) * action * beta # Update Input Data

    return actions_value




# Function for updating weight matrices using error backprop

Elastic weight consolidation:
https://arxiv.org/pdf/1612.00796

In [7]:
# Traditional EWC
def EWC_loss( EWC_lambda, model, present_model, present_gradient_matrix, prev_model, prev_gradient_matrix, prev_train_loader_size):
    prev_model_param = prev_model.state_dict()
    loss = 0
    for name, param in model.named_parameters():
        loss += ( ((prev_gradient_matrix[name])**2) * ((param - prev_model_param[name])**2)     ).sum()
    return EWC_lambda * loss

def update_model(batch_size,
                 epoch_for_learning,
                 model,
                 train_loader,
                 dataset,
                 prev_model,
                 prev_gradient_matrix,
                 prev_train_loader_size,
                 EWC_lambda):




    # retrieving present_gradient_matrix
    present_model = copy.deepcopy(model)
    present_gradient_matrix =  {name: torch.zeros_like(param) for name, param in model.named_parameters()}

    for epoch in range(1):

        model.train()

        selected_optimizer = model.selected_optimizer
        for param_group in selected_optimizer.param_groups:
            param_group['lr'] = model.alpha * batch_size

        for state, action, reward, next_state, padding_mask in train_loader:

            next_state  = torch.unsqueeze(next_state, dim=0).repeat(model.num_layers, 1, 1)

            selected_optimizer.zero_grad()
            loss_function = model.loss_function

            output, output_state = model(state, action, padding_mask)
            total_loss = loss_function(output, reward) + loss_function(output_state, next_state)

            total_loss.backward()     # Error Backpropagation

            for name, param in model.named_parameters():
                present_gradient_matrix[name] += param.grad

    present_gradient_matrix = {name: param / len(train_loader) for name, param in present_gradient_matrix.items()}




    for epoch in range(epoch_for_learning):

        model.train()

        selected_optimizer = model.selected_optimizer
        for param_group in selected_optimizer.param_groups:
            param_group['lr'] = model.alpha * batch_size

        for state, action, reward, next_state, padding_mask in train_loader:

            next_state  = torch.unsqueeze(next_state, dim=0).repeat(model.num_layers, 1, 1)

            selected_optimizer.zero_grad()
            loss_function = model.loss_function

            output, output_state = model(state, action, padding_mask)
            total_loss = loss_function(output, reward)  + loss_function(output_state, next_state)

            total_loss += EWC_loss(EWC_lambda, model, present_model, present_gradient_matrix, prev_model, prev_gradient_matrix, prev_train_loader_size)

            total_loss.backward()     # Error Backpropagation

            selected_optimizer.step() # Update Model Weight




    # training and updating present_gradient_matrix
    updated_present_gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}

    for epoch in range(1):

        model.train()

        selected_optimizer = model.selected_optimizer
        for param_group in selected_optimizer.param_groups:
            param_group['lr'] = model.alpha * batch_size

        for state, action, reward, next_state, padding_mask in train_loader:

            next_state  = torch.unsqueeze(next_state, dim=0).repeat(model.num_layers, 1, 1)

            selected_optimizer.zero_grad()
            loss_function = model.loss_function

            output, output_state = model(state, action, padding_mask)
            total_loss = loss_function(output, reward)  + loss_function(output_state, next_state)

            total_loss.backward()     # Error Backpropagation

            for name, param in model.named_parameters():
                updated_present_gradient_matrix[name] += param.grad

    updated_present_gradient_matrix = {name: param / len(train_loader) for name, param in updated_present_gradient_matrix.items()}





    present_train_loader_size = len(train_loader) + prev_train_loader_size

    return model, present_gradient_matrix, present_train_loader_size

# Function for re-initializing action value in each step

In [8]:
def initialize_actions_value(init, noise_t, noise_r, shape):
    input = 0
    if   init == "random_uniform":
        for _ in range(noise_t):
            input += np.array([  np.random.uniform(low=0, high=1, size=shape)    ]) * noise_r
    elif init == "random_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= 1, size= shape )    ])  * noise_r
    elif init == "glorot_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "glorot_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    elif init == "xavier_uniform":
        for _ in range(noise_t):
            limit = np.sqrt(6 / (shape[1] + shape[1]))
            input += np.array([  np.random.uniform(low=-limit, high=limit, size=shape)    ])  * noise_r
    elif init == "xavier_normal":
        for _ in range(noise_t):
            input += np.array([  np.random.normal(loc=0.0, scale= np.sqrt(2 / (shape[1] + shape[1])) , size= shape )    ])  * noise_r
    return input

# Function for sequentializing state, action and reward

In [9]:
def sequentialize(state_list, action_list, reward_list, chunk_size):

    sequentialized_state_list  = []
    sequentialized_action_list = []
    sequentialized_reward_list = []
    sequentialized_next_state_list  = []

    if chunk_size > len(state_list[:-1]):
        chunk_size = len(state_list[:-1])
    else:
      pass

    # for j in range(chunk_size):
    chunk_size_ = chunk_size
    if chunk_size != 1:
        for i in range(len(reward_list[:-chunk_size_+1])):
            sequentialized_state_list.append(       state_list [ i ] )
            sequentialized_action_list.append(      action_list[ i : i+chunk_size_]  )
            sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+chunk_size_]) - 1 ]  )
            sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+chunk_size_])     ]  )
    else:
        for i in range(len(reward_list[:])):
            sequentialized_state_list.append(       state_list [ i ] )
            sequentialized_action_list.append(      action_list[ i : i+chunk_size_]  )
            sequentialized_reward_list.append(      reward_list[ i + len(action_list[i:i+chunk_size_]) - 1 ]  )
            sequentialized_next_state_list.append(  state_list [ i + len(action_list[i:i+chunk_size_])     ]  )


    return sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list


In [10]:
def save_performance_to_csv(performance_log, filename='performance_log.csv'):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Episode', 'Summed_Reward'])
        writer.writerows(performance_log)

# Function for vectorizing
Crucial function regarding how you manipulate or shape your state, action and reward

- It's essential to choose between immediate rewards and summed rewards for training your agent. If the current state doesn't encapsulate all crucial past information, using immediate rewards is advisable. This approach prevents confusion caused by varying summed rewards for the same state.

- As for reward shaping, it is recommended to increase your reward upper and decrease your reward lower bound.

In [11]:

def quantifying(array_size, init, interval, input):
    array = np.zeros(array_size)
    index = int( (input - init) // interval + 1)
    if index >= 0:
        array[ : index] = 1
    return array

def vectorizing_state(state):      # Reminder: change this for your specific task ⚠️⚠️⚠️
    state_0 = quantifying(100, -2.5, 0.050, state[0])
    state_1 = quantifying(100, -3.75, 0.075, state[1])
    state_2 = quantifying(100, -0.375, 0.0075, state[2])
    state_3 = quantifying(100, -3.75, 0.075, state[3])
    state_4 = quantifying(100, 0, 10, 0)
    state   = np.atleast_2d(np.concatenate((state_0, state_1, state_2, state_3, state_4)))
    return state

def vectorizing_action(action_size, action_arg):  # Reminder: change this for your specific task ⚠️⚠️⚠️
    return np.eye(action_size)[action_arg]

def vectorizing_reward(state, reward, summed_reward, done, reward_size):       # Reminder: change this for your specific task ⚠️⚠️⚠️
    if done:
        reward = np.zeros(reward_size)
    else:
        reward = np.ones(reward_size)
    return reward


# Control board

Crucial variables regarding how your agent will learn in the environment

- In some environments, it is crucial to increase your "max_steps_for_each_episode" so that your agent can "live long enough" to obatin some better rewards to gradually and heuristically learn better strategy.



In [12]:
game_name = 'CartPole-v1'                # Reminder: change this for your specific task ⚠️⚠️⚠️
max_steps_for_each_episode = 2000        # Reminder: change this for your specific task ⚠️⚠️⚠️

state_size =  500                        # Reminder: change this for your specific task ⚠️⚠️⚠️
hidden_size = 100                        # Reminder: change this for your specific task ⚠️⚠️⚠️ (should be dividable by num_heads below)
action_size = 2                          # Reminder: change this for your specific task ⚠️⚠️⚠️
time_size = 15                           # Reminder: change this for your specific task ⚠️⚠️⚠️
chunk_size = 5                           # Reminder: change this for your specific task ⚠️⚠️⚠️
reward_size = 100                        # Reminder: change this for your specific task ⚠️⚠️⚠️

ensemble_size = 10                       # Reminder: change this value to see the impact of MWM-SGD ◀️◀️◀️
neural_type = 'gru'                      # rnn gru lstm
num_layers = 2                           # Reminder: change this for your specific task ⚠️⚠️⚠️
num_heads  = 10                          # should be able to divide hidden_size
hidden_activation = 'tanh'               # relu leaky_relu sigmoid tanh
output_activation = 'sigmoid'            # relu leaky_relu sigmoid tanh
init = "random_normal"                   # random_normal random_uniform xavier_normal xavier_uniform  glorot_normal  glorot_uniform
loss = 'mean_squared_error'              # mean_squared_error  binary_crossentropy
opti = 'sgd'                             # adam sgd rmsprop
alpha = 0.1
epoch_for_learning = 10
batch_size = 1


noise_t = 1
noise_r = 0.1
beta = 0.1
epoch_for_deducing =  int(100/ensemble_size)


episode_for_training             = 100000
replay_range                     = 2                     # Reminder: change this for your specific task ⚠️⚠️⚠️
interval_for_initiating_learning = 50                    # Reminder: change this for your specific task ⚠️⚠️⚠️
EWC_lambda = 1                                           # Reminder: change this value to see the impact of EWC ◀️◀️◀️


episode_for_testing = 100                # Reminder: change this for your specific task ⚠️⚠️⚠️
render_for_human = False                 # Reminder: change this for your specific task ⚠️⚠️⚠️






mask_value = sys.maxsize
load_pre_model = False
suffix                      = f"ensemble={ensemble_size:05d}_learn={epoch_for_learning:05d}_interval={interval_for_initiating_learning:05d}_deduce={epoch_for_deducing:05d}_lambda={EWC_lambda:05d}"
directory                   = f'/content/Genrl/{game_name}/'
model_directory             = f'/content/Genrl/{game_name}/model_{suffix}'+'_%s.h5'
performance_log_directory   = f'/content/Genrl/{game_name}/performace_log_{suffix}.csv'

# Deducing > Learning


Creating or loading models

In [13]:

if not os.path.exists(directory):
    os.makedirs(directory)

if load_pre_model == False:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            alpha,
                            mask_value)
        model.to(device)
        model_loader.append(model)

elif load_pre_model == True:

    model_loader = []
    for _ in range(ensemble_size):
        model = build_model(state_size,
                            hidden_size,
                            action_size,
                            time_size,
                            reward_size,
                            neural_type,
                            num_layers,
                            num_heads,
                            hidden_activation,
                            output_activation,
                            init,
                            opti,
                            loss,
                            alpha,
                            mask_value)
        model.to(device)
        model_loader.append(model)

    for i in range(len(model_loader)):
        model_loader[i].load_state_dict(torch.load( model_directory  % i ))


Creating Streams

In [14]:
stream_list = []
for _ in range(ensemble_size):
    stream  = torch.cuda.Stream()
    stream_list.append(stream)


  and should_run_async(code)


Creating intial gradient matrices

In [15]:
"""
storing previous models
"""
prev_model_loader = copy.deepcopy(model_loader)
prev_train_loader_size = 1

"""
calculating gradient matrix
"""
prev_gradient_matrix_loader = []
for model in model_loader:
    gradient_matrix = {name: torch.zeros_like(param) for name, param in model.named_parameters()}
    prev_gradient_matrix_loader.append( gradient_matrix )


Creating desired reward

In [16]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float).to(device)

Putting all the previous works into play

In [None]:

performance_log = []
performance_log.append([0, 0])

episode_list = []
sequentialized_state_list = []
sequentialized_action_list = []
sequentialized_reward_list = []
sequentialized_next_state_list = []

env = gym.make(game_name)
env._max_episode_steps = max_steps_for_each_episode

for training_episode in tqdm(range(episode_for_training)):

    state_list  = []
    action_list = []
    reward_list = []

    summed_reward = 0

    state = env.reset()


    # Getting state
    state = vectorizing_state(state)
    state_list.append(state[0])


    for _ in range(sys.maxsize):


        # Getting action
        state         = torch.tensor(state, dtype=torch.float).to(device)
        actions_value = initialize_actions_value(init, noise_t, noise_r,(time_size, action_size) )
        actions_value = torch.tensor(actions_value, dtype=torch.float).to(device)
        actions_value = update_actions_value(epoch_for_deducing,
                                           model_loader,
                                           desired_reward,
                                           state,
                                           actions_value,
                                           beta)
        action_arg    = int(torch.argmax(actions_value[0, 0]))
        action        = vectorizing_action(action_size, action_arg)
        action_list.append(action)


        # Getting reward
        state, reward, done,  info = env.step(action_arg)
        summed_reward += reward
        reward = vectorizing_reward(state, reward, summed_reward, done, reward_size)
        reward_list.append(reward)


        # Getting state
        state = vectorizing_state(state)
        state_list.append(state[0])


        if done:
            print(f'Episode {training_episode+1}: Summed_Reward = {summed_reward}')
            performance_log.append([training_episode+1, summed_reward])
            # Save performance log to CSV
            save_performance_to_csv(performance_log, performance_log_directory)
            break




    env.close()




    """
    sequentializing and setting replay range
    """
    sequentialized_state_list_slice, sequentialized_action_list_slice, sequentialized_reward_list_slice, sequentialized_next_state_list_slice = sequentialize(state_list, action_list, reward_list, chunk_size )

    episode_list .extend( [ training_episode ] * len(sequentialized_state_list_slice))
    sequentialized_state_list       .extend( sequentialized_state_list_slice)
    sequentialized_action_list      .extend( sequentialized_action_list_slice)
    sequentialized_reward_list      .extend( sequentialized_reward_list_slice)
    sequentialized_next_state_list  .extend( sequentialized_next_state_list_slice)




    if (training_episode+1) % interval_for_initiating_learning == 0:




        episode_list                   = [i for i, a, b, c, d in zip(episode_list, sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list) if ((training_episode - replay_range * interval_for_initiating_learning + 1) <= i <= training_episode)     ]
        sequentialized_state_list      = [a for i, a, b, c, d in zip(episode_list, sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list) if ((training_episode - replay_range * interval_for_initiating_learning + 1) <= i <= training_episode)     ]
        sequentialized_action_list     = [b for i, a, b, c, d in zip(episode_list, sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list) if ((training_episode - replay_range * interval_for_initiating_learning + 1) <= i <= training_episode)     ]
        sequentialized_reward_list     = [c for i, a, b, c, d in zip(episode_list, sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list) if ((training_episode - replay_range * interval_for_initiating_learning + 1) <= i <= training_episode)     ]
        sequentialized_next_state_list = [d for i, a, b, c, d in zip(episode_list, sequentialized_state_list, sequentialized_action_list, sequentialized_reward_list, sequentialized_next_state_list) if ((training_episode - replay_range * interval_for_initiating_learning + 1) <= i <= training_episode)     ]




        """
        masking and uploading data
        """

        state_tensor      = torch.stack( [torch.tensor(arr) for arr in sequentialized_state_list]               ).float().to(device)
        action_tensor     = torch.stack( [F.pad(torch.tensor(arr),
                                                pad=(0, 0, 0, time_size - torch.tensor(arr).size(0)),
                                                mode='constant',
                                                value= mask_value) for arr in sequentialized_action_list]       ).float().to(device)
        reward_tensor     = torch.stack( [torch.tensor(arr) for arr in sequentialized_reward_list]              ).float().to(device)
        next_state_tensor = torch.stack( [torch.tensor(arr) for arr in sequentialized_next_state_list]          ).float().to(device)

        row_mask     = torch.all(action_tensor == mask_value, dim = -1)
        padding_mask = torch.zeros_like(action_tensor, dtype = torch.bool)
        padding_mask[row_mask] = True
        padding_mask = padding_mask.to(device)

        dataset     = TensorDataset(state_tensor, action_tensor, reward_tensor, next_state_tensor, padding_mask)
        data_loader = DataLoader(dataset, batch_size = batch_size, shuffle=True)






        """
        learning
        """
        gradient_matrix_loader_ = []
        for i, model in enumerate(model_loader):
            with torch.cuda.stream(stream_list[i]):
                model, gradient_matrix_, train_loader_size_= update_model(batch_size, epoch_for_learning, model, data_loader, dataset, prev_model_loader[i], prev_gradient_matrix_loader[i], prev_train_loader_size, EWC_lambda)
                gradient_matrix_loader_.append(gradient_matrix_)
                model_loader[i] = model
        torch.cuda.synchronize()

        prev_gradient_matrix_loader = gradient_matrix_loader_
        prev_model_loader           = copy.deepcopy(model_loader)
        prev_train_loader_size      = train_loader_size_




        """
        saving
        """
        for i in range(len(model_loader)):
            torch.save(model_loader[i].state_dict(), model_directory % i)


        gc.collect()
        torch.cuda.empty_cache()



  deprecation(
  deprecation(
  result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,
  if not isinstance(terminated, (bool, np.bool8)):
  0%|          | 1/100000 [00:14<403:25:25, 14.52s/it]

Episode 1: Summed_Reward = 14.0


  0%|          | 2/100000 [00:22<302:21:16, 10.88s/it]

Episode 2: Summed_Reward = 15.0


  0%|          | 3/100000 [00:31<269:47:07,  9.71s/it]

Episode 3: Summed_Reward = 16.0


  0%|          | 4/100000 [00:35<210:12:33,  7.57s/it]

Episode 4: Summed_Reward = 9.0


  0%|          | 5/100000 [00:43<210:02:35,  7.56s/it]

Episode 5: Summed_Reward = 14.0


  0%|          | 6/100000 [00:48<188:25:28,  6.78s/it]

Episode 6: Summed_Reward = 11.0


  0%|          | 7/100000 [00:56<199:53:19,  7.20s/it]

Episode 7: Summed_Reward = 15.0


  0%|          | 8/100000 [01:12<282:11:40, 10.16s/it]

Episode 8: Summed_Reward = 33.0


  0%|          | 9/100000 [01:24<299:07:08, 10.77s/it]

Episode 9: Summed_Reward = 24.0


  0%|          | 10/100000 [01:45<383:30:35, 13.81s/it]

Episode 10: Summed_Reward = 40.0


  0%|          | 11/100000 [01:52<326:16:30, 11.75s/it]

Episode 11: Summed_Reward = 14.0


  0%|          | 12/100000 [02:01<299:33:34, 10.79s/it]

Episode 12: Summed_Reward = 17.0


  0%|          | 13/100000 [02:16<341:22:42, 12.29s/it]

Episode 13: Summed_Reward = 32.0


  0%|          | 14/100000 [02:25<305:41:32, 11.01s/it]

Episode 14: Summed_Reward = 16.0


  0%|          | 15/100000 [02:32<274:13:41,  9.87s/it]

Episode 15: Summed_Reward = 14.0


  0%|          | 16/100000 [02:57<402:50:07, 14.50s/it]

Episode 16: Summed_Reward = 51.0


  0%|          | 17/100000 [03:09<378:04:29, 13.61s/it]

Episode 17: Summed_Reward = 23.0


  0%|          | 18/100000 [03:38<511:39:38, 18.42s/it]

Episode 18: Summed_Reward = 59.0


  0%|          | 19/100000 [03:50<458:22:18, 16.50s/it]

Episode 19: Summed_Reward = 24.0


  0%|          | 20/100000 [04:00<406:31:08, 14.64s/it]

Episode 20: Summed_Reward = 20.0


  0%|          | 21/100000 [04:08<344:08:24, 12.39s/it]

Episode 21: Summed_Reward = 15.0


  0%|          | 22/100000 [04:23<367:10:43, 13.22s/it]

Episode 22: Summed_Reward = 30.0


  0%|          | 23/100000 [04:27<295:54:57, 10.66s/it]

Episode 23: Summed_Reward = 9.0


  0%|          | 24/100000 [04:37<287:24:06, 10.35s/it]

Episode 24: Summed_Reward = 19.0


  0%|          | 25/100000 [04:45<265:33:16,  9.56s/it]

Episode 25: Summed_Reward = 16.0


  0%|          | 26/100000 [04:53<254:17:46,  9.16s/it]

Episode 26: Summed_Reward = 16.0


  0%|          | 27/100000 [05:02<250:12:40,  9.01s/it]

Episode 27: Summed_Reward = 17.0


  0%|          | 28/100000 [05:20<331:12:05, 11.93s/it]

Episode 28: Summed_Reward = 38.0


  0%|          | 29/100000 [05:25<268:32:24,  9.67s/it]

Episode 29: Summed_Reward = 9.0


  0%|          | 30/100000 [05:51<408:23:11, 14.71s/it]

Episode 30: Summed_Reward = 53.0


  0%|          | 31/100000 [06:14<476:11:20, 17.15s/it]

Episode 31: Summed_Reward = 46.0


  0%|          | 32/100000 [06:26<433:22:21, 15.61s/it]

Episode 32: Summed_Reward = 24.0


  0%|          | 33/100000 [06:36<387:28:55, 13.95s/it]

Episode 33: Summed_Reward = 20.0


  0%|          | 34/100000 [06:43<328:45:41, 11.84s/it]

Episode 34: Summed_Reward = 13.0


  0%|          | 35/100000 [06:56<333:22:48, 12.01s/it]

Episode 35: Summed_Reward = 25.0


  0%|          | 36/100000 [07:05<315:16:49, 11.35s/it]

Episode 36: Summed_Reward = 20.0


  0%|          | 37/100000 [07:19<337:48:04, 12.17s/it]

Episode 37: Summed_Reward = 28.0


  0%|          | 38/100000 [07:35<368:39:53, 13.28s/it]

Episode 38: Summed_Reward = 32.0


  0%|          | 39/100000 [07:44<330:15:58, 11.89s/it]

Episode 39: Summed_Reward = 17.0


  0%|          | 40/100000 [07:51<285:27:01, 10.28s/it]

Episode 40: Summed_Reward = 14.0


  0%|          | 41/100000 [08:09<352:01:44, 12.68s/it]

Episode 41: Summed_Reward = 36.0


  0%|          | 42/100000 [08:21<344:20:23, 12.40s/it]

Episode 42: Summed_Reward = 24.0


  0%|          | 43/100000 [08:26<288:43:08, 10.40s/it]

Episode 43: Summed_Reward = 12.0


  0%|          | 44/100000 [08:33<257:07:27,  9.26s/it]

Episode 44: Summed_Reward = 13.0


  0%|          | 45/100000 [08:42<254:04:43,  9.15s/it]

Episode 45: Summed_Reward = 19.0


  0%|          | 46/100000 [08:49<239:00:36,  8.61s/it]

Episode 46: Summed_Reward = 14.0


  0%|          | 47/100000 [08:59<250:05:28,  9.01s/it]

Episode 47: Summed_Reward = 20.0


  0%|          | 48/100000 [09:06<233:05:15,  8.40s/it]

Episode 48: Summed_Reward = 15.0


  0%|          | 49/100000 [09:13<219:04:07,  7.89s/it]

Episode 49: Summed_Reward = 13.0
Episode 50: Summed_Reward = 20.0


  action_tensor     = torch.stack( [F.pad(torch.tensor(arr),
  0%|          | 51/100000 [25:18<5796:24:35, 208.78s/it]

Episode 51: Summed_Reward = 50.0


  0%|          | 52/100000 [25:29<4154:53:11, 149.65s/it]

Episode 52: Summed_Reward = 23.0


  0%|          | 53/100000 [25:39<2989:02:38, 107.66s/it]

Episode 53: Summed_Reward = 19.0


  0%|          | 54/100000 [25:44<2135:42:15, 76.93s/it] 

Episode 54: Summed_Reward = 11.0


  0%|          | 55/100000 [25:53<1564:06:08, 56.34s/it]

Episode 55: Summed_Reward = 16.0


  0%|          | 56/100000 [25:59<1148:20:14, 41.36s/it]

Episode 56: Summed_Reward = 13.0


  0%|          | 57/100000 [26:12<907:53:29, 32.70s/it] 

Episode 57: Summed_Reward = 25.0


  0%|          | 58/100000 [26:17<682:39:39, 24.59s/it]

Episode 58: Summed_Reward = 11.0


  0%|          | 59/100000 [26:27<555:20:28, 20.00s/it]

Episode 59: Summed_Reward = 18.0


  0%|          | 60/100000 [26:32<435:55:36, 15.70s/it]

Episode 60: Summed_Reward = 12.0


  0%|          | 61/100000 [26:38<354:08:03, 12.76s/it]

Episode 61: Summed_Reward = 11.0


  0%|          | 62/100000 [26:47<319:33:43, 11.51s/it]

Episode 62: Summed_Reward = 18.0


  0%|          | 63/100000 [26:55<293:05:20, 10.56s/it]

Episode 63: Summed_Reward = 16.0


  0%|          | 64/100000 [27:19<405:44:11, 14.62s/it]

Episode 64: Summed_Reward = 48.0


  0%|          | 65/100000 [27:34<411:00:50, 14.81s/it]

Episode 65: Summed_Reward = 31.0


  0%|          | 66/100000 [27:45<372:20:12, 13.41s/it]

Episode 66: Summed_Reward = 20.0


  0%|          | 67/100000 [27:51<313:03:16, 11.28s/it]

Episode 67: Summed_Reward = 13.0


  0%|          | 68/100000 [28:03<323:49:34, 11.67s/it]

Episode 68: Summed_Reward = 25.0


  0%|          | 69/100000 [28:11<293:54:06, 10.59s/it]

Episode 69: Summed_Reward = 16.0


  0%|          | 70/100000 [28:18<263:06:07,  9.48s/it]

Episode 70: Summed_Reward = 13.0


  0%|          | 71/100000 [28:32<300:03:51, 10.81s/it]

Episode 71: Summed_Reward = 28.0


  0%|          | 72/100000 [28:46<325:50:18, 11.74s/it]

Episode 72: Summed_Reward = 28.0


  0%|          | 73/100000 [28:52<277:23:04,  9.99s/it]

Episode 73: Summed_Reward = 12.0


  0%|          | 74/100000 [29:02<278:33:33, 10.04s/it]

Episode 74: Summed_Reward = 20.0


  0%|          | 75/100000 [29:10<262:48:13,  9.47s/it]

Episode 75: Summed_Reward = 16.0


  0%|          | 76/100000 [29:22<279:46:12, 10.08s/it]

Episode 76: Summed_Reward = 23.0


  0%|          | 77/100000 [29:28<246:44:24,  8.89s/it]

Episode 77: Summed_Reward = 13.0


  0%|          | 78/100000 [29:34<221:58:45,  8.00s/it]

Episode 78: Summed_Reward = 11.0


  0%|          | 79/100000 [29:43<228:03:25,  8.22s/it]

Episode 79: Summed_Reward = 18.0


  0%|          | 80/100000 [29:51<225:58:24,  8.14s/it]

Episode 80: Summed_Reward = 16.0


  0%|          | 81/100000 [30:00<238:56:26,  8.61s/it]

Episode 81: Summed_Reward = 19.0


  0%|          | 82/100000 [30:05<206:20:42,  7.43s/it]

Episode 82: Summed_Reward = 10.0


  0%|          | 83/100000 [30:14<217:10:43,  7.82s/it]

Episode 83: Summed_Reward = 17.0


  0%|          | 84/100000 [30:20<204:12:38,  7.36s/it]

Episode 84: Summed_Reward = 13.0


  0%|          | 85/100000 [30:26<193:56:29,  6.99s/it]

Episode 85: Summed_Reward = 12.0


  0%|          | 86/100000 [30:35<212:48:32,  7.67s/it]

Episode 86: Summed_Reward = 18.0


  0%|          | 87/100000 [30:43<212:54:20,  7.67s/it]

Episode 87: Summed_Reward = 16.0


  0%|          | 88/100000 [30:48<193:26:26,  6.97s/it]

Episode 88: Summed_Reward = 10.0


  0%|          | 89/100000 [31:00<231:48:57,  8.35s/it]

Episode 89: Summed_Reward = 23.0


  0%|          | 90/100000 [31:12<258:20:32,  9.31s/it]

Episode 90: Summed_Reward = 23.0


  0%|          | 91/100000 [31:40<420:19:33, 15.15s/it]

Episode 91: Summed_Reward = 58.0


  0%|          | 92/100000 [32:03<479:29:43, 17.28s/it]

Episode 92: Summed_Reward = 44.0


  0%|          | 93/100000 [32:16<451:10:39, 16.26s/it]

Episode 93: Summed_Reward = 28.0


  0%|          | 94/100000 [32:27<401:16:53, 14.46s/it]

Episode 94: Summed_Reward = 20.0


  0%|          | 95/100000 [32:33<329:17:15, 11.87s/it]

Episode 95: Summed_Reward = 12.0


  0%|          | 96/100000 [32:41<299:59:17, 10.81s/it]

Episode 96: Summed_Reward = 16.0


  0%|          | 97/100000 [32:55<323:49:02, 11.67s/it]

Episode 97: Summed_Reward = 27.0


  0%|          | 98/100000 [33:01<283:04:26, 10.20s/it]

Episode 98: Summed_Reward = 14.0


  0%|          | 99/100000 [33:13<293:09:50, 10.56s/it]

Episode 99: Summed_Reward = 23.0
Episode 100: Summed_Reward = 74.0


  0%|          | 101/100000 [1:04:44<11175:23:53, 402.72s/it]

Episode 101: Summed_Reward = 9.0


  0%|          | 102/100000 [1:04:48<7860:14:14, 283.26s/it] 

Episode 102: Summed_Reward = 9.0


  0%|          | 103/100000 [1:04:54<5544:46:24, 199.82s/it]

Episode 103: Summed_Reward = 10.0


  0%|          | 104/100000 [1:04:58<3916:22:28, 141.14s/it]

Episode 104: Summed_Reward = 9.0


  0%|          | 105/100000 [1:05:03<2782:58:34, 100.29s/it]

Episode 105: Summed_Reward = 9.0


  0%|          | 106/100000 [1:05:07<1982:50:50, 71.46s/it] 

Episode 106: Summed_Reward = 9.0


  0%|          | 107/100000 [1:05:11<1424:28:16, 51.34s/it]

Episode 107: Summed_Reward = 9.0


  0%|          | 108/100000 [1:05:16<1037:44:09, 37.40s/it]

Episode 108: Summed_Reward = 9.0


  0%|          | 109/100000 [1:05:20<761:44:55, 27.45s/it] 

Episode 109: Summed_Reward = 9.0


  0%|          | 110/100000 [1:05:25<572:55:34, 20.65s/it]

Episode 110: Summed_Reward = 10.0


  0%|          | 111/100000 [1:05:30<440:51:39, 15.89s/it]

Episode 111: Summed_Reward = 9.0


  0%|          | 112/100000 [1:05:34<343:44:23, 12.39s/it]

Episode 112: Summed_Reward = 9.0


  0%|          | 113/100000 [1:05:40<284:17:55, 10.25s/it]

Episode 113: Summed_Reward = 10.0


  0%|          | 114/100000 [1:05:43<231:35:47,  8.35s/it]

Episode 114: Summed_Reward = 8.0


  0%|          | 115/100000 [1:05:48<201:15:53,  7.25s/it]

Episode 115: Summed_Reward = 10.0


  0%|          | 116/100000 [1:05:53<177:46:20,  6.41s/it]

Episode 116: Summed_Reward = 8.0


  0%|          | 117/100000 [1:05:57<159:26:58,  5.75s/it]

Episode 117: Summed_Reward = 9.0


  0%|          | 118/100000 [1:06:01<147:02:22,  5.30s/it]

Episode 118: Summed_Reward = 9.0


  0%|          | 119/100000 [1:06:06<147:25:06,  5.31s/it]

Episode 119: Summed_Reward = 10.0


  0%|          | 120/100000 [1:06:11<142:17:35,  5.13s/it]

Episode 120: Summed_Reward = 10.0


  0%|          | 121/100000 [1:06:15<131:23:45,  4.74s/it]

Episode 121: Summed_Reward = 8.0


  0%|          | 122/100000 [1:06:20<136:54:47,  4.93s/it]

Episode 122: Summed_Reward = 10.0


  0%|          | 123/100000 [1:06:25<135:01:54,  4.87s/it]

Episode 123: Summed_Reward = 10.0


  0%|          | 124/100000 [1:06:30<135:04:02,  4.87s/it]

Episode 124: Summed_Reward = 9.0


  0%|          | 125/100000 [1:06:34<126:02:26,  4.54s/it]

Episode 125: Summed_Reward = 8.0


  0%|          | 126/100000 [1:06:38<127:58:02,  4.61s/it]

Episode 126: Summed_Reward = 10.0


  0%|          | 127/100000 [1:06:44<138:17:01,  4.98s/it]

Episode 127: Summed_Reward = 11.0


  0%|          | 128/100000 [1:06:49<132:41:13,  4.78s/it]

Episode 128: Summed_Reward = 9.0


  0%|          | 129/100000 [1:06:54<134:01:43,  4.83s/it]

Episode 129: Summed_Reward = 9.0


  0%|          | 130/100000 [1:06:58<130:13:24,  4.69s/it]

Episode 130: Summed_Reward = 9.0


  0%|          | 131/100000 [1:07:03<130:52:06,  4.72s/it]

Episode 131: Summed_Reward = 10.0


  0%|          | 132/100000 [1:07:08<136:38:58,  4.93s/it]

Episode 132: Summed_Reward = 10.0


  0%|          | 133/100000 [1:07:12<131:34:27,  4.74s/it]

Episode 133: Summed_Reward = 9.0


  0%|          | 134/100000 [1:07:18<135:09:10,  4.87s/it]

Episode 134: Summed_Reward = 10.0


  0%|          | 135/100000 [1:07:22<133:08:48,  4.80s/it]

Episode 135: Summed_Reward = 9.0


  0%|          | 136/100000 [1:07:27<133:23:04,  4.81s/it]

Episode 136: Summed_Reward = 10.0


  0%|          | 137/100000 [1:07:32<134:46:38,  4.86s/it]

Episode 137: Summed_Reward = 9.0


# Deducing (testing)

Loading models

In [None]:
model_loader = []
for _ in range(ensemble_size):
    model = build_model(state_size,
                        hidden_size,
                        action_size,
                        time_size,
                        reward_size,
                        neural_type,
                        num_layers,
                        num_heads,
                        hidden_activation,
                        output_activation,
                        init,
                        opti,
                        loss,
                        alpha,
                        mask_value)
    model.to(device)
    model_loader.append(model)

for i in range(len(model_loader)):
    model_loader[i].load_state_dict(torch.load(model_directory % i))

Creating desired reward ... again

In [None]:
desired_reward = np.atleast_2d(np.ones(reward_size))
desired_reward = torch.tensor(desired_reward, dtype=torch.float).to(device)

Putting all the previous works into play ... again

But this time the agent does not learn

In [None]:
summed_reward_sum = 0


if render_for_human == True:
    env = gym.make( game_name, render_mode="human")
else:
    env = gym.make( game_name)
env._max_episode_steps = max_steps_for_each_episode


for testing_episode in range(episode_for_testing):

    summed_reward = 0

    state                  = env.reset()
    if render_for_human == True:
        env.render()

    # Getting state
    state = vectorizing_state(state)


    for _ in tqdm(range(sys.maxsize)):


        # Getting action
        state         = torch.tensor(state, dtype=torch.float).to(device)
        actions_value = initialize_actions_value(init, noise_t, noise_r,(time_size, action_size) )
        actions_value = torch.tensor(actions_value, dtype=torch.float).to(device)
        actions_value = update_actions_value(epoch_for_deducing,
                                           model_loader,
                                           desired_reward,
                                           state,
                                           actions_value,
                                           beta)
        action_arg = int(torch.argmax(actions_value[0, 0]))
        action     = vectorizing_action(action_size, action_arg)


        # Getting reward
        state, reward, done,  info = env.step(action_arg)
        if render_for_human == True:
            env.render()
        summed_reward += reward


        # Getting state
        state = vectorizing_state(state)


        if done:
            break


    env.close()

    print("Summed reward:", summed_reward)
    print(f'Episode: {testing_episode + 1}')
    print('Everaged reward:')
    summed_reward_sum += summed_reward
    print(summed_reward_sum/(testing_episode + 1))

