In [1]:
import gym
import numpy as np
import pandas as pd
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
from IPython.display import display
import random
import os
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
random.seed()
randomN = random.random()
print(randomN)

0.49128473257716665


In [3]:
random.seed(randomN)
randomNumber = random.random()

In [4]:
Box(low=0, high=3, shape=(3, 3, 3), dtype=int).sample()

array([[[3, 3, 0],
        [3, 0, 1],
        [2, 3, 1]],

       [[1, 0, 1],
        [2, 1, 3],
        [3, 3, 0]],

       [[0, 0, 3],
        [0, 0, 2],
        [3, 1, 0]]])

In [5]:
def fillInContainerGridRandom(grid, action_space, upperbound, percentage=0.25):
    filledIn = 0
    amountToFillIn = int((grid.shape[0] * grid.shape[1]) * percentage)
    while (filledIn < amountToFillIn):
        indexToFillIn = action_space.sample()
        rowToFillIn = grid[indexToFillIn[0]][indexToFillIn[1]]
        if (np.all(rowToFillIn)):
            continue
        elif (not np.any(rowToFillIn)):
            checkGridCopy = grid.copy()
            x_values_list = np.array([], dtype=int)
            for x_spot in checkGridCopy[indexToFillIn[0]]:
                if (x_spot[0] == 0):
                    continue
                x_values_list = np.append(x_values_list, x_spot[0])
            uniqueListOfNumbers = list(set(x_values_list))
            num_values = len(uniqueListOfNumbers)
            if (num_values < 2):
                randomShipNumber = random.randint(1, upperbound)
                checkGridCopy[indexToFillIn[0]][indexToFillIn[1]], score = fillContainerInList(rowToFillIn, randomShipNumber)
                #We encountered a boxed in(e.g. [1,0,3])
                if(checkForBoxedInContainer(checkGridCopy[indexToFillIn[0]]) < 0):
                    continue
            else:
                randomShipNumber = uniqueListOfNumbers[-1]
                checkGridCopy[indexToFillIn[0]][indexToFillIn[1]], score = fillContainerInList(rowToFillIn, randomShipNumber)
                #We encountered a boxed in(e.g. [1,3,x]) where x is to be placed and is equal to 1
                if(checkForBoxedInContainer(checkGridCopy[indexToFillIn[0]]) < 0):
                    randomShipNumber = uniqueListOfNumbers[0]
        else:
            #We don't need to check for box in since we add to the current x_row
            randomShipNumber = rowToFillIn[0]
            
        grid[indexToFillIn[0]][indexToFillIn[1]], score = fillContainerInList(rowToFillIn, randomShipNumber)
        filledIn +=1
    return grid

def fillContainerInList(currentList, container):
    newList = currentList.copy()
    indexesOfZeros = np.where(newList == 0)[0]
    # List is full of numbers
    if (len(indexesOfZeros) < 1):
        return newList, -3
    else:
        indexToChange = indexesOfZeros[0]
        newList[indexToChange] = container
        return newList, 0

In [6]:
def fillSingleContainerAtPosition(grid, y, x, containerList):
    bottomValue = grid[y][x][0]
    container = containerList[-1]
    #We get the new row and score of putting a container there based in the list availability
    grid[y][x], score = fillContainerInList(grid[y][x], container)
    if (score == 0):
        #Also give a reward if the same number is used
        if (bottomValue == 0):
            score += 2
            score += checkForBoxedInContainer(grid[y])
        elif (bottomValue == container):
            score += 5
        else:
            score -= 3
        return grid, score, np.delete(containerList, -1)
    else:
        return grid, score, containerList

In [7]:
def checkForBoxedInContainer(listOfContainerRows):
    x_values_list = np.array([], dtype=int)
    for x_spot in listOfContainerRows:
        x_values_list = np.append(x_values_list, x_spot[0])
    numberToCheckFor = np.nan
    numberToVerify = np.nan
    encounteredZero = False
    for item in x_values_list:
        #Skip empty position on grid
        if (item != 0):
            #If we already encountered a number and zero counters is more than 0, we will box in a zero -> penalty
            # e.g. x 0 0 x
            if ((not np.isnan(numberToCheckFor)) & encounteredZero):
                return -4
            if (np.isnan(numberToCheckFor)):
                numberToCheckFor = item
            if (not np.isnan(numberToVerify)):
                if (item != numberToVerify):
                    #The previous item is boxed in since that item is not equal to where we started and the next item is also different
                    return -4
            #Set number to verify if boxed in
            if ((item != numberToCheckFor) & (np.isnan(numberToVerify))):
                numberToVerify = item
        else:
            # Only check once we encountered a number
            # e.g. 0 x ... doesn't count
            if (not np.isnan(numberToCheckFor)):
                encounteredZero = True

    return 0

In [8]:
def getListOfContainers(numberOfShips, containersPerShip, multiplication=1):
    containers = np.array([], dtype=int)
    for i in range(multiplication):
        for i in range(1, numberOfShips + 1):
            containers = np.append(containers, np.full((1, containersPerShip), i, dtype=int))
    random.shuffle(containers)
    return containers

def getContainersCountAsList(containerList, current_count=None):
    numbers, counts = np.unique(containerList, return_counts=True)
    if (type(current_count) == type(None)):
        current_count = np.zeros(len(counts), dtype=int)
    #Iterate through the numbers and assign their count
    if (len(numbers) > 0):
        startIndex = numbers[-1]
        for idx, number in enumerate(numbers):
            current_count[number-1] = counts[idx]
        current_count[numbers[-1]:] = 0
        return current_count
    #Numbers that are not in the list, but still exist in the dict will be set to 0
    current_count[0:] = 0
    return current_count

In [9]:
testContainerList = getListOfContainers(5,10,4)
display(testContainerList)

array([4, 4, 3, 2, 5, 2, 3, 3, 4, 2, 5, 3, 2, 4, 3, 3, 5, 3, 1, 2, 2, 4,
       5, 4, 5, 3, 5, 3, 1, 3, 3, 2, 4, 5, 2, 4, 5, 3, 2, 2, 2, 3, 5, 4,
       2, 5, 5, 4, 5, 5, 5, 4, 2, 1, 3, 4, 1, 3, 4, 2, 3, 3, 1, 2, 1, 4,
       4, 1, 3, 2, 5, 5, 5, 5, 2, 2, 4, 4, 1, 3, 2, 4, 1, 2, 2, 2, 4, 3,
       4, 3, 1, 5, 3, 3, 1, 3, 5, 4, 1, 5, 5, 5, 4, 4, 4, 5, 3, 1, 2, 3,
       1, 3, 2, 5, 3, 1, 4, 1, 4, 4, 1, 5, 1, 5, 4, 2, 4, 3, 4, 1, 5, 2,
       4, 1, 3, 3, 5, 4, 3, 1, 3, 2, 1, 5, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2,
       5, 1, 1, 2, 2, 1, 1, 4, 3, 2, 5, 4, 5, 2, 1, 5, 2, 5, 4, 1, 4, 1,
       3, 5, 1, 4, 5, 4, 3, 1, 1, 2, 1, 1, 5, 1, 1, 2, 1, 4, 1, 5, 5, 3,
       4, 3])

In [10]:
container_count = getContainersCountAsList(testContainerList)
display(container_count)

array([40, 40, 40, 40, 40])

In [11]:
test_dict = getContainersCountAsList(np.array([1, 2, 2, 3]), container_count)
display(test_dict)

array([1, 2, 1, 0, 0])

In [12]:
x = 3
y = 3
z = 3
upperbound = 3
empty3dArray = np.zeros(shape=(y, x, z), dtype=int)
action_space = MultiDiscrete([y, x], dtype=int)
empty3dArray = fillInContainerGridRandom(empty3dArray, action_space, upperbound, 0.9)
containers = getListOfContainers(upperbound, 4, z-1)
display(empty3dArray)
df, score, lists = fillSingleContainerAtPosition(empty3dArray, 0, 0, containers)
display(lists)
display(df)
display(score)

array([[[0, 0, 0],
        [2, 2, 0],
        [2, 0, 0]],

       [[0, 0, 0],
        [3, 0, 0],
        [1, 1, 1]],

       [[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0]]])

array([2, 2, 3, 1, 3, 3, 3, 2, 2, 1, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 1, 3,
       2])

array([[[3, 0, 0],
        [2, 2, 0],
        [2, 0, 0]],

       [[0, 0, 0],
        [3, 0, 0],
        [1, 1, 1]],

       [[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0]]])

2

In [13]:
containers = getListOfContainers(3, 3)
display(df)
df, score, lists = fillSingleContainerAtPosition(df, 0, 2, containers)
display(lists)
display(df)
display(score)

array([[[3, 0, 0],
        [2, 2, 0],
        [2, 0, 0]],

       [[0, 0, 0],
        [3, 0, 0],
        [1, 1, 1]],

       [[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0]]])

array([2, 1, 3, 3, 3, 1, 1, 2])

array([[[3, 0, 0],
        [2, 2, 0],
        [2, 2, 0]],

       [[0, 0, 0],
        [3, 0, 0],
        [1, 1, 1]],

       [[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0]]])

5

In [14]:
containers = getListOfContainers(3, 3)
df, score, lists = fillSingleContainerAtPosition(df, 0, 1, containers)
display(lists)
display(df)
display(np.all(df.flatten() > 0))
display(score)

array([3, 1, 1, 1, 3, 2, 3, 2])

array([[[3, 0, 0],
        [2, 2, 2],
        [2, 2, 0]],

       [[0, 0, 0],
        [3, 0, 0],
        [1, 1, 1]],

       [[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0]]])

False

5

In [15]:
action = MultiDiscrete([3, 3], dtype=int).sample()

In [16]:
display(action)

array([1, 2])

In [17]:
display(action[0])

1

In [18]:
Discrete(5).n

5

In [19]:
Discrete(5).sample()

2

In [20]:
Dict(spaces={
    '1': Discrete(5),
    '2': Discrete(5),
    '3': Discrete(5),
    '4': Discrete(5),
}).sample()

OrderedDict([('1', 1), ('2', 4), ('3', 1), ('4', 2)])

In [21]:
# observation space
observation_space_test = Dict({
    'grid':
        Box(low=0,
                   high=5,
                   shape=(3, 3, 3), dtype=int),
    'container':
        Discrete(5),
    'containerCount':
        MultiDiscrete(np.full(shape=(5,), fill_value=5,dtype=int))})

In [22]:
print(observation_space_test.sample())

OrderedDict([('container', 1), ('containerCount', array([3, 1, 1, 3, 4])), ('grid', array([[[0, 1, 5],
        [0, 0, 1],
        [3, 2, 1]],

       [[4, 5, 5],
        [1, 4, 0],
        [5, 2, 2]],

       [[1, 3, 5],
        [0, 1, 4],
        [3, 4, 1]]]))])


# Create environment

In [23]:
class ContainerEnv(Env):
    "Custom Environment that follows gym interface"
    metadata = {'render.modes': ['human']}

    def __init__(self, numberOfShips, containersPerShip, boxWidth, boxLength, boxHeight, percentage=0.0):
        super(ContainerEnv, self).__init__()
        self.numberOfShips = numberOfShips
        self.containersPerShip = containersPerShip
        self.percentage = percentage
        self.maxRows = boxLength
        self.maxColumns = boxWidth
        self.maxHeight = boxHeight
        self.maxStepsCount = ((self.maxRows * self.maxColumns * self.maxHeight) * self.maxHeight) * 10
        self.stepCounter = 0
        self.action_space = MultiDiscrete([self.maxRows, self.maxColumns], dtype=int)
        self.observation_space = Dict(
            spaces={
                "grid": Box(low=0, high=numberOfShips, shape=(self.maxRows, self.maxColumns, self.maxHeight), dtype=int),
                "container": Discrete(numberOfShips + 1),
                "containerCount": MultiDiscrete(np.full(shape=(numberOfShips,), fill_value=(self.maxRows * self.maxColumns * self.maxHeight),dtype=int)),
                "maxCounter": Discrete(self.maxStepsCount + 1),
                "stepCounter": Discrete(self.maxStepsCount + 1)
            }
        )
        self.listOfContainers = getListOfContainers(self.numberOfShips, self.containersPerShip, self.maxHeight-1)
        self.containerCount = getContainersCountAsList(self.listOfContainers)
        self.containerGrid = fillInContainerGridRandom(np.zeros(shape=(self.maxRows, self.maxColumns, self.maxHeight), dtype=int), 
                                                       self.action_space, 
                                                       self.numberOfShips,
                                                       self.percentage)
    def step(self, action):
        self.stepCounter += 1
        grid, reward, containerList = fillSingleContainerAtPosition(self.containerGrid, action[0], action[1], self.listOfContainers)
        self.containerGrid = grid
        self.listOfContainers = containerList
        self.containerCount = getContainersCountAsList(self.listOfContainers, self.containerCount)
        info = {"TimeLimit.truncated": False}
        # Check if there are no more containers to place
        if ((len(self.listOfContainers) == 0) | np.all(self.containerGrid.flatten() > 0)):
            obs = {'stepCounter': self.stepCounter, "maxCounter": self.maxStepsCount, 'containerCount': self.containerCount, 'container': 0, 'grid': self.containerGrid}
            return obs, reward, True, info
        else:
            obs = {'stepCounter': self.stepCounter, "maxCounter": self.maxStepsCount, 'containerCount': self.containerCount, 'container': int(self.listOfContainers[-1]),
                   'grid': self.containerGrid}
            if (self.stepCounter >= self.maxStepsCount):
                info["TimeLimit.truncated"] = True
                return obs, reward, True, info
            else:
                return obs, reward, False, info
    
    def reset(self):
        self.stepCounter = 0
        self.containerGrid = fillInContainerGridRandom(np.zeros(shape=(self.maxRows, self.maxColumns, self.maxHeight), dtype=int), 
                                                       self.action_space, 
                                                       self.numberOfShips,
                                                       self.percentage)
        self.listOfContainers = getListOfContainers(self.numberOfShips, 
                                                    random.randint(max(1, self.containersPerShip - 3), self.containersPerShip + 3),
                                                    self.maxHeight-1)
        self.containerCount = getContainersCountAsList(self.listOfContainers)
        obs = {'stepCounter': self.stepCounter, "maxCounter": self.maxStepsCount, 'containerCount': self.containerCount, 'container': int(self.listOfContainers[-1]), 'grid': self.containerGrid}
        return obs
    
    def render(self, mode='human', close=False):
        display(self.containerGrid)
    def close (self):
        display(self.containerGrid)

In [24]:
from stable_baselines3.common.env_checker import check_env

env = ContainerEnv(3, 3, 1, 3, 3)
# It will check your custom environment and output additional warnings if needed
#check_env(env)

In [25]:
numberOfShips = 3
containersPerShip = 4
boxHeight = 3
boxWidth = 3
boxLength = 3
env = ContainerEnv(numberOfShips, containersPerShip, boxWidth, boxLength, boxHeight)
display(env.action_space)
display(env.observation_space)
display(env.reset())

MultiDiscrete([3 3])

Dict(container:Discrete(4), containerCount:MultiDiscrete([27 27 27]), grid:Box([[[0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]]], [[[3 3 3]
  [3 3 3]
  [3 3 3]]

 [[3 3 3]
  [3 3 3]
  [3 3 3]]

 [[3 3 3]
  [3 3 3]
  [3 3 3]]], (3, 3, 3), int64), maxCounter:Discrete(811), stepCounter:Discrete(811))

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([8, 8, 8]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

# Test environment

In [26]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    display(state)
    done = False
    score = 0
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        state = n_state
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
    print('final env state')
    display(state)
    display(env.listOfContainers)
env.close()

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([14, 14, 14]),
 'container': 3,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:1 Score:-48
final env state


{'stepCounter': 43,
 'maxCounter': 810,
 'containerCount': array([5, 7, 3]),
 'container': 0,
 'grid': array([[[2, 2, 3],
         [3, 1, 3],
         [3, 1, 1]],
 
        [[3, 1, 3],
         [2, 1, 2],
         [1, 2, 3]],
 
        [[2, 3, 1],
         [3, 3, 3],
         [1, 2, 1]]])}

array([2, 2, 1, 3, 3, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1])

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([4, 4, 4]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:2 Score:0
final env state


{'stepCounter': 12,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[3, 1, 0],
         [0, 0, 0],
         [1, 3, 0]],
 
        [[3, 0, 0],
         [2, 0, 0],
         [1, 0, 0]],
 
        [[1, 0, 0],
         [2, 3, 2],
         [2, 0, 0]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([12, 12, 12]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:3 Score:-112
final env state


{'stepCounter': 67,
 'maxCounter': 810,
 'containerCount': array([7, 1, 2]),
 'container': 0,
 'grid': array([[[3, 2, 3],
         [2, 3, 2],
         [2, 1, 1]],
 
        [[2, 3, 2],
         [3, 2, 1],
         [3, 3, 2]],
 
        [[2, 2, 3],
         [3, 2, 3],
         [1, 2, 1]]])}

array([1, 3, 1, 1, 3, 1, 1, 1, 1])

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([2, 2, 2]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:4 Score:-2
final env state


{'stepCounter': 6,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[3, 2, 0],
         [0, 0, 0],
         [3, 1, 0]],
 
        [[0, 0, 0],
         [2, 0, 0],
         [1, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([4, 4, 4]),
 'container': 3,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:5 Score:16
final env state


{'stepCounter': 12,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[1, 3, 0],
         [2, 2, 0],
         [0, 0, 0]],
 
        [[3, 0, 0],
         [3, 0, 0],
         [2, 0, 0]],
 
        [[1, 1, 0],
         [2, 0, 0],
         [1, 3, 0]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([6, 6, 6]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:6 Score:13
final env state


{'stepCounter': 21,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[2, 0, 0],
         [3, 0, 0],
         [3, 3, 0]],
 
        [[0, 0, 0],
         [1, 1, 0],
         [2, 1, 3]],
 
        [[2, 1, 3],
         [2, 2, 2],
         [1, 3, 1]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([6, 6, 6]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:7 Score:-1
final env state


{'stepCounter': 19,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[1, 1, 2],
         [3, 2, 0],
         [0, 0, 0]],
 
        [[1, 2, 0],
         [3, 1, 3],
         [3, 2, 2]],
 
        [[3, 0, 0],
         [3, 0, 0],
         [1, 1, 2]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([6, 6, 6]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:8 Score:-7
final env state


{'stepCounter': 20,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[1, 0, 0],
         [2, 1, 3],
         [3, 3, 1]],
 
        [[2, 2, 2],
         [3, 0, 0],
         [2, 1, 0]],
 
        [[2, 1, 0],
         [3, 0, 0],
         [1, 3, 0]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([4, 4, 4]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:9 Score:-9
final env state


{'stepCounter': 12,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[3, 0, 0],
         [3, 2, 0],
         [2, 1, 0]],
 
        [[1, 0, 0],
         [2, 1, 0],
         [3, 1, 0]],
 
        [[0, 0, 0],
         [2, 3, 0],
         [0, 0, 0]]])}

array([], dtype=int64)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([4, 4, 4]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

Episode:10 Score:-1
final env state


{'stepCounter': 12,
 'maxCounter': 810,
 'containerCount': array([0, 0, 0]),
 'container': 0,
 'grid': array([[[3, 2, 0],
         [0, 0, 0],
         [1, 0, 0]],
 
        [[2, 0, 0],
         [1, 1, 3],
         [3, 2, 2]],
 
        [[1, 0, 0],
         [3, 0, 0],
         [0, 0, 0]]])}

array([], dtype=int64)

array([[[3, 2, 0],
        [0, 0, 0],
        [1, 0, 0]],

       [[2, 0, 0],
        [1, 1, 3],
        [3, 2, 2]],

       [[1, 0, 0],
        [3, 0, 0],
        [0, 0, 0]]])

# Stable baselines

In [27]:
for percentage in np.arange(0.0, 1.0, 0.1):
    print(percentage)

0.0
0.1
0.2
0.30000000000000004
0.4
0.5
0.6000000000000001
0.7000000000000001
0.8
0.9


In [28]:
from typing import Callable
def make_env(numberOfShips, containersPerShip, boxHeight, boxWidth, boxLength, percentage) -> Callable:
    def _init() -> Env:
        env = ContainerEnv(numberOfShips, containersPerShip, boxHeight, boxWidth, boxLength, percentage)
        return env
    return _init

In [29]:
def getTotalTimeSteps(timesteps, percentage):
    return timesteps
    #return timesteps * (1 - percentage)

In [30]:
num_cpu = 4
startPercentage = 0
timesteps = 20_000

modelEnvironment = DummyVecEnv([make_env(numberOfShips, containersPerShip, boxWidth, boxLength, boxHeight, startPercentage) for i in range(num_cpu)])
model = PPO("MultiInputPolicy", modelEnvironment, verbose=1, learning_rate=0.0003, device='cpu')
model.learn(total_timesteps=getTotalTimeSteps(timesteps, startPercentage))
model.train()

for modelIterations in range(0, 50):
    print("Model iteratie: " + str(modelIterations))
#     for percentage in reversed(np.arange(0.0, startPercentage, 0.1)):
#         print("Using percentage: " + str(percentage))
    newEnv = DummyVecEnv([make_env(numberOfShips, containersPerShip, boxWidth, boxLength, boxHeight, 0) for i in range(num_cpu)])
    model.set_env(newEnv)
    model.learn(total_timesteps=getTotalTimeSteps(timesteps, 0), reset_num_timesteps=False)
    model.train()

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 2829 |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 8192 |
-----------------------------


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


-----------------------------------------
| time/                   |             |
|    fps                  | 1156        |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.009512847 |
|    clip_fraction        | 0.0725      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | 0.0167      |
|    learning_rate        | 0.0003      |
|    loss                 | 60.2        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00915    |
|    value_loss           | 182         |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1047        |
|    iterations           | 3           |
|    time_elapsed         | 23          |
|    total_timesteps      | 24576 

Model iteratie: 4
-------------------------------
| time/              |        |
|    fps             | 2849   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 131072 |
-------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1349         |
|    iterations           | 2            |
|    time_elapsed         | 12           |
|    total_timesteps      | 139264       |
| train/                  |              |
|    approx_kl            | 0.0102970265 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.94        |
|    explained_variance   | 0.528        |
|    learning_rate        | 0.0003       |
|    loss                 | 32.9         |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.0113      |
|    value_loss           | 89.3         |
---------------------------

Model iteratie: 9
-------------------------------
| time/              |        |
|    fps             | 2795   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 253952 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1335        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 262144      |
| train/                  |             |
|    approx_kl            | 0.016872678 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.2        |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.0003      |
|    loss                 | 42.4        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0196     |
|    value_loss           | 97          |
-----------------------------------------
--

Model iteratie: 14
-------------------------------
| time/              |        |
|    fps             | 2797   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 376832 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1368        |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 385024      |
| train/                  |             |
|    approx_kl            | 0.015789898 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.573      |
|    explained_variance   | 0.798       |
|    learning_rate        | 0.0003      |
|    loss                 | 16.6        |
|    n_updates            | 610         |
|    policy_gradient_loss | -0.0139     |
|    value_loss           | 59.9        |
-----------------------------------------
-

Model iteratie: 19
-------------------------------
| time/              |        |
|    fps             | 2786   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 499712 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1308        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 507904      |
| train/                  |             |
|    approx_kl            | 0.017268904 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.359      |
|    explained_variance   | 0.926       |
|    learning_rate        | 0.0003      |
|    loss                 | 8.07        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0143     |
|    value_loss           | 23.9        |
-----------------------------------------
-

Model iteratie: 24
-------------------------------
| time/              |        |
|    fps             | 2809   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 622592 |
-------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1349       |
|    iterations           | 2          |
|    time_elapsed         | 12         |
|    total_timesteps      | 630784     |
| train/                  |            |
|    approx_kl            | 0.27159998 |
|    clip_fraction        | 0.142      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.247     |
|    explained_variance   | 0.906      |
|    learning_rate        | 0.0003     |
|    loss                 | 7.53       |
|    n_updates            | 1010       |
|    policy_gradient_loss | -0.022     |
|    value_loss           | 23.2       |
----------------------------------------
-------------------

Model iteratie: 29
-------------------------------
| time/              |        |
|    fps             | 2813   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 745472 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1236        |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 753664      |
| train/                  |             |
|    approx_kl            | 0.019201618 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.295      |
|    explained_variance   | 0.82        |
|    learning_rate        | 0.0003      |
|    loss                 | 6           |
|    n_updates            | 1210        |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 17.3        |
-----------------------------------------
-

Model iteratie: 34
-------------------------------
| time/              |        |
|    fps             | 2784   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 868352 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1336        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 876544      |
| train/                  |             |
|    approx_kl            | 0.015149672 |
|    clip_fraction        | 0.0847      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.175      |
|    explained_variance   | 0.976       |
|    learning_rate        | 0.0003      |
|    loss                 | 5.3         |
|    n_updates            | 1410        |
|    policy_gradient_loss | -0.00811    |
|    value_loss           | 10.4        |
-----------------------------------------
-

Model iteratie: 39
-------------------------------
| time/              |        |
|    fps             | 2813   |
|    iterations      | 1      |
|    time_elapsed    | 2      |
|    total_timesteps | 991232 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1337        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 999424      |
| train/                  |             |
|    approx_kl            | 0.014873069 |
|    clip_fraction        | 0.0746      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.164      |
|    explained_variance   | 0.966       |
|    learning_rate        | 0.0003      |
|    loss                 | 7.07        |
|    n_updates            | 1610        |
|    policy_gradient_loss | -0.0113     |
|    value_loss           | 16          |
-----------------------------------------
-

Model iteratie: 44
--------------------------------
| time/              |         |
|    fps             | 2813    |
|    iterations      | 1       |
|    time_elapsed    | 2       |
|    total_timesteps | 1114112 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1324        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 1122304     |
| train/                  |             |
|    approx_kl            | 0.018657923 |
|    clip_fraction        | 0.0799      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.167      |
|    explained_variance   | 0.81        |
|    learning_rate        | 0.0003      |
|    loss                 | 4.54        |
|    n_updates            | 1810        |
|    policy_gradient_loss | 0.00886     |
|    value_loss           | 21.2        |
------------------------------------

Model iteratie: 49
--------------------------------
| time/              |         |
|    fps             | 2750    |
|    iterations      | 1       |
|    time_elapsed    | 2       |
|    total_timesteps | 1236992 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1364        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 1245184     |
| train/                  |             |
|    approx_kl            | 0.026465185 |
|    clip_fraction        | 0.0946      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.174      |
|    explained_variance   | 0.849       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.87        |
|    n_updates            | 2010        |
|    policy_gradient_loss | -0.027      |
|    value_loss           | 16.6        |
------------------------------------

In [31]:
eval_env = ContainerEnv(numberOfShips, containersPerShip, boxWidth, boxLength, boxHeight)
eval_env.reset()
display(eval_env.containerGrid)
display(eval_env.listOfContainers)

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]])

array([2, 3, 3, 3, 2, 2, 1, 3, 2, 1, 1, 1, 3, 3, 1, 1, 2, 2, 1, 2, 2, 3,
       1, 3])

In [32]:
from stable_baselines3.common.monitor import Monitor
monitor = Monitor(eval_env)
evaluate_policy(model, monitor, n_eval_episodes=5, return_episode_rewards= True, render=False)

([93, 72, 93, 100, 100], [24, 18, 24, 27, 27])

# Using trained agent

In [33]:
display(eval_env.reset())
display(eval_env.listOfContainers)

{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([2, 2, 2]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([3, 3, 1, 2, 2, 1])

In [34]:
obs = eval_env.reset()
print("---------------------\n")
for i in range(10):
    display(eval_env.listOfContainers)
    print(len(eval_env.listOfContainers))
    reward = 0
    dones = False
    display(obs)
    while not dones:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = eval_env.step(action)
        reward += rewards
    eval_env.render()
    print((obs['grid'].flatten() > 0).sum())
    print("Reward: " + str(reward))
    display(eval_env.listOfContainers)
    obs = eval_env.reset()
    print("---------------------\n")

---------------------



array([2, 1, 3, 2, 1, 1, 3, 2, 2, 3, 2, 1, 1, 1, 1, 1, 3, 3, 2, 3, 2, 2,
       2, 3, 3, 2, 2, 2, 3, 1, 3, 3, 2, 3, 1, 1, 3, 1, 1, 1, 2, 3])

42


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([14, 14, 14]),
 'container': 3,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[2, 2, 2],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [3, 3, 1]],

       [[3, 3, 3],
        [3, 3, 3],
        [3, 3, 3]]])

27
Reward: 100


array([2, 1, 3, 2, 1, 1, 3, 2, 2, 3, 2, 1, 1, 1, 1])

---------------------



array([1, 3, 1, 2, 2, 2, 3, 2, 2, 3, 3, 1, 1, 1, 3, 2, 1, 3, 3, 1, 2, 3,
       1, 2])

24


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([8, 8, 8]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[2, 2, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 0]],

       [[3, 3, 0],
        [3, 3, 3],
        [3, 3, 3]]])

24
Reward: 93


array([], dtype=int64)

---------------------



array([2, 1, 2, 2, 1, 1, 3, 3, 3, 2, 2, 1, 2, 3, 1, 1, 3, 2, 3, 3, 3, 1,
       1, 2])

24


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([8, 8, 8]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[2, 2, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 0]],

       [[3, 3, 0],
        [3, 3, 3],
        [3, 3, 3]]])

24
Reward: 93


array([], dtype=int64)

---------------------



array([3, 2, 2, 1, 3, 1])

6


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([2, 2, 2]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[0, 0, 0],
        [2, 2, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [1, 1, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [3, 3, 0],
        [0, 0, 0]]])

6
Reward: 21


array([], dtype=int64)

---------------------



array([3, 1, 3, 3, 2, 2, 2, 1, 2, 1, 3, 3, 1, 1, 2, 2, 1, 3])

18


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([6, 6, 6]),
 'container': 3,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[0, 0, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [0, 0, 0]],

       [[0, 0, 0],
        [3, 3, 3],
        [3, 3, 3]]])

18
Reward: 72


array([], dtype=int64)

---------------------



array([3, 1, 2, 2, 1, 3])

6


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([2, 2, 2]),
 'container': 3,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[0, 0, 0],
        [2, 2, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [1, 1, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [3, 3, 0],
        [0, 0, 0]]])

6
Reward: 21


array([], dtype=int64)

---------------------



array([1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 3, 3, 2, 3, 2, 2,
       2, 2, 3, 1, 3, 1, 2, 3, 3, 2, 1, 2, 2, 3, 1, 1, 1, 2, 1, 2])

42


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([14, 14, 14]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[2, 2, 2],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 3, 1]],

       [[3, 3, 3],
        [3, 3, 3],
        [3, 2, 2]]])

27
Reward: 84


array([1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 2])

---------------------



array([1, 3, 2, 3, 3, 2, 3, 2, 2, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1,
       1, 2])

24


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([8, 8, 8]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[2, 2, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 0]],

       [[3, 3, 0],
        [3, 3, 3],
        [3, 3, 3]]])

24
Reward: 93


array([], dtype=int64)

---------------------



array([3, 1, 3, 1, 3, 2, 3, 2, 2, 2, 3, 3, 1, 2, 1, 1, 2, 1])

18


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([6, 6, 6]),
 'container': 1,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[0, 0, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [0, 0, 0]],

       [[0, 0, 0],
        [3, 3, 3],
        [3, 3, 3]]])

18
Reward: 72


array([], dtype=int64)

---------------------



array([3, 1, 3, 3, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 2, 1, 1, 2, 1, 3, 1,
       1, 2])

24


{'stepCounter': 0,
 'maxCounter': 810,
 'containerCount': array([8, 8, 8]),
 'container': 2,
 'grid': array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]])}

array([[[2, 2, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 0]],

       [[3, 3, 0],
        [3, 3, 3],
        [3, 3, 3]]])

24
Reward: 93


array([], dtype=int64)

---------------------



In [35]:
amountOfShips = 3
containersOnShip = 4
height = 3
width = 3
length = 3

In [36]:
for percentage in np.arange(0.0, 0.9, 0.1):
    validateEnv = ContainerEnv(amountOfShips, containersOnShip, width, length, height, percentage)
    state = validateEnv.reset()
    print('Start list of containers ------------------------')
    display(validateEnv.listOfContainers)
    print(len(validateEnv.listOfContainers))
    print('Start render ------------------------------------')
    validateEnv.render()
    reward = 0
    dones = False
    while not dones:
        action, _states = model.predict(state, deterministic=True)
        state, rewards, dones, info = validateEnv.step(action)
        reward += rewards
    print('End render --------------------------------------')
    print((state['grid'].flatten() > 0).sum())
    print("Reward: " + str(reward))
    validateEnv.render()
    print('End list of containers --------------------------')
    display(validateEnv.listOfContainers)

Start list of containers ------------------------


array([1, 1, 1, 2, 2, 2, 3, 2, 3, 3, 3, 1, 1, 1, 3, 1, 3, 2, 2, 2, 1, 1,
       2, 3, 2, 2, 1, 3, 3, 3])

30
Start render ------------------------------------


array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]])

End render --------------------------------------
27
Reward: 84


array([[[2, 3, 2],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 2, 2]],

       [[3, 3, 3],
        [3, 3, 3],
        [3, 3, 3]]])

End list of containers --------------------------


array([1, 1, 1])

Start list of containers ------------------------


array([2, 2, 1, 3, 3, 1, 3, 2, 3, 3, 2, 3, 1, 1, 1, 1, 1, 3, 1, 2, 3, 3,
       2, 1, 2, 2, 3, 2, 2, 2, 1, 1, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2])

42
Start render ------------------------------------


array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]])

End render --------------------------------------
27
Reward: 100


array([[[2, 2, 2],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[3, 3, 3],
        [3, 3, 3],
        [3, 2, 3]]])

End list of containers --------------------------


array([2, 2, 1, 3, 3, 1, 3, 2, 3, 3, 2, 3, 1, 1, 1])

Start list of containers ------------------------


array([1, 2, 3, 2, 1, 3, 2, 3, 1, 1, 3, 1, 2, 1, 2, 2, 3, 2, 2, 3, 1, 2,
       2, 1, 3, 3, 3, 1, 3, 1])

30
Start render ------------------------------------


array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0]]])

End render --------------------------------------
27
Reward: 82


array([[[2, 3, 2],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[3, 3, 3],
        [3, 3, 3],
        [1, 3, 3]]])

End list of containers --------------------------


array([1, 2, 3, 2])

Start list of containers ------------------------


array([3, 1, 2, 1, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 1, 3, 3])

18
Start render ------------------------------------


array([[[2, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [3, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]])

End render --------------------------------------
20
Reward: 52


array([[[2, 0, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [3, 1, 1],
        [1, 0, 0]],

       [[0, 0, 0],
        [3, 3, 3],
        [3, 3, 3]]])

End list of containers --------------------------


array([], dtype=int64)

Start list of containers ------------------------


array([1, 2, 3, 2, 3, 1])

6
Start render ------------------------------------


array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[2, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [2, 2, 0],
        [0, 0, 0]]])

End render --------------------------------------
9
Reward: 13


array([[[0, 0, 0],
        [2, 2, 0],
        [0, 0, 0]],

       [[2, 0, 0],
        [1, 1, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [2, 2, 3],
        [3, 0, 0]]])

End list of containers --------------------------


array([], dtype=int64)

Start list of containers ------------------------


array([1, 1, 1, 1, 3, 1, 3, 2, 2, 2, 2, 3, 3, 3, 2, 1, 1, 1, 3, 2, 3, 3,
       2, 2])

24
Start render ------------------------------------


array([[[2, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [2, 0, 0],
        [2, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [2, 0, 0]]])

End render --------------------------------------
27
Reward: 52


array([[[2, 2, 2],
        [2, 2, 2],
        [2, 2, 2]],

       [[1, 1, 1],
        [2, 1, 1],
        [2, 1, 1]],

       [[3, 3, 3],
        [3, 3, 3],
        [2, 3, 3]]])

End list of containers --------------------------


array([1])

Start list of containers ------------------------


array([2, 2, 1, 2, 3, 1, 1, 2, 1, 2, 3, 3, 1, 3, 1, 3, 2, 1, 2, 3, 3, 2,
       1, 3])

24
Start render ------------------------------------


array([[[3, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[1, 0, 0],
        [1, 1, 0],
        [1, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]])

End render --------------------------------------
27
Reward: 63


array([[[3, 1, 2],
        [2, 2, 1],
        [2, 2, 2]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[3, 3, 1],
        [3, 3, 3],
        [3, 3, 3]]])

End list of containers --------------------------


array([2, 2])

Start list of containers ------------------------


array([1, 3, 2, 2, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 2, 1, 1, 1])

18
Start render ------------------------------------


array([[[0, 0, 0],
        [2, 2, 0],
        [0, 0, 0]],

       [[2, 0, 0],
        [2, 2, 0],
        [0, 0, 0]],

       [[2, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]])

End render --------------------------------------
24
Reward: 51


array([[[2, 2, 0],
        [2, 2, 2],
        [2, 2, 2]],

       [[2, 1, 1],
        [2, 2, 1],
        [1, 1, 1]],

       [[2, 0, 0],
        [3, 3, 3],
        [3, 3, 3]]])

End list of containers --------------------------


array([], dtype=int64)

Start list of containers ------------------------


array([2, 3, 2, 3, 1, 3, 1, 3, 3, 2, 3, 2, 1, 1, 2, 1, 2, 1])

18
Start render ------------------------------------


array([[[0, 0, 0],
        [0, 0, 0],
        [3, 0, 0]],

       [[3, 3, 0],
        [1, 0, 0],
        [1, 0, 0]],

       [[0, 0, 0],
        [1, 0, 0],
        [1, 0, 0]]])

End render --------------------------------------
25
Reward: 7


array([[[1, 2, 0],
        [2, 2, 2],
        [3, 2, 2]],

       [[3, 3, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[3, 3, 0],
        [1, 3, 3],
        [1, 3, 3]]])

End list of containers --------------------------


array([], dtype=int64)