# SAKI HA 4 WAREHOUSE

## Imports

In [87]:
import mdptoolbox, mdptoolbox.example
import numpy as np
import itertools

## Parameter

In [88]:
items = ["w", "b", "r"]
container_state = ["e", "w", "b", "r"]
operations = ["in", "out"]
actions = []
for operation in operations:
    for item in items:
        actions.append((operation, item))

print(items)
print(container_state)
print(operations)
print(actions)

['w', 'b', 'r']
['e', 'w', 'b', 'r']
['in', 'out']
[('in', 'w'), ('in', 'b'), ('in', 'r'), ('out', 'w'), ('out', 'b'), ('out', 'r')]


## Create Warehouse

In [89]:
def create_warehouse(x, y):
    warehouse = []    
    for i in range(0, x):
        for j in range(0, y):
            warehouse.append((i,j))
            
    return warehouse

In [90]:
warehouse = create_warehouse(2, 2)
print(warehouse)

[(0, 0), (0, 1), (1, 0), (1, 1)]


calculate the total number of states:

In [91]:
def getnrstates(warehouse, actions, container_state):
    nrstates = (len(container_state) ** len(warehouse)) * len(actions)
    return nrstates


In [92]:
print(getnrstates(warehouse, actions, container_state))

1536


Generation of the warehousestates:

In [93]:
def getfieldstates(warehouse, container_state):  
    return list(itertools.product(container_state, repeat=len(warehouse)))

In [94]:
fieldstates = getfieldstates(warehouse, container_state)
print("first state:" + str(fieldstates[0]))
print("last state: " + str(fieldstates[-1]))
print("total number of fieldstates: " + str(len(fieldstates)))

first state:('e', 'e', 'e', 'e')
last state: ('r', 'r', 'r', 'r')
total number of fieldstates: 256


 ## Generate matrices

create reward matrix:

In [95]:
def get_reward_distance(warehouse):
    rewardvec = [1/(2*(container[0]+container[1] + 1)) for container in warehouse]
    return rewardvec

In [96]:
rewardvec = get_reward_distance(warehouse)
print(rewardvec)

[0.5, 0.25, 0.25, 0.16666666666666666]


start with empty matrices:

In [97]:
transition = np.zeros((len(actions), len(fieldstates), len(fieldstates)))
reward = np.zeros((len(actions), len(fieldstates), len(fieldstates)))
print("shape:" + str(np.shape(transition)))

shape:(6, 256, 256)


Helperfunction, which checks if an action is possible:

In [98]:
def checkaction(action, fieldstate):
    if action[0] == "in":
        if "e" in fieldstate:
            return True
    elif action[0] == "out":
        if action[1] == "w":
            if "w" in fieldstate:
                return True
        if action[1] == "b":
            if "b" in fieldstate:
                return True
        if action[1] == "r":
            if "r" in fieldstate:
                return True
    return False
action1 = checkaction(("in","r"),("r","e","w","e"))
action2 = checkaction(("out","r"),("b","w","e","e"))
print(action1)
print(action2)

True
False


Helperfunction, which search given state in a fieldstate and returns the indices

In [99]:
def findvalueinlist(value, mylist):
    indices = [i for i, x in enumerate(mylist) if x == value]
    return indices
print(findvalueinlist("e", ("e", "b", "w", "e", "e", "r")))

[0, 3, 4]


Calculate how often the unique colors appear in the trainingdata, for the final reward:

In [100]:
def calc_appearancevec():
    txt = open('warehousetraining.txt')
    appearancecount = [0,0,0]
    itemscount = 0
    for line in txt:
        linelist = line.split('\t')
        color = linelist[1].strip('\n')
        if color == 'white':
            appearancecount[0] += 1 
        elif color == 'blue':
            appearancecount[1] += 1 
        elif color == 'red':
            appearancecount[2] += 1 
        itemscount += 1
    appearancevec = [appearance / itemscount for appearance in appearancecount]
    return(appearancevec)
appearancevec = calc_appearancevec()
print("w, b, r")
print(appearancevec)

w, b, r
[0.2556987115956392, 0.25057813016187647, 0.4937231582424843]


Matrix generation:

In [101]:
def calc_matrices(fieldstates, actions):
    transition = np.zeros((len(actions), len(fieldstates), len(fieldstates)))
    reward = np.zeros((len(actions), len(fieldstates), len(fieldstates)))
    for i, action in enumerate(actions):
        for j, state in enumerate(fieldstates):
            if checkaction(action, state) == False:
                transition[i, j, j] = 1
            else:
                if action[0] == "in":
                    indices = findvalueinlist( "e",state)
                    prob = 1/len(indices)
                    for index in indices:
                        newfieldstate = list(state)
                        newfieldstate[index] = action[1]
                        newfieldstateindex = fieldstates.index(tuple(newfieldstate))
                        transition[i, j , newfieldstateindex] = prob
                        reward[i, j, newfieldstateindex] = rewardvec[index] * appearancevec[items.index(action[1])]
                elif action[0] == "out":
                    indices = findvalueinlist(action[1], state)
                    
                    prob = 1/len(indices)
                    for index in indices:
                        newfieldstate = list(state)
                        newfieldstate[index] = "e"
                        newfieldstateindex = fieldstates.index(tuple(newfieldstate))
                        transition[i, j , newfieldstateindex] = prob
                        reward[i, j, newfieldstateindex] = rewardvec[index] * appearancevec[items.index(action[1])]
    
    return transition, reward
transition_matrix, reward_matrix = calc_matrices(fieldstates, actions)

Test the transition and reward matrix:

In [102]:
stateid = 0
actionid = 0
print(str(fieldstates[stateid]) + " + " + str(actions[actionid]) + " ->\n")
print("Transition matrix")
print(transition_matrix[actionid, stateid])
print("Reward matrix")
print(reward_matrix[actionid, stateid])
white_index = findvalueinlist(max(transition_matrix[actionid, stateid]) ,transition_matrix[actionid, stateid])
print("\nIndicies of all state transitions: " + str(white_index))
print("Resulting next states:")
for wi in white_index:
    print(fieldstates[wi])

('e', 'e', 'e', 'e') + ('in', 'w') ->

Transition matrix
[0.   0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   

## Build and Run Models

In [103]:
policyIt = mdptoolbox.mdp.PolicyIteration(transition_matrix, reward_matrix, 0.9)
policyIt.run()

qlearning = mdptoolbox.mdp.QLearning(transition_matrix, reward_matrix, 0.9)
qlearning.run()

valIt = mdptoolbox.mdp.ValueIteration(transition_matrix, reward_matrix, 0.9)
valIt.run()
print("finished calculation")

finished calculation


## Evaluation

In [104]:
def greedy_model(testactions):
    current_state = 0
    rewardsum = 0

    for action in testactions:
        indices = findvalueinlist(max(transition_matrix[actions.index(action), current_state]) ,transition_matrix[actions.index(action), current_state])  
        indices_reward = [reward_matrix[actions.index(action), current_state, index] for index in indices]
        next_state = indices[indices_reward.index(max(indices_reward))]
        #print(str(fieldstates[current_state]) + "+" + str(action) + "->" + str(fieldstates[next_state]) + "reward: " + str(reward_matrix[actions.index(action), current_state, next_state]))
        rewardsum += reward_matrix[actions.index(action), current_state, next_state]
        current_state = next_state
        
    #rint("Total Reward: " + str(rewardsum))
    return rewardsum


In [105]:
def create_testorder(testfilepath):
    testactions = []
    file = open(testfilepath)
    for line in file:
        linelist = line.split('\t')
        if linelist[0] == "store":
            variant = "in"
        elif linelist[0] == "restore":
            variant = "out"
        if linelist[1].strip('\n') == "white":
            color = "w"
        elif linelist[1].strip('\n') == "blue":
            color = "b"
        elif linelist[1].strip('\n') == "red":
            color = "r"
            
        testactions.append((variant, color))
    return(testactions)


In [106]:
testactions_short = create_testorder("warehouseorder.txt")
testactions_long = create_testorder("warehousetraining.txt")
print(len(testactions_short))
print(len(testactions_long))

60
12108


Evaluation of the 3 models with the short actionlist:

In [107]:
def test_model(testactions, model):
    current_state = 0
    rewardsum = 0

    for action in testactions:
        indices = findvalueinlist(max(transition_matrix[actions.index(action), current_state]) ,transition_matrix[actions.index(action), current_state])  
        values = [model.V[index] for index in indices]
        next_state = indices[values.index(max(values))]
        #print(fieldstates[next_state])
        rewardsum += reward_matrix[actions.index(action), current_state, next_state]
        current_state = next_state    
    print("Total Reward: " + str(rewardsum))
    

Short actionlist:

In [108]:
print("SHORT_LIST")
print("-------greedyModel-------")
print("Total Reward: " + str(greedy_model(testactions_short)))
print("--------policyIt---------")
test_model(testactions_short, policyIt)
print("--------qlearning--------")
test_model(testactions_short, qlearning)
print("---------valIt-----------")
test_model(testactions_short, valIt)

SHORT_LIST
-------greedyModel-------
Total Reward: 5.937782182579012
--------policyIt---------
Total Reward: 5.393926880299527
--------qlearning--------
Total Reward: 4.757873582204604
---------valIt-----------
Total Reward: 5.393926880299527


Long actionlist:

In [109]:
print("LONG_LIST")
print("-------greedyModel-------")
print("Total Reward: " + str(greedy_model(testactions_long)))
print("--------policyIt---------")
test_model(testactions_long, policyIt)
print("--------qlearning--------")
test_model(testactions_long, qlearning)
print("---------valIt-----------")
test_model(testactions_long, valIt)

LONG_LIST
-------greedyModel-------
Total Reward: 1181.0238685167508
--------policyIt---------
Total Reward: 1193.400561612256
--------qlearning--------
Total Reward: 1043.510654113055
---------valIt-----------
Total Reward: 1193.400561612256
