In [1]:
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size 
        self.item_pickup = item_pickup # Coordinate position of item pickup 
        self.item_dropoff = item_dropoff # Coordinate position of item drop-off
        self.position = start_position
        self.item_in_car = False
        
    def get_number_of_states(self):
        ''''''
        return self.size*self.size*self.size*self.size*2
        
        
    def get_state(self):
        state = self.position[0]*self.size*self.size*self.size*2
        state = state + self.position[1]*self.size*self.size*2
        state = state + self.item_pickup[0]*self.size*2
        state = state + self.item_pickup[1]*2
        
        if self.item_in_car:
            state = state + 1
        return state
        
    def make_action(self, action):
        (x,y) = self.position
        if action == 0: #down
            if y == self.size-1:
                return -10, False
            else:
                self.position = (x, y+1)
                return -1, False
            
        elif action == 1: #up
            if y == 0:
                return -10, False
            else:
                self.position = (x, y-1)
                return -1, False
            
        elif action == 2: #left
            if x == 0:
                return -10, False
            else:
                self.position = (x-1, y)
                return -1, False
                
        elif action == 3: #right
            if x == self.size-1:
                return -10, False
            else:
                self.position = (x+1, y)
                return -1, False
            
        elif action == 4: # pickup
            if self.item_in_car:
                return -10, False
            elif self.item_pickup != (x,y):
                return -10, False
            else:
                self.item_in_car = True
                return 20, False
                
        elif action == 5: #dropoff
            if not self.item_in_car: # If the item is not in the car. 
                return -10, False
            elif self.item_dropoff != (x,y):
                self.item_pickup = (x,y)
                self.item_in_car = False
                return -10, False
            else:
                self.item_in_car = False
                return 20, True

In [27]:
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size, item_pickup, item_dropoff, start_position)

In [28]:
field.position

(9, 0)

In [29]:
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)

field.make_action(4)

field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)

field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)

field.make_action(5)

In [37]:
import random

In [61]:
# Random Solution
def random_solution():
    '''Implements a solution to the 'carrier problem' where the carrier moves with random actions and no reinforcement learning methodology.'''
    size = 10
    item_pickup = (0,0)
    item_dropoff = (9,9)
    start_position = (9,0)

    field = Field(size, item_pickup, item_dropoff, start_position)
    
    done = False
    steps = 0
    
    while not done:
        action = random.randint(0,5) # Selects random integer for action
        reward, done = field.make_action(action) # Makes action and obtains reward and done state.
        steps = steps + 1 # Increments steps.
        
    return steps
    

In [97]:
random_solution()

161407

In [69]:
run = [random_solution() for _ in range(100)]

In [70]:
sum(run)/len(run)

154913.37

In [2]:
# Q-Learning Algo
import numpy as np
import random

In [3]:
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size, item_pickup, item_dropoff, start_position)

number_of_states = field.get_number_of_states()
number_of_actions = 6

q_table = np.zeros((number_of_states, number_of_actions))

epsilon = 0.1
alpha = 0.1
gamma = 0.6

for _ in range(10000):
    field = Field(size, item_pickup, item_dropoff, start_position)
    done = False
    
    while not done:
        state = field.get_state()        

        # Epsilon is the percentage of time we want to explore.
        if random.uniform(0,1) < epsilon: # Decision to explore or exploit. 
            action = random.randint(0,5) #Explore
        else:
            action = np.argmax(q_table[state]) #Exploit
            
        reward, done = field.make_action(action)
        
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])
        
        q_table[state, action] = (1-alpha)*q_table[state, action]+alpha*(reward+gamma*new_state_max - q_table[state, action])
                        

In [4]:
q_table

array([[ 0.23071429, -2.06428571, -2.06428571,  0.23071429,  9.78571429,
        -2.06428571],
       [-0.71428571, -5.21428571, -5.21428571, -0.71428571, -5.21428571,
        -2.06428571],
       [ 1.51061586, -1.        , -1.        , -0.1       , -1.        ,
        -1.        ],
       ...,
       [-1.33071063,  0.08268536, -0.25      , -1.44779836, -1.54175457,
        10.27464764],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [58]:
def reinforcement_solution():
    epsilon = 0.08
    alpha = 0.1
    gamma = 0.6
    
    field = Field(size, item_pickup, item_dropoff, start_position)
    done = False
    steps = 0
    
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5) #Explore
        else:
            action = np.argmax(q_table[state]) #Exploit
            
        reward, done = field.make_action(action)
        
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])
        
        q_table[state, action] = (1-alpha)*q_table[state, action]+alpha*(reward+gamma*new_state_max - q_table[state, action])
                        
        steps = steps +1
        
    return steps

In [59]:
reinforcement_solution()

29

In [62]:
run = [reinforcement_solution() for _ in range(100)]

In [63]:
sum(run)/len(run)

50.92