In [32]:

class Field():
    def __init__(self, size, item_pickup, item_dropoff, start_position) -> None:
        self.size = size
        self.item_pickup = item_pickup
        self.item_dropoff = item_dropoff
        self.position = start_position
        self.item_in_car = False
        
    def get_number_of_states(self):
        return self.size ** 4 * 2
    
    def get_state(self):
        #map to a specific state
        state = self.position[0] * self.size ** 3 * 2
        state = state + self.position[1] * self.size ** 2 * 2
        state = state + self.item_pickup[0] * self.size * 2
        state = state + self.item_pickup[1] * 2
        if self.item_in_car:
            state = state + 1
        return state
        
    
    def make_action(self, action):
        (x,y ) = self.position
        if action == 0: #Go South
            if y == self.size -1: #punish for going off the field if you're at position '9'
                return -10 ,False
            else:
                self.position = (x, y + 1)
                return -1, False #punish for moving
        elif action == 1: #Go North
            if y == 0: #at the top
                return -10, False #punish for going up if already at the top
            else:
                self.position = (x, y -1)
                return -1, False
        elif action == 2: #Go east
            if x == self.size -1:
                return -10, False
            else:
                self.position = (x + 1, y)
                return -1, False
        elif action == 3: #Go West
            if x == 0:
                return -10, False
            else:
                self.position = (x -1, y)
                return -1, False
        elif action == 4: #Pickup
            if self.item_in_car:
                return -10, False
            elif self.item_pickup != (x,y):
                return -10, False
            else:
                self.item_in_car = True
                return 20, False
        elif action == 5: #Dropoff
            if self.item_in_car is False: #don't drop off if you don't have
                return -10, False
            elif self.item_dropoff != (x,y): #don't drop off at wrong place
                self.item_pickup = (x, y) #need to update item after it was dropped
                self.item_in_car = False
                return -10, False
            else:
                return 20, True

In [13]:
field = Field(10, (0,0), (9,9), (9,0))
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(4)

field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)

field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)

field.make_action(4)
field.position

(9, 0)


(9, 9)

In [15]:
import random
def naive_solution():
    size = 10
    item_start = (0,0)
    item_drop = (9,9)
    start_position = (9,0)
    field = Field(size, item_start, item_drop, start_position)
    done = False
    steps = 0
    while not done:
        action = random.randint(0,5)
        reward, done = field.make_action(action)
        steps += 1
    return steps

In [37]:
import numpy as np

size = 10
item_start = (0,0)
item_drop = (9,9)
start_position = (9,0)
field = Field(size, item_start, item_drop, start_position)

number_of_states = field.get_number_of_states()
number_of_actions = 6
q_table = np.zeros((number_of_states, number_of_actions))
epsilon = 0.1 #explore
alpha = 0.1
gamma = 0.6

for _ in range(1000):
    field = Field(size, item_start, item_drop, start_position)
    done = False
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5)
        else:
            action = np.argmax(q_table[state])
           

        reward, done = field.make_action(action)
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])
        q_table[state, action] = (1- alpha) * q_table[state, action] + alpha*(reward + gamma * new_state_max - q_table[state, action])
        

In [39]:
def reinforcement_learning():
    epsilon = 0.1 #explore
    alpha = 0.1
    gamma = 0.6
    field = Field(size, item_start, item_drop, start_position)
    done = False
    steps = 0
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5)
        else:
            action = np.argmax(q_table[state])
           

        reward, done = field.make_action(action)
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])
        q_table[state, action] = (1- alpha) * q_table[state, action] + alpha*(reward + gamma * new_state_max - q_table[state, action])
        steps += 1
    return steps

In [40]:
reinforcement_learning()

35

In [43]:
runs_rl = [reinforcement_learning() for _ in range(10000)]

In [44]:
sum(runs_rl) / len(runs_rl)

55.1501