In [44]:
# import libraries
import numpy as np
import random

In [88]:
# create reward system in each state to state transition
# -1 for impossible moves
# 2 for running next to a body of water
# 0 points for steep inclines
# 1 point for running on a trail
# 0 0.5 points for running on the street
r = np.array([[ -1,  0, 0.5, -1,  -1,  -1,  -1,  -1,  -1,  -1], # o
              [ -1, -1,  -1,  1,   1,  -1,  -1,  -1,  -1,  -1], # a
              [ -1, -1,  -1, -1,  -1,   1,   1,  -1,  -1,  -1], # b
              [ -1, -1,  -1, -1,   1,  -1,  -1,   1,  -1,  -1], # c
              [ -1, -1,  -1,  1,  -1, 1.5,  -1,   1,  -1,  -1], # d
              [ -1, -1,  -1, -1, 1.5,  -1,   1,  -1,   1,  -1], # e
              [ -1, -1,  -1, -1,  -1,   1,  -1,  -1,   1,  -1], # f
              [ -1, -1,  -1,  1,   1,  -1,  -1,  -1,  -1,   1], # g
              [ -1, -1,  -1, -1,  -1,   1,   1,  -1,  -1,   1], # h
              [ -1, -1,  -1, -1,  -1,  -1,  -1,  -1,  -1,   1]]).astype("float32")

In [45]:
# these are the possible moves
splits = [(0,1), (0,2), (1,3), (1,4), (2,5), (2,6), (3,4), (3,7),\
          (4,3), (4,5), (4,7), (5,4), (5,6), (5,8), (6,5), (6,8),\
          (7,3), (7,4), (7,9), (8,5), (8,6), (8,9)]

# adapt the reward structure to consider state of completion
completion_rate_list = [i/10.0 for i in range(0, 10)]

In [46]:
# create rewards look-up table
rtable = {}
for i in splits:
    for k in completion_rate_list:
        key = "{}, {}".format(i, k)
        rtable[key] = 1

In [47]:
for i, j in rtable.iteritems():
    if '0, 1' in i:
        rtable[i] = 0.0
    elif '0, 2' in i:
        rtable[i] = 0.5
    elif '4, 5' in i or '5, 4' in i:
        rtable[i] = 2.0

for i, j in rtable.iteritems():
    if i=='(7, 9), 0.0': rtable[i] = 0.0
    elif i=='(7, 9), 0.1': rtable[i] = 0.0
    elif i=='(7, 9), 0.2': rtable[i] = 0.0
    elif i=='(7, 9), 0.3': rtable[i] = 0.0
    elif i=='(7, 9), 0.4': rtable[i] = 0.0
    elif i=='(7, 9), 0.5': rtable[i] = 0.0
    elif i=='(7, 9), 0.6': rtable[i] = 0.0
    elif i=='(7, 9), 0.7': rtable[i] = 0.0
    elif i=='(7, 9), 0.8': rtable[i] = 10.0
    elif i=='(7, 9), 0.9': rtable[i] = 10.0
        
for i, j in rtable.iteritems():
    if i=='(8, 9), 0.0': rtable[i] = 0.0
    elif i=='(8, 9), 0.1': rtable[i] = 0.0
    elif i=='(8, 9), 0.2': rtable[i] = 0.0
    elif i=='(8, 9), 0.3': rtable[i] = 0.0
    elif i=='(8, 9), 0.4': rtable[i] = 0.0
    elif i=='(8, 9), 0.5': rtable[i] = 0.0
    elif i=='(8, 9), 0.6': rtable[i] = 0.0
    elif i=='(8, 9), 0.7': rtable[i] = 0.0
    elif i=='(8, 9), 0.8': rtable[i] = 10.0
    elif i=='(8, 9), 0.9': rtable[i] = 10.0

In [48]:
# adapt this to dict for faster look up speed
m = np.array([[  0,  0.5,   1,   0,   0,   0,   0,   0,   0,   0], # o
              [  0,    0,   0,0.25,0.35,   0,   0,   0,   0,   0], # a
              [  0,    0,   0,   0,   0,0.35, 0.1,   0,   0,   0], # b
              [  0,    0,   0,   0, 0.5,   0,   0,0.25,   0,   0], # c
              [  0,    0,   0, 0.5,   0,0.75,   0,0.35,   0,   0], # d
              [  0,    0,   0,   0,0.75,   0, 0.5,   0,0.35,   0], # e
              [  0,    0,   0,   0,   0, 0.5,   0,   0, 0.1,   0], # f
              [  0,    0,   0,0.25,0.35,   0,   0,   0,   0, 0.5], # g
              [  0,    0,   0,   0,   0,0.1, 0.35,   0,   0,   1], # h
              [  0,    0,   0,   0,   0,   0,   0,   0,   0,   0]]).astype("float32")

In [49]:
# create q look-up table
q = {}
for i in splits:
    for j in completion_rate_list:
        key = "{}, {}".format(i, j)
        q[key] = 0

In [90]:
def rewardiplier(pct_complete, miles):
    y = (pct_complete) + ((pct_complete-0.5)/(0-0.5)) * miles
    return y

In [91]:
def initial_move(current_state, action, next_state, total_miles):
    # total_miles = total_miles + m[0, start_action]
    update_q(current_state, action=start_action, next_state=start_action,\
             decision_miles=total_miles, alpha=alpha, gamma=gamma)
    current_state = next_state
    return (current_state, total_miles)

In [92]:
def update_q(current_state, action, next_state, decision_miles, alpha, gamma):
    pct_complete = round((decision_miles / target_miles),1)
    if pct_complete > 1.0:
        pct_complete = 1.0
    reward = r[current_state, action] * rewardiplier(pct_complete, m[current_state, action])
    k = str(str((current_state, next_state))+", "+str(target_miles)+", "+str(pct_complete))
    q_value = q[k]
    next_q_values = []
    for i in q.keys():
        if i[4] == str(next_state):
            next_q_values.append(q[i])
    new_q = q_value + alpha * (reward + gamma * max(next_q_values) - q_value)
    q[k] = new_q
    return r[current_state, action]

In [93]:
# define global parameters
target_miles = 6.0
gamma = 0.8
alpha = 1
epsilon = 0.3
n_actions = 10
final_destination = 9
random_state = np.random.RandomState(1999) 

In [35]:
# define state parameters
target_miles_list = np.arange(3.0,21.0)
completion_rate_list = [i/10.0 for i in range(0, 11)]
splits = [(0,1), (0,2), (1,3), (1,4), (2,5), (2,6), (3,4), (3,7),\
          (4,3), (4,5), (4,7), (5,4), (5,6), (5,8), (6,5), (6,8),\
          (7,3), (7,4), (7,9), (8,5), (8,6), (8,9), (9,9)]

In [36]:
# create q look-up table
q = {}
for i in splits:
    for j in target_miles_list:
        for k in completion_rate_list:
            key = "{}, {}, {}".format(i, j, k)
            q[key] = 0

In [37]:
q

{'(7, 4), 7.0, 0.6': 0,
 '(5, 8), 16.0, 0.5': 0,
 '(6, 8), 19.0, 0.3': 0,
 '(8, 5), 20.0, 0.7': 0,
 '(7, 3), 4.0, 0.7': 0,
 '(6, 5), 16.0, 1.0': 0,
 '(7, 3), 4.0, 0.6': 0,
 '(4, 5), 18.0, 0.2': 0,
 '(4, 5), 18.0, 0.3': 0,
 '(4, 5), 18.0, 0.0': 0,
 '(4, 5), 18.0, 0.1': 0,
 '(4, 5), 18.0, 0.6': 0,
 '(4, 5), 18.0, 0.7': 0,
 '(4, 5), 18.0, 0.4': 0,
 '(4, 5), 18.0, 0.5': 0,
 '(4, 5), 18.0, 0.8': 0,
 '(4, 5), 18.0, 0.9': 0,
 '(5, 4), 4.0, 0.5': 0,
 '(8, 5), 14.0, 0.6': 0,
 '(8, 5), 20.0, 0.6': 0,
 '(5, 6), 8.0, 0.1': 0,
 '(4, 3), 20.0, 0.7': 0,
 '(4, 3), 20.0, 0.6': 0,
 '(4, 3), 20.0, 0.5': 0,
 '(4, 3), 20.0, 0.4': 0,
 '(4, 3), 20.0, 0.3': 0,
 '(4, 3), 20.0, 0.2': 0,
 '(4, 3), 20.0, 0.1': 0,
 '(4, 3), 20.0, 0.0': 0,
 '(8, 5), 18.0, 1.0': 0,
 '(4, 3), 20.0, 0.9': 0,
 '(4, 3), 20.0, 0.8': 0,
 '(2, 6), 12.0, 1.0': 0,
 '(3, 4), 7.0, 0.9': 0,
 '(3, 4), 7.0, 0.8': 0,
 '(3, 4), 7.0, 0.3': 0,
 '(3, 4), 7.0, 0.2': 0,
 '(3, 4), 7.0, 0.1': 0,
 '(3, 4), 7.0, 0.0': 0,
 '(3, 4), 7.0, 0.7': 0,
 '(3, 4), 7.

In [103]:
for i in range(500):
    print "NEW RUN"
    start_action = random.choice([2]) # head to entrance a or b
    current_state, total_miles = initial_move(current_state=0,\
                            action=start_action, next_state=start_action, total_miles=0)
    print ("Initial state = ", current_state)
    goal = False
    while not goal:
        print "NEW SPLIT POINT"
        valid_moves = r[current_state] >= 0
        if total_miles > 0.75 * target_miles:
            print "High total miles."
            actions = np.array(list(range(n_actions)))
            actions = actions[valid_moves == True]
            if 9 in actions and m[current_state, final_destination] + total_miles > (0.99*target_miles):
                print("9 is here - going home")
                new_miles = m[current_state,final_destination]
                total_miles += new_miles
                next_state = final_destination
            else:
                shortest = min(list(m[current_state][actions]))
                print("taking shortest route")
                if total_miles > 20:
                    break
                new_miles = shortest
                total_miles += new_miles
                next_state = [i for i,x in enumerate(m[current_state]) if x == shortest][0]
        else:
            if random.random() < epsilon: 
                print "using random choice to pick next state"
                actions = np.array(list(range(n_actions)))
                actions = actions[valid_moves == True]
                random_state.shuffle(actions) 
                action = actions[0]
                new_miles = m[current_state,action]
                total_miles += new_miles
                next_state = action
            else:
                print "exploit"
                current_q_values = []
                for i in q.keys():
                    if i[3] == str(current_state):
                        current_q_values.append(q[i])
                if np.sum(current_q_values) > 0:
                    print "there is info to exploit"
                    current_q_values = []
                    for i in q.keys():
                        if i[3] == str(current_state):
                            current_q_values.append(q[i[4]])
                    action = np.argmax(current_q_values)
                    new_miles = m[current_state,action]
                    total_miles += new_miles
                else:
                    print "no info to exploit yet"
                    actions = np.array(list(range(n_actions)))
                    actions = actions[valid_moves == True]
                    random_state.shuffle(actions)
                    action = actions[0]
                    new_miles = m[current_state,action]
                    total_miles += new_miles
                next_state = action
        print("the current state = ", current_state)
        print("the next state = ", next_state)
        decision_miles = total_miles - new_miles
        reward = update_q(current_state, action, next_state, decision_miles, alpha, gamma)
        
        if next_state == 9:
            goal = True
        current_state = next_state   
        if type(current_state) == list:
            current_state = int(current_state[0])
        print("miles this far in this run = ", total_miles)
        print("----------")
    print("total miles - ", total_miles)

NEW RUN
('Initial state = ', 2)
NEW SPLIT POINT
using random choice to pick next state
('the current state = ', 2)
('the next state = ', 6)
('miles this far in this run = ', 0.10000000149011612)
----------
NEW SPLIT POINT
using random choice to pick next state
('the current state = ', 6)
('the next state = ', 5)
('miles this far in this run = ', 0.60000000149011612)
----------
NEW SPLIT POINT
using random choice to pick next state
('the current state = ', 5)
('the next state = ', 8)
('miles this far in this run = ', 0.94999999552965164)
----------
NEW SPLIT POINT
exploit
no info to exploit yet
('the current state = ', 8)
('the next state = ', 6)
('miles this far in this run = ', 1.2999999895691872)
----------
NEW SPLIT POINT
exploit
no info to exploit yet
('the current state = ', 6)
('the next state = ', 8)
('miles this far in this run = ', 1.3999999910593033)
----------
NEW SPLIT POINT
exploit
no info to exploit yet
('the current state = ', 8)
('the next state = ', 9)
('miles this far

In [22]:
# user inputs
target_miles = 6.0

# model parameters
gamma = 0.8
alpha = 1
epsilon = 0.3
n_actions = 10
final_destination = 9
random_state = np.random.RandomState(1999)

# learn!
for i in range(500):
    # start_action = random.choice([2]) # head to entrance a or b
    current_state, total_miles = initial_move(current_state=0,\
                            action=start_action, next_state=start_action, total_miles=0)
    
    current_state = 0
    current_total_miles = 0
    start_action = random.choice([2]) # head to entrance a or b
        
    transition = "(" + str(current_state) + ", " + str(start_action) + "), " + str(pct_complete)
    
    reward = rtable[transition]
    
    q_value = q[transition]
    
    next_q_values = [value for key, value in q.items() if key.lower()[1]==str(start_action)]
    
    new_q = q_value + alpha * (reward + gamma * max(next_q_values) - q_value)
    
    q[transition] = new_q
    
    
    
    goal = False
    
    while not goal:
        
        
        
        
        valid_moves = r[current_state] >= 0
        if total_miles > 0.75 * target_miles:
            print "High total miles."
            actions = np.array(list(range(n_actions)))
            actions = actions[valid_moves == True]
            if 9 in actions and m[current_state, final_destination] + total_miles > (0.99*target_miles):
                print("9 is here - going home")
                new_miles = m[current_state,final_destination]
                total_miles += new_miles
                next_state = final_destination
            else:
                shortest = min(list(m[current_state][actions]))
                print("taking shortest route")
                if total_miles > 20:
                    break
                new_miles = shortest
                total_miles += new_miles
                next_state = [i for i,x in enumerate(m[current_state]) if x == shortest][0]
        else:
            if random.random() < epsilon: 
                print "using random choice to pick next state"
                actions = np.array(list(range(n_actions)))
                actions = actions[valid_moves == True]
                random_state.shuffle(actions) 
                action = actions[0]
                new_miles = m[current_state,action]
                total_miles += new_miles
                next_state = action
            else:
                print "exploit"
                current_q_values = []
                for i in q.keys():
                    if i[3] == str(current_state):
                        current_q_values.append(q[i])
                if np.sum(current_q_values) > 0:
                    print "there is info to exploit"
                    current_q_values = []
                    for i in q.keys():
                        if i[3] == str(current_state):
                            current_q_values.append(q[i[4]])
                    action = np.argmax(current_q_values)
                    new_miles = m[current_state,action]
                    total_miles += new_miles
                else:
                    print "no info to exploit yet"
                    actions = np.array(list(range(n_actions)))
                    actions = actions[valid_moves == True]
                    random_state.shuffle(actions)
                    action = actions[0]
                    new_miles = m[current_state,action]
                    total_miles += new_miles
                next_state = action
        print("the current state = ", current_state)
        print("the next state = ", next_state)
        decision_miles = total_miles - new_miles
        reward = update_q(current_state, action, next_state, decision_miles, alpha, gamma)
        
        if next_state == 9:
            goal = True
        current_state = next_state   
        if type(current_state) == list:
            current_state = int(current_state[0])
        print("miles this far in this run = ", total_miles)
        print("----------")
    print("total miles - ", total_miles)

NameError: name 'initial_move' is not defined

In [24]:
    target_miles = 5
    
    current_state = 0
    current_total_miles = 0
    start_action = random.choice([2]) # head to entrance a or b
    
    pct_complete = round((current_total_miles / target_miles), 1)
     
#     if pct_complete > 1.0:
#         pct_complete = 1.0
    
    transition = "(" + str(current_state) + ", " + str(start_action) + "), " + str(pct_complete)
    
    reward = rtable[transition]
    
    q_value = q[transition]
    
    next_q_values = [value for key, value in q.items() if key.lower()[1]==str(start_action)]
    
    new_q = q_value + alpha * (reward + gamma * max(next_q_values) - q_value)
    
    q[transition] = new_q
    

In [80]:
q

{'(0, 1), 0.0': 0,
 '(0, 1), 0.1': 0,
 '(0, 1), 0.2': 0,
 '(0, 1), 0.3': 0,
 '(0, 1), 0.4': 0,
 '(0, 1), 0.5': 0,
 '(0, 1), 0.6': 0,
 '(0, 1), 0.7': 0,
 '(0, 1), 0.8': 0,
 '(0, 1), 0.9': 0,
 '(0, 2), 0.0': 0.5,
 '(0, 2), 0.1': 0,
 '(0, 2), 0.2': 0,
 '(0, 2), 0.3': 0,
 '(0, 2), 0.4': 0,
 '(0, 2), 0.5': 0,
 '(0, 2), 0.6': 0,
 '(0, 2), 0.7': 0,
 '(0, 2), 0.8': 0,
 '(0, 2), 0.9': 0,
 '(1, 3), 0.0': 0,
 '(1, 3), 0.1': 0,
 '(1, 3), 0.2': 0,
 '(1, 3), 0.3': 0,
 '(1, 3), 0.4': 0,
 '(1, 3), 0.5': 0,
 '(1, 3), 0.6': 0,
 '(1, 3), 0.7': 0,
 '(1, 3), 0.8': 0,
 '(1, 3), 0.9': 0,
 '(1, 4), 0.0': 0,
 '(1, 4), 0.1': 0,
 '(1, 4), 0.2': 0,
 '(1, 4), 0.3': 0,
 '(1, 4), 0.4': 0,
 '(1, 4), 0.5': 0,
 '(1, 4), 0.6': 0,
 '(1, 4), 0.7': 0,
 '(1, 4), 0.8': 0,
 '(1, 4), 0.9': 0,
 '(2, 5), 0.0': 0,
 '(2, 5), 0.1': 0,
 '(2, 5), 0.2': 0,
 '(2, 5), 0.3': 0,
 '(2, 5), 0.4': 0,
 '(2, 5), 0.5': 0,
 '(2, 5), 0.6': 0,
 '(2, 5), 0.7': 0,
 '(2, 5), 0.8': 0,
 '(2, 5), 0.9': 0,
 '(2, 6), 0.0': 0,
 '(2, 6), 0.1': 0,
 '(2, 6), 

In [68]:
max([value for key, value in q.items() if key.lower()[1]==str(start_action)])

0

In [None]:
shorter = 3; longer = 9

In [None]:
comp = 0.3

In [None]:
print(abs(np.log(0.3/0.5)))
print(abs(np.log(0.3/1.0)))

In [None]:
print(1/.3, 1/.6, 1/.9)

In [None]:
complete = 80
rewardiplier = -(complete-5.0)**3.0
miles1 = 2
miles2 = 5
miles3 = 8
print(complete, rewardiplier, miles1, miles1*rewardiplier)
print(complete, rewardiplier, miles2, miles2*rewardiplier)
print(complete, rewardiplier, miles3, miles3*rewardiplier)

In [None]:
1**(1/2.0)

In [None]:
%matplotlib inline
plt.plot(miles, complete, 'ro')
plt.show()