#### RL Mine Explorer Solution

In [1]:
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
# Initialize number of states and actions
state_size = 6
action_size = 2

# Rewards and actions initialization
actions = [0,1] # 0 for left and 1 for right

global reward
reward = [100,0,0,0,0,40]

In [3]:
# Set the percent you want to explore

def action(act,state,epsilon=0.5):
    
    if random.uniform(0, 1) < epsilon:
        """
        Explore: select a random action    
        """
        a1 = random.randint(0,1)
        return a1

    else:
        """
        Exploit: select the action with max value (future reward)    
        """
        a2 = np.argmax(Q[state])
        return int(a2)
            

In [4]:
# Q-Learning part
# Update q values as per the Bellman equation for computing Q values

def q_update(state,action,episode,lr,gamma):
    if 100 != np.sum(Q[state]):      # Check if we are in terminal state
        if 40 != np.sum(Q[state]):   # Check if we are in terminal state
            if action == 0:
                Q[state, action] = Q[state, action] + lr * (reward[state] + gamma * np.max(Q[state-1, :]) - Q[state, action])
                return Q[state,action],state-1
                
            else:
                Q[state, action] = Q[state, action] + lr * (reward[state] + gamma * np.max(Q[state+1, :]) - Q[state, action])
                return Q[state,action],state+1
        else:
            #print(f"End of episode at T2:{episode}")
            return 'T2'
    else:
        #print(f"End of episode at T1:{episode}")
        return 'T1'

#### The terms in the above equation and their significance ####

- Learning Rate: lr or learning rate, often referred to as alpha or α, can simply be defined as how much you accept the new value vs the old value. Above we are taking the difference between new and old and then multiplying that value by the learning rate. This value then gets added to our previous q-value which essentially moves it in the direction of our latest update.

- Gamma: gamma or γ is a discount factor. It’s used to balance immediate and future reward. From our update rule above you can see that we apply the discount to the future reward. Typically this value can range anywhere from 0.8 to 0.99.

- Reward: reward is the value received after completing a certain action at a given state. A reward can happen at any given time step or only at the terminal time step.

- Max: np.max() uses the numpy library and is taking the maximum of the future reward and applying it to the reward for the current state. What this does is impact the current action by the possible future reward. This is the beauty of q-learning. We’re allocating future reward to current actions to help the agent select the highest return action at any given state.

In [5]:
# Initialize Q table
Q = np.zeros((state_size, action_size))
Q[5,1] = 40
Q[0,0] = 100
lr = 0.1
gamma = 0.5

# Run episodes
print("Starting learning.....")
for ep in range(1000):   
    ss = int(3) # Start state
    f_action = random.randint(0,1) # Take any first action
    
    q  = q_update(int(ss),int(f_action),ep,lr=lr,gamma=gamma)  # Q value and new state is returned by the update function

    if q == 'T1' or q == 'T2':
        break
    
    t_state=0
    while t_state <= 0:  # Loops till end of episode
        s = q            # Store the value of the returned from q-update after first action in a new variable
        
        n_action = action(act=actions,state=int(s[1]),epsilon=0.8) # Pick next action        
        q  = q_update(int(s[1]),int(n_action),ep,lr=lr,gamma=gamma) # Q value and new state is returned
        if q == 'T1' or q == 'T2':    # terminal states
            t_state = 1
            break
        
print(f"\nEnd of {ep+1} Episodes\n\nFinal Q-values:\n{Q}")

    

Starting learning.....

End of 1000 Episodes

Final Q-values:
[[100.     0.  ]
 [ 50.    12.5 ]
 [ 25.     6.25]
 [ 12.5   10.  ]
 [  6.25  20.  ]
 [  0.    40.  ]]


In [6]:
# Let us deploy and test the optimal policy

optimal_policy = Q # Trained policy

actions = [0,1] # 0 for left and 1 for right
states = [] # Store the path traversed

def navigate(state):
    if 100 != np.sum(optimal_policy[int(state)]):      # Check if we are in terminal state
        if 40 != np.sum(optimal_policy[state]):   # Check if we are in terminal state]
            
            if np.argmax(optimal_policy[state]) == 0:   # Check if max Q value is towards left 
                state = state - 1
                return state    
            else:                          # Go right
                state = state + 1
                return state
        else:
            return 'T2'
    else:
        return 'T1'

# If we feed it a random start state it should be able to navigate for max returns
start_state = random.randint(1,4)
states.append(start_state) # Load the path with start state
    
# Call navigation function    
next_state = navigate(start_state)
states.append(next_state)

# Navigate till a terminal state is reached
while next_state != 'T1' or next_state != 'T2':
    state = next_state               # Set next state to current state
    if next_state == 'T1' or next_state == 'T2':
        break
    next_state = navigate(state)     # Take the next step
    states.append(next_state)

# Render path 
print('Path Traversed:') 
for s in states:
    print(s,'\u2193')
    

Path Traversed:
3 ↓
2 ↓
1 ↓
0 ↓
T1 ↓
