In [1]:
def show_values(board, decimals = 2):
    for i in range(0, len(board)):
        print('-------------------------------------------------------')
        out = '| '
        for j in range(0, len(board[0])):
            value = 0
            if type(board[i][j]) == str:
                value = board[i][j]
            else:
                value = round(board[i][j],decimals)
            out += str(value).ljust(7) + ' | '
        print(out)
    print('-------------------------------------------------------')

In [2]:
import numpy as np
import copy
import random

class Gridworld:
    #Dimensions[rows,cols]
    #Each cell element [row,col,value]
    def __init__(self, dimensions=[10,10], stop_states = [(0,0),(4,0),(0,4)], passenger_state = (4,3), cells={}, initial_state=None, actions=['up','down','left','right','pick','drop']):
        self.dimensions = dimensions
        self.actions = actions
        self.cells = cells
        self.state_actions = self.define_actions()
        self.init_state(initial_state)
        self.stop_states = stop_states
        self.passenger_state = passenger_state
        
    def define_actions(self):
        state_actions = {}
        for row in range(self.dimensions[0]):
            for col in range(self.dimensions[1]):
                for taxi_state in ['full', 'empty']:
                    state = (row, col, taxi_state)
                    cell = (row,col)
                    if cell in self.cells:
                        actions = self.actions.copy()
                        if 'l' in self.cells[cell]:
                            actions.remove('left')
                        if 't' in self.cells[cell]:
                            actions.remove('up')
                        if 'r' in self.cells[cell]:
                            actions.remove('right')
                        if 'b' in self.cells[cell]:
                            actions.remove('down')
                        state_actions[state] = actions
                    else:
                        state_actions[state] = self.actions 
        return state_actions
        
    def get_board(self):
        return self.board
        
    def get_current_state(self):
        return self.current_state
    
    def get_possible_actions(self, state=(0,0)):
        return self.state_actions[state]
    
    def do_action(self, action, state = None):
        current_state = self.current_state
        reward = 0
        if state != None:
            current_state = state
        new_state = ()
        if(action=='up'):
            new_state=(current_state[0]-1,current_state[1], current_state[2])
            reward = -1
        elif(action=='down'):
            new_state=(current_state[0]+1,current_state[1], current_state[2])
            reward = -1
        elif(action=='left'):
            new_state=(current_state[0],current_state[1]-1, current_state[2])
            reward = -1
        elif(action=='right'):
            new_state=(current_state[0],current_state[1]+1, current_state[2])
            reward = -1
        elif(action=='drop'):
            cell = (current_state[0], current_state[1])
            if cell in self.stop_states and current_state[2] == 'full':
                new_state=(current_state[0],current_state[1], 'empty')
                reward = 10
            else:
                reward = -10
                new_state=(current_state[0],current_state[1], current_state[2])
        elif(action=='pick'):
            cell = (current_state[0], current_state[1])
            if cell == self.passenger_state and current_state[2] == 'empty':
                new_state=(current_state[0],current_state[1], 'full')
                reward = 5
            else:
                reward = -10
                new_state=(current_state[0],current_state[1], current_state[2])   
        
        return [reward, new_state]
    
    def init_state(self, initial_state=None):
        if initial_state:
            self.initial_state = initial_state
            self.current_state = initial_state
        else:
            new_initial_state = random.choice(list(self.state_actions.keys()))
            self.initial_state = new_initial_state
            self.current_state = new_initial_state
        
    def is_terminal(self, action, state):
        cell = (state[0], state[1])
        if action == 'drop' and cell in self.stop_states and state[2] == 'full':
            return True
        else:
            return False


In [3]:
class QLearning:
    #Dimensions[rows,cols]
    #Each cell element [row,col,value]
    def __init__(self, mdp, discount=0.9, alpha=0.5, iterations=False, epsilon=0.9):
        # Mdp is equivalent to env
        self.mdp = mdp
        self.alpha = alpha
        self.discount = discount
        self.iterations = iterations
        self.epsilon = epsilon
        self.q = {}
        state_actions = self.mdp.state_actions
        for state in state_actions.keys():
            for action in state_actions[state]:
                self.q[(state[0],state[1],state[2],action)] = 0

    def run_episode(self):
        is_terminal = False
        while not is_terminal:
            state1 = self.mdp.current_state
            action = self.choose_action(state1)
            action1 = action[0]
            is_terminal = self.mdp.is_terminal(action1, state1)
            res_do_action = self.mdp.do_action(action[0], self.mdp.current_state)
            state2 = res_do_action[1]
            action2 = self.choose_best_action(state2)[0]
            reward = res_do_action[0]
            self.action_function(state1,action1,reward,state2,action2)
            self.mdp.current_state = state2
    
    def run_value_iteration(self):
        # Begins at iteration 2 because first iteration is initializing rewards
        converge = 0
        i = 1
        while self.iterations >= i:
            i += 1
            self.mdp.init_state()
            self.run_episode()
        print("Total iterations: " + str(i))
    
    def action_function(self,state1,action1,reward,state2,action2):
        self.q[(state1[0], state1[1], state1[2], action1)] = (1-self.alpha)*self.q[(state1[0], state1[1], state1[2], action1)] + self.alpha*(reward + self.discount*self.q[(state2[0], state2[1], state1[2], action2)])
    
    def choose_best_action(self, state):
        possible_actions = self.mdp.get_possible_actions(state)
        best_actions = []
        best_q_value = -9999999
        for action in possible_actions:
            if len(best_actions) == 0:
                best_actions.append(action)
                best_q_value = self.q[(state[0], state[1], state[2], action)]
            else:
                if best_q_value == self.q[(state[0], state[1], state[2], action)]:
                    best_actions.append(action)
                elif best_q_value < self.q[(state[0], state[1], state[2], action)]:
                    best_actions = [action]
                    best_q_value = self.q[(state[0], state[1], state[2], action)]
                    
        best_action = random.choice(best_actions)
        return [best_action, best_action]
    
    def choose_action(self, state):
        possible_actions = self.mdp.get_possible_actions(state).copy()
        best_actions = []
        best_q_value = -9999999
        for action in possible_actions:
            if len(best_actions) == 0:
                best_actions.append(action)
                best_q_value = self.q[(state[0], state[1], state[2], action)]
            else:
                if best_q_value == self.q[(state[0], state[1], state[2], action)]:
                    best_actions.append(action)
                elif best_q_value < self.q[(state[0], state[1], state[2], action)]:
                    best_actions = [action]
                    best_q_value = self.q[(state[0], state[1], state[2], action)]
                    
        best_action = random.choice(best_actions)
        if random.random() < (1-self.epsilon) :
            return [best_action, best_action]
        else:
            possible_actions.remove(best_action)
            return [random.choice(possible_actions), best_action]


In [15]:
grid = Gridworld(cells={(0, 0): 'lt', (0, 1): 'tr',(0, 2): 'lt', (0, 3): 't', (0, 4): 'tr', (1, 0): 'l', (1, 1): 'r', (1, 2): 'l', (1, 4): 'r', (2,0): 'l', (2, 4):'r', (3,0):'lr', (3,1): 'l', (3,2):'r',(3,3):'l',(3,4):'r',(4,0):'lrb',(4,1):'lb',(4,2):'rb',(4,3):'lb',(4,4):'rb'}
                        ,dimensions = [5,5]
                        ,stop_states = [(0,0),(4,0),(0,4)]
                        ,passenger_state = (4,3))

iteration = QLearning(grid, discount = 0.8, alpha=0.2, iterations = 3000, epsilon=0.1)
iteration.run_value_iteration()

Total iterations: 3001


In [16]:
board = np.full((5, 5), float('-inf'))
for (x, y, taxi, action), value in iteration.q.items():
    if taxi == 'full':
        board[x][y] = max(board[x][y], value)
show_values(board)

-------------------------------------------------------
| 15.14   | 10.92   | 9.43    | 13.03   | 17.54   | 
-------------------------------------------------------
| 11.1    | 7.81    | 6.54    | 9.43    | 13.03   | 
-------------------------------------------------------
| 7.57    | 5.12    | 4.23    | 6.54    | 9.43    | 
-------------------------------------------------------
| 8.94    | 0.91    | 2.39    | 4.23    | 6.54    | 
-------------------------------------------------------
| 12.5    | -0.28   | 0.91    | 2.39    | 4.23    | 
-------------------------------------------------------


In [17]:
board = np.full((5, 5), float('-inf'))
for (x, y, taxi, action), value in iteration.q.items():
    if taxi == 'empty':
        board[x][y] = max(board[x][y], value)
show_values(board)

-------------------------------------------------------
| -2.42   | -1.78   | -0.98   | 0.03    | -0.98   | 
-------------------------------------------------------
| -1.78   | -0.98   | 0.03    | 1.29    | 0.03    | 
-------------------------------------------------------
| -0.97   | 0.03    | 1.29    | 2.86    | 1.29    | 
-------------------------------------------------------
| -1.78   | -0.97   | 0.03    | 4.83    | 2.86    | 
-------------------------------------------------------
| -2.43   | -1.78   | -0.98   | 7.29    | 4.82    | 
-------------------------------------------------------


In [18]:
from tabulate import tabulate

my_dict = iteration.q

# Create a dictionary to store the table data
table_data = {}

# Iterate through the keys in the dictionary
for key in my_dict.keys():
    # Extract x, y, and action from the key
    x, y, taxi_state, action = key

    # If x, y pair is not in table_data, create a new row with x, y as the key
    if (x, y, taxi_state) not in table_data:
        table_data[(x, y, taxi_state)] = {}

    # Set the value of the action column in the table data
    table_data[(x, y, taxi_state)][action] = round(my_dict[key]*2)/2

# Convert the dictionary to a list of rows
table_rows = []
for key, value in table_data.items():
    if key[2] == 'empty':
        row = list(key) + [value.get('up', '-'), value.get('down', '-'), value.get('left', '-'), value.get('right', '-'), value.get('pick', '-'), value.get('drop', '-')]
        table_rows.append(row)
    
for key, value in table_data.items():
    if key[2] == 'full':
        row = list(key) + [value.get('up', '-'), value.get('down', '-'), value.get('left', '-'), value.get('right', '-'), value.get('pick', '-'), value.get('drop', '-')]
        table_rows.append(row)

# Print the table using tabulate
print(tabulate(table_rows, headers=['x', 'y', 'taxi', 'up','down','left','right','pick','drop']))

  x    y  taxi    up    down    left    right      pick    drop
---  ---  ------  ----  ------  ------  -------  ------  ------
  0    0  empty   -     -2.5    -       -2.5      -10      -6.5
  0    1  empty   -     -2.0    -3.0    -          -6.5    -8
  0    2  empty   -     -1.0    -       -1.5       -8      -6.5
  0    3  empty   -     0.0     -1.5    -1.5       -7      -6.5
  0    4  empty   -     -1.0    -1.5    -          -4      -5.5
  1    0  empty   -2.5  -2.0    -       -2.0       -8.5   -10
  1    1  empty   -2.5  -1.0    -2.0    -          -6.5    -8.5
  1    2  empty   -1.5  -1.0    -       0.0        -7      -7
  1    3  empty   -1.0  1.5     -1.5    -1.0       -8.5    -7
  1    4  empty   -1.5  -0.5    0.0     -          -7      -8
  2    0  empty   -2.5  -2.5    -       -1.0      -10      -9.5
  2    1  empty   -2.0  -2.0    -2.0    0.0        -9.5   -10
  2    2  empty   -1.0  -1.0    -1.0    1.5        -9      -9
  2    3  empty   0.0   3.0     0.0     0.0        -7.