In [118]:
from numpy import genfromtxt
import numpy as np
import os
from collections import defaultdict
from itertools import permutations, repeat
import random
from numpy import random as numpy_random

## Setup

In [119]:
# The velocity is also discrete, a number of grid cells 
# moved horizontally and vertically per time step.

# Actions:
# The actions are increments to the velocity components. 
# Each may be changed by +1, −1, or 0 in one step, 
# for a total of nine actions.

# Both velocity components are restricted to be nonnegative 
# and less than 5, and they cannot both be zero except at the 
# starting line.

# Episodes:
# Each episode begins in one of the randomly selected 
# start states with both velocity components zero and 
# ends when the car crosses the finish line.

# Rewards:
# The rewards are −1 for each step until the car crosses 
# the finish line.

# If the car hits the track boundary, it is moved back 
# to a random position on the starting line, both velocity 
# components are reduced to zero, and the episode continues.

# With probability 0.1 at each time step the velocity increments are both zero, 
# independently of the intended increments.

## On-policy first-visit MC control (for ε-soft policies), estimates π ≈ π∗

In [117]:
# Load map1
CELL_TYPE_WALL = 0  # Black boxes
CELL_TYPE_TRACK = 1
CELL_TYPE_GOAL = 2
CELL_TYPE_START = 3

class RaceTrack:
    def __init__(self, track, zero_velocity_prob=0.9, max_vel=5, min_vel=0, gamma=0.9, epsilon=0.9):
        self.track = track
        self.wall_cells = np.argwhere(track == CELL_TYPE_WALL)
        self.goal_cells = np.argwhere(track == CELL_TYPE_GOAL)
        self.start_cells = np.argwhere(track == CELL_TYPE_START)
        self.max_vel = max_vel
        self.min_vel = min_vel
        self.colors = ['black', 'white', 'yellow', 'red']  # For plotting
        self.gamma = gamma
        self.epsilon = epsilon
        
        self.zero_velocity_prob = zero_velocity_prob
        self.velocity_min = min_vel
        self.velocity_max = max_vel
        self.velocity_decrease_limit = -1
        self.velocity_increase_limit = 1
        
        # Q-Matrix - a 6 dimensional vector for states, velocity, and action in both directions.
        
        # Q(s, a)
        velocity_range = self.velocity_max - self.velocity_min + 1  # Minus for other direction.
        velocity_change_range = self.velocity_increase_limit  - self.velocity_decrease_limit  + 1        
        self.q = np.zeros((self.track.shape[0], self.track.shape[1], velocity_range, velocity_range, velocity_change_range, velocity_change_range))
        
        # Returns 
        self.Returns = defaultdict(list)

        # Initialize policy
        # NB: limit actions depending on action
        self.pi_probabilities = np.zeros((self.track.shape[0], 
                                          self.track.shape[1], 
                                          velocity_range, 
                                          velocity_range, 
                                          velocity_change_range, 
                                          velocity_change_range), dtype=float)

        # Initialize with equal probabilties for all possible actions        
        for y_coord in range(self.pi_probabilities.shape[0]):
            for x_coord in range(self.pi_probabilities.shape[1]):
                for y_vel in range(self.velocity_min, self.velocity_max + 1):
                    for x_vel in range(self.velocity_min, self.velocity_max + 1):
                        possible_actions = self.possible_actions((y_vel, x_vel))
                        for y_vel_change, x_vel_change in possible_actions:
                            self.pi_probabilities[y_coord, x_coord, y_vel, x_vel, y_vel_change, x_vel_change] = 1/len(possible_actions)
                                
    def policy_iteration(self):
        """
        """
        
        policy_improvement = False
        
        k=0
        while not policy_improvement:
            print('Iteration {}'.format(k))
            # Generate an episode
            G = self.generate_episode()

#            print('--EPSILON', self.epsilon)
            
#             print('Append G to Returns(s, a)')
#             if '[0, 8, 0, 0, 1, 0]' in G.keys():            
#                 print('G[0, 8, 0, 0, 1, 0]', G['[0, 8, 0, 0, 1, 0]'])
#                 print('self.Returns[0, 8, 0, 0, 1, 0]', self.Returns['[0, 8, 0, 0, 1, 0]'])
            for s_a in G.keys():
                self.Returns[s_a].append(G[s_a])
#             if '[0, 8, 0, 0, 1, 0]' in G.keys():            
#                 print('self.Returns[0, 8, 0, 0, 1, 0]', self.Returns['[0, 8, 0, 0, 1, 0]'])
            
            print('Calculate averages in Q(s, a):')
            print('Q-value  [0, 8, 0, 0, 1, 0]:', self.q[0, 8, 0, 0, 1, 0])
            print('Q-value  [0, 8, 0, 0, 1, 1]:', self.q[0, 8, 0, 0, 1, 1])        
            print('Q-value  [0, 8, 0, 0, 0, 1]:', self.q[0, 8, 0, 0, 0, 1])                
            print('Q-value  [0, 8, 0, 0, 0, -1]:', self.q[0, 8, 0, 0, 0, -1])                

            for s_a in self.Returns.keys():
                self.q[eval(s_a)[0],
                       eval(s_a)[1],
                       eval(s_a)[2],
                       eval(s_a)[3],
                       eval(s_a)[4],
                       eval(s_a)[5]] = np.average(self.Returns[s_a])
            print('Q-value  [0, 8, 0, 0, 1, 0]:', self.q[0, 8, 0, 0, 1, 0])
            print('Q-value  [0, 8, 0, 0, 1, 1]:', self.q[0, 8, 0, 0, 1, 1])        
            print('Q-value  [0, 8, 0, 0, 0, 1]:', self.q[0, 8, 0, 0, 0, 1])                
            print('Q-value  [0, 8, 0, 0, 0, -1]:', self.q[0, 8, 0, 0, 0, -1])                
                        
            # Old policy
            old_policy = self.pi_probabilities.copy()

            # Update pi(a | s)
            self.update_policy()
                
            # Check if convergence
            if np.allclose(old_policy, self.pi_probabilities, atol=0.0005):
                print('Policy iteration converged.')
                policy_improvement = True
                
            # Counter and update epsilon
            self.epsilon = 1/(np.sqrt(k + 1.1))
            
            k += 1
            
    def update_policy(self):
        """
        """
        
        # Ranges
        velocity_range = self.velocity_max - self.velocity_min + 1  # Minus for other direction.
        velocity_change_range = self.velocity_increase_limit  - self.velocity_decrease_limit  + 1        

        # Initialize policy
        # NB: limit actions depending on action
        self.pi_probabilities = np.zeros((rt.track.shape[0], 
                                          rt.track.shape[1], 
                                          velocity_range, 
                                          velocity_range, 
                                          velocity_change_range, 
                                          velocity_change_range), dtype=float)
        
        # Initialize with equal probabilties for all possible actions        
        for y_coord in range(self.pi_probabilities.shape[0]):
            for x_coord in range(self.pi_probabilities.shape[1]):
                for y_vel in range(self.velocity_min, self.velocity_max + 1):
                    for x_vel in range(self.velocity_min, self.velocity_max + 1):
                        possible_actions = self.possible_actions((y_vel, x_vel))
                        greedy_action = self.greedy_action(state=[y_coord, x_coord, y_vel, x_vel],
                                                           possible_actions=possible_actions)
                        for y_vel_change, x_vel_change in possible_actions:
                            self.pi_probabilities[y_coord, 
                                                  x_coord, 
                                                  y_vel, 
                                                  x_vel, 
                                                  y_vel_change, 
                                                  x_vel_change] = self.epsilon_soft_policy(action=[y_vel_change, x_vel_change],
                                                                                           greedy_action=greedy_action, 
                                                                                           all_state_actions=possible_actions)         
            
    def generate_episode(self):
        """
        """
        
        crossed_finishing_line = False
        position = self.random_start_position()
        first_occurence = defaultdict(int)
        
        step = 0
        while not crossed_finishing_line:
            step += 1
            
            # Sample action
            action = self.sample_action_from_state(position)
            
            # Initiate s, a pair if not already in dict
            if str(position + action) not in first_occurence.keys():                
                first_occurence[str(position + action)] = step

            # Old position
            old_position = position.copy()
            
            # New position
            position[0] += position[2] + action[0]
            position[1] += position[3] + action[1]
            position[2] += action[0]
            position[3] += action[1]
            
            # Check if goal if reached (is it in the projected reactangle)
            grid_states_to_check = self.get_all_grid_cells_in_projected_retcangle(current_state=[old_position[0], old_position[1]], 
                                                                                  new_state=[position[0], position[1]])
            if self.check_if_goal_is_reached(check_grid_states=grid_states_to_check):
                print('-- Goal Reached. Terminating Episode.')
                break

            # Check if car hits boundery
            if position[0] >= self.track.shape[0] or position[1] >= self.track.shape[1]:
                position = self.random_start_position()
                continue
            
            new_grid_position = rt.track[position[0], position[1]]
            if new_grid_position == 0:
                position = self.random_start_position()
                continue        

        print('Steps {}'.format(step))

        G = self._get_G_values(first_occurence_dict=first_occurence, total_steps=step)
                
        return G

    def check_if_goal_is_reached(self, check_grid_states):
        """
        """

        grid_values = []

        for y, x in check_grid_states:
            if y <= self.track.shape[0] - 1 and x <= self.track.shape[1] - 1:
                grid_values.append(rt.track[y, x])

        if 2 in grid_values:
            return True
        else:
            return False
    
    def get_all_grid_cells_in_projected_retcangle(self, current_state, new_state):
        """
        """
        y_coord_current = current_state[0]
        x_coord_current = current_state[1]

        y_diff = new_state[0] - current_state[0]
        x_diff = new_state[1] - current_state[1]

        return [[y_coord_current + y, x_coord_current + x] for y in range(0, y_diff + 1) for x in range(0, x_diff + 1)]


    def sample_action_from_state(self, state):
        """
        """
        
        # Action coordinates in probability matrix
        array = np.array(['(0, 0)', 
                          '(0, 1)', 
                          '(0, 2)', 
                          '(1, 0)', 
                          '(1, 1)', 
                          '(1, 2)',
                          '(2, 0)',
                          '(2, 1)',
                          '(2, 2)'])
        
        # Randomly pick action
        a = numpy_random.choice(array,
                                size=1,
                                p=rt.pi_probabilities[state[0], state[1], state[2], state[3]].flatten())        
        a = eval(list(a)[0])        
        
        return self.center_axis_around_zero(coordinates=a, list_range=range(3))

    def center_axis_around_zero(self, coordinates, list_range):
        """
        """
        return [x - len(list_range) if x > len(list_range)/2 else x for x in coordinates]

    def greedy_action(self, state, possible_actions):
        """
        """
        
        greedy_action = possible_actions[0]
        q_max = self.q[state[0], state[1], state[2], state[3], greedy_action[0], greedy_action[1]]
        
        for action in possible_actions:
            value = self.q[state[0], state[1], state[2], state[3], action[0], action[1]]
            if value > q_max:
                q_max = value
                greedy_action = action
                    
        return self.center_axis_around_zero(coordinates=greedy_action, list_range=range(3))

    def epsilon_soft_policy(self, action, greedy_action, all_state_actions):
        """
        """
        if action == greedy_action:
            return 1 - self.epsilon + self.epsilon/len(all_state_actions)
        
        return self.epsilon/len(all_state_actions)

    def random_start_position(self):
        """
        """
        
        grid_position = list(random.choice(np.argwhere(self.track==3)))
        velocity = [0, 0]
        
        return  grid_position + velocity 
    
    def possible_actions(self, velocity):
        """
        Credit: Andreas
        """
        actions = [[a_y, a_x] for a_y in range(-1, 2) for a_x in range(-1, 2)]
        legal_actions = []
        
        v_y, v_x = velocity
        
        # Discard illegal actions
        for a in actions:
            a_y, a_x = a
            # Cannot go above speed limit in any x direction
            if v_x + a_x < self.min_vel or v_x + a_x > self.max_vel:
                continue
            # Cannot go above speed limit in any y direction
            if v_y + a_y < self.min_vel or v_y + a_y > self.max_vel:
                continue
            # Cannot noop
            if v_x + a_x == 0 and v_y + a_y == 0:
                continue
            legal_actions.append(a)
            
        return legal_actions

    def _get_G_values(self, first_occurence_dict, total_steps):
        """
        """

        G = defaultdict(int)  # Dict. w/ G for first occurence for each s, a pair.

        for key, val in first_occurence_dict.items():
            number_rewards = total_steps - val

            discounted_rewards = []
            for k in range(number_rewards):
                discounted_rewards.append(self.gamma**k * (-1))

            G[key] = sum(discounted_rewards)

        return G

    @classmethod
    def from_csv(cls, file_path):
        
        file_path = os.path.join(os.getcwd(), file_path)
        
        track = genfromtxt(file_path, delimiter=',')
        track = np.flip(track, axis=0)
        
        return cls(track) 
                    

In [91]:
G = rt.generate_episode()

-- Goal Reached. Terminating Episode.
Steps 637


In [113]:
rt = RaceTrack.from_csv("../racetracks/map1.csv")

In [114]:
q = rt.policy_iteration()

Iteration 0
-- Goal Reached. Terminating Episode.
Steps 1104
--EPSILON 0.9
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: 0.0
Q-value  [0, 8, 0, 0, 1, 1]: 0.0
Q-value  [0, 8, 0, 0, 0, 1]: 0.0
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.999999999999993
Q-value  [0, 8, 0, 0, 1, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 1
-- Goal Reached. Terminating Episode.
Steps 166
--EPSILON 0.9534625892455922
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.999999999999993
Q-value  [0, 8, 0, 0, 1, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.999722333566368
Q-value  [0, 8, 0, 0, 1, 1]: -9.874842224750335
Q-value  [0, 8, 0, 0, 0, 1]: -9.999999806689011
Q-value  [0, 8, 0, 0, 0, -1]: 0.0


In [115]:
rt.Returns['[0, 8, 0, 0, 1, 1]']

[-9.999999999999993, -9.749684449500677]

In [116]:
q[0, 8, 0, 0, :, :]

array([[ 0.        , -9.99999981,  0.        ],
       [-9.99972233, -9.87484222,  0.        ],
       [ 0.        ,  0.        ,  0.        ]])

In [87]:
rt.policy_iteration()

Iteration 0
-- Goal Reached. Terminating Episode.
Steps 613
--EPSILON 0.9
Append G to Returns(s, a)
G[0, 8, 0, 0, 1, 0] -9.999999999999993
self.Returns[0, 8, 0, 0, 1, 0] []
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993]
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: 0.0
Q-value  [0, 8, 0, 0, 1, 1]: 0.0
Q-value  [0, 8, 0, 0, 0, 1]: 0.0
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.999999999999993
Q-value  [0, 8, 0, 0, 1, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 1
-- Goal Reached. Terminating Episode.
Steps 1896
--EPSILON 0.9534625892455922
Append G to Returns(s, a)
G[0, 8, 0, 0, 1, 0] -9.999999999999993
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993]
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993, -9.999999999999993]
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.999999999999993
Q-value  [0, 8, 0, 0, 1, 1]: -9.999999999999993
Q-value  [0, 8, 0, 0, 0, 1]

Q-value  [0, 8, 0, 0, 1, 0]: -9.990186121598896
Q-value  [0, 8, 0, 0, 1, 1]: -9.989280276852007
Q-value  [0, 8, 0, 0, 0, 1]: -9.999989041270124
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 12
-- Goal Reached. Terminating Episode.
Steps 475
--EPSILON 0.2874797872880345
Append G to Returns(s, a)
G[0, 8, 0, 0, 1, 0] -9.999999999999993
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.903022627021246, -9.999999652040229, -9.999999999997884, -9.998838936929646]
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.903022627021246, -9.999999652040229, -9.999999999997884, -9.998838936929646, -9.999999999999993]
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.990186121598896
Q-value  [0, 8, 0, 0, 1, 1]: -9.989280276852007
Q-value  [0, 8, 0, 0, 0, 1]: -9.999989041270124
Q-value 

Q-value  [0, 8, 0, 0, 1, 0]: -9.988750830127007
Q-value  [0, 8, 0, 0, 1, 1]: -9.991721096637292
Q-value  [0, 8, 0, 0, 0, 1]: -9.96575914454044
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 20
-- Goal Reached. Terminating Episode.
Steps 54
--EPSILON 0.22304986837273524
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.988750830127007
Q-value  [0, 8, 0, 0, 1, 1]: -9.991721096637292
Q-value  [0, 8, 0, 0, 0, 1]: -9.96575914454044
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.988750830127007
Q-value  [0, 8, 0, 0, 1, 1]: -9.991721096637292
Q-value  [0, 8, 0, 0, 0, 1]: -9.96575914454044
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 21
-- Goal Reached. Terminating Episode.
Steps 219
--EPSILON 0.21770017209205406
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.988750830127007
Q-value  [0, 8, 0, 0, 1, 1]: -9.991721096637292
Q-value  [0, 8, 0, 0, 0, 1]: -9.96575914454044
Q-value  [0, 8, 0, 0, 0, -1]: 0

Q-value  [0, 8, 0, 0, 1, 0]: -9.987697628754367
Q-value  [0, 8, 0, 0, 1, 1]: -9.853774180997048
Q-value  [0, 8, 0, 0, 0, 1]: -9.96778398576832
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 33
-- Goal Reached. Terminating Episode.
Steps 85
--EPSILON 0.17381449986274955
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.987697628754367
Q-value  [0, 8, 0, 0, 1, 1]: -9.853774180997048
Q-value  [0, 8, 0, 0, 0, 1]: -9.96778398576832
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.987697628754367
Q-value  [0, 8, 0, 0, 1, 1]: -9.860340370794715
Q-value  [0, 8, 0, 0, 0, 1]: -9.96778398576832
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 34
-- Goal Reached. Terminating Episode.
Steps 88
--EPSILON 0.17124693631268542
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.987697628754367
Q-value  [0, 8, 0, 0, 1, 1]: -9.860340370794715
Q-value  [0, 8, 0, 0, 0, 1]: -9.96778398576832
Q-value  [0, 8, 0, 0, 0, -1]: 0.

Q-value  [0, 8, 0, 0, 1, 0]: -9.952558551997699
Q-value  [0, 8, 0, 0, 1, 1]: -9.836887206432838
Q-value  [0, 8, 0, 0, 0, 1]: -9.972137628325726
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 46
-- Goal Reached. Terminating Episode.
Steps 41
--EPSILON 0.1472819539849714
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.952558551997699
Q-value  [0, 8, 0, 0, 1, 1]: -9.836887206432838
Q-value  [0, 8, 0, 0, 0, 1]: -9.972137628325726
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.952558551997699
Q-value  [0, 8, 0, 0, 1, 1]: -9.764803781137026
Q-value  [0, 8, 0, 0, 0, 1]: -9.972137628325726
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 47
-- Goal Reached. Terminating Episode.
Steps 63
--EPSILON 0.145710063157312
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.952558551997699
Q-value  [0, 8, 0, 0, 1, 1]: -9.764803781137026
Q-value  [0, 8, 0, 0, 0, 1]: -9.972137628325726
Q-value  [0, 8, 0, 0, 0, -1]: 0

Q-value  [0, 8, 0, 0, 1, 0]: -9.923275988049193
Q-value  [0, 8, 0, 0, 1, 1]: -9.81002580449355
Q-value  [0, 8, 0, 0, 0, 1]: -9.973995042495037
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 59
-- Goal Reached. Terminating Episode.
Steps 233
--EPSILON 0.13007872144692093
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.923275988049193
Q-value  [0, 8, 0, 0, 1, 1]: -9.81002580449355
Q-value  [0, 8, 0, 0, 0, 1]: -9.973995042495037
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.923275988049193
Q-value  [0, 8, 0, 0, 1, 1]: -9.814548999308576
Q-value  [0, 8, 0, 0, 0, 1]: -9.973995042495037
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 60
-- Goal Reached. Terminating Episode.
Steps 95
--EPSILON 0.12899199629493716
Append G to Returns(s, a)
G[0, 8, 0, 0, 1, 0] -9.99588901683294
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.90302262702

Iteration 73
-- Goal Reached. Terminating Episode.
Steps 255
--EPSILON 0.11696106429438609
Append G to Returns(s, a)
G[0, 8, 0, 0, 1, 0] -9.994360791266038
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.903022627021246, -9.999999652040229, -9.999999999997884, -9.998838936929646, -9.999999999999993, -9.99303801390869, -9.999999999999993, -9.999999613378032, -9.999999999667699, -9.999998309981741, -9.903022627021246, -9.99959516233977, -9.983826907300768, -9.972610725500465, -9.202335569231275, -9.999999999630775, -9.999999999999993, -9.202335569231275, -9.99588901683294]
self.Returns[0, 8, 0, 0, 1, 0] [-9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.999999999999993, -9.903022627021246, -9.999999652040229, -9.999999999997884, -9.998838936929646, -9.999999999999993, -9.99303801390869, -9.999999999999993, -9.999999613378032, -9.9

Q-value  [0, 8, 0, 0, 1, 0]: -9.928802827741523
Q-value  [0, 8, 0, 0, 1, 1]: -9.807944650683966
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 88
-- Goal Reached. Terminating Episode.
Steps 92
--EPSILON 0.10653984136442511
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.928802827741523
Q-value  [0, 8, 0, 0, 1, 1]: -9.807944650683966
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.928802827741523
Q-value  [0, 8, 0, 0, 1, 1]: -9.807944650683966
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 89
-- Goal Reached. Terminating Episode.
Steps 108
--EPSILON 0.10594028769395471
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.928802827741523
Q-value  [0, 8, 0, 0, 1, 1]: -9.807944650683966
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0

Iteration 102
-- Goal Reached. Terminating Episode.
Steps 15
--EPSILON 0.09896625331298045
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.739091714830783
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.739091714830783
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 103
-- Goal Reached. Terminating Episode.
Steps 11
--EPSILON 0.09848513109869263
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.739091714830783
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.739091714830783
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 

Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.587294190864535
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 119
-- Goal Reached. Terminating Episode.
Steps 55
--EPSILON 0.09163135721752008
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.587294190864535
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.590873879779684
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 120
-- Goal Reached. Terminating Episode.
Steps 39
--EPSILON 0.09124908038499573
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.590873879779684
Q-value  [0, 8, 0, 0, 0, 1]: -9.93981729099744
Q-value  [0, 8, 0, 0, 0, -1]: 

Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.529196531270212
Q-value  [0, 8, 0, 0, 0, 1]: -9.9387952169981
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 136
-- Goal Reached. Terminating Episode.
Steps 15
--EPSILON 0.0857177844707708
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.529196531270212
Q-value  [0, 8, 0, 0, 0, 1]: -9.9387952169981
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.529196531270212
Q-value  [0, 8, 0, 0, 0, 1]: -9.9387952169981
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 137
-- Goal Reached. Terminating Episode.
Steps 17
--EPSILON 0.08540460189474228
Append G to Returns(s, a)
Calculate averages in Q(s, a):
Q-value  [0, 8, 0, 0, 1, 0]: -9.927448921653205
Q-value  [0, 8, 0, 0, 1, 1]: -9.529196531270212
Q-value  [0, 8, 0, 0, 0, 1]: -9.9387952169981
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q

In [None]:
np.save("On_Policy_Q_matrix_Joakim.csv", rt.q, allow_pickle=True, fix_imports=True)

In [77]:
rt.pi_probabilities[0, 8, 0, 0, :, :]

array([[0.        , 0.93090702, 0.        ],
       [0.03454649, 0.03454649, 0.        ],
       [0.        , 0.        , 0.        ]])

## Off-policy MC prediction, for estimating Q ≈ qπ

In [32]:
# Load map1
CELL_TYPE_WALL = 0  # Black boxes
CELL_TYPE_TRACK = 1
CELL_TYPE_GOAL = 2
CELL_TYPE_START = 3

class RaceTrack:
    def __init__(self, track, zero_velocity_prob=0.9, max_vel=5, min_vel=0, gamma=0.25, epsilon=0.9):
        self.track = track
        self.wall_cells = np.argwhere(track == CELL_TYPE_WALL)
        self.goal_cells = np.argwhere(track == CELL_TYPE_GOAL)
        self.start_cells = np.argwhere(track == CELL_TYPE_START)
        self.max_vel = max_vel
        self.min_vel = min_vel
        self.colors = ['black', 'white', 'yellow', 'red']  # For plotting
        self.gamma = gamma
        self.epsilon = epsilon
        
        self.zero_velocity_prob = zero_velocity_prob
        self.velocity_min = min_vel
        self.velocity_max = max_vel
        self.velocity_decrease_limit = -1
        self.velocity_increase_limit = 1
        
        # Q-Matrix - a 6 dimensional vector for states, velocity, and action in both directions.
        
        # Q(s, a)
        velocity_range = self.velocity_max - self.velocity_min + 1  # Minus for other direction.
        velocity_change_range = self.velocity_increase_limit  - self.velocity_decrease_limit  + 1        
        self.q = np.zeros((self.track.shape[0], self.track.shape[1], velocity_range, velocity_range, velocity_change_range, velocity_change_range))
        
        # Returns 
        self.Returns = defaultdict(list)

        # Initialize policy
        # NB: limit actions depending on action
        self.pi_probabilities = np.zeros((self.track.shape[0], 
                                          self.track.shape[1], 
                                          velocity_range, 
                                          velocity_range, 
                                          velocity_change_range, 
                                          velocity_change_range), dtype=float)

        # Initialize with equal probabilties for all possible actions        
        for y_coord in range(self.pi_probabilities.shape[0]):
            for x_coord in range(self.pi_probabilities.shape[1]):
                for y_vel in range(self.velocity_min, self.velocity_max + 1):
                    for x_vel in range(self.velocity_min, self.velocity_max + 1):
                        possible_actions = self.possible_actions((y_vel, x_vel))
                        for y_vel_change, x_vel_change in possible_actions:
                            self.pi_probabilities[y_coord, x_coord, y_vel, x_vel, y_vel_change, x_vel_change] = 1/len(possible_actions)
                                
    def policy_iteration(self):
        """
        """
        
        policy_improvement = False
        
        k=0
        while not policy_improvement:
            print('Iteration {}'.format(k))
            # Generate an episode
            G = self.generate_episode()

            print('--EPSILON', self.epsilon)
            
            # Append G to Returns(s, a)
#             if '[0, 8, 0, 0, 1, 0]' in G.keys():            
#                 print('G[0, 8, 0, 0, 1, 0]', G['[0, 8, 0, 0, 1, 0]'])
#                 print('self.Returns[0, 8, 0, 0, 1, 0]', self.Returns['[0, 8, 0, 0, 1, 0]'])
            for s_a in G.keys():
                self.Returns[s_a].append(G[s_a])
#             if '[0, 8, 0, 0, 1, 0]' in G.keys():            
#                 print('self.Returns[0, 8, 0, 0, 1, 0]', self.Returns['[0, 8, 0, 0, 1, 0]'])
            
            # Old Q(s, a)
            old_q = self.q.copy()
            
            # Calculate averages in Q(s, a)
#             print('Q-value  [0, 8, 0, 0, 1, 0]:', self.q[0, 8, 0, 0, 1, 0])
#             print('Q-value  [0, 8, 0, 0, 1, 1]:', self.q[0, 8, 0, 0, 1, 1])        
#             print('Q-value  [0, 8, 0, 0, 0, 1]:', self.q[0, 8, 0, 0, 0, 1])                
#             print('Q-value  [0, 8, 0, 0, 0, -1]:', self.q[0, 8, 0, 0, 0, -1])                
            for s_a in self.Returns.keys():
                self.q[eval(s_a)[0],
                       eval(s_a)[1],
                       eval(s_a)[2],
                       eval(s_a)[3],
                       eval(s_a)[4],
                       eval(s_a)[5]] = np.average(self.Returns[s_a])
#             print('Q-value  [0, 8, 0, 0, 1, 0]:', self.q[0, 8, 0, 0, 1, 0])
#             print('Q-value  [0, 8, 0, 0, 1, 1]:', self.q[0, 8, 0, 0, 1, 1])        
#             print('Q-value  [0, 8, 0, 0, 0, 1]:', self.q[0, 8, 0, 0, 0, 1])                
#             print('Q-value  [0, 8, 0, 0, 0, -1]:', self.q[0, 8, 0, 0, 0, -1])                
            
            # Q-diff
            q_diff = abs(old_q - self.q)
#            print('Q-diff: {}'.format(np.max(q_diff)))
            
            # Old policy
            old_policy = self.pi_probabilities.copy()

#             print('Pi Prob  [0, 8, 0, 0, 1, 0]:', self.pi_probabilities[0, 8, 0, 0, 1, 0])
#             print('Pi Prob  [0, 8, 0, 0, 1, 1]:', self.pi_probabilities[0, 8, 0, 0, 1, 1])        
#             print('Pi Prob  [0, 8, 0, 0, 0, 1]:', self.pi_probabilities[0, 8, 0, 0, 0, 1])                
#             print('Pi Prob  [0, 8, 0, 0, 0, -1]:', self.pi_probabilities[0, 8, 0, 0, 0, -1])            
            # Update pi(a | s)
            self.update_policy()
#             print('Pi Prob  [0, 8, 0, 0, 1, 0]:', self.pi_probabilities[0, 8, 0, 0, 1, 0])
#             print('Pi Prob  [0, 8, 0, 0, 1, 1]:', self.pi_probabilities[0, 8, 0, 0, 1, 1])        
#             print('Pi Prob  [0, 8, 0, 0, 0, 1]:', self.pi_probabilities[0, 8, 0, 0, 0, 1])                
#             print('Pi Prob  [0, 8, 0, 0, 0, -1]:', self.pi_probabilities[0, 8, 0, 0, 0, -1])    
                       
            # Check if convergence
            if np.allclose(old_policy, self.pi_probabilities, atol=0.0005):
                print('Policy iteration converged.')
                policy_improvement = True
                
            # Counter and update epsilon
            self.epsilon = 1/(np.sqrt(k + 1.1))
            
            k += 1
            
    def update_policy(self):
        """
        """
        
        # Ranges
        velocity_range = self.velocity_max - self.velocity_min + 1  # Minus for other direction.
        velocity_change_range = self.velocity_increase_limit  - self.velocity_decrease_limit  + 1        

        # Initialize policy
        # NB: limit actions depending on action
        self.pi_probabilities = np.zeros((rt.track.shape[0], 
                                          rt.track.shape[1], 
                                          velocity_range, 
                                          velocity_range, 
                                          velocity_change_range, 
                                          velocity_change_range), dtype=float)
        
        # Initialize with equal probabilties for all possible actions        
        for y_coord in range(self.pi_probabilities.shape[0]):
            for x_coord in range(self.pi_probabilities.shape[1]):
                for y_vel in range(self.velocity_min, self.velocity_max + 1):
                    for x_vel in range(self.velocity_min, self.velocity_max + 1):
                        possible_actions = self.possible_actions((y_vel, x_vel))
                        greedy_action = self.greedy_action(state=[y_coord, x_coord, y_vel, x_vel],
                                                           possible_actions=possible_actions)
#                         if [0, 8, 0, 0] == [y_coord,  x_coord, y_vel, x_vel]:
#                             print('--GREEDY ACTION:', greedy_action)
                        for y_vel_change, x_vel_change in possible_actions:
                            self.pi_probabilities[y_coord, 
                                                  x_coord, 
                                                  y_vel, 
                                                  x_vel, 
                                                  y_vel_change, 
                                                  x_vel_change] = self.epsilon_soft_policy(action=[y_vel_change, x_vel_change],
                                                                                           greedy_action=greedy_action, 
                                                                                           all_state_actions=possible_actions)         
            
    def generate_episode(self):
        """
        """
        
        crossed_finishing_line = False
        position = self.random_start_position()
        first_occurence = defaultdict(int)
        
        step = 0
        while not crossed_finishing_line:
#            print('-- Step {}'.format(step))
            step += 1
#            print('-- Position',  position)
            
            # Sample action
            action = self.sample_action_from_state(position)
#            print('-- Action', action)
            
            # Initiate s, a pair if not already in dict
            if str(position + action) not in first_occurence.keys():                
                first_occurence[str(position + action)] = step

            # Old position
            old_position = position.copy()
            
            # New position
            position[0] += position[2] + action[0]
            position[1] += position[3] + action[1]
            position[2] += action[0]
            position[3] += action[1]
            
#            print('-- New position', position)

            # Check if goal if reached (is it in the projected reactangle)
            grid_states_to_check = self.get_all_grid_cells_in_projected_retcangle(current_state=[old_position[0], old_position[1]], 
                                                                                  new_state=[position[0], position[1]])
            if self.check_if_goal_is_reached(check_grid_states=grid_states_to_check):
                print('-- Goal Reached. Terminating Episode.')
                break

            # Check if car hits boundery
            if position[0] >= self.track.shape[0] or position[1] >= self.track.shape[1]:
#                print('Hit the track boundery! (outside matrix)')
                position = self.random_start_position()
                continue
            
            new_grid_position = rt.track[position[0], position[1]]
            if new_grid_position == 0:
#                print('Hit the track boundery !')
                position = self.random_start_position()
                continue        

        print('Steps {}'.format(step))
        
        return self._get_G_values(first_occurence_dict=first_occurence, total_steps=step)

    def check_if_goal_is_reached(self, check_grid_states):
        """
        """

        grid_values = []

        for y, x in check_grid_states:
            if y <= self.track.shape[0] - 1 and x <= self.track.shape[1] - 1:
                grid_values.append(rt.track[y, x])

        if 2 in grid_values:
            return True
        else:
            return False
    
    def get_all_grid_cells_in_projected_retcangle(self, current_state, new_state):
        """
        """
        y_coord_current = current_state[0]
        x_coord_current = current_state[1]

        y_diff = new_state[0] - current_state[0]
        x_diff = new_state[1] - current_state[1]

        return [[y_coord_current + y, x_coord_current + x] for y in range(0, y_diff + 1) for x in range(0, x_diff + 1)]


    def sample_action_from_state(self, state):
        """
        """
        
        # Action coordinates in probability matrix
        array = np.array(['(0, 0)', 
                          '(0, 1)', 
                          '(0, 2)', 
                          '(1, 0)', 
                          '(1, 1)', 
                          '(1, 2)',
                          '(2, 0)',
                          '(2, 1)',
                          '(2, 2)'])
        
        # Randomly pick action
#        print('STATE:', state)
#        print('---- ! {}'.format(rt.pi_probabilities[state[0], state[1], state[2], state[3]].flatten()))
        a = numpy_random.choice(array,
                                size=1,
                                p=rt.pi_probabilities[state[0], state[1], state[2], state[3]].flatten())        
        a = eval(list(a)[0])        
        
        return self.center_axis_around_zero(coordinates=a, list_range=range(3))

    def center_axis_around_zero(self, coordinates, list_range):
        """
        """
        return [x - len(list_range) if x > len(list_range)/2 else x for x in coordinates]

    def greedy_action(self, state, possible_actions):
        """
        """
        
#         if state == [0, 8, 0, 0]:
#             print('possible_actions[0]', possible_actions[0])
        greedy_action = possible_actions[0]
        q_max = self.q[state[0], state[1], state[2], state[3], greedy_action[0], greedy_action[1]]
        
        for action in possible_actions:
            value = self.q[state[0], state[1], state[2], state[3], action[0], action[1]]
#             if state == [0, 8, 0, 0]:            
#                 print('action', action)
#                 print('value', value)
            if value > q_max:
                q_max = value
                greedy_action = action
                    
        return self.center_axis_around_zero(coordinates=greedy_action, list_range=range(3))

    def epsilon_soft_policy(self, action, greedy_action, all_state_actions):
        """
        """
        if action == greedy_action:
            return 1 - self.epsilon + self.epsilon/len(all_state_actions)
        
        return self.epsilon/len(all_state_actions)

    def random_start_position(self):
        """
        """
        
        grid_position = list(random.choice(np.argwhere(self.track==3)))
        velocity = [0, 0]
        
        return  grid_position + velocity 
    
    def possible_actions(self, velocity):
        """
        Credit: Andreas
        """
        actions = [[a_y, a_x] for a_y in range(-1, 2) for a_x in range(-1, 2)]
        legal_actions = []
        
        v_y, v_x = velocity
        
        # Discard illegal actions
        for a in actions:
            a_y, a_x = a
            # Cannot go above speed limit in any x direction
            if v_x + a_x < self.min_vel or v_x + a_x > self.max_vel:
                continue
            # Cannot go above speed limit in any y direction
            if v_y + a_y < self.min_vel or v_y + a_y > self.max_vel:
                continue
            # Cannot noop
            if v_x + a_x == 0 and v_y + a_y == 0:
                continue
            legal_actions.append(a)
            
        return legal_actions

    def _get_G_values(self, first_occurence_dict, total_steps):
        """
        """

        G = defaultdict(int)  # Dict. w/ G for first occurence for each s, a pair.

        for key, val in first_occurence_dict.items():
            number_rewards = total_steps - val

            discounted_rewards = []
            for k in range(number_rewards):
                discounted_rewards.append(self.gamma**k * (-1))

            G[key] = sum(discounted_rewards)

        return G

    @classmethod
    def from_csv(cls, file_path):
        
        file_path = os.path.join(os.getcwd(), file_path)
        
        track = genfromtxt(file_path, delimiter=',')
        track = np.flip(track, axis=0)
        
        return cls(track) 
                    

In [9]:
rt.track

array([[0., 0., 0., 3., 3., 3., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,

In [543]:
a = get_all_cell_values_in_projected_retcangle([29, 14], [34, 16])

In [525]:
a = [29, 14]

In [527]:
b = [34, 16]

In [542]:
def get_all_cell_values_in_projected_retcangle(current_state, new_state):
    """
    """
    y_coord_current = current_state[0]
    x_coord_current = current_state[1]
    
    y_diff = new_state[0] - current_state[0]
    x_diff = new_state[1] - current_state[1]
    
    return [[y_coord_current + y, x_coord_current + x] for y in range(0, y_diff + 1) for x in range(0, x_diff + 1)]


In [553]:
def check_if_goal_is_reached(check_grid_states):
    """
    """
    
    grid_values = []
    
    for y, x in check_grid_states:
        if y <= rt.track.shape[0] - 1 and x <= rt.track.shape[1] - 1:
            grid_values.append(rt.track[y, x])
    
    if 2 in grid_values:
        return True
    else:
        return False

In [499]:
rt.track.shape

(32, 16)

In [None]:
# If I'm in e.g. 1, 14, -2, 2
# And takes action -1, 0

# New position -2, 16, -3, 2


In [540]:
rt.track

array([[0., 0., 0., 3., 3., 3., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,

In [147]:
a[0, 1] = random.choice(rt.all_possible_actions(5, 4))

In [None]:
# Apply a Monte Carlo control method to this task to compute 
# the optimal policy from each starting state.

In [22]:
# Track bounderies:

race_track = {'[0, 2]': '[0, 6]',
              '[3, 9]': '[-1, 6]',
              '[10, 17]': '[-2, 6]',
              '[18, 24]':  '[-3, 6]',
              '[25, 25]': '[-3, 7]',
              '[26, 27]': '[-3, 14]',
              '[28, 28]': '[-2,  14]',
              '[29, 30]': '[-1, 14]',
              '[31, 31]': '[0, 14]'}


In [23]:
# States:
states = []
for key in race_track.keys():
    for x_coord in range(eval(key)[0], eval(key)[1] + 1):
        for y_coord in range(eval(race_track[key])[0], eval(race_track[key])[1] + 1):
            states.append([x_coord, y_coord])
            
# Start states:
start_states = [[0, 0],
                [0, 1],
                [0, 2],
                [0, 3],
                [0, 4],
                [0, 5],
                [0, 6]]

# End states:
end_states = [[x, 14] for x in range(26, 32)]

In [51]:
class RaceTrackingMDP:
    def __init__(self,
                 states,
                 start_states,
                 end_states,
                 zero_velocity_prob=0.9,
                 velocity_max=4,
                 speed_decrease_limit=-1,
                 speed_increase_limit=1):
        self.states = states
        self.start_states = start_states
        self.end_states = end_states
        self.zero_velocity_prob = zero_velocity_prob
        self.velocity_max = velocity_max
        self.speed_decrease_limit = speed_decrease_limit
        self.speed_increase_limit = speed_increase_limit
        
        self.q_values = [0 for x in self.states for a in range(0, 9 + 1)]
        self.returns = [[] for x in self.states for a in range(0, 9 + 1)]
        self.policies = [[1/9] * 6 for x in self.states for a in range(0, 9 + 1)]
        
    def actions(self, x_coord, y_coord):
        """Iterator over all actions"""
        
        for a1 in range(self.speed_decrease_limit, self.speed_increase_limit + 1):
            for a2 in range(self.speed_decrease_limit, self.speed_increase_limit + 1):
                yield a1 + x_coord, a2 + y_coord
                
    def 
                
    def policy_iteration(self):
        """Iterates over policies"""
        
        for iteration in range(0, 10):
            

In [52]:
mdp = RaceTrackingMDP(states=states, start_states=start_states, end_states=end_states)

In [16]:
a = np.zeros((2, 4, 5))


In [17]:
a

array([[[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])