In [66]:
from numpy import genfromtxt
import numpy as np
import os
from collections import defaultdict
from itertools import permutations, repeat
import random
from numpy import random as numpy_random

## Setup

In [67]:
# The velocity is also discrete, a number of grid cells 
# moved horizontally and vertically per time step.

# Actions:
# The actions are increments to the velocity components. 
# Each may be changed by +1, −1, or 0 in one step, 
# for a total of nine actions.

# Both velocity components are restricted to be nonnegative 
# and less than 5, and they cannot both be zero except at the 
# starting line.

# Episodes:
# Each episode begins in one of the randomly selected 
# start states with both velocity components zero and 
# ends when the car crosses the finish line.

# Rewards:
# The rewards are −1 for each step until the car crosses 
# the finish line.

# If the car hits the track boundary, it is moved back 
# to a random position on the starting line, both velocity 
# components are reduced to zero, and the episode continues.

# With probability 0.1 at each time step the velocity increments are both zero, 
# independently of the intended increments.

## Task

In [112]:
# Load map1
CELL_TYPE_WALL = 0  # Black boxes
CELL_TYPE_TRACK = 1
CELL_TYPE_GOAL = 2
CELL_TYPE_START = 3

class RaceTrack:
    def __init__(self, track, zero_velocity_prob=0.9, max_vel=5, min_vel=0, epsilon=0.9):
        self.track = track
        self.wall_cells = np.argwhere(track == CELL_TYPE_WALL)
        self.goal_cells = np.argwhere(track == CELL_TYPE_GOAL)
        self.start_cells = np.argwhere(track == CELL_TYPE_START)
        self.max_vel = max_vel
        self.min_vel = min_vel
        self.colors = ['black', 'white', 'yellow', 'red']  # For plotting
        self.epsilon = epsilon
        
        self.zero_velocity_prob = zero_velocity_prob
        self.velocity_min = min_vel
        self.velocity_max = max_vel
        self.velocity_decrease_limit = -1
        self.velocity_increase_limit = 1
        
        # Q-Matrix - a 6 dimensional vector for states, velocity, and action in both directions.
        
        # Q(s, a)
        velocity_range = self.velocity_max - self.velocity_min + 1  # Minus for other direction.
        velocity_change_range = self.velocity_increase_limit  - self.velocity_decrease_limit  + 1        
        self.q = np.zeros((self.track.shape[0], self.track.shape[1], velocity_range, velocity_range, velocity_change_range, velocity_change_range))
        
        # Returns 
        self.Returns = defaultdict(list)

        # Initialize policy
        # NB: limit actions depending on action
        self.pi_probabilities = np.zeros((self.track.shape[0], 
                                          self.track.shape[1], 
                                          velocity_range, 
                                          velocity_range, 
                                          velocity_change_range, 
                                          velocity_change_range), dtype=float)

        # Initialize with equal probabilties for all possible actions        
        for y_coord in range(self.pi_probabilities.shape[0]):
            for x_coord in range(self.pi_probabilities.shape[1]):
                for y_vel in range(self.velocity_min, self.velocity_max + 1):
                    for x_vel in range(self.velocity_min, self.velocity_max + 1):
                        possible_actions = self.possible_actions((y_vel, x_vel))
                        for y_vel_change, x_vel_change in possible_actions:
                            self.pi_probabilities[y_coord, x_coord, y_vel, x_vel, y_vel_change, x_vel_change] = 1/len(possible_actions)
                                
    def policy_iteration(self):
        """
        """
        
        policy_improvement = False
        
        k=0
        while not policy_improvement:
            print('Iteration {}'.format(k))
            # Generate an episode
            G = self.generate_episode()

            print('--EPSILON', self.epsilon)
            
            # Append G to Returns(s, a)
            if '[0, 8, 0, 0, 1, 0]' in G.keys():            
                print('G[0, 8, 0, 0, 1, 0]', G['[0, 8, 0, 0, 1, 0]'])
                print('self.Returns[0, 8, 0, 0, 1, 0]', self.Returns['[0, 8, 0, 0, 1, 0]'])
            for s_a in G.keys():
                self.Returns[s_a].append(G[s_a])
            if '[0, 8, 0, 0, 1, 0]' in G.keys():            
                print('self.Returns[0, 8, 0, 0, 1, 0]', self.Returns['[0, 8, 0, 0, 1, 0]'])
            
            # Old Q(s, a)
            old_q = self.q.copy()
            
            # Calculate averages in Q(s, a)
            print('Q-value  [0, 8, 0, 0, 1, 0]:', self.q[0, 8, 0, 0, 1, 0])
            print('Q-value  [0, 8, 0, 0, 1, 1]:', self.q[0, 8, 0, 0, 1, 1])        
            print('Q-value  [0, 8, 0, 0, 0, 1]:', self.q[0, 8, 0, 0, 0, 1])                
            print('Q-value  [0, 8, 0, 0, 0, -1]:', self.q[0, 8, 0, 0, 0, -1])                
            for s_a in self.Returns.keys():
                self.q[eval(s_a)[0],
                       eval(s_a)[1],
                       eval(s_a)[2],
                       eval(s_a)[3],
                       eval(s_a)[4],
                       eval(s_a)[5]] = np.average(self.Returns[s_a])
            print('Q-value  [0, 8, 0, 0, 1, 0]:', self.q[0, 8, 0, 0, 1, 0])
            print('Q-value  [0, 8, 0, 0, 1, 1]:', self.q[0, 8, 0, 0, 1, 1])        
            print('Q-value  [0, 8, 0, 0, 0, 1]:', self.q[0, 8, 0, 0, 0, 1])                
            print('Q-value  [0, 8, 0, 0, 0, -1]:', self.q[0, 8, 0, 0, 0, -1])                
            
            # Q-diff
            q_diff = abs(old_q - self.q)
            print('Q-diff: {}'.format(np.max(q_diff)))
            
            # Old policy
            old_policy = self.pi_probabilities.copy()

            print('Pi Prob  [0, 8, 0, 0, 1, 0]:', self.pi_probabilities[0, 8, 0, 0, 1, 0])
            print('Pi Prob  [0, 8, 0, 0, 1, 1]:', self.pi_probabilities[0, 8, 0, 0, 1, 1])        
            print('Pi Prob  [0, 8, 0, 0, 0, 1]:', self.pi_probabilities[0, 8, 0, 0, 0, 1])                
            print('Pi Prob  [0, 8, 0, 0, 0, -1]:', self.pi_probabilities[0, 8, 0, 0, 0, -1])            
            # Update pi(a | s)
            self.update_policy()
            print('Pi Prob  [0, 8, 0, 0, 1, 0]:', self.pi_probabilities[0, 8, 0, 0, 1, 0])
            print('Pi Prob  [0, 8, 0, 0, 1, 1]:', self.pi_probabilities[0, 8, 0, 0, 1, 1])        
            print('Pi Prob  [0, 8, 0, 0, 0, 1]:', self.pi_probabilities[0, 8, 0, 0, 0, 1])                
            print('Pi Prob  [0, 8, 0, 0, 0, -1]:', self.pi_probabilities[0, 8, 0, 0, 0, -1])    
                       
            # Check if convergence
            if np.allclose(old_policy, self.pi_probabilities, atol=0.01):
                print('Policy iteration converged.')
                policy_improvement = True
                
            # Counter and update epsilon
            self.epsilon = 1/(np.sqrt(k + 1.1))
            
            k += 1
            
    def update_policy(self):
        """
        """
        
        # Ranges
        velocity_range = self.velocity_max - self.velocity_min + 1  # Minus for other direction.
        velocity_change_range = self.velocity_increase_limit  - self.velocity_decrease_limit  + 1        

        # Initialize policy
        # NB: limit actions depending on action
        self.pi_probabilities = np.zeros((rt.track.shape[0], 
                                          rt.track.shape[1], 
                                          velocity_range, 
                                          velocity_range, 
                                          velocity_change_range, 
                                          velocity_change_range), dtype=float)
        
        # Initialize with equal probabilties for all possible actions        
        for y_coord in range(self.pi_probabilities.shape[0]):
            for x_coord in range(self.pi_probabilities.shape[1]):
                for y_vel in range(self.velocity_min, self.velocity_max + 1):
                    for x_vel in range(self.velocity_min, self.velocity_max + 1):
                        possible_actions = self.possible_actions((y_vel, x_vel))
                        greedy_action = self.greedy_action(state=[y_coord, x_coord, y_vel, x_vel],
                                                           possible_actions=possible_actions)
                        if [0, 8, 0, 0] == [y_coord,  x_coord, y_vel, x_vel]:
                            print('--GREEDY ACTION:', greedy_action)
                        for y_vel_change, x_vel_change in possible_actions:
#                                 print('SOFT',  self.epsilon_soft_policy(action=[y_vel_change, x_vel_change],
#                                                                                                greedy_action=greedy_action, 
#                                                                                                all_state_actions=possible_actions))
                            self.pi_probabilities[y_coord, 
                                                  x_coord, 
                                                  y_vel, 
                                                  x_vel, 
                                                  y_vel_change, 
                                                  x_vel_change] = self.epsilon_soft_policy(action=[y_vel_change, x_vel_change],
                                                                                           greedy_action=greedy_action, 
                                                                                           all_state_actions=possible_actions)         
            
    def generate_episode(self):
        """
        """
        
        crossed_finishing_line = False
        position = self.random_start_position()
        G = defaultdict(int)  # Dict. w/ G for first occurence for each s, a pair.
        
        step = 0
        while not crossed_finishing_line:
#            print('-- Step {}'.format(step))
            step += 1
#            print('-- Position',  position)
            
            # Sample action
            action = self.sample_action_from_state(position)
#            print('-- Action', action)
            
            # Initiate s, a pair if not already in dict
            if str(position + action) not in G.keys():                
                G[str(position + action)] = 0   

            # Append -1 reward to all s, a pairs
            G = {key_: val_ - 1 for key_, val_ in G.items()}

            # Old position
            old_position = position.copy()
            
            # New position
            position[0] += position[2] + action[0]
            position[1] += position[3] + action[1]
            position[2] += action[0]
            position[3] += action[1]
            
#            print('-- New position', position)

            # Check if goal if reached (is it in the projected reactangle)
            grid_states_to_check = self.get_all_grid_cells_in_projected_retcangle(current_state=[old_position[0], old_position[1]], 
                                                                                  new_state=[position[0], position[1]])
            if self.check_if_goal_is_reached(check_grid_states=grid_states_to_check):
                print('-- Goal Reached. Terminating Episode.')
                break

            # Check if car hits boundery
            if position[0] >= self.track.shape[0] or position[1] >= self.track.shape[1]:
#                print('Hit the track boundery! (outside matrix)')
                position = self.random_start_position()
                continue
            
            new_grid_position = rt.track[position[0], position[1]]
            if new_grid_position == 0:
#                print('Hit the track boundery !')
                position = self.random_start_position()
                continue
        
        print('Steps {}'.format(step))
        
        return G

    def check_if_goal_is_reached(self, check_grid_states):
        """
        """

        grid_values = []

        for y, x in check_grid_states:
            if y <= self.track.shape[0] - 1 and x <= self.track.shape[1] - 1:
                grid_values.append(rt.track[y, x])

        if 2 in grid_values:
            return True
        else:
            return False
    
    def get_all_grid_cells_in_projected_retcangle(self, current_state, new_state):
        """
        """
        y_coord_current = current_state[0]
        x_coord_current = current_state[1]

        y_diff = new_state[0] - current_state[0]
        x_diff = new_state[1] - current_state[1]

        return [[y_coord_current + y, x_coord_current + x] for y in range(0, y_diff + 1) for x in range(0, x_diff + 1)]


    def sample_action_from_state(self, state):
        """
        """
        
        # Action coordinates in probability matrix
        array = np.array(['(0, 0)', 
                          '(0, 1)', 
                          '(0, 2)', 
                          '(1, 0)', 
                          '(1, 1)', 
                          '(1, 2)',
                          '(2, 0)',
                          '(2, 1)',
                          '(2, 2)'])
        
        # Randomly pick action
#        print('STATE:', state)
#        print('---- ! {}'.format(rt.pi_probabilities[state[0], state[1], state[2], state[3]].flatten()))
        a = numpy_random.choice(array,
                                size=1,
                                p=rt.pi_probabilities[state[0], state[1], state[2], state[3]].flatten())        
        a = eval(list(a)[0])        
        
        return self.center_axis_around_zero(coordinates=a, list_range=range(3))

    def center_axis_around_zero(self, coordinates, list_range):
        """
        """
        return [x - len(list_range) if x > len(list_range)/2 else x for x in coordinates]

    def greedy_action(self, state, possible_actions):
        """
        """
        
#         if state == [0, 8, 0, 0]:
#             print('possible_actions[0]', possible_actions[0])
        greedy_action = possible_actions[0]
        q_max = self.q[state[0], state[1], state[2], state[3], greedy_action[0], greedy_action[1]]
        
        for action in possible_actions:
            value = self.q[state[0], state[1], state[2], state[3], action[0], action[1]]
#             if state == [0, 8, 0, 0]:            
#                 print('action', action)
#                 print('value', value)
            if value > q_max:
                q_max = value
                greedy_action = action
                    
        return self.center_axis_around_zero(coordinates=greedy_action, list_range=range(3))

    def epsilon_soft_policy(self, action, greedy_action, all_state_actions):
        """
        """
        if action == greedy_action:
            return 1 - self.epsilon + self.epsilon/len(all_state_actions)
        
        return self.epsilon/len(all_state_actions)

    def random_start_position(self):
        """
        """
        
        grid_position = list(random.choice(np.argwhere(self.track==3)))
        velocity = [0, 0]
        
        return  grid_position + velocity 
    
    def possible_actions(self, velocity):
        """
        Credit: Andreas
        """
        actions = [[a_y, a_x] for a_y in range(-1, 2) for a_x in range(-1, 2)]
        legal_actions = []
        
        v_y, v_x = velocity
        
        # Discard illegal actions
        for a in actions:
            a_y, a_x = a
            # Cannot go above speed limit in any x direction
            if v_x + a_x < self.min_vel or v_x + a_x > self.max_vel:
                continue
            # Cannot go above speed limit in any y direction
            if v_y + a_y < self.min_vel or v_y + a_y > self.max_vel:
                continue
            # Cannot noop
            if v_x + a_x == 0 and v_y + a_y == 0:
                continue
            legal_actions.append(a)
            
        return legal_actions
                
    @classmethod
    def from_csv(cls, file_path):
        
        file_path = os.path.join(os.getcwd(), file_path)
        
        track = genfromtxt(file_path, delimiter=',')
        track = np.flip(track, axis=0)
        
        return cls(track) 
                    

In [113]:
rt = RaceTrack.from_csv("../racetracks/map1.csv")

In [114]:
rt.policy_iteration()

Iteration 0
-- Goal Reached. Terminating Episode.
Steps 553
--EPSILON 0.9
G[0, 8, 0, 0, 1, 0] -500
self.Returns[0, 8, 0, 0, 1, 0] []
self.Returns[0, 8, 0, 0, 1, 0] [-500]
Q-value  [0, 8, 0, 0, 1, 0]: 0.0
Q-value  [0, 8, 0, 0, 1, 1]: 0.0
Q-value  [0, 8, 0, 0, 0, 1]: 0.0
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -500.0
Q-value  [0, 8, 0, 0, 1, 1]: -535.0
Q-value  [0, 8, 0, 0, 0, 1]: -393.0
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 553.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.3333333333333333
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.3333333333333333
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.3333333333333333
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [0, 1]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.3
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.3
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.39999999999999997
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 1
-- Goal Reached. Terminating Episode.
Steps 1358
--EPSILON 0.9534625892455922
G[0, 8, 0, 0, 1, 0] -1304
self.Returns[0, 8, 0, 0, 1, 0] [-500]
self.Returns[0, 8, 0, 0, 1, 0] [-50

Q-value  [0, 8, 0, 0, 1, 0]: -658.7
Q-value  [0, 8, 0, 0, 1, 1]: -843.625
Q-value  [0, 8, 0, 0, 0, 1]: -722.3333333333334
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 1295.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.7657572103578979
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.11712139482105109
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.11712139482105109
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.7790021519560681
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.11049892402196598
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.11049892402196598
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 10
-- Goal Reached. Terminating Episode.
Steps 320
--EPSILON 0.3146583877637763
G[0, 8, 0, 0, 1, 0] -310
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310]
Q-value  [0, 8, 0, 0, 1, 0]: -658.7
Q-value  [0, 8, 0, 0, 1, 1]: -843.625
Q-value  [0, 8, 0, 0, 0, 1]: -722.3333333333334
Q-value  [0, 8

Q-value  [0, 8, 0, 0, 1, 0]: -529.5294117647059
Q-value  [0, 8, 0, 0, 1, 1]: -614.5384615384615
Q-value  [0, 8, 0, 0, 0, 1]: -600.6923076923077
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 96.64999999999998
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8338517379069245
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.0830741310465378
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.0830741310465378
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8387830555311085
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.08060847223444574
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.08060847223444574
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 18
-- Goal Reached. Terminating Episode.
Steps 53
--EPSILON 0.23505024736113422
Q-value  [0, 8, 0, 0, 1, 0]: -529.5294117647059
Q-value  [0, 8, 0, 0, 1, 1]: -614.5384615384615
Q-value  [0, 8, 0, 0, 0, 1]: -600.6923076923077
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -529.5294117647059
Q-value  [0, 8, 0, 0, 1, 1]: -614.5384615384615
Q-value  [0, 8, 0, 0, 0, 1]: -600.6923076923077
Q-

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8641998593788816
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.06790007031055921
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.06790007031055921
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 25
-- Goal Reached. Terminating Episode.
Steps 227
--EPSILON 0.1996011960139498
G[0, 8, 0, 0, 1, 0] -213
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213]
Q-value  [0, 8, 0, 0, 1, 0]: -448.7391304347826
Q-value  [0, 8, 0, 0, 1, 1]: -489.22222222222223
Q-value  [0, 8, 0, 0, 0, 1]: -560.0666666666667
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -438.9166666666667
Q-value  [0, 8, 0, 0, 1, 1]: -489.22222222222223
Q-value  [0, 8, 0, 0, 0, 1]: -539.25
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 301.5
Pi

Q-value  [0, 8, 0, 0, 1, 0]: -586.8928571428571
Q-value  [0, 8, 0, 0, 1, 1]: -586.9090909090909
Q-value  [0, 8, 0, 0, 0, 1]: -730.3888888888889
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 1101.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8804557899798612
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.05977210501006938
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.05977210501006938
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8823325817302987
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.05883370913485064
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.05883370913485064
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 33
-- Goal Reached. Terminating Episode.
Steps 253
--EPSILON 0.17381449986274955
G[0, 8, 0, 0, 1, 0] -141
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -8

Q-value  [0, 8, 0, 0, 1, 0]: -536.6857142857143
Q-value  [0, 8, 0, 0, 1, 1]: -563.4166666666666
Q-value  [0, 8, 0, 0, 0, 1]: -652.3181818181819
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 793.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8919944051652776
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.05400279741736121
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.05400279741736121
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.8933844964987998
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.0533077517506001
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.0533077517506001
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 40
-- Goal Reached. Terminating Episode.
Steps 16
--EPSILON 0.15791661046371636
Q-value  [0, 8, 0, 0, 1, 0]: -536.6857142857143
Q-value  [0, 8, 0, 0, 1, 1]: -563.4166666666666
Q-value  [0, 8, 0, 0, 0, 1]: -652.3181818181819
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -536.6857142857143
Q-value  [0, 8, 0, 0, 1, 1]: -563.4166666666666
Q-value  [0, 8, 0, 0, 0, 1]: -652.3181818181819
Q-value  [0, 8

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9018120306766857
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.04909398466165713
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.04909398466165713
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 47
-- Goal Reached. Terminating Episode.
Steps 91
--EPSILON 0.145710063157312
G[0, 8, 0, 0, 1, 0] -91
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91]
Q-value  [0, 8, 0, 0, 1, 0]: -514.2564102564103
Q-value  [0, 8, 0, 0, 1, 1]: -519.925925925926
Q-value  [0, 8, 0, 0, 0, 1]: -621.375
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-va

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9076387335502568
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.04618063322487163
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.04618063322487163
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 53
-- Goal Reached. Terminating Episode.
Steps 29
--EPSILON 0.1372311615987697
Q-value  [0, 8, 0, 0, 1, 0]: -491.3777777777778
Q-value  [0, 8, 0, 0, 1, 1]: -507.82142857142856
Q-value  [0, 8, 0, 0, 0, 1]: -605.2
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -491.3777777777778
Q-value  [0, 8, 0, 0, 1, 1]: -507.82142857142856
Q-value  [0, 8, 0, 0, 0, 1]: -605.2
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 50.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9076387335502568
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.04618063322487163
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.04618063322487163
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9085125589341535
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.04574372053292323
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.04574372053292323
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0


Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9125377441186023
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.043731127940698805
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.043731127940698805
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 59
-- Goal Reached. Terminating Episode.
Steps 4594
--EPSILON 0.13007872144692093
G[0, 8, 0, 0, 1, 0] -4594
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186, -4594]
Q-value  [0, 8, 0, 0, 1, 0]: -469.1

Q-value  [0, 8, 0, 0, 1, 0]: -573.1071428571429
Q-value  [0, 8, 0, 0, 1, 1]: -614.2424242424242
Q-value  [0, 8, 0, 0, 0, 1]: -678.4
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 795.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9160744760349265
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.04196276198253671
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.04196276198253671
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9167316946385935
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.041634152680703265
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.041634152680703265
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 65
-- Goal Reached. Terminating Episode.
Steps 492
--EPSILON 0.12393943320395891
G[0, 8, 0, 0, 1, 0] -416
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, 

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9198008588881027
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.040099570555948684
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.040099570555948684
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 70
-- Goal Reached. Terminating Episode.
Steps 462
--EPSILON 0.1194375788241625
G[0, 8, 0, 0, 1, 0] -460
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186, -4594, -2106, -162, -448, -669, -658, -416, -167, -161, -410, -219]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, 

Q-value  [0, 8, 0, 0, 1, 0]: -625.5076923076923
Q-value  [0, 8, 0, 0, 1, 1]: -686.972972972973
Q-value  [0, 8, 0, 0, 0, 1]: -668.0
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 268.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9230712328689098
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.038464383565545136
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.038464383565545136
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9235783495836553
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03821082520817238
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03821082520817238
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 77
-- Goal Reached. Terminating Episode.
Steps 803
--EPSILON 0.11388664808568191
G[0, 8, 0, 0, 1, 0] -788
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -

Q-value  [0, 8, 0, 0, 1, 0]: -648.1142857142858
Q-value  [0, 8, 0, 0, 1, 1]: -687.5641025641025
Q-value  [0, 8, 0, 0, 0, 1]: -685.025
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 1474.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9255109418717787
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.037244529064110604
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.037244529064110604
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9259716083687995
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03701419581560026
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03701419581560026
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 82
-- Goal Reached. Terminating Episode.
Steps 780
--EPSILON 0.1103642513040126
G[0, 8, 0, 0, 1, 0] -765
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9281532437651596
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.0359233781174202
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.0359233781174202
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 87
-- Goal Reached. Terminating Episode.
Steps 223
--EPSILON 0.10714969088698138
G[0, 8, 0, 0, 1, 0] -190
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186, -4594, -2106, -162, -448, -669, -658, -416, -167, -161, -410, -219, -460, -5026, -1595, -110, -788, -630, -937, -2254, -101, -765, -559, -561, -1041]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -28

Q-value  [0, 8, 0, 0, 1, 0]: -644.9358974358975
Q-value  [0, 8, 0, 0, 1, 1]: -671.8095238095239
Q-value  [0, 8, 0, 0, 0, 1]: -653.2954545454545
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -658.7721518987341
Q-value  [0, 8, 0, 0, 1, 1]: -664.6279069767442
Q-value  [0, 8, 0, 0, 0, 1]: -653.2954545454545
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 1747.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9297661710686834
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03511691446565827
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03511691446565827
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [0, 1]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.034923644343915136
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.034923644343915136
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.9301527113121697
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 92
-- Goal Reached. Terminating Episode.
Steps 2862
--EPSILON 0.1042005916526391
Q-value  [0, 8, 0, 0, 1, 0]: -658.7721518987341
Q-value  [0, 8, 0, 0, 1, 1]: -664.6279069767442
Q-value  [0, 8, 0, 0, 0, 1]: -653.2954545454545
Q-value 

Q-value  [0, 8, 0, 0, 1, 0]: -655.0361445783133
Q-value  [0, 8, 0, 0, 1, 1]: -694.0
Q-value  [0, 8, 0, 0, 0, 1]: -705.3695652173913
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 1169.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9316374140392014
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03418129298039929
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03418129298039929
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9319940288135832
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03400298559320838
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03400298559320838
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 97
-- Goal Reached. Terminating Episode.
Steps 72
--EPSILON 0.10148231951452123
G[0, 8, 0, 0, 1, 0] -72
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63

-- Goal Reached. Terminating Episode.
Steps 5440
--EPSILON 0.09945449630266603
G[0, 8, 0, 0, 1, 0] -5427
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186, -4594, -2106, -162, -448, -669, -658, -416, -167, -161, -410, -219, -460, -5026, -1595, -110, -788, -630, -937, -2254, -101, -765, -559, -561, -1041, -190, -854, -814, -153, -1738, -130, -1239, -133, -823, -72, -210, -1616, -811]
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186, -4594, 

Q-value  [0, 8, 0, 0, 1, 0]: -737.4065934065934
Q-value  [0, 8, 0, 0, 1, 1]: -773.2549019607843
Q-value  [0, 8, 0, 0, 0, 1]: -794.2352941176471
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 374.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9349709539797562
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03251452301012192
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03251452301012192
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9352781311627115
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03236093441864427
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03236093441864427
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 107
-- Goal Reached. Terminating Episode.
Steps 322
--EPSILON 0.09662850594207474
G[0, 8, 0, 0, 1, 0] -322
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -

Q-value  [0, 8, 0, 0, 1, 0]: -721.0631578947368
Q-value  [0, 8, 0, 0, 1, 1]: -752.2641509433962
Q-value  [0, 8, 0, 0, 0, 1]: -780.8269230769231
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 785.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9361741855031128
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03191290724844361
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03191290724844361
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9364647005046337
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03176764974768314
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03176764974768314
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 111
-- Goal Reached. Terminating Episode.
Steps 5626
--EPSILON 0.09487307357732752
G[0, 8, 0, 0, 1, 0] -5542
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461,

Q-value  [0, 8, 0, 0, 1, 0]: -765.7777777777778
Q-value  [0, 8, 0, 0, 1, 1]: -831.1636363636363
Q-value  [0, 8, 0, 0, 0, 1]: -859.7222222222222
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 827.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9375883134409508
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.03120584327952459
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.03120584327952459
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9378600242939295
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.031069987853035275
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.031069987853035275
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 116
-- Goal Reached. Terminating Episode.
Steps 674
--EPSILON 0.09280767439828409
G[0, 8, 0, 0, 1, 0] -674
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461,

Q-value  [0, 8, 0, 0, 1, 0]: -767.7669902912621
Q-value  [0, 8, 0, 0, 1, 1]: -832.625
Q-value  [0, 8, 0, 0, 0, 1]: -856.7017543859649
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 2257.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9386543472225926
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.030672826388703712
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.030672826388703712
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9389124285216534
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.030543785739173362
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.030543785739173362
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 120
-- Goal Reached. Terminating Episode.
Steps 159
--EPSILON 0.09124908038499573
G[0, 8, 0, 0, 1, 0] -149
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, 

Q-value  [0, 8, 0, 0, 1, 0]: -815.7850467289719
Q-value  [0, 8, 0, 0, 1, 1]: -892.7758620689655
Q-value  [0, 8, 0, 0, 0, 1]: -927.95
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 5491.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9396675573177776
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.030166221341111157
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.030166221341111157
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9399131116240231
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.030043444187988433
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.030043444187988433
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 124
-- Goal Reached. Terminating Episode.
Steps 196
--EPSILON 0.08976646215396372
G[0, 8, 0, 0, 1, 0] -105
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -

Q-value  [0, 8, 0, 0, 1, 0]: -798.5765765765766
Q-value  [0, 8, 0, 0, 1, 1]: -892.7758620689655
Q-value  [0, 8, 0, 0, 0, 1]: -919.9508196721312
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 304.0999999999999
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9406321668997947
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02968391655010266
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02968391655010266
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.940866175824322
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.029566912087839018
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.029566912087839018
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 128
-- Goal Reached. Terminating Episode.
Steps 824
--EPSILON 0.08835384116734371
G[0, 8, 0, 0, 1, 0] -785
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, 

Q-value  [0, 8, 0, 0, 1, 0]: -785.1739130434783
Q-value  [0, 8, 0, 0, 1, 1]: -892.7758620689655
Q-value  [0, 8, 0, 0, 0, 1]: -902.7301587301587
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 1443.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9415519410807732
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.029224029459613363
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.029224029459613363
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9417752818246943
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.029112359087652875
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.029112359087652875
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 132
-- Goal Reached. Terminating Episode.
Steps 72
--EPSILON 0.08700587744553888
G[0, 8, 0, 0, 1, 0] -72
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9426437080837097
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02867814595814516
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02867814595814516
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 136
-- Goal Reached. Terminating Episode.
Steps 230
--EPSILON 0.0857177844707708
G[0, 8, 0, 0, 1, 0] -196
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279, -356, -63, -186, -4594, -2106, -162, -448, -669, -658, -416, -167, -161, -410, -219, -460, -5026, -1595, -110, -788, -630, -937, -2254, -101, -765, -559, -561, -1041, -190, -854, -814, -153, -1738, -130, -1239, -133, -823, -72, -210, -1616, -811, -5427, -1231, -2965, -404, -322, -169, -209, -697, -5542, -267, -1340, -162, -674, -163, -212, -2219, -149, -291, -2244, -5525, -105, -573, -534,

Q-value  [0, 8, 0, 0, 1, 0]: -773.2377049180328
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -891.8333333333334
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 675.0
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9432701151999128
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.028364942400043624
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.028364942400043624
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9434744006483853
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.028262799675807374
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.028262799675807374
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 140
-- Goal Reached. Terminating Episode.
Steps 12
--EPSILON 0.08448525755274976
Q-value  [0, 8, 0, 0, 1, 0]: -773.2377049180328
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -891.8333333333334
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -773.2377049180328
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -891.8333333333334
Q-value

Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9442700152693867
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02786499236530662
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02786499236530662
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 144
-- Goal Reached. Terminating Episode.
Steps 31
--EPSILON 0.0833044132098411
Q-value  [0, 8, 0, 0, 1, 0]: -757.56
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -878.9850746268656
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -757.56
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -878.9850746268656
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 105.23333333333335
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9442700152693867
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02786499236530662
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02786499236530662
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9444637245267726
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.0277681377366137
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.0277681377366137
Pi Prob  [0, 8, 0, 0,

Q-value  [0, 8, 0, 0, 1, 0]: -742.34375
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -878.9850746268656
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 819.3333333333333
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9452188415878414
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.027390579206079315
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.027390579206079315
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9454028567480808
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.0272985716259596
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.0272985716259596
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 150
-- Goal Reached. Terminating Episode.
Steps 256
--EPSILON 0.08162245514079562
G[0, 8, 0, 0, 1, 0] -256
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -46

Q-value  [0, 8, 0, 0, 1, 0]: -724.4772727272727
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -866.4264705882352
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 308.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9459439716210534
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02702801418947333
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02702801418947333
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9461207991422341
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02693960042888297
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02693960042888297
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 154
-- Goal Reached. Terminating Episode.
Steps 34
--EPSILON 0.0805561460541608
G[0, 8, 0, 0, 1, 0] -34
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -84, -699, -461, -279

Q-value  [0, 8, 0, 0, 1, 0]: -708.9629629629629
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -866.4264705882352
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 110.85000000000002
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9466410482199087
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.026679475890045667
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.026679475890045667
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9468111441317768
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.026594427934111597
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.026594427934111597
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 158
-- Goal Reached. Terminating Episode.
Steps 128
--EPSILON 0.07953056449841825
G[0, 8, 0, 0, 1, 0] -115
self.Returns[0, 8, 0, 0, 1, 0] [-500, -1304, -2565, -863, -192, -151, -366, -216, -132, -298, -310, -77, -899, -281, -338, -92, -418, -72, -225, -414, -35, -119, -454, -213, -573, -3421, -274, -1631, -141, -495, -344, -136, -285, -158, -792, -100, -428, -412, -332, -91, -410, -618, -154, -

Q-value  [0, 8, 0, 0, 1, 0]: -695.0942028985507
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -866.4264705882352
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-diff: 361.5
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.9474756156392149
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.026262192180392524
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.026262192180392524
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
--GREEDY ACTION: [1, 0]
Pi Prob  [0, 8, 0, 0, 1, 0]: 0.947637878567463
Pi Prob  [0, 8, 0, 0, 1, 1]: 0.02618106071626854
Pi Prob  [0, 8, 0, 0, 0, 1]: 0.02618106071626854
Pi Prob  [0, 8, 0, 0, 0, -1]: 0.0
Iteration 163
-- Goal Reached. Terminating Episode.
Steps 129
--EPSILON 0.07830202965406481
Q-value  [0, 8, 0, 0, 1, 0]: -695.0942028985507
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -866.4264705882352
Q-value  [0, 8, 0, 0, 0, -1]: 0.0
Q-value  [0, 8, 0, 0, 1, 0]: -695.0942028985507
Q-value  [0, 8, 0, 0, 1, 1]: -919.3728813559322
Q-value  [0, 8, 0, 0, 0, 1]: -854.7246376811594
Q-value  

In [117]:
rt.q[0,8,0,0,:, :]

array([[   0.        , -854.72463768,    0.        ],
       [-695.0942029 , -919.37288136,    0.        ],
       [   0.        ,    0.        ,    0.        ]])

In [129]:
state = [0, 8, 0, 0]
for i in range(20):
    print('State:', state)
    coords = get_greedy_action(matrix=rt.q, state=state)
    print('Action:', coords)
    state[0] += state[2] + coords[0]
    state[1] += state[3] + coords[1]
    state[2] += coords[0]
    state[3] += coords[1]
    print('New state:', state)
    

State: [0, 8, 0, 0]
Action: (1, 0)
New state: [1, 8, 1, 0]
State: [1, 8, 1, 0]
Action: (0, 0)
New state: [2, 8, 1, 0]
State: [2, 8, 1, 0]
Action: (1, 0)
New state: [4, 8, 2, 0]
State: [4, 8, 2, 0]
Action: (2, 1)
New state: [8, 9, 4, 1]
State: [8, 9, 4, 1]
Action: (0, 0)
New state: [12, 10, 4, 1]
State: [12, 10, 4, 1]
Action: (0, 0)
New state: [16, 11, 4, 1]
State: [16, 11, 4, 1]
Action: (0, 0)
New state: [20, 12, 4, 1]
State: [20, 12, 4, 1]
Action: (0, 0)
New state: [24, 13, 4, 1]
State: [24, 13, 4, 1]
Action: (0, 0)
New state: [28, 14, 4, 1]
State: [28, 14, 4, 1]
Action: (0, 0)
New state: [32, 15, 4, 1]
State: [32, 15, 4, 1]


IndexError: index 32 is out of bounds for axis 0 with size 32

In [None]:
def draw(self, car_cell=None, path=[]):
        colors = ['black', 'white', 'yellow', 'red']
        
        fig=plt.figure(figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k')
        
        im = plt.imshow(self.track, cmap=ListedColormap(colors), origin='lower', interpolation='none', animated=True)
        
        def rect(pos, edgecolor='k', facecolor='none'):
            r = plt.Rectangle(pos, 1,1, facecolor=facecolor, edgecolor=edgecolor, linewidth=2)
            plt.gca().add_patch(r)
            
        for i in range(self.track.shape[0]):
            for j in range(self.track.shape[1]):
                rect((j-0.5,i-0.5))
                
        if path:
            for cell in path:
                rect((cell[1]-0.5, cell[0]-0.5), edgecolor='g')
        
        if car_cell:
            rect((car_cell[1]-0.5, car_cell[0]-0.5), edgecolor='g', facecolor='g')
            
        plt.gca().invert_yaxis()
        return im

In [126]:
def get_greedy_action(matrix, state):
    val = -100000
    coords = (0, 0)
    for i in range(rt.q[state[0],state[1],state[2],state[3],:, :].shape[0]):
        for j in range(rt.q[state[0],state[1],state[2],state[3],:, :].shape[1]):
            if rt.q[state[0],state[1],state[2],state[3], i, j] < 0:
                temp = rt.q[state[0],state[1],state[2],state[3], i, j]
                if temp > val:
                    val = temp
                    coords = (i, j)
                    
    return coords
                    
                
            

In [124]:
rt.q[0,8,0,0,:, :].shape

(3, 3)

In [935]:
rt.generate_episode()

-- Step 0
-- Position [0, 4, 0, 0]
-- Action [0, 1]
-- New position [0, 5, 0, 1]
-- Step 1
-- Position [0, 5, 0, 1]
-- Action [0, 1]
-- New position [0, 7, 0, 2]
-- Step 2
-- Position [0, 7, 0, 2]
-- Action [0, -1]
-- New position [0, 8, 0, 1]
-- Step 3
-- Position [0, 8, 0, 1]
-- Action [0, 1]
-- New position [0, 10, 0, 2]
Hit the track boundery !
-- Step 4
-- Position [0, 4, 0, 0]
-- Action [1, 0]
-- New position [1, 4, 1, 0]
-- Step 5
-- Position [1, 4, 1, 0]
-- Action [1, 0]
-- New position [3, 4, 2, 0]
-- Step 6
-- Position [3, 4, 2, 0]
-- Action [0, 0]
-- New position [5, 4, 2, 0]
-- Step 7
-- Position [5, 4, 2, 0]
-- Action [1, 0]
-- New position [8, 4, 3, 0]
-- Step 8
-- Position [8, 4, 3, 0]
-- Action [-1, 0]
-- New position [10, 4, 2, 0]
-- Step 9
-- Position [10, 4, 2, 0]
-- Action [-1, 0]
-- New position [11, 4, 1, 0]
-- Step 10
-- Position [11, 4, 1, 0]
-- Action [0, 1]
-- New position [12, 5, 1, 1]
-- Step 11
-- Position [12, 5, 1, 1]
-- Action [0, -1]
-- New position [13

{'[0, 4, 0, 0, 0, 1]': -173,
 '[0, 5, 0, 1, 0, 1]': -172,
 '[0, 7, 0, 2, 0, -1]': -171,
 '[0, 8, 0, 1, 0, 1]': -170,
 '[0, 4, 0, 0, 1, 0]': -169,
 '[1, 4, 1, 0, 1, 0]': -168,
 '[3, 4, 2, 0, 0, 0]': -167,
 '[5, 4, 2, 0, 1, 0]': -166,
 '[8, 4, 3, 0, -1, 0]': -165,
 '[10, 4, 2, 0, -1, 0]': -164,
 '[11, 4, 1, 0, 0, 1]': -163,
 '[12, 5, 1, 1, 0, -1]': -162,
 '[13, 5, 1, 0, -1, 1]': -161,
 '[13, 6, 0, 1, 0, 0]': -160,
 '[13, 7, 0, 1, 1, 0]': -159,
 '[14, 8, 1, 1, 0, 1]': -158,
 '[0, 8, 0, 0, 0, 1]': -157,
 '[0, 7, 0, 0, 1, 0]': -156,
 '[1, 7, 1, 0, 0, 0]': -155,
 '[2, 7, 1, 0, 1, 0]': -154,
 '[4, 7, 2, 0, 1, 0]': -153,
 '[7, 7, 3, 0, 1, 0]': -152,
 '[11, 7, 4, 0, -1, 0]': -151,
 '[14, 7, 3, 0, 1, 0]': -150,
 '[18, 7, 4, 0, 1, 0]': -149,
 '[23, 7, 5, 0, 0, 0]': -148,
 '[28, 7, 5, 0, 0, 1]': -147,
 '[0, 5, 0, 0, 1, 0]': -146,
 '[1, 5, 1, 0, 0, 0]': -145,
 '[2, 5, 1, 0, 1, 1]': -144,
 '[4, 6, 2, 1, 1, -1]': -143,
 '[7, 6, 3, 0, -1, 1]': -142,
 '[9, 7, 2, 1, 0, 1]': -141,
 '[0, 7, 0, 0, 1, 1]': 

In [761]:
dict_test['a'].append(1)

In [763]:
dict_test.keys()

dict_keys(['a'])

In [567]:
a = rt.get_all_cell_values_in_projected_retcangle([29, 12],  [31, 15])

In [569]:
rt.check_if_goal_is_reached(a)

True

In [582]:
G = defaultdict(list)

In [581]:
G[str(str([1, 1, 0, 1]), str([0, 0]))] = 1

TypeError: decoding str is not supported

In [588]:
str(str([1, 1, 0, 1] + [0, 0]))

'[1, 1, 0, 1, 0, 0]'

In [514]:
a = rt.random_start_position()

In [520]:
b =  rt.sample_action_from_state([a[0], a[1], a[2], a[3]])

In [560]:
rt.track

array([[0., 0., 0., 3., 3., 3., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,

In [543]:
a = get_all_cell_values_in_projected_retcangle([29, 14], [34, 16])

In [525]:
a = [29, 14]

In [527]:
b = [34, 16]

In [542]:
def get_all_cell_values_in_projected_retcangle(current_state, new_state):
    """
    """
    y_coord_current = current_state[0]
    x_coord_current = current_state[1]
    
    y_diff = new_state[0] - current_state[0]
    x_diff = new_state[1] - current_state[1]
    
    return [[y_coord_current + y, x_coord_current + x] for y in range(0, y_diff + 1) for x in range(0, x_diff + 1)]


In [553]:
def check_if_goal_is_reached(check_grid_states):
    """
    """
    
    grid_values = []
    
    for y, x in check_grid_states:
        if y <= rt.track.shape[0] - 1 and x <= rt.track.shape[1] - 1:
            grid_values.append(rt.track[y, x])
    
    if 2 in grid_values:
        return True
    else:
        return False

In [499]:
rt.track.shape

(32, 16)

In [None]:
# If I'm in e.g. 1, 14, -2, 2
# And takes action -1, 0

# New position -2, 16, -3, 2


In [540]:
rt.track

array([[0., 0., 0., 3., 3., 3., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,

In [147]:
a[0, 1] = random.choice(rt.all_possible_actions(5, 4))

In [None]:
# Apply a Monte Carlo control method to this task to compute 
# the optimal policy from each starting state.

In [22]:
# Track bounderies:

race_track = {'[0, 2]': '[0, 6]',
              '[3, 9]': '[-1, 6]',
              '[10, 17]': '[-2, 6]',
              '[18, 24]':  '[-3, 6]',
              '[25, 25]': '[-3, 7]',
              '[26, 27]': '[-3, 14]',
              '[28, 28]': '[-2,  14]',
              '[29, 30]': '[-1, 14]',
              '[31, 31]': '[0, 14]'}


In [23]:
# States:
states = []
for key in race_track.keys():
    for x_coord in range(eval(key)[0], eval(key)[1] + 1):
        for y_coord in range(eval(race_track[key])[0], eval(race_track[key])[1] + 1):
            states.append([x_coord, y_coord])
            
# Start states:
start_states = [[0, 0],
                [0, 1],
                [0, 2],
                [0, 3],
                [0, 4],
                [0, 5],
                [0, 6]]

# End states:
end_states = [[x, 14] for x in range(26, 32)]

In [51]:
class RaceTrackingMDP:
    def __init__(self,
                 states,
                 start_states,
                 end_states,
                 zero_velocity_prob=0.9,
                 velocity_max=4,
                 speed_decrease_limit=-1,
                 speed_increase_limit=1):
        self.states = states
        self.start_states = start_states
        self.end_states = end_states
        self.zero_velocity_prob = zero_velocity_prob
        self.velocity_max = velocity_max
        self.speed_decrease_limit = speed_decrease_limit
        self.speed_increase_limit = speed_increase_limit
        
        self.q_values = [0 for x in self.states for a in range(0, 9 + 1)]
        self.returns = [[] for x in self.states for a in range(0, 9 + 1)]
        self.policies = [[1/9] * 6 for x in self.states for a in range(0, 9 + 1)]
        
    def actions(self, x_coord, y_coord):
        """Iterator over all actions"""
        
        for a1 in range(self.speed_decrease_limit, self.speed_increase_limit + 1):
            for a2 in range(self.speed_decrease_limit, self.speed_increase_limit + 1):
                yield a1 + x_coord, a2 + y_coord
                
    def 
                
    def policy_iteration(self):
        """Iterates over policies"""
        
        for iteration in range(0, 10):
            

In [52]:
mdp = RaceTrackingMDP(states=states, start_states=start_states, end_states=end_states)

In [16]:
a = np.zeros((2, 4, 5))


In [17]:
a

array([[[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])