# Assignment1 Q4 Specification: Reinforcement Learning - Taxi Environment

In [1]:
!python -m pip install gym
!python -m pip install numpy==1.24.0
!python -m pip install tqdm



In [2]:
import gym
import numpy as np
import importlib.util
import time
from IPython.display import clear_output
import random

In [3]:
np.random.seed(42)

In [4]:
class SimpleTaxiEnv(gym.Env):
    def __init__(self, grid_size=10, fuel_limit=5000):
        """
        Custom Taxi environment supporting different grid sizes.
        """
        self.grid_size = grid_size
        self.fuel_limit = fuel_limit
        self.current_fuel = fuel_limit
        self.passenger_picked_up = False

        self.stations = [(0, 0), (0, self.grid_size - 1), (self.grid_size - 1, 0), (self.grid_size - 1, self.grid_size - 1)]
        self.passenger_loc = None

        self.obstacles = set()  # No obstacles in simple version
        self.destination = None
    def legal(self):
      available_positions = [
            (x, y) for x in range(self.grid_size) for y in range(self.grid_size)
            if (x, y) not in self.obstacles
      ]
      n=len(available_positions)
      p = [0 for i in range(n)]
      global group_size
      group_size = n
      def find_parent(x):
        if x!=p[x]:
          p[x]=find_parent(p[x])
        return p[x]
      def Union(a,b):
        global group_size
        a=find_parent(a)
        b=find_parent(b)
        if a!=b:
          group_size -= 1
          if np.random.randint(2)==0:
            p[a]=b
          else:
            p[b]=a
      for i in range(n):
        p[i]=i
      for i in range(n):
        for j in range(i+1,n):
          if abs(available_positions[i][0]-available_positions[j][0])+abs(available_positions[i][1]-available_positions[j][1])==1:
            Union(i,j)
      return group_size==1
    def reset(self,grid_size=None,obstacle_len=None):
        """Reset the environment, ensuring Taxi, passenger, and destination are not overlapping obstacles"""
        self.current_fuel = self.fuel_limit
        self.passenger_picked_up = False
        if grid_size == None:
            grid_size = np.random.randint(5,11)
        self.grid_size = grid_size
        self.stations = []
        if obstacle_len == None:
            obstacle_len = np.random.randint(0,grid_size*grid_size)
        while len(self.stations)<4:
            posx,posy = np.random.randint(0,self.grid_size),np.random.randint(0,self.grid_size)
            # check posx,posy is not adjacent to existed stations
            if not any([abs(posx-station[0])+abs(posy-station[1])<=1 for station in self.stations]):
                self.stations.append((posx,posy))
        #self.stations = [(0, 0), (0, self.grid_size - 1), (self.grid_size - 1, 0), (self.grid_size - 1, self.grid_size - 1)]
        self.obstacles = set()
        cnt = 0
        while len(self.obstacles)<obstacle_len and cnt < self.grid_size*self.grid_size*10:
            posx,posy = np.random.randint(0,self.grid_size),np.random.randint(0,self.grid_size)
            if (posx,posy) not in self.stations and (posx,posy) not in self.obstacles:
                self.obstacles.add((posx,posy))
            if not self.legal():
                self.obstacles.remove((posx,posy))
            cnt = cnt +1
        available_positions = [
            (x, y) for x in range(self.grid_size) for y in range(self.grid_size)
            if (x, y) not in self.stations and (x, y) not in self.obstacles
        ]

        self.taxi_pos = random.choice(available_positions)

        self.passenger_loc = random.choice([pos for pos in self.stations])


        possible_destinations = [s for s in self.stations if s != self.passenger_loc]
        self.destination = random.choice(possible_destinations)
        #print(f'grid size : {self.grid_size}, obstacles : {len(self.obstacles)}, ')
        #self.render_env(self.taxi_pos)
        return self.get_state(), {}

    def step(self, action):
        """Perform an action and update the environment state."""
        taxi_row, taxi_col = self.taxi_pos
        next_row, next_col = taxi_row, taxi_col
        reward = 0
        if action == 0 :  # Move Down
            next_row += 1
        elif action == 1:  # Move Up
            next_row -= 1
        elif action == 2:  # Move Right
            next_col += 1
        elif action == 3:  # Move Left
            next_col -= 1


        if action in [0, 1, 2, 3]:  # Only movement actions should be checked
            if (next_row, next_col) in self.obstacles or not (0 <= next_row < self.grid_size and 0 <= next_col < self.grid_size):
                reward -=10
                #print('block')
            else:
                self.taxi_pos = (next_row, next_col)
                if self.passenger_picked_up:
                    self.passenger_loc = self.taxi_pos
        else:
            if action == 4:  # PICKUP
                if self.taxi_pos == self.passenger_loc:
                    self.passenger_picked_up = True
                    self.passenger_loc = self.taxi_pos
                else:
                    reward = -10
            elif action == 5:  # DROPOFF
                if self.passenger_picked_up:
                    self.passenger_picked_up = False
                    self.passenger_loc = self.taxi_pos
                    if self.taxi_pos == self.destination:
                        reward += 50
                        return self.get_state(), reward -0.1, True, {}
                    else:
                        reward -=10
                else:
                    reward -=10

        reward -= 0.1

        self.current_fuel -= 1
        if self.current_fuel <= 0:
            return self.get_state(), reward -10, True, {}



        return self.get_state(), reward, False, {}

    def get_state(self):
        """Return the current environment state."""
        taxi_row, taxi_col = self.taxi_pos
        passenger_row, passenger_col = self.passenger_loc
        destination_row, destination_col = self.destination

        obstacle_north = int(taxi_row == 0 or (taxi_row-1, taxi_col) in self.obstacles)
        obstacle_south = int(taxi_row == self.grid_size - 1 or (taxi_row+1, taxi_col) in self.obstacles)
        obstacle_east  = int(taxi_col == self.grid_size - 1 or (taxi_row, taxi_col+1) in self.obstacles)
        obstacle_west  = int(taxi_col == 0 or (taxi_row , taxi_col-1) in self.obstacles)

        passenger_loc_north = int((taxi_row - 1, taxi_col) == self.passenger_loc)
        passenger_loc_south = int((taxi_row + 1, taxi_col) == self.passenger_loc)
        passenger_loc_east  = int((taxi_row, taxi_col + 1) == self.passenger_loc)
        passenger_loc_west  = int((taxi_row, taxi_col - 1) == self.passenger_loc)
        passenger_loc_middle  = int( (taxi_row, taxi_col) == self.passenger_loc)
        passenger_look = passenger_loc_north or passenger_loc_south or passenger_loc_east or passenger_loc_west or passenger_loc_middle

        destination_loc_north = int( (taxi_row - 1, taxi_col) == self.destination)
        destination_loc_south = int( (taxi_row + 1, taxi_col) == self.destination)
        destination_loc_east  = int( (taxi_row, taxi_col + 1) == self.destination)
        destination_loc_west  = int( (taxi_row, taxi_col - 1) == self.destination)
        destination_loc_middle  = int( (taxi_row, taxi_col) == self.destination)
        destination_look = destination_loc_north or destination_loc_south or destination_loc_east or destination_loc_west or destination_loc_middle


        state = (taxi_row, taxi_col, self.stations[0][0],self.stations[0][1] ,self.stations[1][0],self.stations[1][1],self.stations[2][0],self.stations[2][1],self.stations[3][0],self.stations[3][1],obstacle_north, obstacle_south, obstacle_east, obstacle_west, passenger_look, destination_look)
        return state,self.passenger_picked_up,self.passenger_loc
    def render_env(self, taxi_pos,   action=None, step=None, fuel=None):
        clear_output(wait=True)

        grid = [['.'] * self.grid_size for _ in range(self.grid_size)]

        '''
        # Place passenger
        py, px = passenger_pos
        if 0 <= px < self.grid_size and 0 <= py < self.grid_size:
            grid[py][px] = 'P'
        '''
        stations_char =['R','G','B','Y']
        for i in range(len(self.stations)):
            sx,sy = self.stations[i]
            grid[sy][sx] = stations_char[i]
        for ox,oy in self.obstacles:
            grid[oy][ox] = '#'

        '''
        # Place destination
        dy, dx = destination_pos
        if 0 <= dx < self.grid_size and 0 <= dy < self.grid_size:
            grid[dy][dx] = 'D'
        '''
        # Place taxi
        ty, tx = taxi_pos
        if 0 <= tx < self.grid_size and 0 <= ty < self.grid_size:
            grid[ty][tx] = '🚖'

        # Print step info
        print(f"\nStep: {step}")
        print(f"Taxi Position: ({tx}, {ty})")
        #print(f"Passenger Position: ({px}, {py}) {'(In Taxi)' if (px, py) == (tx, ty) else ''}")
        #print(f"Destination: ({dx}, {dy})")
        print(f"Fuel Left: {fuel}")
        print(f"Last Action: {self.get_action_name(action)}\n")

        # Print grid
        for row in grid:
            print(" ".join(row))
        print("\n")

    def get_action_name(self, action):
        """Returns a human-readable action name."""
        actions = ["Move South", "Move North", "Move East", "Move West", "Pick Up", "Drop Off"]
        return actions[action] if action is not None else "None"

In [5]:
from re import M
from collections import defaultdict
from tqdm import tqdm
def tabular_q_learning(env,episodes=7000, alpha=0.1, gamma=0.99,
                       epsilon_start=1.0, epsilon_end=0.1, decay_rate=0.9995):

    global stations, candidates_p,candidates_goal, pickup
    stations = [[0,0] for _ in range(4)]
    candidates_p = [i for i in stations]
    candidates_goal = [i for i in stations]
    pickup=False
    action_size = 6
    q_table = defaultdict(lambda :np.zeros(action_size))
    pickup_id = 4
    drop_id = 5
    ifpickup=False
    p_loc = (0,0)
    def cmp(a,b):
        if a==b:
            return 0
        return 1 if a<b else -1
    def get_state_obs(obs,action,last_action=None):
        global stations,pickup,candidates_p,candidates_goal
        #print(candidates_p)
        taxi_row, taxi_col, stations[0][0], stations[0][1] , stations[1][0], stations[1][1],stations[2][0],stations[2][1],stations[3][0],stations[3][1],obstacle_north, obstacle_south, obstacle_east, obstacle_west, passenger_look, destination_look = obs
        agent_pos = (taxi_row,taxi_col)
        if action==None:
            # initialize
            candidates_goal = [tuple(i) for i in stations]
            candidates_p = [tuple(i) for i in stations]
            pickup=False
        if passenger_look:
            #print('before p',candidates_p)
            candidates_p = [ tuple(x) for x in candidates_p if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) <=1 ]
            #print('after p',candidates_p)
        else:
            #print('before p',candidates_p)
            candidates_p = [ tuple(x) for x in candidates_p if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) >1 ]
            #print('after p',candidates_p)
        if destination_look:
            #print('before g',candidates_goal)
            candidates_goal = [ tuple(x) for x in candidates_goal if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) <=1 ]
            #print('after g',candidates_goal)
        else:
            #print('before g',candidates_goal)
            candidates_goal = [ tuple(x) for x in candidates_goal if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) >1 ]
            #print('after g',candidates_goal)
        if action==pickup_id and not pickup and agent_pos in candidates_p:
            pickup = True
            candidates_p = []
        elif action == drop_id and pickup:
            pickup=False
            candidates_p.append(agent_pos)
        cmp_pos = (0,0)
        if not pickup:
            # choose the one that is closest to the agent
            idx = 0 #np.argmin([abs(agent_pos[0]-i[0])+abs(agent_pos[1]-i[1]) for i in candidates_p])
            cmp_pos = candidates_p[idx]
        else:
            # choose the one that is closest to the agent
            idx = 0 #np.argmin([abs(agent_pos[0]-i[0])+abs(agent_pos[1]-i[1]) for i in candidates_goal])
            cmp_pos = candidates_goal[idx]
        passenger_look = passenger_look and agent_pos in candidates_p
        destination_look = destination_look and agent_pos in candidates_goal
        real_look = passenger_look if not pickup else destination_look
        relative_pos = (cmp(agent_pos[0],cmp_pos[0]),cmp(agent_pos[1],cmp_pos[1]))
        return (relative_pos,pickup, real_look, (obstacle_north,obstacle_south,obstacle_east,obstacle_west),last_action)
    """
    if action == 0 :  # Move Down
        next_row += 1
    elif action == 1:  # Move Up
        next_row -= 1
    elif action == 2:  # Move Right
        next_col += 1
    elif action == 3:  # Move Left
        next_col -= 1
    """
    station_size = 4
    total_reward = 0
    total_reward_shaped = 0
    cnt = [0,0,0,0]
    epsilon = epsilon_start
    averaged = [0,0]
    batch_size = 100
    for epoch in tqdm(range(episodes+batch_size)):
        if epoch >=episodes:
            epsilon = 0
        grid_size = 5 #np.random.randint(5,11)
        obstacle_size = np.random.randint(grid_size*grid_size)
        obs,_ = env.reset(grid_size,obstacle_size)
        averaged[0]+=env.grid_size
        averaged[1]+=len(env.obstacles)
        obs,ifpickup,p_loc = obs
        done = False
        state = get_state_obs(obs,action=None)
        steps=0
        action_l=[]
        success = False
        has_pickup=False

        while not done:
            """if state[-1]!=(0,0,0,0):
              print(state[-1])"""
            if np.random.choice(2,p=[epsilon,1-epsilon])==0 or state not in q_table.keys():
                action = np.random.randint(6)
            else:
                action = np.argmax(q_table[state])
            lst_pickup = pickup
            
            relative_pos,pickup, _ , _,last_action = state
            obs,reward,done,_ = env.step(action)
            obs,ifpickup,p_loc = obs
            if action in [0,1,2,3]:
                next_state = get_state_obs(obs,action,action)
            else:
                next_state = get_state_obs(obs,action,last_action)
            total_reward += reward

            ### reward shaping
            reward_shaping = 0
            """if relative_pos == (0,0):
                # want it to go to possible goal
                #reward_shaping += 0.5"""
            if relative_pos != (0,0) and action in [pickup_id,drop_id]:
                reward_shaping -= 10
            if done and reward>0:
                cnt[2]+=1
            
            relative_pos,pickup, _ , _,_ = next_state
            if not done and reward <-10 and action in [0,1,2,3]:
                #print('hit wall')
                cnt[-1]+=1
            """if not pickup:
                reward_shaping -= p_len
            reward_shaping -= 0.1 * goal_len"""
            reward += reward_shaping
            total_reward_shaped += reward
            if epsilon:
                q_table[state][action] = q_table[state][action] + alpha*(reward+gamma*np.max(q_table[next_state])-q_table[state][action])
            state = next_state
            if lst_pickup==False and pickup:
                cnt[0]+=1
            elif lst_pickup==True and pickup==False:
                cnt[1]+=1
            if pickup!=ifpickup:
                print(pickup,ifpickup)
            assert(pickup==ifpickup)


        if (epoch+1)%batch_size==0:
            cnt = [i/batch_size for i in cnt]
            print(f'Epsilon : {epsilon}, average reward : {total_reward/batch_size:.4f}, averaged shaped reward : {total_reward_shaped/batch_size:.4f} Pickup, Drop, Success, Hit wall rate : {cnt}')
            print(f'averaged grid size : {averaged[0]/batch_size:.2f}, averaged obstacles : {averaged[1]/batch_size:.2f}')
            averaged = [0,0]
            cnt = [0,0,0,0]
            total_reward = 0
            total_reward_shaped = 0
        epsilon *= decay_rate #max(epsilon*decay_rate ,epsilon_end)
    return q_table

In [6]:
env = SimpleTaxiEnv()
q_table = tabular_q_learning(env)

  2%|▏         | 108/7100 [00:03<02:49, 41.14it/s]

Epsilon : 0.9516933769307994, average reward : -10138.5050, averaged shaped reward : -15255.8050 Pickup, Drop, Success, Hit wall rate : [23.87, 23.87, 0.88, 490.13]
averaged grid size : 5.00, averaged obstacles : 10.14


  3%|▎         | 208/7100 [00:04<01:28, 78.28it/s]

Epsilon : 0.9052674235521029, average reward : -3898.0140, averaged shaped reward : -5883.5140 Pickup, Drop, Success, Hit wall rate : [18.69, 18.69, 1.0, 193.53]
averaged grid size : 5.00, averaged obstacles : 10.34


  4%|▍         | 319/7100 [00:06<01:17, 87.76it/s]

Epsilon : 0.8611062428400729, average reward : -2441.6760, averaged shaped reward : -3639.7760 Pickup, Drop, Success, Hit wall rate : [13.27, 13.27, 1.0, 129.25]
averaged grid size : 5.00, averaged obstacles : 10.38


  6%|▌         | 420/7100 [00:06<01:00, 110.98it/s]

Epsilon : 0.8190993535905904, average reward : -1305.1120, averaged shaped reward : -1982.3120 Pickup, Drop, Success, Hit wall rate : [10.9, 10.9, 1.0, 69.0]
averaged grid size : 5.00, averaged obstacles : 8.83


  7%|▋         | 513/7100 [00:07<00:55, 119.13it/s]

Epsilon : 0.7791416641455342, average reward : -1075.6380, averaged shaped reward : -1611.9380 Pickup, Drop, Success, Hit wall rate : [9.88, 9.88, 1.0, 60.31]
averaged grid size : 5.00, averaged obstacles : 11.04


  9%|▉         | 626/7100 [00:08<00:41, 155.97it/s]

Epsilon : 0.7411332094774175, average reward : -868.8410, averaged shaped reward : -1293.3410 Pickup, Drop, Success, Hit wall rate : [6.19, 6.19, 1.0, 49.75]
averaged grid size : 5.00, averaged obstacles : 10.53


 10%|█         | 726/7100 [00:09<00:42, 150.16it/s]

Epsilon : 0.7049789010996835, average reward : -674.3980, averaged shaped reward : -1018.6980 Pickup, Drop, Success, Hit wall rate : [8.96, 8.96, 1.0, 40.52]
averaged grid size : 5.00, averaged obstacles : 10.56


 12%|█▏        | 823/7100 [00:09<00:39, 158.01it/s]

Epsilon : 0.6705882891769959, average reward : -656.9150, averaged shaped reward : -1016.8150 Pickup, Drop, Success, Hit wall rate : [6.42, 6.42, 1.0, 36.55]
averaged grid size : 5.00, averaged obstacles : 9.63


 13%|█▎        | 905/7100 [00:10<00:39, 155.69it/s]

Epsilon : 0.6378753362403742, average reward : -550.4900, averaged shaped reward : -841.0900 Pickup, Drop, Success, Hit wall rate : [8.85, 8.85, 0.99, 33.83]
averaged grid size : 5.00, averaged obstacles : 10.05


 14%|█▍        | 1026/7100 [00:11<00:41, 144.99it/s]

Epsilon : 0.6067582019410674, average reward : -872.5250, averaged shaped reward : -1299.0250 Pickup, Drop, Success, Hit wall rate : [10.81, 10.8, 0.99, 52.47]
averaged grid size : 5.00, averaged obstacles : 10.08


 16%|█▌        | 1125/7100 [00:11<00:34, 175.73it/s]

Epsilon : 0.5771590383046616, average reward : -420.3860, averaged shaped reward : -651.8860 Pickup, Drop, Success, Hit wall rate : [7.33, 7.33, 1.0, 27.3]
averaged grid size : 5.00, averaged obstacles : 10.32


 17%|█▋        | 1227/7100 [00:12<00:35, 166.92it/s]

Epsilon : 0.5490037949732016, average reward : -203.8030, averaged shaped reward : -330.0030 Pickup, Drop, Success, Hit wall rate : [3.49, 3.49, 1.0, 13.84]
averaged grid size : 5.00, averaged obstacles : 10.67


 19%|█▊        | 1318/7100 [00:13<00:40, 142.88it/s]

Epsilon : 0.5222220339480774, average reward : -503.3540, averaged shaped reward : -787.1540 Pickup, Drop, Success, Hit wall rate : [6.05, 6.05, 1.0, 28.53]
averaged grid size : 5.00, averaged obstacles : 10.09


 20%|██        | 1428/7100 [00:13<00:34, 165.74it/s]

Epsilon : 0.49674675337021873, average reward : -502.4700, averaged shaped reward : -774.2700 Pickup, Drop, Success, Hit wall rate : [6.1, 6.1, 1.0, 30.14]
averaged grid size : 5.00, averaged obstacles : 10.58


 22%|██▏       | 1538/7100 [00:14<00:34, 162.75it/s]

Epsilon : 0.47251421989671744, average reward : -393.4510, averaged shaped reward : -640.0510 Pickup, Drop, Success, Hit wall rate : [7.5, 7.49, 0.99, 23.23]
averaged grid size : 5.00, averaged obstacles : 10.16


 23%|██▎       | 1634/7100 [00:15<00:30, 180.12it/s]

Epsilon : 0.44946380925453877, average reward : -372.3120, averaged shaped reward : -610.5120 Pickup, Drop, Success, Hit wall rate : [8.03, 8.03, 1.0, 22.87]
averaged grid size : 5.00, averaged obstacles : 9.34


 24%|██▍       | 1714/7100 [00:15<00:31, 170.08it/s]

Epsilon : 0.4275378545724137, average reward : -193.4950, averaged shaped reward : -323.2950 Pickup, Drop, Success, Hit wall rate : [4.81, 4.81, 1.0, 13.74]
averaged grid size : 5.00, averaged obstacles : 9.98


 26%|██▌       | 1823/7100 [00:16<00:33, 158.89it/s]

Epsilon : 0.4066815021114777, average reward : -357.4400, averaged shaped reward : -610.2400 Pickup, Drop, Success, Hit wall rate : [10.1, 10.1, 1.0, 21.57]
averaged grid size : 5.00, averaged obstacles : 10.96


 27%|██▋       | 1900/7100 [00:16<00:24, 214.29it/s]

Epsilon : 0.38684257403372235, average reward : -103.8060, averaged shaped reward : -189.6060 Pickup, Drop, Success, Hit wall rate : [3.26, 3.26, 1.0, 7.78]
averaged grid size : 5.00, averaged obstacles : 9.34


 28%|██▊       | 2016/7100 [00:17<00:27, 186.20it/s]

Epsilon : 0.3679714378649446, average reward : -342.5070, averaged shaped reward : -556.3070 Pickup, Drop, Success, Hit wall rate : [6.05, 6.05, 1.0, 20.2]
averaged grid size : 5.00, averaged obstacles : 10.03


 30%|██▉       | 2126/7100 [00:17<00:25, 191.99it/s]

Epsilon : 0.35002088232561296, average reward : -132.5310, averaged shaped reward : -231.2310 Pickup, Drop, Success, Hit wall rate : [4.11, 4.11, 1.0, 10.03]
averaged grid size : 5.00, averaged obstacles : 9.29


 32%|███▏      | 2238/7100 [00:18<00:22, 214.30it/s]

Epsilon : 0.33294599921901236, average reward : -115.3840, averaged shaped reward : -199.9840 Pickup, Drop, Success, Hit wall rate : [2.9, 2.9, 1.0, 8.55]
averaged grid size : 5.00, averaged obstacles : 9.89


 33%|███▎      | 2332/7100 [00:18<00:21, 217.10it/s]

Epsilon : 0.3167040710811749, average reward : -65.3430, averaged shaped reward : -127.5430 Pickup, Drop, Success, Hit wall rate : [2.83, 2.83, 1.0, 6.31]
averaged grid size : 5.00, averaged obstacles : 10.04


 34%|███▍      | 2437/7100 [00:19<00:27, 168.77it/s]

Epsilon : 0.3012544643115281, average reward : -75.3160, averaged shaped reward : -138.0160 Pickup, Drop, Success, Hit wall rate : [2.56, 2.56, 1.0, 6.88]
averaged grid size : 5.00, averaged obstacles : 10.58


 36%|███▌      | 2522/7100 [00:20<00:27, 166.63it/s]

Epsilon : 0.2865585275168893, average reward : -160.1560, averaged shaped reward : -276.2560 Pickup, Drop, Success, Hit wall rate : [1.89, 1.89, 1.0, 8.99]
averaged grid size : 5.00, averaged obstacles : 10.31


 37%|███▋      | 2629/7100 [00:20<00:22, 200.33it/s]

Epsilon : 0.2725794948144954, average reward : -138.4870, averaged shaped reward : -242.6870 Pickup, Drop, Success, Hit wall rate : [4.49, 4.49, 1.0, 10.26]
averaged grid size : 5.00, averaged obstacles : 10.30


 38%|███▊      | 2721/7100 [00:21<00:21, 208.33it/s]

Epsilon : 0.2592823938521472, average reward : -30.2960, averaged shaped reward : -72.3960 Pickup, Drop, Success, Hit wall rate : [2.27, 2.27, 1.0, 4.59]
averaged grid size : 5.00, averaged obstacles : 10.59


 40%|████      | 2842/7100 [00:21<00:25, 168.99it/s]

Epsilon : 0.2466339583153596, average reward : -193.2070, averaged shaped reward : -322.7070 Pickup, Drop, Success, Hit wall rate : [3.18, 3.18, 1.0, 11.73]
averaged grid size : 5.00, averaged obstacles : 11.38


 41%|████▏     | 2929/7100 [00:22<00:22, 185.02it/s]

Epsilon : 0.23460254470262715, average reward : -67.7830, averaged shaped reward : -125.7830 Pickup, Drop, Success, Hit wall rate : [1.86, 1.86, 1.0, 5.85]
averaged grid size : 5.00, averaged obstacles : 9.72


 43%|████▎     | 3025/7100 [00:22<00:19, 204.83it/s]

Epsilon : 0.22315805316059978, average reward : -63.6480, averaged shaped reward : -130.0480 Pickup, Drop, Success, Hit wall rate : [3.15, 3.15, 1.0, 6.03]
averaged grid size : 5.00, averaged obstacles : 9.73


 44%|████▍     | 3128/7100 [00:23<00:22, 178.97it/s]

Epsilon : 0.21227185218111316, average reward : -34.3440, averaged shaped reward : -76.6440 Pickup, Drop, Success, Hit wall rate : [1.85, 1.85, 1.0, 4.15]
averaged grid size : 5.00, averaged obstacles : 10.92


 45%|████▌     | 3215/7100 [00:23<00:20, 187.12it/s]

Epsilon : 0.20191670697168435, average reward : -46.4590, averaged shaped reward : -96.2590 Pickup, Drop, Success, Hit wall rate : [1.74, 1.74, 1.0, 4.64]
averaged grid size : 5.00, averaged obstacles : 9.88


 47%|████▋     | 3345/7100 [00:24<00:17, 209.19it/s]

Epsilon : 0.19206671132027073, average reward : -47.1000, averaged shaped reward : -99.2000 Pickup, Drop, Success, Hit wall rate : [2.14, 2.14, 1.0, 4.83]
averaged grid size : 5.00, averaged obstacles : 10.45


 49%|████▊     | 3445/7100 [00:25<00:22, 165.55it/s]

Epsilon : 0.1826972227838353, average reward : -133.2730, averaged shaped reward : -236.8730 Pickup, Drop, Success, Hit wall rate : [4.01, 4.01, 1.0, 9.07]
averaged grid size : 5.00, averaged obstacles : 10.29


 50%|████▉     | 3541/7100 [00:25<00:18, 194.41it/s]

Epsilon : 0.1737848010385734, average reward : -8.0640, averaged shaped reward : -38.0640 Pickup, Drop, Success, Hit wall rate : [1.78, 1.78, 1.0, 2.79]
averaged grid size : 5.00, averaged obstacles : 10.11


 51%|█████     | 3629/7100 [00:25<00:17, 199.58it/s]

Epsilon : 0.1653071492375671, average reward : -13.1620, averaged shaped reward : -46.9620 Pickup, Drop, Success, Hit wall rate : [1.86, 1.86, 1.0, 3.0]
averaged grid size : 5.00, averaged obstacles : 10.77


 53%|█████▎    | 3739/7100 [00:26<00:17, 194.78it/s]

Epsilon : 0.15724305822915946, average reward : -45.8020, averaged shaped reward : -91.1020 Pickup, Drop, Success, Hit wall rate : [2.11, 2.11, 1.0, 5.06]
averaged grid size : 5.00, averaged obstacles : 9.90


 54%|█████▍    | 3855/7100 [00:27<00:14, 225.57it/s]

Epsilon : 0.14957235349649245, average reward : -23.9890, averaged shaped reward : -63.7890 Pickup, Drop, Success, Hit wall rate : [1.46, 1.46, 1.0, 3.28]
averaged grid size : 5.00, averaged obstacles : 9.80


 56%|█████▌    | 3954/7100 [00:27<00:13, 235.79it/s]

Epsilon : 0.14227584468546683, average reward : -7.2250, averaged shaped reward : -36.5250 Pickup, Drop, Success, Hit wall rate : [1.5, 1.5, 1.0, 2.75]
averaged grid size : 5.00, averaged obstacles : 10.14


 57%|█████▋    | 4023/7100 [00:27<00:14, 209.72it/s]

Epsilon : 0.13533527759485164, average reward : -9.1420, averaged shaped reward : -37.8420 Pickup, Drop, Success, Hit wall rate : [1.52, 1.52, 1.0, 2.8]
averaged grid size : 5.00, averaged obstacles : 9.15


 58%|█████▊    | 4128/7100 [00:28<00:19, 155.95it/s]

Epsilon : 0.12873328850843568, average reward : -8.6520, averaged shaped reward : -38.4520 Pickup, Drop, Success, Hit wall rate : [1.19, 1.19, 1.0, 2.49]
averaged grid size : 5.00, averaged obstacles : 10.37


 59%|█████▉    | 4214/7100 [00:28<00:14, 194.40it/s]

Epsilon : 0.12245336075496815, average reward : 10.4670, averaged shaped reward : -6.3330 Pickup, Drop, Success, Hit wall rate : [1.36, 1.36, 1.0, 2.07]
averaged grid size : 5.00, averaged obstacles : 10.55


 61%|██████    | 4320/7100 [00:29<00:15, 178.00it/s]

Epsilon : 0.11647978338721439, average reward : -6.2290, averaged shaped reward : -36.2290 Pickup, Drop, Success, Hit wall rate : [2.15, 2.15, 1.0, 2.92]
averaged grid size : 5.00, averaged obstacles : 9.43


 62%|██████▏   | 4428/7100 [00:30<00:13, 199.14it/s]

Epsilon : 0.11079761187674808, average reward : -7.2810, averaged shaped reward : -34.0810 Pickup, Drop, Success, Hit wall rate : [1.68, 1.68, 1.0, 2.87]
averaged grid size : 5.00, averaged obstacles : 9.63


 64%|██████▍   | 4543/7100 [00:30<00:12, 200.09it/s]

Epsilon : 0.10539263072614893, average reward : 7.6120, averaged shaped reward : -10.5880 Pickup, Drop, Success, Hit wall rate : [1.27, 1.27, 1.0, 2.23]
averaged grid size : 5.00, averaged obstacles : 10.73


 65%|██████▌   | 4636/7100 [00:31<00:11, 215.86it/s]

Epsilon : 0.10025131790506973, average reward : 13.3180, averaged shaped reward : -4.9820 Pickup, Drop, Success, Hit wall rate : [1.38, 1.38, 1.0, 1.8]
averaged grid size : 5.00, averaged obstacles : 9.85


 67%|██████▋   | 4746/7100 [00:31<00:12, 191.98it/s]

Epsilon : 0.09536081102119956, average reward : 29.6010, averaged shaped reward : 20.9010 Pickup, Drop, Success, Hit wall rate : [1.3, 1.3, 1.0, 1.07]
averaged grid size : 5.00, averaged obstacles : 10.55


 68%|██████▊   | 4843/7100 [00:32<00:10, 225.43it/s]

Epsilon : 0.09070887514149145, average reward : 11.8470, averaged shaped reward : -4.6530 Pickup, Drop, Success, Hit wall rate : [1.29, 1.29, 1.0, 1.84]
averaged grid size : 5.00, averaged obstacles : 10.21


 70%|██████▉   | 4941/7100 [00:32<00:09, 216.33it/s]

Epsilon : 0.08628387218314981, average reward : -3.1000, averaged shaped reward : -28.9000 Pickup, Drop, Success, Hit wall rate : [1.26, 1.26, 1.0, 2.17]
averaged grid size : 5.00, averaged obstacles : 9.39


 71%|███████   | 5031/7100 [00:33<00:11, 186.94it/s]

Epsilon : 0.082074731797801, average reward : 12.3760, averaged shaped reward : -3.0240 Pickup, Drop, Success, Hit wall rate : [1.14, 1.14, 1.0, 1.73]
averaged grid size : 5.00, averaged obstacles : 10.41


 72%|███████▏  | 5122/7100 [00:33<00:09, 202.45it/s]

Epsilon : 0.07807092367600618, average reward : -0.0720, averaged shaped reward : -22.2720 Pickup, Drop, Success, Hit wall rate : [1.1, 1.1, 1.0, 2.02]
averaged grid size : 5.00, averaged obstacles : 10.12


 73%|███████▎  | 5214/7100 [00:34<00:09, 192.08it/s]

Epsilon : 0.07426243120282834, average reward : 22.4850, averaged shaped reward : 11.0850 Pickup, Drop, Success, Hit wall rate : [1.15, 1.15, 1.0, 1.21]
averaged grid size : 5.00, averaged obstacles : 10.48


 75%|███████▌  | 5342/7100 [00:34<00:09, 194.27it/s]

Epsilon : 0.07063972639854567, average reward : 1.5140, averaged shaped reward : -20.9860 Pickup, Drop, Success, Hit wall rate : [1.77, 1.76, 0.99, 2.5]
averaged grid size : 5.00, averaged obstacles : 10.28


 77%|███████▋  | 5435/7100 [00:35<00:08, 195.19it/s]

Epsilon : 0.06719374608181877, average reward : 16.4800, averaged shaped reward : 1.0800 Pickup, Drop, Success, Hit wall rate : [1.13, 1.13, 1.0, 1.23]
averaged grid size : 5.00, averaged obstacles : 10.50


 78%|███████▊  | 5549/7100 [00:35<00:08, 190.37it/s]

Epsilon : 0.06391586919567814, average reward : 19.4640, averaged shaped reward : 6.3640 Pickup, Drop, Success, Hit wall rate : [1.2, 1.2, 1.0, 1.31]
averaged grid size : 5.00, averaged obstacles : 10.59


 79%|███████▉  | 5628/7100 [00:36<00:07, 198.69it/s]

Epsilon : 0.060797895239605056, average reward : 7.2680, averaged shaped reward : -11.6320 Pickup, Drop, Success, Hit wall rate : [1.58, 1.58, 1.0, 1.96]
averaged grid size : 5.00, averaged obstacles : 9.13


 81%|████████  | 5739/7100 [00:36<00:06, 212.71it/s]

Epsilon : 0.0578320237537493, average reward : 29.5400, averaged shaped reward : 18.4400 Pickup, Drop, Success, Hit wall rate : [1.24, 1.24, 1.0, 0.83]
averaged grid size : 5.00, averaged obstacles : 10.28


 82%|████████▏ | 5838/7100 [00:37<00:05, 213.98it/s]

Epsilon : 0.05501083480395743, average reward : 36.0990, averaged shaped reward : 28.9990 Pickup, Drop, Success, Hit wall rate : [1.22, 1.22, 1.0, 0.55]
averaged grid size : 5.00, averaged obstacles : 10.22


 84%|████████▎ | 5930/7100 [00:37<00:05, 215.04it/s]

Epsilon : 0.05232727041878946, average reward : 36.2540, averaged shaped reward : 30.3540 Pickup, Drop, Success, Hit wall rate : [1.05, 1.05, 1.0, 0.47]
averaged grid size : 5.00, averaged obstacles : 10.59


 85%|████████▌ | 6041/7100 [00:38<00:05, 202.59it/s]

Epsilon : 0.049774616932083626, average reward : 23.8200, averaged shaped reward : 12.7200 Pickup, Drop, Success, Hit wall rate : [1.15, 1.15, 1.0, 1.03]
averaged grid size : 5.00, averaged obstacles : 10.93


 86%|████████▌ | 6116/7100 [00:38<00:04, 197.19it/s]

Epsilon : 0.04734648818689483, average reward : 39.9390, averaged shaped reward : 35.1390 Pickup, Drop, Success, Hit wall rate : [1.06, 1.06, 1.0, 0.37]
averaged grid size : 5.00, averaged obstacles : 10.01


 88%|████████▊ | 6220/7100 [00:39<00:06, 141.66it/s]

Epsilon : 0.04503680955878596, average reward : 15.6350, averaged shaped reward : 0.9350 Pickup, Drop, Success, Hit wall rate : [1.5, 1.5, 1.0, 1.42]
averaged grid size : 5.00, averaged obstacles : 10.71


 89%|████████▉ | 6350/7100 [00:39<00:03, 210.95it/s]

Epsilon : 0.04283980275850272, average reward : 7.4760, averaged shaped reward : -11.0240 Pickup, Drop, Success, Hit wall rate : [1.58, 1.58, 0.99, 1.59]
averaged grid size : 5.00, averaged obstacles : 10.54


 91%|█████████ | 6436/7100 [00:40<00:03, 184.71it/s]

Epsilon : 0.040749971376011676, average reward : 30.8360, averaged shaped reward : 24.2360 Pickup, Drop, Success, Hit wall rate : [1.11, 1.11, 1.0, 0.81]
averaged grid size : 5.00, averaged obstacles : 10.26


 92%|█████████▏| 6538/7100 [00:40<00:03, 172.61it/s]

Epsilon : 0.03876208712973563, average reward : 19.4140, averaged shaped reward : 7.6140 Pickup, Drop, Success, Hit wall rate : [1.36, 1.36, 1.0, 1.22]
averaged grid size : 5.00, averaged obstacles : 10.01


 93%|█████████▎| 6630/7100 [00:41<00:02, 191.61it/s]

Epsilon : 0.03687117678658528, average reward : 28.3040, averaged shaped reward : 19.3040 Pickup, Drop, Success, Hit wall rate : [1.3, 1.3, 1.0, 0.89]
averaged grid size : 5.00, averaged obstacles : 10.21


 95%|█████████▍| 6722/7100 [00:41<00:02, 188.62it/s]

Epsilon : 0.03507250972006415, average reward : 16.9700, averaged shaped reward : 5.2700 Pickup, Drop, Success, Hit wall rate : [1.17, 1.17, 1.0, 1.27]
averaged grid size : 5.00, averaged obstacles : 9.59


 96%|█████████▌| 6829/7100 [00:42<00:01, 164.56it/s]

Epsilon : 0.03336158607531967, average reward : 25.8380, averaged shaped reward : 16.9380 Pickup, Drop, Success, Hit wall rate : [1.27, 1.27, 1.0, 0.82]
averaged grid size : 5.00, averaged obstacles : 10.56


 98%|█████████▊| 6923/7100 [00:43<00:01, 156.18it/s]

Epsilon : 0.031734125511532624, average reward : 14.9790, averaged shaped reward : 3.7790 Pickup, Drop, Success, Hit wall rate : [1.03, 1.03, 0.99, 1.25]
averaged grid size : 5.00, averaged obstacles : 11.94


 99%|█████████▉| 7022/7100 [00:43<00:00, 166.25it/s]

Epsilon : 0.030186056493480308, average reward : 19.9040, averaged shaped reward : 11.0040 Pickup, Drop, Success, Hit wall rate : [1.08, 1.08, 1.0, 1.26]
averaged grid size : 5.00, averaged obstacles : 10.28


100%|██████████| 7100/7100 [00:44<00:00, 160.56it/s]

Epsilon : 0, average reward : 20.3540, averaged shaped reward : 20.3540 Pickup, Drop, Success, Hit wall rate : [0.96, 0.95, 0.95, 0.02]
averaged grid size : 5.00, averaged obstacles : 10.50





In [7]:
import pickle
q_table_dict = dict(q_table)  # Convert to regular dict
q_table_dict_list = {k: v.tolist() for k, v in q_table_dict.items()}  # Convert numpy arrays to lists
with open('q_table.pkl', 'wb') as f:
    pickle.dump(q_table_dict_list, f)

In [8]:
print(len(q_table_dict.keys()))

912


# Testing

In [9]:
from tqdm import tqdm
def Testing(env):
    import pickle
    from collections import defaultdict
    with open('q_table.pkl', 'rb') as f:
        print('load')
        loaded_dict = pickle.load(f)
    q_table = defaultdict(lambda: np.zeros(6), loaded_dict)  # Replace 0 with your default value
    print(len(q_table))
    #print('len of q_table',len(q_table.keys()))
    global stations, candidates_p,candidates_goal, pickup, goal_id,last_action,last_record_action
    stations = [[0,0] for _ in range(4)]
    candidates_p = [i for i in stations]
    candidates_goal = [i for i in stations]
    goal_id = -1
    pickup=False
    action_size = 6
    last_action = None
    last_record_action = None
    #q_table = defaultdict(lambda :np.zeros(action_size))
    pickup_id = 4
    drop_id = 5
    ifpickup=False
    p_loc = (0,0)
    def cmp(a,b):
        if a==b:
            return 0
        return 1 if a<b else -1
            
    def get_state_obs(obs,action,last_action=None):
        global stations,pickup,candidates_p,candidates_goal
        #print(candidates_p)
        taxi_row, taxi_col, stations[0][0], stations[0][1] , stations[1][0], stations[1][1],stations[2][0],stations[2][1],stations[3][0],stations[3][1],obstacle_north, obstacle_south, obstacle_east, obstacle_west, passenger_look, destination_look = obs
        agent_pos = (taxi_row,taxi_col)
        if action==None:
            # initialize
            candidates_goal = [tuple(i) for i in stations]
            candidates_p = [tuple(i) for i in stations]
            pickup=False
        if passenger_look:
            #print('before p',candidates_p)
            candidates_p = [ tuple(x) for x in candidates_p if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) <=1 ]
            #print('after p',candidates_p)
        else:
            #print('before p',candidates_p)
            candidates_p = [ tuple(x) for x in candidates_p if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) >1 ]
            #print('after p',candidates_p)
        if destination_look:
            #print('before g',candidates_goal)
            candidates_goal = [ tuple(x) for x in candidates_goal if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) <=1 ]
            #print('after g',candidates_goal)
        else:
            #print('before g',candidates_goal)
            candidates_goal = [ tuple(x) for x in candidates_goal if abs(x[0]-agent_pos[0])+abs(x[1]-agent_pos[1]) >1 ]
            #print('after g',candidates_goal)
        if action==pickup_id and not pickup and agent_pos in candidates_p:
            pickup = True
            candidates_p = []
        elif action == drop_id and pickup:
            pickup=False
            candidates_p.append(agent_pos)
        cmp_pos = (0,0)
        if not pickup:
            idx = np.argmin([abs(agent_pos[0]-i[0])+abs(agent_pos[1]-i[1]) for i in candidates_p])
            cmp_pos = candidates_p[idx]
        else:
            idx = np.argmin([abs(agent_pos[0]-i[0])+abs(agent_pos[1]-i[1]) for i in candidates_goal])
            cmp_pos = candidates_goal[idx]
        passenger_look = passenger_look and agent_pos in candidates_p
        destination_look = destination_look and agent_pos in candidates_goal
        real_look = passenger_look if not pickup else destination_look
        relative_pos = (cmp(agent_pos[0],cmp_pos[0]),cmp(agent_pos[1],cmp_pos[1]))
        return (relative_pos,pickup, real_look, (obstacle_north,obstacle_south,obstacle_east,obstacle_west),last_action)

    def get_action(obs):
        # TODO: Train your own agent
        # HINT: If you're using a Q-table, consider designing a custom key based on `obs` to store useful information.
        # NOTE: Keep in mind that your Q-table may not cover all possible states in the testing environment.
        #       To prevent crashes, implement a fallback strategy for missing keys.
        #       Otherwise, even if your agent performs well in training, it may fail during testing.
        global last_action,last_record_action
        state = get_state_obs(obs,last_action,last_record_action)
        action_name = ['Move North','Move South','Move East','Move West','Pick Up','Drop Off']
        if state not in q_table.keys():
            #print(state)
            print(state)
            assert(0)
            action = np.random.randint(action_size)
        else:
            #print(state,action_name[np.argmax(q_table[state])])
            action = np.argmax(q_table[state])
        last_action = action
        if action in [0,1,2,3]:
            last_record_action = action
        return action,state # Choose a random action
        # You can submit this random agent to evaluate the performance of a purely random strategy.
    Total_reward=0
    for i in tqdm(range(100)):
        with open('q_table.pkl', 'rb') as f:
            #print('load')
            loaded_dict = pickle.load(f)
        q_table = defaultdict(lambda: np.zeros(6), loaded_dict)  # Replace 0 with your default value
        grid_size = np.random.randint(5,11)
        obstacle_size = np.random.randint(grid_size*grid_size)
        obs,_ = env.reset()
        obs,_,_ = obs
        total_reward = 0
        done = False
        stations = [[0,0] for _ in range(4)]
        candidates_p = [i for i in stations]
        candidates_goal = [i for i in stations]
        goal_id = -1
        pickup=False
        action_size = 6
        pickup_id = 4
        drop_id = 5
        last_action = None
        last_record_action = None
        while not done:
            action,state = get_action(obs)
            obs,reward,done,_ = env.step(action)
            obs,_,_ = obs
            total_reward += reward
            q_table[state][action] = q_table[state][action] + (reward+np.max(q_table[state])-q_table[state][action])
        Total_reward+=total_reward
        print(f'grid_size : {env.grid_size}, obstacle_size : {len(env.obstacles)}, total_reward : {total_reward}')
        # save the q_table
        
    print(f'average : {Total_reward/100}')

In [10]:
env = SimpleTaxiEnv()
Testing(env)

load
912


  0%|          | 0/100 [00:00<?, ?it/s]

grid_size : 5, obstacle_size : 11, total_reward : 38.800000000000004
grid_size : 6, obstacle_size : 22, total_reward : 4.799999999999628
grid_size : 6, obstacle_size : 11, total_reward : -106.3999999999954
grid_size : 9, obstacle_size : 54, total_reward : 47.099999999999994


  5%|▌         | 5/100 [00:00<00:05, 17.03it/s]

grid_size : 10, obstacle_size : 73, total_reward : 46.699999999999996
grid_size : 6, obstacle_size : 8, total_reward : 48.4
grid_size : 8, obstacle_size : 0, total_reward : 48.6
grid_size : 8, obstacle_size : 48, total_reward : 47.9


 13%|█▎        | 13/100 [00:00<00:04, 19.97it/s]

grid_size : 10, obstacle_size : 69, total_reward : -72.79999999999731
grid_size : 9, obstacle_size : 12, total_reward : -36.59999999999936
grid_size : 10, obstacle_size : 33, total_reward : 46.0
grid_size : 5, obstacle_size : 16, total_reward : 48.9
grid_size : 9, obstacle_size : 61, total_reward : 46.599999999999994
grid_size : 5, obstacle_size : 17, total_reward : 48.6


 19%|█▉        | 19/100 [00:01<00:04, 18.89it/s]

grid_size : 10, obstacle_size : 70, total_reward : 46.9
grid_size : 5, obstacle_size : 11, total_reward : 49.0
grid_size : 8, obstacle_size : 40, total_reward : -580.0000000000473
grid_size : 8, obstacle_size : 18, total_reward : 47.199999999999996
grid_size : 9, obstacle_size : 51, total_reward : 47.0
grid_size : 9, obstacle_size : 8, total_reward : 47.9


 22%|██▏       | 22/100 [00:01<00:03, 19.70it/s]

grid_size : 9, obstacle_size : 59, total_reward : 47.099999999999994
grid_size : 9, obstacle_size : 44, total_reward : -43.699999999999214
grid_size : 6, obstacle_size : 11, total_reward : 49.3
grid_size : 5, obstacle_size : 12, total_reward : 3.899999999999615
grid_size : 9, obstacle_size : 0, total_reward : 47.5
grid_size : 8, obstacle_size : 4, total_reward : 47.599999999999994
grid_size : 9, obstacle_size : 3, total_reward : 47.699999999999996
grid_size : 8, obstacle_size : 22, total_reward : -226.69999999999553


 32%|███▏      | 32/100 [00:01<00:02, 23.98it/s]

grid_size : 9, obstacle_size : 51, total_reward : 46.099999999999994
grid_size : 6, obstacle_size : 27, total_reward : 49.1
grid_size : 9, obstacle_size : 39, total_reward : -443.8000000000324
grid_size : 8, obstacle_size : 12, total_reward : -670.0000000000551
grid_size : 5, obstacle_size : 14, total_reward : 48.199999999999996
grid_size : 6, obstacle_size : 26, total_reward : 47.699999999999996
grid_size : 5, obstacle_size : 8, total_reward : 48.0
grid_size : 10, obstacle_size : 43, total_reward : -17.10000000000047


 43%|████▎     | 43/100 [00:01<00:01, 34.81it/s]

grid_size : 8, obstacle_size : 44, total_reward : 46.3
grid_size : 5, obstacle_size : 8, total_reward : 48.8
grid_size : 9, obstacle_size : 2, total_reward : 49.0
grid_size : 8, obstacle_size : 50, total_reward : 46.699999999999996
grid_size : 5, obstacle_size : 3, total_reward : 48.6
grid_size : 9, obstacle_size : 28, total_reward : 35.100000000000016
grid_size : 9, obstacle_size : 36, total_reward : 47.8
grid_size : 8, obstacle_size : 1, total_reward : 49.1
grid_size : 9, obstacle_size : 20, total_reward : -650.000000000048


 53%|█████▎    | 53/100 [00:02<00:01, 35.67it/s]

grid_size : 10, obstacle_size : 72, total_reward : 46.099999999999994
grid_size : 5, obstacle_size : 10, total_reward : 47.5
grid_size : 10, obstacle_size : 22, total_reward : 48.0
grid_size : 8, obstacle_size : 31, total_reward : 43.50000000000001
grid_size : 7, obstacle_size : 37, total_reward : 48.0
grid_size : 8, obstacle_size : 35, total_reward : 45.1
grid_size : 5, obstacle_size : 0, total_reward : 49.4
grid_size : 9, obstacle_size : 36, total_reward : 48.699999999999996
grid_size : 5, obstacle_size : 17, total_reward : 48.9


 61%|██████    | 61/100 [00:02<00:01, 31.60it/s]

grid_size : 9, obstacle_size : 35, total_reward : -600.0000000000495
grid_size : 5, obstacle_size : 5, total_reward : 48.199999999999996
grid_size : 5, obstacle_size : 5, total_reward : 48.5
grid_size : 10, obstacle_size : 61, total_reward : -520.0000000000451
grid_size : 8, obstacle_size : 5, total_reward : 47.699999999999996
grid_size : 5, obstacle_size : 17, total_reward : 49.199999999999996
grid_size : 5, obstacle_size : 12, total_reward : 48.4
grid_size : 6, obstacle_size : 20, total_reward : 49.0
grid_size : 5, obstacle_size : 16, total_reward : 48.0


 69%|██████▉   | 69/100 [00:02<00:01, 30.35it/s]

grid_size : 10, obstacle_size : 32, total_reward : -740.0000000000508
grid_size : 7, obstacle_size : 33, total_reward : 47.9
grid_size : 7, obstacle_size : 32, total_reward : 47.4
grid_size : 9, obstacle_size : 30, total_reward : -600.0000000000451
grid_size : 7, obstacle_size : 39, total_reward : 48.8
grid_size : 6, obstacle_size : 25, total_reward : 48.1


 80%|████████  | 80/100 [00:02<00:00, 36.39it/s]

grid_size : 9, obstacle_size : 50, total_reward : 48.5
grid_size : 6, obstacle_size : 17, total_reward : 47.5
grid_size : 10, obstacle_size : 8, total_reward : 47.9
grid_size : 5, obstacle_size : 16, total_reward : 48.699999999999996
grid_size : 10, obstacle_size : 57, total_reward : -213.39999999999193
grid_size : 6, obstacle_size : 23, total_reward : 48.699999999999996
grid_size : 5, obstacle_size : 12, total_reward : 47.0
grid_size : 7, obstacle_size : 27, total_reward : 47.5
grid_size : 5, obstacle_size : 15, total_reward : 47.699999999999996
grid_size : 8, obstacle_size : 46, total_reward : 48.8
grid_size : 7, obstacle_size : 10, total_reward : 48.9
grid_size : 5, obstacle_size : 13, total_reward : 48.1


 84%|████████▍ | 84/100 [00:03<00:00, 22.43it/s]

grid_size : 10, obstacle_size : 65, total_reward : -422.40000000003437
grid_size : 9, obstacle_size : 62, total_reward : 46.699999999999996
grid_size : 7, obstacle_size : 37, total_reward : 48.1
grid_size : 7, obstacle_size : 6, total_reward : 48.4
grid_size : 9, obstacle_size : 49, total_reward : 46.3
grid_size : 8, obstacle_size : 49, total_reward : 48.8


 92%|█████████▏| 92/100 [00:03<00:00, 24.08it/s]

grid_size : 10, obstacle_size : 83, total_reward : 48.1
grid_size : 5, obstacle_size : 13, total_reward : 48.5
grid_size : 8, obstacle_size : 5, total_reward : -39.499999999999766
grid_size : 6, obstacle_size : 22, total_reward : 49.0
grid_size : 9, obstacle_size : 61, total_reward : 47.9
grid_size : 9, obstacle_size : 20, total_reward : -690.0000000000451


 95%|█████████▌| 95/100 [00:03<00:00, 16.61it/s]

grid_size : 10, obstacle_size : 66, total_reward : 44.9
grid_size : 10, obstacle_size : 73, total_reward : 47.099999999999994


 98%|█████████▊| 98/100 [00:04<00:00, 13.35it/s]

grid_size : 10, obstacle_size : 81, total_reward : 46.599999999999994
grid_size : 7, obstacle_size : 21, total_reward : -89.59999999999636
grid_size : 10, obstacle_size : 69, total_reward : 47.599999999999994


100%|██████████| 100/100 [00:04<00:00, 23.33it/s]

grid_size : 5, obstacle_size : 0, total_reward : 48.5
grid_size : 8, obstacle_size : 16, total_reward : 48.1
average : -29.97700000000428



