# Generate experiment csv

In [1]:
import json
from copy import deepcopy
import pandas as pd

In [None]:
# CONFIG_PATH = "experiment_values.json"

# with open(CONFIG_PATH) as f:
#     config = json.load(f)

# defaults = config["defaults"]
# grids = config["grids"]
# experiments = config["experiments"]

# for agent, sweep in experiments.items():
# 	default_params = defaults[agent]

# 	for grid in grids:
# 		for param_name, values in sweep.items():
# 			for val in values:
# 				params = deepcopy(default_params)
# 				params[param_name] = val

# 				run_config = {
# 					"agent": agent,
# 					"grid": grid,
# 					"param_changed": param_name,
# 					"param_value": val,
# 					"params": params
# 				}

# 				cli_args = []
# 				for k, v in run_config["params"].items():
# 					cli_args += [f"--{k}", str(v)]

In [None]:
# cli_args

['--iter', '5000']

In [4]:
CONFIG_PATH = "experiment_values.json"

with open(CONFIG_PATH) as f:
    config = json.load(f)

defaults = config["defaults"]
grids = config["grids"]
experiments = config["experiments"]

all_param_names = set()
for agent_params in defaults.values():
    all_param_names.update(agent_params.keys())

rows = []

for agent, sweep in experiments.items():
    default_params = defaults[agent]

    for grid in grids:
        for param_name, values in sweep.items():
            for val in values:
                params = deepcopy(default_params)
                params[param_name] = val

                row = {
                    "agent": agent,
                    "grid": grid,
                    "param_changed": param_name,
                    "param_value": val
                }

                for pname in sorted(all_param_names):
                    row[pname] = params.get(pname, float('nan'))

                rows.append(row)

df = pd.DataFrame(rows)

df.to_csv("experiment_table.csv", index=False)

In [None]:
df = df.loc[df["agent"] != "RandomAgent"]
df.to_csv("experimental_table.csv", index = False)

# Training - Copy + Minor update to train.py code. Train.py wasn't touched

In [6]:
import json
import numpy as np
import pandas as pd
from copy import deepcopy
from argparse import Namespace
from tqdm import trange
from pathlib import Path
import importlib
import inspect
from inspect import Parameter

from world.reward_functions import custom_reward_function
from world import Environment
from agents import BaseAgent

pygame 2.6.1 (SDL 2.28.4, Python 3.12.8)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [7]:
def load_agent(agent_name: str, env: Environment, config: dict) -> tuple[BaseAgent, str]:
    agent_info = config[agent_name]
    module = importlib.import_module(agent_info["module"])
    AgentClass = getattr(module, agent_info["class"])
    init_args = agent_info.get("init_args", {})

    sig = inspect.signature(AgentClass.__init__)
    if 'env' in sig.parameters:
        agent = AgentClass(env=env, **init_args)
    else:
        agent = AgentClass(**init_args)

    return agent, agent_info["train_mode"]

def update_agent(agent: BaseAgent, args: Namespace,
                 state: tuple[int, int],
                 next_state: tuple[int, int],
                 reward: float,
                 actual_action: int) -> None:
    update_params = inspect.signature(agent.update).parameters
    update_param_names = list(update_params)

    if {"state", "next_state"}.issubset(update_param_names):
        agent.update(state=state, next_state=next_state, reward=reward, action=actual_action)
    elif {"next_state", "reward", "action"}.issubset(update_param_names):
        agent.update(next_state=next_state, reward=reward, action=actual_action)
    elif {"state", "reward", "action"}.issubset(update_param_names):
        agent.update(state=state, reward=reward, action=actual_action)
    elif all(p.kind in {Parameter.VAR_POSITIONAL, Parameter.VAR_KEYWORD} for p in update_params.values()):
        agent.update()
    else:
        raise ValueError(f"Unsupported update() signature: {update_param_names}")

def train_agent(args: Namespace, config: dict):
	start_pos = tuple(args.agent_start_pos)
	for grid_path in args.GRID:
		env = Environment(
			Path(grid_path), args.no_gui, sigma=args.sigma, agent_start_pos=start_pos,
			reward_fn=custom_reward_function, target_fps=args.fps, random_seed=args.random_seed
		)
		env.reset()
		agent, mode = load_agent(args.agent, env, config)

		if mode == "episodic":
			delta = 1e-6
			for ep in trange(args.episodes, desc=f"Training {args.agent}"):
				prev_q = {s: np.copy(q) for s, q in agent.q_table.items()}
				state = env.reset()
				for _ in range(args.iter):
					action = agent.take_action(state)
					next_state, reward, terminated, info = env.step(action)
					if terminated: break
					agent.update(state, next_state, reward, info["actual_action"])
					state = next_state
				common = set(agent.q_table.keys()) & set(prev_q.keys())
				max_diff = max(
					np.max(np.abs(agent.q_table[s] - prev_q[s]))
					for s in common
				) if common else 10
				if max_diff < delta:
					break
			agent.eval_mode()

		elif mode == "iterative":
			state = env.reset()
			for _ in trange(args.iter, desc=f"Training {args.agent}"):
				action = agent.take_action(state)
				next_state, reward, terminated, info = env.step(action)
				update_agent(agent, args, state, next_state, reward, info["actual_action"])
				state = next_state
				if terminated: break

		# Environment.evaluate_agent(
		#     grid_path, agent, args.iter, args.sigma, agent_start_pos=start_pos,
		#     reward_fn=custom_reward_function, random_seed=args.random_seed
		# )
		Environment.evaluate_agent(
			Path(grid_path), agent, args.iter, args.sigma, agent_start_pos=start_pos,
			reward_fn=custom_reward_function, random_seed=args.random_seed
		)


### Can just regen df instead of reading

In [9]:
df = pd.read_csv("/Users/aniket/TU_Eindhoven/2_Study/Q4_S2AMC15_Data_Intelligence_Challenge/4_Code/2AMC15_Intelligence_Challenge/experimental_table.csv")  # or assign df directly
with open("agent_config.json") as f:
    base_config = json.load(f)

TRAIN_ARGS = {"episodes", "iter"}

In [10]:
for i, row in df.iterrows():
    agent = row["agent"]
    grid = row["grid"]

    print(f"Row {i+1}: {agent} on {grid} | {row['param_changed']} = {row['param_value']}")

    init_args = {}
    train_args = {}

    for col, val in row.items():
        if pd.isna(val): continue
        if col in {"agent", "grid", "param_changed", "param_value"}:
            continue
        if col in TRAIN_ARGS:
            train_args[col] = int(val)
        else:
            init_args[col] = float(val) if isinstance(val, float) else val

    config = deepcopy(base_config)
    config[agent]["init_args"] = init_args
    args = Namespace(
        GRID=[f"grid_configs/{grid}.npy"],
        agent=agent,
        no_gui=True,
        sigma=0.1,
        fps=5,
        episodes=train_args.get("episodes", 2000),
        iter=train_args.get("iter", 2000),
        random_seed=42,
        agent_start_pos=[1, 1],
    )

    train_agent(args, config)


Row 1: QLearningAgent on A1_grid | gamma = 0.6


Training QLearningAgent:  24%|██▎       | 470/2000 [00:00<00:00, 1620.14it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 122164.19it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 2: QLearningAgent on A1_grid | gamma = 0.95


Training QLearningAgent:  29%|██▉       | 578/2000 [00:00<00:00, 1952.64it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 159681.62it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 3: QLearningAgent on A1_grid | alpha = 0.3


Training QLearningAgent:  19%|█▉        | 376/2000 [00:00<00:00, 1971.08it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 111156.47it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 4: QLearningAgent on A1_grid | alpha = 0.5


Training QLearningAgent:  11%|█         | 213/2000 [00:00<00:00, 1799.22it/s]
Evaluating agent:   1%|          | 16/2000 [00:00<00:00, 137518.16it/s]


Evaluation complete. Results:
cumulative_reward: 28
total_steps: 17
total_agent_moves: 16
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 5: QLearningAgent on A1_grid | epsilon = 0.5


Training QLearningAgent:  28%|██▊       | 564/2000 [00:00<00:00, 1876.20it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 102134.03it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 6: QLearningAgent on A1_grid | epsilon_decay = 0.8


Training QLearningAgent:  29%|██▊       | 573/2000 [00:00<00:00, 1943.71it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 148734.18it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 7: QLearningAgent on A1_grid | episodes = 5000.0


Training QLearningAgent:  11%|█▏        | 567/5000 [00:00<00:02, 1938.97it/s]
Evaluating agent:   1%|          | 16/2000 [00:00<00:00, 161319.38it/s]


Evaluation complete. Results:
cumulative_reward: 28
total_steps: 17
total_agent_moves: 16
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 8: QLearningAgent on A1_grid | iter = 5000.0


Training QLearningAgent:  29%|██▉       | 578/2000 [00:00<00:00, 1952.24it/s]
Evaluating agent:   0%|          | 15/5000 [00:00<00:00, 156115.53it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 9: QLearningAgent on Maze | gamma = 0.6


Training QLearningAgent:  13%|█▎        | 258/2000 [00:00<00:01, 956.43it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 149796.57it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 10: QLearningAgent on Maze | gamma = 0.95


Training QLearningAgent:  23%|██▎       | 467/2000 [00:00<00:01, 1308.58it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 229475.60it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 11: QLearningAgent on Maze | alpha = 0.3


Training QLearningAgent:  15%|█▍        | 298/2000 [00:00<00:01, 1270.92it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 154549.58it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 12: QLearningAgent on Maze | alpha = 0.5


Training QLearningAgent:   8%|▊         | 168/2000 [00:00<00:01, 1228.38it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 235194.62it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 13: QLearningAgent on Maze | epsilon = 0.5


Training QLearningAgent:  21%|██        | 415/2000 [00:00<00:01, 1141.25it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 182802.60it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 14: QLearningAgent on Maze | epsilon_decay = 0.8


Training QLearningAgent:  21%|██        | 416/2000 [00:00<00:01, 1187.66it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 238538.62it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 15: QLearningAgent on Maze | episodes = 5000.0


Training QLearningAgent:   8%|▊         | 423/5000 [00:00<00:03, 1266.34it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 211774.12it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 16: QLearningAgent on Maze | iter = 5000.0


Training QLearningAgent:  21%|██        | 423/2000 [00:00<00:01, 1268.97it/s]
Evaluating agent:   1%|          | 36/5000 [00:00<00:00, 209134.27it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 17: QLearningAgent on test_grid | gamma = 0.6


Training QLearningAgent:   4%|▎         | 71/2000 [00:00<00:00, 3982.02it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 43539.49it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 18: QLearningAgent on test_grid | gamma = 0.95


Training QLearningAgent:   4%|▎         | 73/2000 [00:00<00:00, 3841.76it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 48960.75it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 19: QLearningAgent on test_grid | alpha = 0.3


Training QLearningAgent:   2%|▏         | 48/2000 [00:00<00:00, 3942.48it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 42799.02it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 20: QLearningAgent on test_grid | alpha = 0.5


Training QLearningAgent:   1%|▏         | 26/2000 [00:00<00:00, 3512.03it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 55676.60it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 21: QLearningAgent on test_grid | epsilon = 0.5


Training QLearningAgent:   4%|▍         | 75/2000 [00:00<00:00, 4006.43it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 57456.22it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 22: QLearningAgent on test_grid | epsilon_decay = 0.8


Training QLearningAgent:   4%|▎         | 74/2000 [00:00<00:00, 4005.19it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 42083.32it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 23: QLearningAgent on test_grid | episodes = 5000.0


Training QLearningAgent:   1%|▏         | 73/5000 [00:00<00:01, 3723.92it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 47482.69it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 24: QLearningAgent on test_grid | iter = 5000.0


Training QLearningAgent:   4%|▎         | 73/2000 [00:00<00:00, 3760.18it/s]
Evaluating agent:   0%|          | 3/5000 [00:00<00:00, 47662.55it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 25: QLearningAgent on large_grid | gamma = 0.6


Training QLearningAgent:   8%|▊         | 153/2000 [00:00<00:01, 1152.28it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 199728.76it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 26: QLearningAgent on large_grid | gamma = 0.95


Training QLearningAgent:  11%|█         | 218/2000 [00:00<00:01, 1360.30it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 202135.13it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 27: QLearningAgent on large_grid | alpha = 0.3


Training QLearningAgent:   7%|▋         | 144/2000 [00:00<00:01, 1390.21it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 156212.44it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 28: QLearningAgent on large_grid | alpha = 0.5


Training QLearningAgent:   4%|▍         | 82/2000 [00:00<00:01, 1305.73it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 171897.70it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 29: QLearningAgent on large_grid | epsilon = 0.5


Training QLearningAgent:  10%|█         | 204/2000 [00:00<00:01, 1179.94it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 113666.78it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 30: QLearningAgent on large_grid | epsilon_decay = 0.8


Training QLearningAgent:  10%|█         | 204/2000 [00:00<00:01, 1384.28it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 192399.27it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 31: QLearningAgent on large_grid | episodes = 5000.0


Training QLearningAgent:   4%|▍         | 214/5000 [00:00<00:03, 1395.44it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 178861.58it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 32: QLearningAgent on large_grid | iter = 5000.0


Training QLearningAgent:  10%|█         | 209/2000 [00:00<00:01, 1377.28it/s]
Evaluating agent:   0%|          | 20/5000 [00:00<00:00, 190650.18it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 33: ValueIterationAgent on A1_grid | gamma = 0.6


Training ValueIterationAgent:   1%|          | 16/2000 [00:00<00:00, 58102.91it/s]
Evaluating agent:   1%|          | 16/2000 [00:00<00:00, 250406.21it/s]


Evaluation complete. Results:
cumulative_reward: 28
total_steps: 17
total_agent_moves: 16
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 34: ValueIterationAgent on A1_grid | gamma = 0.95


Training ValueIterationAgent:   1%|          | 15/2000 [00:00<00:00, 59297.42it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 207638.81it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 35: ValueIterationAgent on A1_grid | theta = 1e-05


Training ValueIterationAgent:   1%|          | 15/2000 [00:00<00:00, 58362.30it/s]
Evaluating agent:   1%|          | 15/2000 [00:00<00:00, 219980.98it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 36: ValueIterationAgent on A1_grid | iter = 5000.0


Training ValueIterationAgent:   0%|          | 15/5000 [00:00<00:00, 61984.79it/s]
Evaluating agent:   0%|          | 15/5000 [00:00<00:00, 202950.19it/s]


Evaluation complete. Results:
cumulative_reward: 35
total_steps: 16
total_agent_moves: 16
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 37: ValueIterationAgent on Maze | gamma = 0.6


Training ValueIterationAgent: 100%|██████████| 2000/2000 [00:00<00:00, 112636.56it/s]
Evaluating agent: 100%|██████████| 2000/2000 [00:00<00:00, 1010188.82it/s]


Evaluation complete. Results:
cumulative_reward: -2168
total_steps: 2000
total_agent_moves: 1972
total_failed_moves: 28
total_targets_reached: 0
targets_remaining: 1
Row 38: ValueIterationAgent on Maze | gamma = 0.95


Training ValueIterationAgent:   2%|▏         | 36/2000 [00:00<00:00, 81486.75it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 375609.31it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 39: ValueIterationAgent on Maze | theta = 1e-05


Training ValueIterationAgent:   2%|▏         | 36/2000 [00:00<00:00, 84307.62it/s]
Evaluating agent:   2%|▏         | 36/2000 [00:00<00:00, 404812.18it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 40: ValueIterationAgent on Maze | iter = 5000.0


Training ValueIterationAgent:   1%|          | 36/5000 [00:00<00:00, 84307.62it/s]
Evaluating agent:   1%|          | 36/5000 [00:00<00:00, 405900.39it/s]


Evaluation complete. Results:
cumulative_reward: 8
total_steps: 37
total_agent_moves: 36
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 41: ValueIterationAgent on test_grid | gamma = 0.6


Training ValueIterationAgent:   0%|          | 3/2000 [00:00<00:00, 19388.15it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 53317.42it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 42: ValueIterationAgent on test_grid | gamma = 0.95


Training ValueIterationAgent:   0%|          | 3/2000 [00:00<00:00, 21620.12it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 66576.25it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 43: ValueIterationAgent on test_grid | theta = 1e-05


Training ValueIterationAgent:   0%|          | 3/2000 [00:00<00:00, 23652.09it/s]
Evaluating agent:   0%|          | 3/2000 [00:00<00:00, 58798.65it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 44: ValueIterationAgent on test_grid | iter = 5000.0


Training ValueIterationAgent:   0%|          | 3/5000 [00:00<00:00, 22231.29it/s]
Evaluating agent:   0%|          | 3/5000 [00:00<00:00, 54471.48it/s]


Evaluation complete. Results:
cumulative_reward: 41
total_steps: 4
total_agent_moves: 3
total_failed_moves: 1
total_targets_reached: 1
targets_remaining: 0
Row 45: ValueIterationAgent on large_grid | gamma = 0.6


Training ValueIterationAgent:   1%|          | 20/2000 [00:00<00:00, 66894.80it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 282444.71it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 46: ValueIterationAgent on large_grid | gamma = 0.95


Training ValueIterationAgent:   1%|          | 20/2000 [00:00<00:00, 65128.94it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 286300.61it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 47: ValueIterationAgent on large_grid | theta = 1e-05


Training ValueIterationAgent:   1%|          | 20/2000 [00:00<00:00, 68478.43it/s]
Evaluating agent:   1%|          | 20/2000 [00:00<00:00, 290263.25it/s]


Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0
Row 48: ValueIterationAgent on large_grid | iter = 5000.0


Training ValueIterationAgent:   0%|          | 20/5000 [00:00<00:00, 67324.30it/s]
Evaluating agent:   0%|          | 20/5000 [00:00<00:00, 274137.52it/s]

Evaluation complete. Results:
cumulative_reward: 30
total_steps: 21
total_agent_moves: 21
total_failed_moves: 0
total_targets_reached: 1
targets_remaining: 0



