In [1]:
from policies.dp_policies import *
from environment_presets import *
from environment import RoadEnvironment
from tqdm import tqdm

In [2]:
def policy_agent(horizon, max_timestep, seed, iterations_Q, gamma):
    return OptimalMDPAgent(horizon, max_timestep, seed, iterations_Q, gamma)

def small_environment():
    """Create a small environment for testing."""
    env = RoadEnvironment(**small_environment_dict)
    return env

def test_mdp_policy_agent(horizon, max_timestep, seed, iterations_Q, gamma, env_dict, n_rollouts):
    agent = policy_agent(horizon, max_timestep, seed, iterations_Q, gamma)
    env = RoadEnvironment(**env_dict)

    rewards = []
    for rollout in tqdm(range(n_rollouts)):
        obs = env.reset()
        agent.reset(obs)
        done = False
        reward_episode = 0
        while not done:
            state = np.array(env._get_states())
            actions = agent.get_action(state)
            obs, reward, done, info = env.step(actions)
            reward_episode += reward
        rewards.append(reward_episode)
    return rewards

In [3]:
# Finite horizon and small env
rewards = test_mdp_policy_agent(horizon="finite", max_timestep=50, seed=42, iterations_Q=5000, 
                                gamma=0.95, env_dict=small_environment_dict, n_rollouts=1000)
print(np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards))

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:47<00:00, 21.18it/s]

-533.1 105.27008121968939 -1050.0 -240.0





In [4]:
# Finite horizon and smallest env
rewards = test_mdp_policy_agent(horizon="finite", max_timestep=50, seed=42, iterations_Q=5000, 
                                gamma=0.95, env_dict=smallest_environment_dict, n_rollouts=1000)
print(np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards))

100%|██████████| 1000/1000 [00:07<00:00, 132.86it/s]

-43.68 31.120051413839278 -150.0 0.0





In [5]:
# Infinite horizon and small env
rewards = test_mdp_policy_agent(horizon="infinite", max_timestep=50, seed=42, iterations_Q=5000, 
                                gamma=0.95, env_dict=small_environment_dict, n_rollouts=1000)
print(np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards))

100%|██████████| 1000/1000 [00:51<00:00, 19.38it/s]

-551.88 98.33039001244732 -900.0 -240.0





In [6]:
# Infinite horizon and smallest env
rewards = test_mdp_policy_agent(horizon="infinite", max_timestep=50, seed=42, iterations_Q=5000, 
                                gamma=0.95, env_dict=smallest_environment_dict, n_rollouts=1000)
print(np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards))

100%|██████████| 1000/1000 [00:07<00:00, 127.47it/s]

-43.68 31.120051413839278 -150.0 0.0



