In [1]:
import gymnasium as gym
import time
import pygame
from stable_baselines3 import PPO
import config
from wrappers import CliffWalkingStateWrapper

def watch_agent():
    raw_env = gym.make(config.ENV_ID, render_mode="human")
    env = CliffWalkingStateWrapper(raw_env)

    pygame.init()

    try:
        model_path = f"{config.LOG_DIR}/best_model.zip"
        model = PPO.load(model_path)
        print(f"SUCCESS: Loaded BEST model from {model_path}")
    except FileNotFoundError:
        try:
            model_path = f"{config.LOG_DIR}/ppo_cliffwalking_final.zip"
            model = PPO.load(model_path)
            print(f"WARNING: 'best_model.zip' not found. Loaded FINAL model from {model_path}")
        except FileNotFoundError:
            print("ERROR: No models found. Did you run training?")
            env.close()
            return

    print("Starting Greedy Evaluation...")

    for episode in range(5):
        obs, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    env.close()
                    pygame.quit()
                    return
                if event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
                    print("ESC pressed. Exiting evaluation.")
                    env.close()
                    pygame.quit()
                    return

            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            total_reward += reward

            time.sleep(0.05)

        print(f"Episode {episode + 1}: Reward {total_reward}")

    env.close()
    pygame.quit()

if __name__ == "__main__":
    watch_agent()


  from pkg_resources import resource_stream, resource_exists


SUCCESS: Loaded BEST model from ./rlhf_logs//best_model.zip
Starting Greedy Evaluation...
Episode 1: Reward -13
Episode 2: Reward -13
Episode 3: Reward -13
ESC pressed. Exiting evaluation.
