# Linear Probe Experiments
A first try at generating data in a procgen maze environment, and using that to construct a linear probe to detect a "cheese signal" in the neural network layers of a pre-trained RL agent. 

## Initialize Gym Environment

In [51]:
import torch
import numpy as np
from procgen import ProcgenGym3Env
from procgen_tools import maze
from procgen_tools.models import load_policy
from procgen_tools import maze as maze_api

In [52]:
seed = 4242
maze_test_env = ProcgenGym3Env(
    num=1,
    env_name='maze', num_levels=1, start_level=seed,
    distribution_mode='hard', num_threads=4, render_mode="rgb_array",
)
maze_test_env

<CEnv lib_path=/home/gearspark/Projects/ai-safety-camp-2024-model-agents/venv/lib/python3.10/site-packages/procgen/data/prebuilt/libenv.so options={'center_agent': True, 'use_generated_assets': False, 'use_monochrome_assets': False, 'restrict_themes': False, 'use_backgrounds': True, 'paint_vel_info': False, 'distribution_mode': 1, 'env_name': 'maze', 'num_levels': 1, 'start_level': 4242, 'num_actions': 15, 'use_sequential_levels': False, 'debug_mode': 0, 'rand_seed': 1383589431, 'num_threads': 4, 'render_human': True, 'resource_root': '/home/gearspark/Projects/ai-safety-camp-2024-model-agents/venv/lib/python3.10/site-packages/procgen/data/assets/'}>

## Load Agent

In [53]:
policy = load_policy('model_rand.pth', action_size=15, device=torch.device('cpu'))

## Run Episodes

In [58]:
def run_episode(model, maze_environment, argmax=True, max_time_steps=256):
    assert maze_test_env.options["env_name"] == "maze"  # Only maze is supported because of the use of the maze wrapper
    venv = maze_api.wrap_venv(maze_environment)

    observations = []
    rewards = []
    dones = []
    info_list = []
    
    def log(o, r, d, i):
        observations.append(o)
        rewards.append(r)
        dones.append(d)
        info_list.append(i)

    model.eval()  # Switch off gradient tracking and other training time mechanisms
    obs = venv.reset()

    for step in range(max_time_steps):
        out, _ = model(torch.FloatTensor(obs))
        if argmax:
            act = out.probs.argmax(dim=-1).numpy()
        else:
            act = out.sample().numpy()
        obs, rew, done, info = venv.step(act)
        log(obs, rew, done, info)
        if done:
            break

    return observations, rewards, dones, info_list



In [59]:
# Run a test episode
obs, rews, ds, infs = run_episode(policy, maze_test_env)

In [63]:
print(f"n obs: {len(obs)}, n rewards: {len(rews)}, total reward: {sum(rews)}, level complete: {ds[-1]}, seed: {seed}")

n obs: 7, n rewards: 7, total reward: [10.], level complete: [ True], seed: 4242
