# Linear Probe Experiments
A first try at generating data in a procgen maze environment, and using that to construct a linear probe to detect a "cheese signal" in the neural network layers of a pre-trained RL agent. 

## Initialize Gym Environment

In [66]:
import torch
import numpy as np
from tqdm import tqdm
from procgen import ProcgenGym3Env
from procgen_tools import maze
from procgen_tools.models import load_policy
from procgen_tools import maze as maze_api

In [107]:
seed = 4242
maze_test_env = ProcgenGym3Env(
    num=1,
    env_name='maze', num_levels=1, start_level=seed,
    distribution_mode='hard', num_threads=4, render_mode="rgb_array",
)
maze_test_env.get_info()[0].keys()

dict_keys(['prev_level_seed', 'prev_level_complete', 'level_seed', 'rgb'])

## Load Agent
Download one of the agents from the [trained model files](https://drive.google.com/drive/folders/1Ig7bzRlieyYFcdKL_PM-guSWR8WryDOL). I used `maze_I/model_rand_region_5` without knowing about the performance of this agent. Do not forget to rename the model file or change the filename below.

In [64]:
policy = load_policy('model_rand.pth', action_size=15, device=torch.device('cpu'))

## Run Episodes

In [65]:
def run_episode(model, maze_environment, argmax=True, max_time_steps=256):
    observations = []
    rewards = []
    dones = []
    info_list = []
    
    def log(o, r, d, i):
        observations.append(o)
        rewards.append(r)
        dones.append(d)
        info_list.append(i)

    model.eval()  # Switch off gradient tracking and other training time mechanisms
    obs = venv.reset()

    for step in range(max_time_steps):
        out, _ = model(torch.FloatTensor(obs))
        if argmax:
            act = out.probs.argmax(dim=-1).numpy()
        else:
            act = out.sample().numpy()
        obs, rew, done, info = venv.step(act)
        log(obs, rew, done, info)
        if done:
            break

    return observations, rewards, dones, info_list



In [59]:
# Run a test episode
obs, rews, ds, infs = run_episode(policy, maze_test_env)

In [63]:
print(f"n obs: {len(obs)}, n rewards: {len(rews)}, total reward: {sum(rews)}, level complete: {ds[-1]}, seed: {seed}")

n obs: 7, n rewards: 7, total reward: [10.], level complete: [ True], seed: 4242


## Gathering Training Data
To gather training data we need to run multiple episodes and store the activations of the model alongside the environment parameters. Procgen-tools provides some convenience functions to make this easier.

In [108]:
def generate_maze_training_data(model, n_episodes):
    max_seed = int(1e9)
    seeds = np.random.default_rng().choice(max_seed, size=n_episodes, replace=False)

    for seed in tqdm(seeds):
        environment = ProcgenGym3Env(
            num=1, env_name='maze', num_levels=1, start_level=int(seed),
            distribution_mode='hard', num_threads=4, render_mode="rgb_array"
        )
        assert environment.options["env_name"] == "maze"  # Only maze is supported because of maze wrapper
        environment = maze_api.wrap_venv(environment)

        obs, rews, ds, infs = run_episode(model, environment)        


In [109]:
generate_maze_training_data(policy, 10)

  0%|          | 0/10 [00:00<?, ?it/s]


AttributeError: 'ToBaselinesVecEnv' object has no attribute 'ob_space'