# Linear Probe Experiments
A first try at generating data in a procgen maze environment, and using that to construct a linear probe to detect a "cheese signal" in the neural network layers of a pre-trained RL agent. 

## Initialize Environment 

In [16]:
import torch
import numpy as np
from tqdm import tqdm
from procgen import ProcgenGym3Env
from procgen_tools import maze
from procgen_tools.models import load_policy
from procgen_tools import maze as maze_api

The gym environment will be created through `procgen-tools`. It provides a wrapper around the original procgen environment to make it compatible with gym. 

In [19]:
seed = 4242
wrapped_venv = maze_api.create_venv(
    num=1, start_level=int(seed), num_levels=0, num_threads = 4
)
wrapped_venv.env.get_info()[0].keys()

dict_keys(['prev_level_seed', 'prev_level_complete', 'level_seed', 'rgb'])

In [20]:
# Get the gym env directly like so
wrapped_venv.env

<CEnv lib_path=/home/gearspark/Projects/ai-safety-camp-2024-model-agents/venv/lib/python3.10/site-packages/procgen/data/prebuilt/libenv.so options={'center_agent': True, 'use_generated_assets': False, 'use_monochrome_assets': False, 'restrict_themes': False, 'use_backgrounds': True, 'paint_vel_info': False, 'distribution_mode': 1, 'env_name': 'maze', 'num_levels': 0, 'start_level': 4242, 'num_actions': 15, 'use_sequential_levels': False, 'debug_mode': 0, 'rand_seed': 1024729401, 'num_threads': 4, 'render_human': True, 'resource_root': '/home/gearspark/Projects/ai-safety-camp-2024-model-agents/venv/lib/python3.10/site-packages/procgen/data/assets/'}>

## Load Agent
Download one of the agents from the [trained model files](https://drive.google.com/drive/folders/1Ig7bzRlieyYFcdKL_PM-guSWR8WryDOL). I used `maze_I/model_rand_region_5` without knowing about the performance of this agent. Do not forget to rename the model file or change the filename below.

In [21]:
policy = load_policy('model_rand.pth', action_size=15, device=torch.device('cpu'))

## Run Episodes

In [22]:
def run_episode(model, maze_environment, argmax=True, max_time_steps=256):
    observations = []
    rewards = []
    dones = []
    info_list = []
    
    def log(o, r, d, i):
        observations.append(o)
        rewards.append(r)
        dones.append(d)
        info_list.append(i)

    model.eval()  # Switch off gradient tracking and other training time mechanisms
    obs = maze_environment.reset()

    for step in range(max_time_steps):
        out, _ = model(torch.FloatTensor(obs))
        if argmax:
            act = out.probs.argmax(dim=-1).numpy()
        else:
            act = out.sample().numpy()
        obs, rew, done, info = maze_environment.step(act)
        log(obs, rew, done, info)
        if done:
            break

    return observations, rewards, dones, info_list


In [24]:
# Run a test episode
obs, rews, ds, infs = run_episode(policy, wrapped_venv)

In [25]:
print(f"n obs: {len(obs)}, n rewards: {len(rews)}, total reward: {sum(rews)}, level complete: {ds[-1]}, seed: {seed}")

n obs: 256, n rewards: 256, total reward: [0.], level complete: [False], seed: 4242


## Logging Model Activations
Activations can be logged with `circrl`. The hook manager logs a single activation by default. Since we want to have activations spread over the entire eposide we will use a custom hook manager that can be reset between episodes.

## Gathering Training Data
To gather training data we need to run multiple episodes and store the activations of the model alongside the environment parameters. Procgen-tools provides some convenience functions to make this easier.

In [28]:
def generate_maze_training_data(model, n_episodes):
    max_seed = int(1e9)
    seeds = np.random.default_rng().choice(max_seed, size=n_episodes, replace=False)

    for seed in tqdm(seeds):
        wrapped_venv = maze_api.create_venv(
            num=1, start_level=int(seed), num_levels=0, num_threads = 4
        )  # Convenience functions
        state = maze_api.state_from_venv(wrapped_venv)
        grid = maze_api.get_grid(state.state_vals)
        cheese_pos = maze_api.get_cheese_pos(grid)

        obs, rews, ds, infs = run_episode(model, wrapped_venv)  

    print(cheese_pos)
    print(len(grid))

In [29]:
generate_maze_training_data(policy, 1)

100%|██████████| 1/1 [00:02<00:00,  2.85s/it]

(10, 3)
25



