# Linear Probe Experiments
A first try at generating data in a procgen maze environment, and using that to construct a linear probe to detect a "cheese signal" in the neural network layers of a pre-trained RL agent. 

## Initialize Environment 

In [9]:
import os
import torch
import numpy as np
from tqdm import tqdm
from procgen import ProcgenGym3Env
from procgen_tools import maze
from procgen_tools.models import load_policy
from procgen_tools import maze as maze_api

In [10]:

from pathlib import Path
import io

save_dir = Path("maze_linear_probe_data")
if not save_dir.is_dir():
    os.mkdir(save_dir)

The gym environment will be created through `procgen-tools`. It provides a wrapper around the original procgen environment to make it compatible with gym. 

In [11]:
seed = 4242
wrapped_venv = maze_api.create_venv(
    num=1, start_level=int(seed), num_levels=0, num_threads = 4
)
wrapped_venv.env.get_info()[0].keys()

dict_keys(['prev_level_seed', 'prev_level_complete', 'level_seed', 'rgb'])

In [12]:
# Get the procgen env directly like so
wrapped_venv.env

<CEnv lib_path=/home/gearspark/Projects/ai-safety-camp-2024-model-agents/venv/lib/python3.10/site-packages/procgen/data/prebuilt/libenv.so options={'center_agent': True, 'use_generated_assets': False, 'use_monochrome_assets': False, 'restrict_themes': False, 'use_backgrounds': True, 'paint_vel_info': False, 'distribution_mode': 1, 'env_name': 'maze', 'num_levels': 0, 'start_level': 4242, 'num_actions': 15, 'use_sequential_levels': False, 'debug_mode': 0, 'rand_seed': 1737695600, 'num_threads': 4, 'render_human': True, 'resource_root': '/home/gearspark/Projects/ai-safety-camp-2024-model-agents/venv/lib/python3.10/site-packages/procgen/data/assets/'}>

## Load Agent
Download one of the agents from the [trained model files](https://drive.google.com/drive/folders/1Ig7bzRlieyYFcdKL_PM-guSWR8WryDOL). I used `maze_I/model_rand_region_5` without knowing about the performance of this agent. Do not forget to rename the model file or change the filename below.

In [14]:
policy = load_policy('model_rand_region_15.pth', action_size=15, device=torch.device('cpu'))

## Run Episodes

In [15]:
def dummy_step_log(model, environment, step, act, obs, rew, done, info):
    """Just a dummy log method"""

def dummy_done_log():
    """Just a dummy log method"""


def run_episode(model, maze_environment, argmax=True, max_time_steps=256, on_step=dummy_step_log, on_done=dummy_done_log):
    model.eval()  # Switch off gradient tracking and other training time mechanisms
    obs = maze_environment.reset()

    for step in range(max_time_steps):
        out, _ = model(torch.FloatTensor(obs))
        if argmax:
            act = out.probs.argmax(dim=-1).numpy()
        else:
            act = out.sample().numpy()
        obs, rew, done, info = maze_environment.step(act)
        on_step(model, maze_environment, step, act, obs, rew, done, info)
        if done:
            break
    on_done()


In [16]:
# Run a test episode
run_episode(policy, wrapped_venv)

In [17]:
# Define a custom logging function
class CustomLogger:

    def __init__(self):
        self.observations = []
        self.rewards = []

    def log(self, model, environment, step, act, obs, rew, done, info):
        self.observations.append(obs)
        self.rewards.append(float(rew[0]))

    def reset_logs(self):
        self.observations = []
        self.rewards = []

logger = CustomLogger()
run_episode(policy, wrapped_venv, on_step=logger.log)

In [18]:
obs = logger.observations
rews = logger.rewards
print(f"n obs: {len(obs)}, n rewards: {len(rews)}, total reward: {sum(rews)}")

n obs: 17, n rewards: 17, total reward: 10.0


## Logging Model Activations
Activations can be [logged with `circrl`](https://github.com/montemac/circrl/tree/main). The hook manager logs a single activation by default. Since we want to have activations spread over the entire eposide we will use a custom hook manager that can be reset between episodes.

In [19]:
def get_model_layer_names(model):
    return [name for name, module in model.named_modules()]

In [20]:
names = get_model_layer_names(policy)
print(names[:5])

['', 'embedder', 'embedder.block1', 'embedder.block1.conv', 'embedder.block1.maxpool']


Create a hook to log the activations of the corresponding layers.

In [21]:
from circrl import hooks

policy_hook = hooks.HookManager(
    model=policy,
    cache=names[1:4]
)

print("Before logging")
print(policy_hook.cache_results.keys())

# Run an environment update in the policy hook context to start logging
with policy_hook:
    obs = wrapped_venv.reset()
    output = policy(torch.FloatTensor(obs))

print("After logging")
print(output)
print(policy_hook.cache_results.keys())

Before logging
dict_keys([])
After logging
(Categorical(logits: torch.Size([1, 15])), tensor([8.1030], grad_fn=<ViewBackward0>))
dict_keys(['embedder.block1.conv', 'embedder.block1', 'embedder'])


Be careful! In the default log setting the `cache_results` gets overwritten every time the policy is run:

In [22]:
with policy_hook:
    obs = wrapped_venv.reset()
    output = policy(torch.FloatTensor(obs))

print("log size after a single execution:")
print(f"\t{policy_hook.cache_results[names[1]].size()}")

with policy_hook:    
    obs = wrapped_venv.reset()
    for _ in range(10):
        output = policy(torch.FloatTensor(obs))

print("Log size after a multiple executions:")
print(f"\t{policy_hook.cache_results[names[1]].size()}")

log size after a single execution:
	torch.Size([1, 256])
Log size after a multiple executions:
	torch.Size([1, 256])


## Gathering Training Data
To gather training data we need to run multiple episodes and store the activations of the model alongside the environment parameters. Procgen-tools provides some convenience functions to make this easier.

In [23]:
def generate_maze_training_data(model, n_episodes, on_episode_step, on_episode_done):
    max_seed = int(1e9)
    seeds = np.random.default_rng().choice(max_seed, size=n_episodes, replace=False)

    for seed in tqdm(seeds):
        wrapped_venv = maze_api.create_venv(
            num=1, start_level=int(seed), num_levels=0, num_threads = 4
        )  # Convenience functions

        run_episode(model, wrapped_venv, on_step=on_episode_step, on_done=on_episode_done)

In [24]:
generate_maze_training_data(policy, 1, dummy_step_log, dummy_done_log)

100%|██████████| 1/1 [00:00<00:00,  3.35it/s]


We need a system to log activations and environment variables persistently. It would be best to avoid having to edit code related to model execution when logging to avoid introducting bugs. We can use something like the following logging setup to log episodic data to disk at the end of every episode. It combines the previous logging function with hooks:

In [25]:
import pickle

class ProbeDataLogger:
    def __init__(self, model, save_file_path=""):
        self._init_logs()

        self.model = model
        layer_names = get_model_layer_names(model)
        self.model_hook = hooks.HookManager(
            model=model,
            cache=names
        )
        save_file_path = Path(save_file_path)
        self._file_path = save_file_path
        self._file_extension = ".pkl"
        self._current_ep = 0

    def log(self, model, environment, step, act, obs, rew, done, info):
        self.observations.append(obs)
        self.rewards.append(rew)
        self.activations.append(self.model_hook.cache_results)

        # Using these functions only works because the environment is wrapped in
        # the training data generation method
        state = maze_api.state_from_venv(wrapped_venv)
        grid = maze_api.get_grid(state.state_vals)
        self.cheese_positions.append(maze_api.get_cheese_pos(grid))

    def done(self):
        self._current_ep += 1
        self.flush()

    def flush(self):
        """Flush data in log buffer to file and reset buffer"""
        save_path = self._file_path / Path("ep_" + str(self._current_ep) + str(self._file_extension))
        print(save_path)
        self._save_logs_to_file(save_path)
        self.reset()

    def reset(self):
        self._init_logs()

    def _save_logs_to_file(self, file_path):
        write_mode = 'wb' 

        obs = torch.tensor(self.observations)
        rew = torch.tensor(self.rewards)
        cheese_positions = torch.tensor(self.cheese_positions)
        
        with open(file_path, write_mode) as file:
            pickle.dump(self.observations, file)
            pickle.dump(self.rewards, file)
            pickle.dump(self.activations, file)
            pickle.dump(self.cheese_positions, file)

    def _init_logs(self):
        self.observations = []
        self.rewards = []
        self.activations = []
        self.cheese_positions = []

In [26]:
logs = ProbeDataLogger(policy, save_file_path=save_dir)
with logs.model_hook:
    generate_maze_training_data(policy, 3, logs.log, logs.done)

  obs = torch.tensor(self.observations)
 67%|██████▋   | 2/3 [00:00<00:00,  3.19it/s]

maze_linear_probe_data/ep_1.pkl
maze_linear_probe_data/ep_2.pkl


100%|██████████| 3/3 [00:01<00:00,  2.35it/s]

maze_linear_probe_data/ep_3.pkl





Now we can load back the data. Including the loading functionality in the logging class might be convenient, but here I will simply recover it manually as an example.

In [27]:
import pickle

def load_logs(file_path):
    """
    Args:
    - file_path (str): The path to the pickle file.
    
    Returns:
    - observations (list): Environment observations.
    - rewards (list): Environment rewards.
    - activations (list): The list of activations unpickled from the file.
    - cheese_positions (list): Cheese position tensors in x, y.
    """
    with open(file_path, 'rb') as file:
        # Unpickle in the order they were saved: observations, rewards, activations
        observations = pickle.load(file)
        rewards = pickle.load(file)
        activations = pickle.load(file)
        cheese_positions = pickle.load(file)
    
    return observations, rewards, activations, cheese_positions

# Example usage
file_path = save_dir / 'ep_1.pkl'
_, _, activations, _ = load_logs(file_path)
print("Loaded activations:", len(activations))

Loaded activations: 32


## Training A Linear Probe
Training a linear probe requires choosing what layers we are probing, and with respect to what target. That in turn determines the structure of the probe.

### Generate Training Data

In this case we will probe a early convolutional layer in the network for cheese position signal as an example. First we load a model and generate some training data.

In [28]:
# Create log directory
import os

log_dir = save_dir / Path("logs")

if not os.path.isdir(log_dir):
    os.mkdir(log_dir)

In [29]:
# Load model and create logger
probe_policy = load_policy('model_rand_region_15.pth', action_size=15, device=torch.device('cpu'))
logger = ProbeDataLogger(probe_policy, save_file_path=log_dir)

In [30]:
# Generate data
with logger.model_hook:
    generate_maze_training_data(probe_policy, 30, logger.log, logger.done)

  3%|▎         | 1/30 [00:00<00:11,  2.54it/s]

maze_linear_probe_data/logs/ep_1.pkl


  7%|▋         | 2/30 [00:01<00:17,  1.59it/s]

maze_linear_probe_data/logs/ep_2.pkl


 10%|█         | 3/30 [00:02<00:26,  1.02it/s]

maze_linear_probe_data/logs/ep_3.pkl


 17%|█▋        | 5/30 [00:03<00:13,  1.80it/s]

maze_linear_probe_data/logs/ep_4.pkl
maze_linear_probe_data/logs/ep_5.pkl


 20%|██        | 6/30 [00:04<00:21,  1.12it/s]

maze_linear_probe_data/logs/ep_6.pkl


 27%|██▋       | 8/30 [00:05<00:11,  1.89it/s]

maze_linear_probe_data/logs/ep_7.pkl
maze_linear_probe_data/logs/ep_8.pkl


 33%|███▎      | 10/30 [00:05<00:07,  2.67it/s]

maze_linear_probe_data/logs/ep_9.pkl
maze_linear_probe_data/logs/ep_10.pkl


 37%|███▋      | 11/30 [00:06<00:10,  1.82it/s]

maze_linear_probe_data/logs/ep_11.pkl


 40%|████      | 12/30 [00:07<00:08,  2.01it/s]

maze_linear_probe_data/logs/ep_12.pkl


 43%|████▎     | 13/30 [00:07<00:08,  1.96it/s]

maze_linear_probe_data/logs/ep_13.pkl
maze_linear_probe_data/logs/ep_14.pkl


 50%|█████     | 15/30 [00:08<00:06,  2.35it/s]

maze_linear_probe_data/logs/ep_15.pkl


 53%|█████▎    | 16/30 [00:08<00:05,  2.55it/s]

maze_linear_probe_data/logs/ep_16.pkl


 57%|█████▋    | 17/30 [00:09<00:06,  1.98it/s]

maze_linear_probe_data/logs/ep_17.pkl


 60%|██████    | 18/30 [00:10<00:07,  1.71it/s]

maze_linear_probe_data/logs/ep_18.pkl


 63%|██████▎   | 19/30 [00:11<00:07,  1.46it/s]

maze_linear_probe_data/logs/ep_19.pkl


 67%|██████▋   | 20/30 [00:16<00:21,  2.15s/it]

maze_linear_probe_data/logs/ep_20.pkl


 70%|███████   | 21/30 [00:22<00:27,  3.04s/it]

maze_linear_probe_data/logs/ep_21.pkl


 77%|███████▋  | 23/30 [00:27<00:18,  2.70s/it]

maze_linear_probe_data/logs/ep_22.pkl
maze_linear_probe_data/logs/ep_23.pkl


 80%|████████  | 24/30 [00:28<00:12,  2.10s/it]

maze_linear_probe_data/logs/ep_24.pkl


 87%|████████▋ | 26/30 [00:28<00:04,  1.16s/it]

maze_linear_probe_data/logs/ep_25.pkl
maze_linear_probe_data/logs/ep_26.pkl


 90%|█████████ | 27/30 [00:29<00:03,  1.06s/it]

maze_linear_probe_data/logs/ep_27.pkl


 93%|█████████▎| 28/30 [00:30<00:02,  1.01s/it]

maze_linear_probe_data/logs/ep_28.pkl


 97%|█████████▋| 29/30 [00:31<00:00,  1.01it/s]

maze_linear_probe_data/logs/ep_29.pkl


100%|██████████| 30/30 [00:32<00:00,  1.08s/it]

maze_linear_probe_data/logs/ep_30.pkl





### Probe Training

We need to load the episode data, process it into a format suitable for the training process and then train [a linear probe](https://github.com/montemac/circrl/blob/33534f2f78547b38172e3a4f0d682bc8b5b46b4f/src/circrl/probing.py#L47). For this particular version we can process the episodic data into files of numpy arrays or pytorch tensors. We'll choose pytorch tensors here. 

Alternatively the data could be batched in files to reduce read/write overhead. It might be worth doing if compute bottlenecks become an obvious concern.

#### Regression Probe

Using the Ridge Regressor provided by SKLearn.

In [91]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


def train_regression_episode(model: Ridge, layer_name: str, episode_data_path: Path, random_seed: int = 42):
    _, _, layer_activations, cheese_positions = load_logs(episode_data_path)
    assert layer_name in layer_activations[0].keys(), "Cannot find layer '{layer_name}' in activations"
    
    test_size = 0.2
    inputs = torch.stack([act[layer_name] for act in layer_activations], dim=0)
    inputs = inputs.detach().numpy()
    targets = torch.tensor(cheese_positions)
    targets = targets.detach().numpy()

    inputs_train, inputs_test, targets_train, targets_test = train_test_split(
        inputs, targets, test_size=test_size, random_state=random_seed
    )

    model.fit(inputs_train, targets_train)
    result = {
        "train_score": model.score(inputs_train, targets_train),
        "test_score": model.score(inputs_test, targets_test),
        "x_train": inputs_train,
        "y_train": targets_train,
        "x_test": inputs_test,
        "y_test": targets_test,
        "model": model,
    }



In [92]:
seed = 42
mdl = Ridge(random_state=seed)
layer_name = "embedder.block1.conv"
data_path = save_dir / "logs" / "ep_1.pkl"

mdl = train_regression_episode(mdl, layer_name, data_path, seed)

ValueError: Found array with dim 5. Ridge expected <= 2.