# Linear Probe Experiments
A first try at generating data in a procgen maze environment, and using that to construct a linear probe to detect a "cheese signal" in the neural network layers of a pre-trained RL agent. 

## Initialize Environment 

In [1]:
import os
import torch
import numpy as np
from tqdm import tqdm
from procgen import ProcgenGym3Env
from procgen_tools import maze
from procgen_tools.models import load_policy
from procgen_tools import maze as maze_api

In [2]:

from pathlib import Path
import io

save_dir = Path("maze_linear_probe_data")
if not save_dir.is_dir():
    os.mkdir(save_dir)

The gym environment will be created through `procgen-tools`. It provides a wrapper around the original procgen environment to make it compatible with gym. 

In [3]:
seed = 42
wrapped_venv = maze_api.create_venv(
    num=1, start_level=int(seed), num_levels=0, num_threads = 4
)
wrapped_venv.env.get_info()[0].keys()

dict_keys(['prev_level_seed', 'prev_level_complete', 'level_seed', 'rgb'])

In [4]:
# Get the procgen env directly like so
wrapped_venv.env

<CEnv lib_path=/Users/bensturgeon/Library/Caches/pypoetry/virtualenvs/ai-safety-camp-Y8XZewIj-py3.10/lib/python3.10/site-packages/procgen/data/prebuilt/libenv.dylib options={'center_agent': True, 'use_generated_assets': False, 'use_monochrome_assets': False, 'restrict_themes': False, 'use_backgrounds': True, 'paint_vel_info': False, 'distribution_mode': 1, 'env_name': 'maze', 'num_levels': 0, 'start_level': 42, 'num_actions': 15, 'use_sequential_levels': False, 'debug_mode': 0, 'rand_seed': 424112009, 'num_threads': 4, 'render_human': True, 'resource_root': '/Users/bensturgeon/Library/Caches/pypoetry/virtualenvs/ai-safety-camp-Y8XZewIj-py3.10/lib/python3.10/site-packages/procgen/data/assets/'}>

## Load Agent
Download one of the agents from the [trained model files](https://drive.google.com/drive/folders/1Ig7bzRlieyYFcdKL_PM-guSWR8WryDOL). I used `maze_I/model_rand_region_5` without knowing about the performance of this agent. Do not forget to rename the model file or change the filename below.

In [5]:
policy = load_policy('model_rand_region_15.pth', action_size=15, device=torch.device('cpu'))

## Run Episodes

In [6]:

def run_episode(model, maze_environment,logger, argmax=True, max_time_steps=256):
    model.eval()  # Switch off gradient tracking and other training time mechanisms
    obs = maze_environment.reset()
    logger.update_environment(maze_environment) 
    for step in range(max_time_steps):
        out, _ = model(torch.FloatTensor(obs))
        act = out.probs.argmax(dim=-1).numpy() if argmax else out.sample().numpy()
        obs, rew, done, info = maze_environment.step(act)
        logger.log(model, step, act, obs, rew, done)  # Updated call
        if done:
            logger.done()
            break


## Logging Model Activations
Activations can be [logged with `circrl`](https://github.com/montemac/circrl/tree/main). The hook manager logs a single activation by default. Since we want to have activations spread over the entire eposide we will use a custom hook manager that can be reset between episodes.

In [7]:
def get_model_layer_names(model):
    return [name for name, module in model.named_modules()]

In [37]:
names = get_model_layer_names(policy)
print(names[:20])

['', 'embedder', 'embedder.block1', 'embedder.block1.conv', 'embedder.block1.maxpool', 'embedder.block1.res1', 'embedder.block1.res1.relu1', 'embedder.block1.res1.conv1', 'embedder.block1.res1.relu2', 'embedder.block1.res1.conv2', 'embedder.block1.res1.resadd', 'embedder.block1.res2', 'embedder.block1.res2.relu1', 'embedder.block1.res2.conv1', 'embedder.block1.res2.relu2', 'embedder.block1.res2.conv2', 'embedder.block1.res2.resadd', 'embedder.block2', 'embedder.block2.conv', 'embedder.block2.maxpool']


In [None]:
import torch

activations = {}

def get_activation(name):
    def hook(model, input, output):
        print(output)
        activations[name] = output.detach()
    return hook




# Attach the hook to a specific layer, e.g., model.layer1
policy.register_forward_hook(get_activation('embedder.block1.conv'))

# Run your data through the model
observation = torch.tensor(observations[0], dtype=torch.float32).unsqueeze(0)


output = policy(observation)

Create a hook to log the activations of the corresponding layers.

In [9]:
from circrl import hooks

policy_hook = hooks.HookManager(
    model=policy,
    cache=names[1:4]
)

print("Before logging")
print(policy_hook.cache_results.keys())

# Run an environment update in the policy hook context to start logging
with policy_hook:
    obs = wrapped_venv.reset()
    output = policy(torch.FloatTensor(obs))

print("After logging")
print(output)
print(policy_hook.cache_results.keys())

Before logging
dict_keys([])
After logging
(Categorical(logits: torch.Size([1, 15])), tensor([8.0210], grad_fn=<ViewBackward0>))
dict_keys(['embedder.block1.conv', 'embedder.block1', 'embedder'])


Be careful! In the default log setting the `cache_results` gets overwritten every time the policy is run:

In [10]:
with policy_hook:
    obs = wrapped_venv.reset()
    output = policy(torch.FloatTensor(obs))

print("log size after a single execution:")
print(f"\t{policy_hook.cache_results[names[1]].size()}")

with policy_hook:    
    obs = wrapped_venv.reset()
    for _ in range(10):
        output = policy(torch.FloatTensor(obs))

print("Log size after a multiple executions:")
print(f"\t{policy_hook.cache_results[names[1]].size()}")

log size after a single execution:
	torch.Size([1, 256])
Log size after a multiple executions:
	torch.Size([1, 256])


## Gathering Training Data
To gather training data we need to run multiple episodes and store the activations of the model alongside the environment parameters. Procgen-tools provides some convenience functions to make this easier.

In [25]:
def generate_maze_training_data(model, n_episodes,logger, on_episode_step, on_episode_done ):
    max_seed = int(1e9)
    seeds = np.random.default_rng().choice(max_seed, size=n_episodes, replace=False)

    for seed in tqdm(seeds):
        wrapped_venv = maze_api.create_venv(
            num=1, start_level=int(seed), num_levels=0, num_threads = 4
        )  # Convenience functions
        
        run_episode(model, wrapped_venv,logger)
    logger.reset_episode_counter()
    

We need a system to log activations and environment variables persistently. It would be best to avoid having to edit code related to model execution when logging to avoid introducting bugs. We can use something like the following logging setup to log episodic data to disk at the end of every episode. It combines the previous logging function with hooks:

In [26]:
import pickle

class ProbeDataLogger:
    def __init__(self, model, save_file_path=""):
        self._init_logs()

        self.model = model
        layer_names = get_model_layer_names(model)
        self.model_hook = hooks.HookManager(
            model=model,
            cache=names
        )
        save_file_path = Path(save_file_path)
        self._file_path = save_file_path
        self._file_extension = ".pkl"
        self._current_ep = 0
        self.env = None
    
    
    def reset_episode_counter(self):
        """Reset the episode counter to 0."""
        self._current_ep = 0
    def update_environment(self, env):
        """Update the logger to use the current environment instance."""
        self.env = env

    def log(self, model, environment, step, act, obs, rew):
        self.observations.append(obs)
        self.rewards.append(rew)
        self.activations.append(self.model_hook.cache_results)

        # Using these functions only works because the environment is wrapped in
        # the training data generation method
        state = maze_api.state_from_venv(self.env)
        grid = maze_api.get_grid(state.state_vals)
        self.cheese_positions.append(maze_api.get_cheese_pos(grid))

    def done(self):
        self._current_ep += 1
        self.flush()

    def flush(self):
        """Flush data in log buffer to file and reset buffer"""
        save_path = self._file_path / Path("ep_" + str(self._current_ep) + str(self._file_extension))
        print(save_path)
        self._save_logs_to_file(save_path)
        self.reset()

    def reset(self):
        self._init_logs()

    def _save_logs_to_file(self, file_path):
        write_mode = 'wb' 

        obs = torch.tensor(self.observations)
        rew = torch.tensor(self.rewards)
        cheese_positions = torch.tensor(self.cheese_positions)
        
        with open(file_path, write_mode) as file:
            pickle.dump(self.observations, file)
            pickle.dump(self.rewards, file)
            pickle.dump(self.activations, file)
            pickle.dump(self.cheese_positions, file)

    def _init_logs(self):
        self.observations = []
        self.rewards = []
        self.activations = []
        self.cheese_positions = []

Now we can load back the data. Including the loading functionality in the logging class might be convenient, but here I will simply recover it manually as an example.

In [13]:
frames =[]
logs = ProbeDataLogger(policy, save_file_path=save_dir)
with logs.model_hook:
    generate_maze_training_data(policy, 3,logs, logs.log, logs.done)

  0%|          | 0/3 [00:00<?, ?it/s]

  obs = torch.tensor(self.observations)
  rew = torch.tensor(self.rewards)
 33%|███▎      | 1/3 [00:00<00:01,  1.17it/s]

maze_linear_probe_data/ep_1.pkl


 67%|██████▋   | 2/3 [00:01<00:00,  1.76it/s]

maze_linear_probe_data/ep_2.pkl


100%|██████████| 3/3 [00:02<00:00,  1.32it/s]

maze_linear_probe_data/ep_3.pkl





In [14]:
import pickle

def load_logs(file_path):
    """
    Args:
    - file_path (str): The path to the pickle file.
    
    Returns:
    - observations (list): Environment observations.
    - rewards (list): Environment rewards.
    - activations (list): The list of activations unpickled from the file.
    - cheese_positions (list): Cheese position tensors in x, y.
    """
    with open(file_path, 'rb') as file:
        # Unpickle in the order they were saved: observations, rewards, activations
        observations = pickle.load(file)
        rewards = pickle.load(file)
        activations = pickle.load(file)
        cheese_positions = pickle.load(file)
    
    return observations, rewards, activations, cheese_positions


# Example usage
file_path = save_dir / 'ep_1.pkl'
_, _, activations, _ = load_logs(file_path)
print("Loaded activations:", len(activations))

Loaded activations: 20


## Training A Linear Probe
Training a linear probe requires choosing what layers we are probing, and with respect to what target. That in turn determines the structure of the probe.

### Generate Training Data

In this case we will probe an early convolutional layer in the network for cheese position signal as an example. First we load a model and generate some training data.

In [15]:
# Create log directory
import os

log_dir = save_dir / Path("logs")

if not os.path.isdir(log_dir):
    os.mkdir(log_dir)

In [27]:
# Load model and create logger
probe_policy = load_policy('model_rand_region_15.pth', action_size=15, device=torch.device('cpu'))
logger = ProbeDataLogger(probe_policy, save_file_path=log_dir)

In [29]:
# Generate data
with logger.model_hook:
    generate_maze_training_data(probe_policy, 50, logger, logger.log, logger.done)

  rew = torch.tensor(self.rewards)
  2%|▏         | 1/50 [00:01<01:14,  1.51s/it]

maze_linear_probe_data/logs/ep_11.pkl


  4%|▍         | 2/50 [00:03<01:29,  1.87s/it]

maze_linear_probe_data/logs/ep_12.pkl


  8%|▊         | 4/50 [00:04<00:34,  1.34it/s]

maze_linear_probe_data/logs/ep_13.pkl
maze_linear_probe_data/logs/ep_14.pkl
maze_linear_probe_data/logs/ep_15.pkl


 12%|█▏        | 6/50 [00:06<00:43,  1.00it/s]

maze_linear_probe_data/logs/ep_16.pkl


 14%|█▍        | 7/50 [00:07<00:46,  1.08s/it]

maze_linear_probe_data/logs/ep_17.pkl


 16%|█▌        | 8/50 [00:08<00:36,  1.16it/s]

maze_linear_probe_data/logs/ep_18.pkl
maze_linear_probe_data/logs/ep_19.pkl


 20%|██        | 10/50 [00:08<00:25,  1.59it/s]

maze_linear_probe_data/logs/ep_20.pkl
maze_linear_probe_data/logs/ep_21.pkl


 24%|██▍       | 12/50 [00:09<00:17,  2.22it/s]

maze_linear_probe_data/logs/ep_22.pkl


 28%|██▊       | 14/50 [00:17<01:03,  1.77s/it]

maze_linear_probe_data/logs/ep_23.pkl


 30%|███       | 15/50 [00:18<00:51,  1.48s/it]

maze_linear_probe_data/logs/ep_24.pkl


 34%|███▍      | 17/50 [00:27<01:29,  2.72s/it]

maze_linear_probe_data/logs/ep_25.pkl


 38%|███▊      | 19/50 [00:39<01:54,  3.70s/it]

maze_linear_probe_data/logs/ep_26.pkl


 40%|████      | 20/50 [00:43<01:56,  3.89s/it]

maze_linear_probe_data/logs/ep_27.pkl


 42%|████▏     | 21/50 [00:45<01:34,  3.24s/it]

maze_linear_probe_data/logs/ep_28.pkl


 44%|████▍     | 22/50 [00:46<01:14,  2.66s/it]

maze_linear_probe_data/logs/ep_29.pkl


 46%|████▌     | 23/50 [00:47<00:59,  2.19s/it]

maze_linear_probe_data/logs/ep_30.pkl


 48%|████▊     | 24/50 [00:48<00:48,  1.86s/it]

maze_linear_probe_data/logs/ep_31.pkl


 50%|█████     | 25/50 [00:50<00:42,  1.72s/it]

maze_linear_probe_data/logs/ep_32.pkl


 52%|█████▏    | 26/50 [00:50<00:31,  1.32s/it]

maze_linear_probe_data/logs/ep_33.pkl


 54%|█████▍    | 27/50 [00:51<00:29,  1.30s/it]

maze_linear_probe_data/logs/ep_34.pkl


 56%|█████▌    | 28/50 [00:52<00:25,  1.16s/it]

maze_linear_probe_data/logs/ep_35.pkl


 58%|█████▊    | 29/50 [00:53<00:23,  1.11s/it]

maze_linear_probe_data/logs/ep_36.pkl


 60%|██████    | 30/50 [00:53<00:17,  1.16it/s]

maze_linear_probe_data/logs/ep_37.pkl


 64%|██████▍   | 32/50 [00:54<00:11,  1.61it/s]

maze_linear_probe_data/logs/ep_38.pkl
maze_linear_probe_data/logs/ep_39.pkl


 66%|██████▌   | 33/50 [00:55<00:10,  1.61it/s]

maze_linear_probe_data/logs/ep_40.pkl


 68%|██████▊   | 34/50 [00:55<00:08,  1.92it/s]

maze_linear_probe_data/logs/ep_41.pkl


 70%|███████   | 35/50 [00:56<00:10,  1.46it/s]

maze_linear_probe_data/logs/ep_42.pkl


 72%|███████▏  | 36/50 [00:57<00:10,  1.36it/s]

maze_linear_probe_data/logs/ep_43.pkl


 76%|███████▌  | 38/50 [01:07<00:30,  2.53s/it]

maze_linear_probe_data/logs/ep_44.pkl


 78%|███████▊  | 39/50 [01:09<00:23,  2.18s/it]

maze_linear_probe_data/logs/ep_45.pkl


 80%|████████  | 40/50 [01:09<00:16,  1.67s/it]

maze_linear_probe_data/logs/ep_46.pkl


 86%|████████▌ | 43/50 [01:21<00:16,  2.35s/it]

maze_linear_probe_data/logs/ep_47.pkl
maze_linear_probe_data/logs/ep_48.pkl


 88%|████████▊ | 44/50 [01:21<00:10,  1.76s/it]

maze_linear_probe_data/logs/ep_49.pkl


 90%|█████████ | 45/50 [01:23<00:08,  1.72s/it]

maze_linear_probe_data/logs/ep_50.pkl


 92%|█████████▏| 46/50 [01:24<00:06,  1.62s/it]

maze_linear_probe_data/logs/ep_51.pkl


 96%|█████████▌| 48/50 [01:33<00:05,  2.79s/it]

maze_linear_probe_data/logs/ep_52.pkl


 98%|█████████▊| 49/50 [01:35<00:02,  2.31s/it]

maze_linear_probe_data/logs/ep_53.pkl


100%|██████████| 50/50 [01:35<00:00,  1.91s/it]

maze_linear_probe_data/logs/ep_54.pkl





### Probe Training

We need to load the episode data, process it into a format suitable for the training process and then train [a linear probe](https://github.com/montemac/circrl/blob/33534f2f78547b38172e3a4f0d682bc8b5b46b4f/src/circrl/probing.py#L47). For this particular version we can process the episodic data into files of numpy arrays or pytorch tensors. We'll choose pytorch tensors here. 

Alternatively the data could be batched in files to reduce read/write overhead. It might be worth doing if compute bottlenecks become an obvious concern.

#### Regression Probe

Using the Ridge Regressor provided by SKLearn.

In [35]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


def fit_linear_probe(model: Ridge, layer_name: str, inputs: np.ndarray, targets: np.ndarray, random_seed: int = 42):
    test_size = 0.2
    inputs_train, inputs_test, targets_train, targets_test = train_test_split(
        inputs, targets, test_size=test_size, random_state=random_seed
    )

    model.fit(inputs_train, targets_train)
    return {
        "train_score": model.score(inputs_train, targets_train),
        "test_score": model.score(inputs_test, targets_test),
        "x_train": inputs_train,
        "y_train": targets_train,
        "x_test": inputs_test,
        "y_test": targets_test,
        "model": model,
    }



In [36]:
seed = 42
mdl = Ridge(random_state=seed)
layer_name = "embedder.block1.conv"
data_path = save_dir / "logs" / "ep_1.pkl"

_, _, layer_activations, cheese_positions = load_logs(data_path)
assert layer_name in layer_activations[0].keys(), "Cannot find layer '{layer_name}' in activations"

inputs = torch.stack([act[layer_name] for act in layer_activations], dim=0)
inputs = inputs.detach().numpy()
# Explicit choice to use all the activations as input instead of sampling (might need to reduce input feature space significantly to prevent overfitting)
inputs = np.reshape(inputs, (inputs.shape[0], -1))

targets = torch.tensor(cheese_positions)
targets = targets.detach().numpy()

results = fit_linear_probe(mdl, layer_name, inputs, targets, seed)
print(f"train score: {results['train_score']}")
print(f"test score: {results['test_score']}")

train score: -1.6420198534206065e-13
test score: 0.0


In [32]:
def train_linear_probe(model: Ridge, layer_name: str, episode_data_dir: Path, random_seed: int = 42):
    all_inputs = []
    all_targets = []
    train_scores = []
    test_scores = []

    # Aggregate data from all files first
    for data_file in episode_data_dir.glob('*.pkl'):
        _, _, layer_activations, cheese_positions = load_logs(data_file)
        assert layer_name in layer_activations[0].keys(), "Cannot find layer '{layer_name}' in activations"
        
        in_tmp = torch.stack([act[layer_name] for act in layer_activations], dim=0).detach().numpy()
        in_tmp = np.reshape(in_tmp, (in_tmp.shape[0], -1))
        tar_tmp = torch.tensor(cheese_positions).detach().numpy()

        all_inputs.append(in_tmp)
        all_targets.append(tar_tmp)



    # Concatenate all collected data
    inputs = np.concatenate(all_inputs, axis=0)
    targets = np.concatenate(all_targets, axis=0)

    # Now, train the model once with all the data
    results = fit_linear_probe(model, layer_name, inputs, targets, random_seed)
    train_scores.append(results['train_score'])
    test_scores.append(results['test_score'])

    # Calculate and print overall performance
    mean_train_score = np.mean(train_scores)
    mean_test_score = np.mean(test_scores)
    unique_targets = np.unique(targets)
    print(f"Mean train score: {mean_train_score}")
    print(f"Mean test score: {mean_test_score}")
    print(f"Number of unique targets: {len(unique_targets)}")

    return model

In [33]:
seed = 42
mdl = Ridge(random_state=seed)
layer_name = "embedder.block1.conv"
data_path = save_dir / "logs"

model = train_linear_probe(mdl, layer_name, data_path, seed)

ValueError: Found input variables with inconsistent numbers of samples: [1383, 1382]