This notebook is for comparing a base model (or supervised finetuned model) to its RLHF'd version
Current goal: train an autoencoder from base activations to RLHF'd activations

In [2]:
import argparse
from collections.abc import Generator
from dataclasses import dataclass, field
from datetime import datetime
import importlib
import itertools
import json
import math
import multiprocessing as mp
import os
import pickle
from typing import Union, Tuple, List, Any, Optional, TypeVar, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM

from baukit import Trace
from datasets import Dataset, DatasetDict, load_dataset
from einops import rearrange
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import numpy.typing as npt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtyping import TensorType
from tqdm import tqdm
from transformer_lens import HookedTransformer
from transformers import PreTrainedTokenizerBase, GPT2Tokenizer
import wandb

from utils import *
from run_copy import *
from argparser import parse_args
from nanoGPT_model import GPT

In [3]:
class ScaleModule(nn.Module):
    def __init__(self, n_dict_components, t_type=torch.float32):
        super(ScaleModule, self).__init__()
        self.scale_factor= nn.Parameter(torch.zeros(n_dict_components))
    
    def forward(self, x):
        return x * self.scale_factor
    
    def scales(self):
        return self.scale_factor.data
        
class EncoderDecoder(nn.Module):
    def __init__(self, activation_size, n_dict_components, t_type=torch.float32, l1_coef=0.0):
        super(EncoderDecoder, self).__init__()
        
        # Only defining the decoder layer, encoder will share its weights
        self.decoder = nn.Linear(n_dict_components, activation_size, bias=True)
        # Create a bias layer
        self.encoder_bias= nn.Parameter(torch.zeros(n_dict_components))
        
        # Add scaling term between encoder and decoder
        self.scaling = ScaleModule(n_dict_components)

        # Initialize the decoder weights orthogonally
        nn.init.orthogonal_(self.decoder.weight)
        self.decoder = self.decoder.to(t_type)

        # Encoder is a Sequential with the ReLU activation
        # No need to define a Linear layer for the encoder as its weights are tied with the decoder
        self.encoder = nn.Sequential(nn.ReLU()).to(t_type)

        self.l1_coef = l1_coef
        self.activation_size = activation_size
        self.n_dict_components = n_dict_components

    def forward(self, x):
        c = self.encoder(x @ self.decoder.weight + self.encoder_bias)
        # Apply unit norm constraint to the decoder weights
        self.decoder.weight.data = nn.functional.normalize(self.decoder.weight.data, dim=0)
        
        # scale encoder outputs
        c = self.scaling(c)

        # Decoding step as before
        x_hat = self.decoder(c)
        return x_hat, c


    @property
    def device(self):
        return next(self.parameters()).device

Make sure to run the following terminal commands to create activation dictionaries:

python3 run_copy.py --model_name [model]

In [43]:
# from transformers import AutoModel, AutoTokenizer

# model_name = "usvsnsp/pythia-2.8b-sft"
# model_name = "gpt2"
# model = AutoModel.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [45]:
# with Trace(model, "h.0.mlp") as ret:
#     i = 0

In [4]:
# # load dataset
# dataset_folder = "activation_data/pile-10k-gpt2-2"
# with open(os.path.join(dataset_folder, "0.pkl"), "rb") as f:
#     dataset = pickle.load(f)
#     mlp_width = dataset.tensors[0][0].shape[-1]

In [13]:
sys.argv = ['foo']
cfg = parse_args()
cfg.model_name = "gpt2"
cfg.load_activation_dataset = "activation_data/pile-10k-gpt2-2"
cfg.dataset_folder = "activation_data/pile-10k-gpt2-2"
cfg.n_components_dictionary = 512
cfg.use_wandb = False
cfg.mlp_width = 768

In [14]:
# train EncoderDecoder
from run_copy import *
# from run_copy import run_different_target

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
t_type = torch.float32
l1_alpha = 0.1



auto_encoder = EncoderDecoder(cfg.mlp_width, cfg.n_components_dictionary, t_type, l1_coef=l1_alpha).to(device)
auto_encoder, reconstruction_loss, l1_loss, feature_activations, completed_batches = run_different_target(cfg, auto_encoder)

L1 Coef: 1.00E-01 | Dict ratio: 0.6666666666666666 | Batch: 1000/1364 | Chunk: 1/4 | Minirun: 2/1 | Epoch: 1/1 | Reconstruction loss: 0.993337 | l1: 0.000001
L1 Coef: 1.00E-01 | Dict ratio: 0.6666666666666666 | Batch: 636/1364 | Chunk: 2/4 | Minirun: 2/1 | Epoch: 1/1 | Reconstruction loss: 0.946200 | l1: 0.000000
L1 Coef: 1.00E-01 | Dict ratio: 0.6666666666666666 | Batch: 272/1364 | Chunk: 3/4 | Minirun: 2/1 | Epoch: 1/1 | Reconstruction loss: 0.908071 | l1: 0.000000
L1 Coef: 1.00E-01 | Dict ratio: 0.6666666666666666 | Batch: 1272/1364 | Chunk: 3/4 | Minirun: 2/1 | Epoch: 1/1 | Reconstruction loss: 0.874478 | l1: 0.000000
L1 Coef: 1.00E-01 | Dict ratio: 0.6666666666666666 | Batch: 908/1364 | Chunk: 4/4 | Minirun: 2/1 | Epoch: 1/1 | Reconstruction loss: 0.849065 | l1: 0.000000


In [15]:
auto_encoder.scaling.scale_factor.data.max()

tensor(0.0161, device='cuda:0')