In [1]:
from generate_acts import load_model
import torch as th
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import plotly.express as px
from probes import LRProbe
from tqdm.notebook import tqdm
import random

def load_dataset(dataset_name):
    return pd.read_csv(f"datasets/{dataset_name}.csv")

SEED = 42
th.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = "cuda" if th.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [2]:
tokenizer, model = load_model("EleutherAI/pythia-160m-alldropout", device=device)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

cities_alice = load_dataset("cities_alice")
neg_cities_alice = load_dataset("neg_cities_alice")

all_cities = pd.concat([cities_alice, neg_cities_alice])

perm = th.randperm(len(all_cities))
all_cities = all_cities.iloc[perm]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading model EleutherAI/pythia-160m-alldropout...


In [3]:
statements = all_cities["statement"].tolist()
has_alice = th.tensor(all_cities["has_alice"].tolist()).float().to(device)
has_not = th.tensor(all_cities["has_not"].tolist()).float().to(device)
not_xor_alice = th.tensor(all_cities["has_alice xor has_not"].tolist()).float().to(device)

label_dict = {
    "has_alice": has_alice,
    "has_not": has_not,
    "has_alice xor has_not": not_xor_alice
}

In [4]:
with th.no_grad():
    layers = list(range(model.config.num_hidden_layers + 1))
    acts = [[] for _ in layers]
    dataloader = DataLoader(
        statements,
        batch_size=128,
        shuffle=False,
    )
    for statement_batch in dataloader:
        batch = tokenizer(statement_batch, return_tensors="pt", padding=True).to(device)
        sequence_index = batch.input_ids.ne(tokenizer.pad_token_id).sum(dim=1) - 1
        h_states = model(**batch, output_hidden_states=True).hidden_states
        for layer in layers:
            acts[layer].append(
                h_states[layer][
                    th.arange(len(batch.input_ids), device=device), sequence_index
                ]
            )
    for layer, act in enumerate(acts):
        acts[layer] = th.cat(act, dim=0)

In [5]:
probes = {}
train_accs = {}
test_accs = {}
from itertools import product
for label in tqdm(["has_alice", "has_not", "has_alice xor has_not"]):
    for layer in layers:
        train_size = int(len(acts[layer]) * 0.8)
        probe = LRProbe(acts[layer].shape[1], bias=True).to(device)
        labels = label_dict[label]
        probe.fit(acts[layer][:train_size], labels[:train_size])
        train_accs.setdefault(label, []).append(probe.accuracy(acts[layer][:train_size], labels[:train_size]).item())
        test_accs.setdefault(label, []).append(probe.accuracy(acts[layer][train_size:], labels[train_size:]).item())
        probes.setdefault(label, []).append(probe)

  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
fig = px.scatter()
fig.add_scatter(x=layers, y=train_accs["has_alice"], name="has_alice") 
fig.add_scatter(x=layers, y=train_accs["has_not"], name="has_not")
fig.add_scatter(x=layers, y=train_accs["has_alice xor has_not"], name="has_alice xor has_not")
fig.update_layout(title="Train Accuracy vs Layer", xaxis_title="Layer", yaxis_title="Accuracy")
fig.show()