In [None]:
# Install PyTorch, Hugging Face Transformers, and other libraries
!pip install torch==2.0.1
!pip install transformers==4.31.0
!pip install datasets
!pip install matplotlib seaborn
!pip install bertviz
!pip install git+https://github.com/pytorch/captum.git  # for advanced interpretability

# Optionally, if you want to clone path patching or other relevant repos:
# !git clone https://github.com/your-username/path-patching-repo.git

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from datasets import load_dataset

# For interpretability/visualization
from bertviz import head_view
from captum.attr import LayerIntegratedGradients

In [None]:
!pip install --upgrade datasets

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)
model.eval()

In [None]:
# Load the TruthfulQA dataset (choose the appropriate configuration)
# Option 1: 'generation' configuration for free-text generation tasks
truthfulqa_generation = load_dataset("truthful_qa", "generation", split="validation")

# Option 2: 'multiple_choice' configuration for multiple-choice tasks
truthfulqa_multiple_choice = load_dataset("truthful_qa", "multiple_choice", split="validation")

# Inspect the dataset structure
print("TruthfulQA (Generation) Example:")
print(truthfulqa_generation[0])  # First example in the 'generation' config

print("\nTruthfulQA (Multiple Choice) Example:")
print(truthfulqa_multiple_choice[0])  # First example in the 'multiple_choice' config

In [None]:
# Example hooking function for a single layer
from torch.nn import Module

def patch_activation(module: Module, input, output):
    """
    This function can modify 'output' in real time.
    For path patching, you might replace or scale certain channels
    in the hidden representation.
    """
    # Example: zero out a certain dimension (just a placeholder, adapt for your research)
    # output[:, :, some_dimension] = 0
    return output

# Register hooks for a specific layer, e.g., transformer.h.1
layer_to_patch = model.transformer.h[1]  # Just as an example
hook_handle = layer_to_patch.register_forward_hook(patch_activation)

# Test forward pass
test_text = "Hello, how are you?"
inputs = tokenizer(test_text, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**inputs)
hook_handle.remove()  # remove the hook after testin

In [None]:
# Example code snippet for capturing hidden states
from collections import defaultdict

def capture_activations(model, text):
    activations_dict = defaultdict(list)

    def get_hook(layer_name):
        def hook_fn(module, input, output):
            activations_dict[layer_name].append(output.detach().cpu())
        return hook_fn

    # Register hooks
    handles = []
    for i, block in enumerate(model.transformer.h):
        layer_name = f"layer_{i}"
        handles.append(block.register_forward_hook(get_hook(layer_name)))

    # Forward pass
    inputs = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        _ = model(**inputs)

    # Remove hooks
    for handle in handles:
        handle.remove()

    return activations_dict

# Example usage:
sycophantic_text = "The Earth is definitely flat, right?"
reasoning_text = "Scientific evidence shows the Earth is round."

sycophantic_acts = capture_activations(model, sycophantic_text)
reasoning_acts = capture_activations(model, reasoning_text)

# Compute a "task vector" for layer_1 as an example
task_vector_layer_1 = reasoning_acts['layer_1'][0] - sycophantic_acts['layer_1'][0]

In [None]:
# Example of extracting and visualizing attention
model_config = model.config
model_config.output_attentions = True
model = GPT2LMHeadModel.from_pretrained(model_name, config=model_config).to(device)

inputs = tokenizer("Why do people say the earth is flat?", return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**inputs)
    # outputs.attentions is a tuple of attention weights from each layer
attentions = outputs.attentions  # shape: (num_layers, batch_size, num_heads, seq_len, seq_len)

# Using BertViz's head_view
sentence = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
_ = head_view(attentions, sentence, sentence)

In [None]:
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze only the identified heads/layers
# For GPT-2, the attention heads are under `model.transformer.h[i].attn`
layer_index = 10
head_indices = [2, 5]

# GPT-2 uses a single multi-head projection. One approach is partial-fine-tuning that projection:
for name, param in model.transformer.h[layer_index].attn.c_attn.named_parameters():
    # You could refine further by indexing param slices corresponding to heads
    param.requires_grad = True

# Alternatively, you might do something more advanced with partial slices of the weight matrix.

In [None]:
# Define a simple training loop for demonstration
import torch.optim as optim

optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

train_texts = [
    "The Earth is flat, correct?",
    "Vaccines cause autism, right?",
    # ... (you could load from a proper dataset)
]
labels = [
    "No, the Earth is round.",
    "No, scientific consensus finds no link.",
    # ...
]

# Convert to a form suitable for training
def tokenize_batch(batch_texts):
    return tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')

model.train()
for epoch in range(1):  # Simplified
    for text, label in zip(train_texts, labels):
        inputs = tokenizer(text, return_tensors='pt').to(device)
        labels_t = tokenizer(label, return_tensors='pt')['input_ids'].to(device)

        # Shift labels by 1 for language modeling
        outputs = model(**inputs, labels=labels_t)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Loss: {loss.item()}")

In [None]:
# Standard fine-tuning approach, unfreeze everything
for param in model.parameters():
    param.requires_grad = True

optimizer = optim.AdamW(model.parameters(), lr=1e-5)
# Then follow a similar training loop as above, but now with all parameters trainable.

In [None]:
model.eval()

def evaluate_model(model, dataset):
    # Evaluate synergy between correctness and reduced sycophancy
    # Return relevant metric(s)
    pass

improved_performance = evaluate_model(model, truthfulqa)
print("Pinpoint-tuned performance on sycophancy tasks:", improved_performance)

In [None]:
# Recompute activations for sycophantic vs. reasoning text with the newly tuned model
syc_acts_new = capture_activations(model, sycophantic_text)
reason_acts_new = capture_activations(model, reasoning_text)

# Example: compute difference for layer_1
task_vector_layer_1_new = reason_acts_new['layer_1'][0] - syc_acts_new['layer_1'][0]

# This vector can be stored or applied to other models' activations

In [None]:
def apply_task_vector(activation, task_vector, alpha=1.0):
    return activation + alpha * task_vector

# In a forward hook for a second model:
def patch_with_task_vector(module, input, output):
    # Suppose we've identified layer_1 as target
    return apply_task_vector(output, task_vector_layer_1_new, alpha=0.5)

In [None]:
def evaluate_baseline(model, dataset, tokenizer):
    model.eval()
    correct = 0
    total = 0
    for example in dataset:
        prompt = example['question']  # Replace with dataset's structure
        correct_answer = example['answer']
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=50)
            generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if correct_answer.lower() in generated_answer.lower():
            correct += 1
        total += 1
    return correct / total

baseline_accuracy = evaluate_baseline(model, truthfulqa, tokenizer)
print("Baseline Accuracy:", baseline_accuracy)