# Dependency

In [None]:
!pip install git+https://github.com/davidbau/baukit
!pip install accelerate
!pip install einops

In [1]:
HUGGINGFACE_TOKEN = # ENTER HUGGINGFACE TOKEN

In [None]:
import json
import pickle
import random
import warnings

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import transformers
from einops import rearrange
from IPython.display import display, HTML
from scipy.stats import spearmanr
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.neural_network import MLPRegressor
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from baukit import Trace, TraceDict
from custom_llama import llama # modified code to access attention head outputs

warnings.filterwarnings("ignore")

# Preprocessing Data

In [4]:
df = pd.read_csv('./data_replication/HS116_members.csv')
df = df.loc[pd.notnull(df.nominate_dim1)].reset_index(drop=True)
df['first_middle_name'] = df['bioname'].apply(lambda x: x.split(', ')[1])
df['first_name'] = df['first_middle_name'].apply(lambda x: x.split(' ')[0] + ' ' if ' ' in x else x + ' ')
df['middle_name'] = df['first_middle_name'].apply(lambda x: x.split(' ')[1][0] + '. ' if ' ' in x else '')
df['last_name'] = df['bioname'].apply(lambda x: x.split(', ')[0].lower().capitalize().strip())
df['fullname'] = df['first_name'] + df['middle_name'] + df['last_name']

In [None]:
adfontesmedia = pd.read_csv('./data_replication/adfontesmedia.csv')

# Probing Partisan Slant

## Prompting

In [29]:
for model_name in ['meta-llama/Llama-2-7b-chat-hf', 'mistralai/Mistral-7B-Instruct-v0.1', 'lmsys/vicuna-7b-v1.5']:
    tokenizer = llama.LlamaTokenizer.from_pretrained(model_name, cache_dir='./model', token=HUGGINGFACE_TOKEN)
    
    # Generate prompts for politician simulation
    statements = []
    for index, row in df.iterrows():
        statements.append(tokenizer(f'USER: Generate a statement by {row["fullname"]}, a politician in the United States.\nASSISTANT: In 2019, {row["fullname"]} said that', return_tensors="pt")['input_ids'])
    pickle.dump(statements, open(f'./results_replication/{model_name.replace("/", "_")}_politicians.pkl', 'wb'))
    
    # Generate prompts for news media simulation
    statements_news = []
    for index, row in adfontesmedia.iterrows():
        statements_news.append(tokenizer(f'USER: Generate a statement from a news source in the United States.\nASSISTANT: {row["moniker_name"]} reported that', return_tensors="pt")['input_ids'])
    pickle.dump(statements_news, open(f'./results_replication/{model_name.replace("/", "_")}_news.pkl', 'wb'))

## Extracting Activations

In [None]:
def extract_attention_head_activations(model, statements):
    HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)]
    head_wise_hidden_states_list = []
    for prompt in tqdm(statements, total=len(statements)):
        with torch.no_grad():
            with TraceDict(model, HEADS) as ret:
                output = model(prompt.to('cuda'), output_hidden_states=True, output_attentions=True)
                head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
                head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim = 0).squeeze().numpy()
                head_wise_hidden_states_list.append(head_wise_hidden_states[:, :, :])
    features = []
    for head_wise_hidden_states, label in zip(head_wise_hidden_states_list, labels):
        features.append(rearrange([np.array(head_wise_hidden_states[:,-1,:])], 'b l (h d) -> b l h d', h = model.config.num_attention_heads))
    features = np.stack(features, axis=0)
    return features

for model_name in ['meta-llama/Llama-2-7b-chat-hf', 'mistralai/Mistral-7B-Instruct-v0.1', 'lmsys/vicuna-7b-v1.5']: #
    model = llama.LlamaForCausalLM.from_pretrained(model_name, cache_dir='./model', low_cpu_mem_usage=True, torch_dtype=torch.float16, token=HUGGINGFACE_TOKEN).to('cuda:0')
    # Extract activations for politicians
    statements = pickle.load(open(f'./results_replication/{model_name.replace("/", "_")}_politicians.pkl', 'rb'))
    labels = np.array(df['nominate_dim1'].astype(float))
    features = extract_attention_head_activations(model, statements)
    pickle.dump((features, labels), open(f"./results_replication/{model_name.replace('/','_')}_politician_features.pkl", 'wb'))

    # Extract activations for news media
    statements_news = pickle.load(open(f'./results_replication/{model_name.replace("/", "_")}_news.pkl', 'rb'))
    labels = np.array(adfontesmedia['bias_mean'].astype(float))
    features = extract_attention_head_activations(model, statements_news)
    pickle.dump((features, labels), open(f"./results_replication/{model_name.replace('/','_')}_news_features.pkl", 'wb'))

## Probing

In [None]:
for model_name in ['meta-llama/Llama-2-7b-chat-hf', 'mistralai/Mistral-7B-Instruct-v0.1', 'lmsys/vicuna-7b-v1.5']:
    features, labels = pickle.load(open(f"./results_replication/{model_name.replace('/','_')}_politician_features.pkl", 'rb'))
    performance = np.zeros((model.config.num_hidden_layers, model.config.num_attention_heads))
    ridge_dict = {}
    for i in tqdm(range(model.config.num_hidden_layers)):
        ridge_dict[i] = {}
        for j in range(model.config.num_attention_heads):
            kf = KFold(n_splits=2, shuffle=True, random_state=42)
            for train_indices, test_indices in kf.split(range(features.shape[0])):
                X_train = features[train_indices, 0, i, j, :]
                X_test = features[test_indices, 0, i, j, :]
                y_train = np.array(labels)[train_indices]
                y_test = np.array(labels)[test_indices]
                ridge_model = Ridge(alpha=1, fit_intercept=False)
                ridge_model.fit(X_train, y_train)
                ridge_dict[i][j] = ridge_model
                y_pred = ridge_model.predict(X_test)
                performance[i, j] += spearmanr(y_test, y_pred).statistic
    performance /= 2
    pickle.dump(performance, open(f"./results_replication/{model_name.replace('/','_')}_politician_performance.pkl", 'wb'))
    pickle.dump(ridge_dict, open(f"./results_replication/{model_name.replace('/','_')}_ridge.pkl", 'wb'))

# Intervention

In [None]:
def lt_modulated_vector_add(head_output, layer_name):
    layer_index = layer_name[len('model.layers.'):]
    layer_index = int(layer_index[:layer_index.index('.')])
    head_output = rearrange(head_output.detach().cpu(), 'b s (h d) -> b s h d', h=model.config.num_attention_heads)
    for head_index in head_dict[layer_index]:
        head_output[:, -1, head_index, :] += alpha  * focal_ridge_dict[(layer_index, head_index)] * np.std(features[:, 0, layer_index, head_index, :], axis=0)
    head_output = rearrange(head_output, 'b s h d -> b s (h d)')
    return head_output.to('cuda')

for model_name in ['meta-llama/Llama-2-7b-chat-hf', 'mistralai/Mistral-7B-Instruct-v0.1', 'lmsys/vicuna-7b-v1.5']:
    tokenizer = llama.LlamaTokenizer.from_pretrained(model_name, cache_dir='./model', token=HUGGINGFACE_TOKEN)
    model = llama.LlamaForCausalLM.from_pretrained(model_name, cache_dir='./model', low_cpu_mem_usage=True, torch_dtype=torch.float16, 
                                                   token=HUGGINGFACE_TOKEN).to('cuda:0')
    performance = pickle.load(open(f"./results_replication/{model_name.replace('/','_')}_politician_performance.pkl", 'rb'))
    features, labels = pickle.load(open(f"./results_replication/{model_name.replace('/','_')}_politician_features.pkl", 'rb'))
    trained_ridge_dict = pickle.load(open(f"./results_replication/{model_name.replace('/','_')}_ridge.pkl", 'rb'))
    topics = ['abortion', 'immigration', 'gun control', 'same-sex marriage', 'LGBTQ+ rights', 
              'climate change', 'trust in science', 'social media regulation', 'free speech']
    results = []
    for k in reversed([16, 32, 48, 64, 80, 96]):
        for alpha in tqdm(reversed([-30, -20, -10, 0, 10, 20, 30])): # Add -50, -40, 40, 50 for the coherence tests
            for topic in topics:
                print(k, alpha, topic)
                original_prompt = f"Write a statement about {topic}."
                top_indices = np.dstack(np.unravel_index(np.argsort(performance.ravel()), (32, 32)))[0][-k:, :][::-1]
                focal_ridge_dict = {}
                for i in top_indices:
                    ridge_model = trained_ridge_dict[i[0]][i[1]]
                    focal_ridge_dict[tuple(i)] = ridge_model.coef_
                head_dict = {}
                for i in top_indices:
                    if i[0] not in head_dict:
                        head_dict[i[0]] = [i[1]]
                    else:
                        head_dict[i[0]].append(i[1])
                with TraceDict(model, [f'model.layers.{i}.self_attn.head_out' for i in sorted(list(set(top_indices[:,0])))], edit_output=lt_modulated_vector_add) as ret: 
                    input_ids = tokenizer(f"USER: {original_prompt}\nASSISTANT: Regarding {topic}, I believe that ", return_tensors="pt")['input_ids']
                    model_gen_tokens = model.generate(input_ids.to('cuda')[0][:-1].unsqueeze(0), max_length=200)
                model_gen_str = tokenizer.decode(model_gen_tokens[0], skip_special_tokens=True)
                model_gen_str = model_gen_str.strip()
                results.append([k, alpha, topic, model_gen_str])
    pickle.dump(results, open(f"./results_replication/{model_name.replace('/','_')}_intervention_results.pkl", 'wb'))

In [None]:
results = []
for model_name in ['meta-llama/Llama-2-7b-chat-hf', 'mistralai/Mistral-7B-Instruct-v0.1', 'lmsys/vicuna-7b-v1.5']:
    results += pickle.load(open(f"./results_replication/{model_name.replace('/','_')}_intervention_results.pkl", 'rb'))

pd.DataFrame(results).to_parquet('intervention_raw.parquet')