In [1]:
%%capture
!pip install -r requirements.txt

In [None]:
import pandas as pd
from dotenv import load_dotenv
import torch as t
import pandas as pd

from nnsight import LanguageModel
from transformers import AutoTokenizer

device = t.device(
    "mps" if t.backends.mps.is_available() else "cuda" if t.cuda.is_available() else "cpu"
)
load_dotenv()
t.set_grad_enabled(False)

t.manual_seed(42)
if t.cuda.is_available():
    t.cuda.manual_seed_all(42)

%load_ext autoreload
%autoreload 2

In [3]:
prompt_suffix = {
    "English": "My guess is **",
    "Turkish": "Tahminim **",
    "French": "Ma supposition est **",
    "Russian": "Моё предположение **",
    "Bengali": "আমার অনুমান হলো **",
}

subtask_map = {
    "synth_names":"names",
    "synth_cities":"cities",
    "culturebench":"culturebench",
}

lang_suffix_to_lang = {
    "tr": "Turkish",
    "fr": "French",
    "ru": "Russian",
    "bn": "Bengali",
    "en": "English",
}

def eval_or_skip(txt):
    try:
        return eval(txt)
    except:
        return None

country_to_suffix= {"Turkey": "tr", "France": "fr", "Russia": "ru", "Bangladesh": "bn", "United States":"us"}

### Load model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
nnmodel = LanguageModel('/dlabscratch1/public/llm_weights/gemma_hf/gemma-2-9b-it', 
                        device_map='cuda:0', 
                        dispatch=True, 
                        torch_dtype=t.bfloat16)

### Prepare data

In [None]:
import random

final_df = pd.read_csv("data/mcqa/mcqa_5choice_dataset.csv")
final_df["idx"] = final_df.index

def shuffle_options(row):
    

    country_suffix = ["tr", "fr", "ru", "bn", "us"]
    random.seed(42+row["idx"])
    random.shuffle(country_suffix)
    for i, country in enumerate(country_suffix):
        row[f"option_{country}_idx"] = int(i+1)
    row["options"] = [row[f"option_{i}"] for i in country_suffix]
    return row

final_df = final_df.apply(shuffle_options, axis=1)


In [None]:
from tools.prepare_input import prepare_dataset_5choice

final_df = prepare_dataset_5choice(final_df, tokenizer=tokenizer)
final_df.head(3)

Unnamed: 0,context,idx,lang,option_bn,option_bn_idx,option_fr,option_fr_idx,option_ru,option_ru_idx,option_tr,option_tr_idx,option_us,option_us_idx,options,question,subtask,prompt,messages,input
0,False,0,Turkish,Ridwan,1,Enzo,2,Aleksandr,3,Alparslan,5,Liam,4,"[Ridwan, Enzo, Aleksandr, Liam, Alparslan]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names,Soru:Aşağıdaki çoktan seçmeli soruya cevap ver...,"[{'role': 'user', 'content': 'Soru:Aşağıdaki ç...",<bos><start_of_turn>user\nSoru:Aşağıdaki çokta...
1,False,1,Turkish,Sabbir,3,Lucas,1,Mikhail,4,Yusuf,5,Noah,2,"[Lucas, Noah, Sabbir, Mikhail, Yusuf]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names,Soru:Aşağıdaki çoktan seçmeli soruya cevap ver...,"[{'role': 'user', 'content': 'Soru:Aşağıdaki ç...",<bos><start_of_turn>user\nSoru:Aşağıdaki çokta...
2,False,2,Turkish,Nafis,5,Mathis,2,Maksim,1,Göktuğ,4,Oliver,3,"[Maksim, Mathis, Oliver, Göktuğ, Nafis]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names,Soru:Aşağıdaki çoktan seçmeli soruya cevap ver...,"[{'role': 'user', 'content': 'Soru:Aşağıdaki ç...",<bos><start_of_turn>user\nSoru:Aşağıdaki çokta...


In [None]:
final_df.to_csv("data/mcqa/5choice_dataset_input_ready.csv", index=False)

In [100]:
print(final_df.to_dict(orient="records")[2]["input"])

<bos><start_of_turn>user
Soru:Aşağıdaki çoktan seçmeli soruya cevap vermek için, yalnızca 1,2,3,4,5 arasından bir seçenek seçmelisiniz. Talimat: 1,2,3,4,5 arasından bir seçenek seçmelisiniz. Başka bir şey çıkarmayın.
Aşağıdaki isimlerden hangisi muhtemelen benim adım?
Seçenekler:
1. Maksim
2. Mathis
3. Oliver
4. Göktuğ
5. Nafis<end_of_turn>
<start_of_turn>model



### Load steering vectors

In [None]:
from tools.steering import load_steering_vec_map

steering_vec_map = load_steering_vec_map()

### Load data

In [None]:
final_df = pd.read_csv("data/mcqa/5choice_dataset_input_ready.csv")

In [None]:
test_data_no_steer = final_df.copy()
test_data_no_steer.shape

In [None]:
test_data = final_df.query("not context").copy()
test_data.shape

(600, 19)

### Run steering experiments

In [None]:
from tools.steering import run_steering_mcqa
# First, we must run the no steering case that will be used as a baseline

batch_size = 64
folder = "caa/gemma2_9b_it_mcqa"
no_steer_df = run_steering_mcqa(nnmodel,
                    tokenizer,
                    steering_vec_map, 
                    test_data_no_steer, 
                    [21], 
                    [0], 
                    batch_size, 
                    vector_type="none", folder=folder, filename="nosteer")



In [None]:
batch_size = 64

folder = "caa/gemma2_9b_it_mcqa"

layers = [21,22,23,24,25,26,27]
alphas = [-2, -1, 1, 2]

for vec_type in ["perculture_trans", "trans_universal_loo"]:
    run_steering_mcqa(nnmodel,
                    tokenizer,
                    steering_vec_map, 
                    test_data, 
                    layers, 
                    alphas, 
                    batch_size, 
                    vector_type=vec_type, folder=folder, filename=vec_type)


tr names: 100%|██████████| 1/1 [00:24<00:00, 24.31s/it]
fr names: 100%|██████████| 1/1 [00:33<00:00, 33.68s/it]
ru names: 100%|██████████| 1/1 [00:28<00:00, 28.32s/it]
bn names: 100%|██████████| 1/1 [00:37<00:00, 37.31s/it]
en names: 100%|██████████| 1/1 [00:24<00:00, 24.25s/it]
tr cities: 100%|██████████| 1/1 [00:10<00:00, 10.75s/it]
fr cities: 100%|██████████| 1/1 [00:10<00:00, 10.65s/it]
ru cities: 100%|██████████| 1/1 [00:16<00:00, 16.01s/it]
bn cities: 100%|██████████| 1/1 [00:15<00:00, 15.91s/it]
en cities: 100%|██████████| 1/1 [00:08<00:00,  8.34s/it]
tr o1: 100%|██████████| 1/1 [00:32<00:00, 32.14s/it]
fr o1: 100%|██████████| 1/1 [00:34<00:00, 34.79s/it]
ru o1: 100%|██████████| 1/1 [00:42<00:00, 42.83s/it]
bn o1: 100%|██████████| 1/1 [00:41<00:00, 41.82s/it]
en o1: 100%|██████████| 1/1 [00:38<00:00, 38.40s/it]


### Inspect & Visualize results

In [None]:
folder = "caa/gemma2_9b_it_mcqa"

files = {
    "held-out universal (translated)": ["transuniversal_loo"],
    "per-culture (translated)": ["perculture_trans"],
}

all_dfs = []
for vec in files:
    files_to_load = files[vec]
    dfs = [pd.read_csv(folder+"/"+f+".csv") for f in files_to_load]
    steer_df = pd.concat(dfs)
    steer_df["vector"] = vec
    all_dfs.append(steer_df)

steer_df = pd.concat(all_dfs)

idx_list = steer_df["idx"].unique()

In [None]:
no_steer_df = pd.read_csv(f"caa/gemma2_9b_it_mcqa/nosteer.csv").query("idx in @idx_list")
dfs = []
for l in steer_df["layer"].unique():
    n = no_steer_df.copy()
    n["vector"] = "none"
    n["layer"]=l
    dfs.append(n)
no_steer_df = pd.concat(dfs)


steer_df = [steer_df, no_steer_df]
steer_df = pd.concat(steer_df)

In [10]:
for i in ["tr", "fr", "ru", "bn", "us"]:
    steer_df[f"ans_in_{i}"] = (steer_df["steer_ans_type"]).apply(lambda x: 1 if x==i else 0)

lang_to_type = {"French":"fr", "Turkish":"tr", "Russian":"ru", "Bengali":"bn", "English":"us"}
steer_df["lang"] = steer_df["lang"].apply(lambda x: lang_to_type[x])

steer_df["local_ans"] = steer_df.apply(lambda x: x[f"ans_in_{x['lang']}"], axis=1)

In [11]:
steer_df = steer_df.merge(
    steer_df.query("alpha == 0")[["prompt", "layer", "local_ans"]].rename(columns={"local_ans": "local_ans_base"}),
    on=["prompt","layer"],
    how="inner"
)
steer_df["delta_local_ans"] = steer_df["local_ans"] - steer_df["local_ans_base"]

In [12]:
hinted = pd.read_csv("caa/gemma2_9b_it_shuffle/nosteer.csv").query("context")
for i in ["tr", "fr", "ru", "bn", "us"]:
    hinted[f"ans_in_{i}"] = (hinted["steer_ans_type"]).apply(lambda x: 1 if x==i else 0)

lang_to_type = {"French":"fr", "Turkish":"tr", "Russian":"ru", "Bengali":"bn", "English":"us"}
hinted["lang"] = hinted["lang"].apply(lambda x: lang_to_type[x])

hinted["local_ans"] = hinted.apply(lambda x: x[f"ans_in_{x['lang']}"], axis=1)

In [None]:
## per culture vectors
disp_df = steer_df.merge(hinted, on=["option_tr", "option_fr","option_bn","option_us","option_ru"], how="inner", suffixes=("", "_context"))
best_steering_performance = disp_df.groupby(["vector","lang","alpha","layer"])[["local_ans","local_ans_base","local_ans_context"]].mean().reset_index()

best_alpha_layer = best_steering_performance.loc[best_steering_performance.groupby(["vector","lang"])["local_ans"].idxmax()]

best_alpha_layer.rename(columns={"local_ans": "local_ans_steer", "local_ans_base": "local_ans_no_steer", "local_ans_context": "local_ans_w_context"}, inplace=True)
best_alpha_layer

Unnamed: 0,vector,lang,alpha,layer,local_ans_steer,local_ans_no_steer,local_ans_w_context
21,held-out universal (translated),bn,2,21,0.544118,0.448529,0.830882
50,held-out universal (translated),fr,2,22,0.305439,0.230126,0.757322
79,held-out universal (translated),ru,2,23,0.516393,0.262295,0.893443
111,held-out universal (translated),tr,2,27,0.379592,0.2,0.75102
131,held-out universal (translated),us,1,26,0.289062,0.210938,0.761719
140,none,bn,0,21,0.448529,0.448529,0.830882
147,none,fr,0,21,0.230126,0.230126,0.757322
154,none,ru,0,21,0.262295,0.262295,0.893443
161,none,tr,0,21,0.2,0.2,0.75102
168,none,us,0,21,0.210938,0.210938,0.761719


In [None]:
## per culture vectors, performance by subtask
disp_df = steer_df.merge(hinted, on=["option_tr", "option_fr","option_bn","option_us","option_ru"], how="inner", suffixes=("", "_context"))
best_steering_performance = disp_df.groupby(["vector","lang","alpha","subtask","layer"])[["local_ans","local_ans_base","local_ans_context"]].mean().reset_index()

# Group by vector, subtask, alpha, layer, taking the mean across languages
grouped = best_steering_performance.groupby(
    ["vector", "lang", "alpha", "layer"], as_index=False
)[["local_ans", "local_ans_base", "local_ans_context"]].mean()

# 2) Within each (vector, subtask), pick the alpha/layer that has the best local_ans
idx = grouped.groupby(["vector", "lang"])["local_ans"].idxmax()
best_alpha_layer_subtask = grouped.loc[idx].reset_index(drop=True)

# 3) Rename columns for clarity
best_alpha_layer_subtask.rename(
    columns={
        "local_ans": "local_ans_steer",
        "local_ans_base": "local_ans_no_steer",
        "local_ans_context": "local_ans_w_context"
    },
    inplace=True
)

best_alpha_layer_subtask

In [31]:
## per culture vectors
disp_df = steer_df.merge(hinted, on=["option_tr", "option_fr","option_bn","option_us","option_ru"], how="inner", suffixes=("", "_context"))
best_steering_performance = disp_df.groupby(["vector","lang","alpha","subtask","layer"])[["local_ans","local_ans_base","local_ans_context"]].mean().reset_index()

best_alpha_layer = best_steering_performance.loc[best_steering_performance.groupby(["vector","lang","subtask"])["local_ans"].idxmax()]

best_alpha_layer.rename(columns={"local_ans": "local_ans_steer", "local_ans_base": "local_ans_no_steer", "local_ans_context": "local_ans_w_context"}, inplace=True)
best_alpha_layer

Unnamed: 0,vector,lang,alpha,subtask,layer,local_ans_steer,local_ans_no_steer,local_ans_w_context
43,held-out universal (translated),bn,1,cities,22,0.5,0.3,1.0
49,held-out universal (translated),bn,1,names,21,0.58,0.5,0.74
77,held-out universal (translated),bn,2,o1,21,0.530303,0.454545,0.848485
105,held-out universal (translated),fr,-1,cities,21,0.352941,0.294118,1.0
155,held-out universal (translated),fr,2,names,22,0.34,0.24,0.633333
102,held-out universal (translated),fr,-2,o1,25,0.2,0.163636,0.945455
233,held-out universal (translated),ru,2,cities,23,0.761905,0.380952,1.0
240,held-out universal (translated),ru,2,names,23,0.54,0.26,0.88
247,held-out universal (translated),ru,2,o1,23,0.392157,0.215686,0.862745
318,held-out universal (translated),tr,2,cities,24,0.7,0.5,1.0
