In [1]:
%%capture
!pip install -r requirements.txt

In [2]:
import sys
from pathlib import Path
import json
import pandas as pd
from dotenv import load_dotenv
import plotly.express as px
import torch as t
import pandas as pd
from tools.globals import load_country_globals
from tools.nnsight_utils import collect_residuals, visualize_top_tokens
from tools.patchscope import patch_scope_gen

from translate import Translator

from nnsight import LanguageModel
from transformers import AutoTokenizer

load_country_globals()
translator = Translator(from_lang="autodetect",to_lang="en")

device = t.device(
    "mps" if t.backends.mps.is_available() else "cuda" if t.cuda.is_available() else "cpu"
)
load_dotenv()
t.set_grad_enabled(False)

t.manual_seed(42)
if t.cuda.is_available():
    t.cuda.manual_seed_all(42)

%load_ext autoreload
%autoreload 2

In [3]:
prompt_suffix = {
    "English": "My guess is **",
    "Turkish": "Tahminim **",
    "French": "Ma supposition est **",
    "Russian": "Моё предположение **",
    "Bengali": "আমার অনুমান হলো **",
}

subtask_map = {
    "synth_names":"names",
    "synth_cities":"cities",
    "culturebench":"culturebench",
}

lang_suffix_to_lang = {
    "tr": "Turkish",
    "fr": "French",
    "ru": "Russian",
    "bn": "Bengali",
    "en": "English",
}

def eval_or_skip(txt):
    try:
        return eval(txt)
    except:
        return None

country_to_suffix= {"Turkey": "tr", "France": "fr", "Russia": "ru", "Bangladesh": "bn", "United States":"us"}

## Models

In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
nnmodel = LanguageModel('/dlabscratch1/public/llm_weights/gemma_hf/gemma-2-9b-it', 
                        device_map='cuda:0', 
                        dispatch=True, 
                        torch_dtype=t.bfloat16)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Prepare Dataset

In [None]:
all_dfs = []
for lang_suffix, lang in lang_suffix_to_lang.items():
    for subtask in ["names", "cities", "o1"]:
        print(f"Loading {lang} data...")
        subtask_df = pd.read_csv(f"data/new/{subtask}/{subtask}-{lang_suffix}.csv")
        subtask_df["lang"] = lang
        subtask_df["context"] = False
        subtask_df["subtask"] = subtask
        all_dfs.append(subtask_df)
combined_df = pd.concat(all_dfs)
combined_df.to_csv("data/new/merged_wo_context.csv", index=False)

Loading Turkish data...
Loading Turkish data...
Loading Turkish data...
Loading French data...
Loading French data...
Loading French data...
Loading Russian data...
Loading Russian data...
Loading Russian data...
Loading Bengali data...
Loading Bengali data...
Loading Bengali data...
Loading English data...
Loading English data...
Loading English data...


In [9]:
combined_df = pd.read_csv("data/new/merged_wo_context.csv")

hints = {
    "English": "I live in the United States.",
    "Turkish": "Türkiye'de yaşıyorum.",
    "French": "Je vis en France.",
    "Russian": "Я живу в России.",
    "Bengali": "আমি বাংলাদেশে থাকি।",
}

combined_df["question"] = combined_df.apply(lambda x: hints[x["lang"]] + " "+x["question"], axis=1)
combined_df["context"]=True

combined_df.to_csv("data/new/merged_w_context.csv", index=False)

In [10]:
df = pd.concat(
    [pd.read_csv("data/new/merged_wo_context.csv"),
    pd.read_csv("data/new/merged_w_context.csv")])

df.to_csv("data/new/5choice_dataset.csv", index=False)
 

In [96]:
import random

final_df = pd.read_csv("data/new/5choice_dataset.csv")
final_df["idx"] = final_df.index

def shuffle_options(row):
    

    country_suffix = ["tr", "fr", "ru", "bn", "us"]
    indices = [1, 2, 3, 4, 5]
    random.seed(42+row["idx"])
    random.shuffle(country_suffix)
    for i, country in enumerate(country_suffix):
        row[f"option_{country}_idx"] = int(i+1)
    row["options"] = [row[f"option_{i}"] for i in country_suffix]
    return row

final_df = final_df.apply(shuffle_options, axis=1)


In [97]:
final_df

Unnamed: 0,context,idx,lang,option_bn,option_bn_idx,option_fr,option_fr_idx,option_ru,option_ru_idx,option_tr,option_tr_idx,option_us,option_us_idx,options,question,subtask
0,False,0,Turkish,Ridwan,1,Enzo,2,Aleksandr,3,Alparslan,5,Liam,4,"[Ridwan, Enzo, Aleksandr, Liam, Alparslan]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names
1,False,1,Turkish,Sabbir,3,Lucas,1,Mikhail,4,Yusuf,5,Noah,2,"[Lucas, Noah, Sabbir, Mikhail, Yusuf]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names
2,False,2,Turkish,Nafis,5,Mathis,2,Maksim,1,Göktuğ,4,Oliver,3,"[Maksim, Mathis, Oliver, Göktuğ, Nafis]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names
3,False,3,Turkish,Umar,4,Nathan,3,Artyom,5,Ömer,1,James,2,"[Ömer, James, Nathan, Umar, Artyom]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names
4,False,4,Turkish,Irfan,4,Thomas,1,Mark,2,Miraç,5,Elijah,3,"[Thomas, Mark, Elijah, Irfan, Miraç]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,True,1195,English,University of Dhaka,4,Sorbonne University,5,Moscow State University,1,Boğaziçi University,2,Harvard University,3,"[Moscow State University, Boğaziçi University,...",I live in the United States. Which educational...,o1
1196,True,1196,English,Jagadish Chandra Bose,4,Marie Curie,2,Mikhail Lomonosov,1,Aziz Sancar,5,Thomas Edison,3,"[Mikhail Lomonosov, Marie Curie, Thomas Edison...",I live in the United States. Which scientific ...,o1
1197,True,1197,English,Rabindra Sangeet,1,Chanson,3,Folk music,4,Arabesque,5,Blues,2,"[Rabindra Sangeet, Blues, Chanson, Folk music,...",I live in the United States. Which traditional...,o1
1198,True,1198,English,Bengal tiger,5,Gallic rooster,1,Brown bear,4,Gray wolf,2,Bald eagle,3,"[Gallic rooster, Gray wolf, Bald eagle, Brown ...",I live in the United States. Which animal is r...,o1


In [98]:
from tools.prepare_input import messages_to_str
from tools.prompts import format_multi_choice_5choice


def prepare_dataset(data_df, tokenizer, instruction_model=True, formatter=None):
    data_df["prompt"] = data_df.apply(formatter, axis=1)
    data_df["messages"] = data_df["prompt"].apply(lambda x: [{"role": "user", "content": x}])
    to_input = lambda x: messages_to_str(x, tokenizer, instruction_model=instruction_model)
    data_df["input"] = data_df["messages"].apply(to_input)
    return data_df

final_df = prepare_dataset(final_df, tokenizer=tokenizer, instruction_model=True, formatter=format_multi_choice_5choice)
final_df.head(3)

Unnamed: 0,context,idx,lang,option_bn,option_bn_idx,option_fr,option_fr_idx,option_ru,option_ru_idx,option_tr,option_tr_idx,option_us,option_us_idx,options,question,subtask,prompt,messages,input
0,False,0,Turkish,Ridwan,1,Enzo,2,Aleksandr,3,Alparslan,5,Liam,4,"[Ridwan, Enzo, Aleksandr, Liam, Alparslan]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names,Soru:Aşağıdaki çoktan seçmeli soruya cevap ver...,"[{'role': 'user', 'content': 'Soru:Aşağıdaki ç...",<bos><start_of_turn>user\nSoru:Aşağıdaki çokta...
1,False,1,Turkish,Sabbir,3,Lucas,1,Mikhail,4,Yusuf,5,Noah,2,"[Lucas, Noah, Sabbir, Mikhail, Yusuf]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names,Soru:Aşağıdaki çoktan seçmeli soruya cevap ver...,"[{'role': 'user', 'content': 'Soru:Aşağıdaki ç...",<bos><start_of_turn>user\nSoru:Aşağıdaki çokta...
2,False,2,Turkish,Nafis,5,Mathis,2,Maksim,1,Göktuğ,4,Oliver,3,"[Maksim, Mathis, Oliver, Göktuğ, Nafis]",Aşağıdaki isimlerden hangisi muhtemelen benim ...,names,Soru:Aşağıdaki çoktan seçmeli soruya cevap ver...,"[{'role': 'user', 'content': 'Soru:Aşağıdaki ç...",<bos><start_of_turn>user\nSoru:Aşağıdaki çokta...


In [99]:
final_df.to_csv("data/new/5choice_dataset_input_ready.csv", index=False)

In [100]:
print(final_df.to_dict(orient="records")[2]["input"])

<bos><start_of_turn>user
Soru:Aşağıdaki çoktan seçmeli soruya cevap vermek için, yalnızca 1,2,3,4,5 arasından bir seçenek seçmelisiniz. Talimat: 1,2,3,4,5 arasından bir seçenek seçmelisiniz. Başka bir şey çıkarmayın.
Aşağıdaki isimlerden hangisi muhtemelen benim adım?
Seçenekler:
1. Maksim
2. Mathis
3. Oliver
4. Göktuğ
5. Nafis<end_of_turn>
<start_of_turn>model



## Analysis

### Suffix Analysis

In [53]:
words = {
    "France" :["Baguette", "Paris", "Euro", "Guillotine"],       # France
    "Turkey": ["Baklava", "Istanbul", "Lira", "Nazar"],         # Turkey
    "United States": ["Burger", "New York", "Dollar", "Cowboy Hat"],   # USA
    "Bangladesh": ["Biriyani", "Dhaka", "Taka", "Rickshaw"],        # Bangladesh
    "Russia": ["Borscht", "Moscow", "Ruble", "Matryoshka"]      # Russia
}

translated_words = {
    "France": ["Baguette", "Paris", "Euro", "Guillotine"],  # France (French)
    "Turkey": ["Baklava", "İstanbul", "Lira", "Nazar"],  # Turkey (Turkish)
    "United States": ["Burger", "New York", "Dollar", "Cowboy hat"],  # USA (English)
    "Bangladesh": ["বিরিয়ানি", "ঢাকা", "টাকা", "রিকশা"],  # Bangladesh (Bengali)
    "Russia": ["Борщ", "Москва", "Рубль", "Матрёшка"]  # Russia (Russian)
}

### Steering Eval

In [5]:
final_df = pd.read_csv("data/new/5choice_dataset_input_ready.csv")

In [6]:
steering_vec_map = {}
for c_pref in ["tr", "fr", "ru", "bn", "us"]:
     
    steering_vec_map[("enmicro", c_pref)] = t.load(f"vectors/gemma2_9b_it/per_culture/{c_pref}_en_avg_all_tasks.pt", weights_only=True)
    steering_vec_map[("transmicro", c_pref)] = t.load(f"vectors/gemma2_9b_it/per_culture/{c_pref}_trans_avg_all_tasks.pt", weights_only=True)
    steering_vec_map[("enuniversal", c_pref)] = t.load(f"vectors/gemma2_9b_it/universal/en_universal_all_cultures.pt", weights_only=True)
    steering_vec_map[("transuniversal", c_pref)] = t.load(f"vectors/gemma2_9b_it/universal/trans_universal_all_cultures.pt", weights_only=True)
    steering_vec_map[("enuniversal_loo", c_pref)] = t.load(f"vectors/gemma2_9b_it/universal/en_universal_{c_pref}_out.pt", weights_only=True)
    steering_vec_map[("transuniversal_loo", c_pref)] = t.load(f"vectors/gemma2_9b_it/universal/trans_universal_{c_pref}_out.pt", weights_only=True)

    if c_pref != "us":
        steering_vec_map[("implicit", c_pref)] = t.load(f"vectors/gemma2_9b_it/implicit/{c_pref}_avg_all_tasks.pt", weights_only=True)

    for task in ["names", "cities", "culturedistil", "culturebench"]:
        steering_vec_map[("en"+task, c_pref)] = t.load(f"vectors/gemma2_9b_it/per_task/{c_pref}_{task}_en.pt", weights_only=True)
        steering_vec_map[("trans"+task, c_pref)] = t.load(f"vectors/gemma2_9b_it/per_task/{c_pref}_{task}_trans.pt", weights_only=True)

In [21]:
test_data = final_df.query("not context").copy()
#test_data = final_df.copy()

test_data.shape

(600, 19)

In [22]:
from tools.contrastiveact import contrastive_act_gen_opt
from tqdm import tqdm

batch_size = 64

folder = "caa/gemma2_9b_it_shuffle"
filename = "heldoutuniversal_trans_alltasks"

lang_to_test = ["tr", "fr", "ru", "bn", "en"]
tasks_to_test = ["names", "cities", "o1"]

layers = [21,22,23,24,25,26,27]# + [15,16,17,18,19,20,28,29,30]
#layers = [21]

alphas = [-2, -1, 1, 2]
#alphas = [2]

#alphas = [0]


outputs = []
for task in tasks_to_test:
    for lang in lang_to_test:

        s = lang
        if lang == "en":
            s = "us"
        
        steering_vec = steering_vec_map[("transuniversal_loo",s)].unsqueeze(1)

        test_entries = test_data.query(f"lang=='{lang_suffix_to_lang[lang]}' and subtask==@task").to_dict(orient="records")

        batch_entries = [test_entries[k:k+batch_size] for k in range(0, len(test_entries), batch_size)]
        batch_inputs = [[entry["input"] for entry in batch] for batch in batch_entries]
        if len(batch_inputs) == 0:
            continue
        for i,batch_imp in tqdm(enumerate(batch_inputs), total=len(batch_inputs), desc=f"{lang} {task}"):
            for alpha in alphas:
                with t.no_grad():
                    out = contrastive_act_gen_opt(nnmodel, tokenizer, alpha * steering_vec, prompt=batch_imp, layer=layers, n_new_tokens=1)
                    for j,layer in enumerate(out[0]):
                        texts = out[0][layer]
                        probs = out[1]
                        epsilon = 1e-6
                        probs[probs < epsilon] = 0

                        for k, text in enumerate(texts):
                            out_dict = {"alpha": alpha, "steer_out": text, "steer_prob": probs[j,k,:,:].to_sparse(), "layer": layer}
                            out_dict.update(batch_entries[i][k])
                            outputs.append(out_dict)
                            pass

pd.to_pickle(outputs, f"{folder}/{filename}.pkl")

tr names: 100%|██████████| 1/1 [00:24<00:00, 24.31s/it]
fr names: 100%|██████████| 1/1 [00:33<00:00, 33.68s/it]
ru names: 100%|██████████| 1/1 [00:28<00:00, 28.32s/it]
bn names: 100%|██████████| 1/1 [00:37<00:00, 37.31s/it]
en names: 100%|██████████| 1/1 [00:24<00:00, 24.25s/it]
tr cities: 100%|██████████| 1/1 [00:10<00:00, 10.75s/it]
fr cities: 100%|██████████| 1/1 [00:10<00:00, 10.65s/it]
ru cities: 100%|██████████| 1/1 [00:16<00:00, 16.01s/it]
bn cities: 100%|██████████| 1/1 [00:15<00:00, 15.91s/it]
en cities: 100%|██████████| 1/1 [00:08<00:00,  8.34s/it]
tr o1: 100%|██████████| 1/1 [00:32<00:00, 32.14s/it]
fr o1: 100%|██████████| 1/1 [00:34<00:00, 34.79s/it]
ru o1: 100%|██████████| 1/1 [00:42<00:00, 42.83s/it]
bn o1: 100%|██████████| 1/1 [00:41<00:00, 41.82s/it]
en o1: 100%|██████████| 1/1 [00:38<00:00, 38.40s/it]


In [23]:
from tqdm import tqdm

new_rows = []


for out in tqdm(outputs):
    out["steer_ans_type"] = "none"
    for i in ["tr", "fr", "ru", "bn", "us"]:
        ans_idx  = str(out[f"option_{i}_idx"])
        pos = tokenizer.encode(ans_idx, add_special_tokens=False)[0]

        out["prob_"+i] = out["steer_prob"][0,pos].item()
        if ans_idx in out["steer_out"]:
            out["steer_ans_type"] = i
    new_rows.append(out)

steer_df = pd.DataFrame(new_rows)
steer_df.drop(columns=["steer_prob"], inplace=True)

steer_df.to_csv(f"{folder}/{filename}.csv", index=False)

100%|██████████| 16800/16800 [00:06<00:00, 2624.62it/s]


In [24]:
vector_renaming = {"enmicro": "per-culture (en)", 
                   "transmicro": "per-culture (translated)", 
                   "names": "names (en)",
                   "enuniversal_loo": "held-out universal (en)",
                   "transuniversal_loo": "held-out universal (translated)",
                   "transuniversal": "universal (translated)",
                   "enuniversal": "universal (en)"}

folder = "caa/gemma2_9b_it_shuffle"

files = {
    "held-out universal (translated)": ["heldoutuniversal_trans_alltasks"],
    "per-culture (translated)": ["perculture_trans_alltasks"]
}

all_dfs = []
for vec in files:
    files_to_load = files[vec]
    dfs = [pd.read_csv(folder+"/"+f+".csv") for f in files_to_load]
    steer_df = pd.concat(dfs)
    steer_df["vector"] = vec
    all_dfs.append(steer_df)

steer_df = pd.concat(all_dfs)

idx_list = steer_df["idx"].unique()

In [25]:
no_steer_df = pd.read_csv(f"caa/gemma2_9b_it_shuffle/nosteer.csv").query("idx in @idx_list")
dfs = []
for l in steer_df["layer"].unique():
    n = no_steer_df.copy()
    n["vector"] = "none"
    n["layer"]=l
    dfs.append(n)
no_steer_df = pd.concat(dfs)


steer_df = [steer_df, no_steer_df]
steer_df = pd.concat(steer_df)

In [12]:
steer_df.query("vector=='none' and alpha==0 and layer==25").groupby("lang")["steer_out"].value_counts()

lang     steer_out
Bengali  1            13
         2            10
         5            10
         3             9
         4             8
English  1            18
         3            13
         4             7
         2             6
         5             5
         This          1
French   1            14
         4            10
         5            10
         2             8
         3             8
Russian  1            15
         2            12
         3             9
         4             9
         5             4
         Нет           1
Turkish  1            16
         3            12
         2             8
         4             8
         5             5
         Bu            1
Name: count, dtype: int64

In [16]:
steer_df.query("vector=='per-culture (translated)' and alpha==2 and layer==21").groupby("lang")["steer_out"].value_counts()

lang     steer_out
Bengali  1            15
         2            11
         3            11
         5             8
         4             5
English  1            19
         3            12
         2             8
         4             7
         5             4
French   1            15
         4            12
         3             9
         2             7
         5             7
Russian  1            16
         2            13
         3             9
         4             7
         5             4
         Нет           1
Turkish  1            16
         3            13
         2             8
         4             6
         5             6
         Bu            1
Name: count, dtype: int64

In [26]:
for i in ["tr", "fr", "ru", "bn", "us"]:
    steer_df[f"ans_in_{i}"] = (steer_df["steer_ans_type"]).apply(lambda x: 1 if x==i else 0)

lang_to_type = {"French":"fr", "Turkish":"tr", "Russian":"ru", "Bengali":"bn", "English":"us"}
steer_df["lang"] = steer_df["lang"].apply(lambda x: lang_to_type[x])

steer_df["local_ans"] = steer_df.apply(lambda x: x[f"ans_in_{x['lang']}"], axis=1)

In [27]:
steer_df = steer_df.merge(
    steer_df.query("alpha == 0")[["prompt", "layer", "local_ans"]].rename(columns={"local_ans": "local_ans_base"}),
    on=["prompt","layer"],
    how="inner"
)
steer_df["delta_local_ans"] = steer_df["local_ans"] - steer_df["local_ans_base"]

In [28]:
hinted = pd.read_csv("caa/gemma2_9b_it_shuffle/nosteer.csv").query("context")
for i in ["tr", "fr", "ru", "bn", "us"]:
    hinted[f"ans_in_{i}"] = (hinted["steer_ans_type"]).apply(lambda x: 1 if x==i else 0)

lang_to_type = {"French":"fr", "Turkish":"tr", "Russian":"ru", "Bengali":"bn", "English":"us"}
hinted["lang"] = hinted["lang"].apply(lambda x: lang_to_type[x])

hinted["local_ans"] = hinted.apply(lambda x: x[f"ans_in_{x['lang']}"], axis=1)

In [34]:
## per culture vectors
disp_df = steer_df.merge(hinted, on=["option_tr", "option_fr","option_bn","option_us","option_ru"], how="inner", suffixes=("", "_context"))
best_steering_performance = disp_df.groupby(["vector","lang","alpha","layer"])[["local_ans","local_ans_base","local_ans_context"]].mean().reset_index()

best_alpha_layer = best_steering_performance.loc[best_steering_performance.groupby(["vector","lang"])["local_ans"].idxmax()]

best_alpha_layer.rename(columns={"local_ans": "local_ans_steer", "local_ans_base": "local_ans_no_steer", "local_ans_context": "local_ans_w_context"}, inplace=True)
best_alpha_layer

Unnamed: 0,vector,lang,alpha,layer,local_ans_steer,local_ans_no_steer,local_ans_w_context
21,held-out universal (translated),bn,2,21,0.544118,0.448529,0.830882
50,held-out universal (translated),fr,2,22,0.305439,0.230126,0.757322
79,held-out universal (translated),ru,2,23,0.516393,0.262295,0.893443
111,held-out universal (translated),tr,2,27,0.379592,0.2,0.75102
131,held-out universal (translated),us,1,26,0.289062,0.210938,0.761719
140,none,bn,0,21,0.448529,0.448529,0.830882
147,none,fr,0,21,0.230126,0.230126,0.757322
154,none,ru,0,21,0.262295,0.262295,0.893443
161,none,tr,0,21,0.2,0.2,0.75102
168,none,us,0,21,0.210938,0.210938,0.761719


In [35]:
best_alpha_layer.to_csv("caa/gemma2_9b_it_shuffle/best_alpha_layer_for_alltasks.csv", index=False)

In [31]:
## per culture vectors
disp_df = steer_df.merge(hinted, on=["option_tr", "option_fr","option_bn","option_us","option_ru"], how="inner", suffixes=("", "_context"))
best_steering_performance = disp_df.groupby(["vector","lang","alpha","subtask","layer"])[["local_ans","local_ans_base","local_ans_context"]].mean().reset_index()

best_alpha_layer = best_steering_performance.loc[best_steering_performance.groupby(["vector","lang","subtask"])["local_ans"].idxmax()]

best_alpha_layer.rename(columns={"local_ans": "local_ans_steer", "local_ans_base": "local_ans_no_steer", "local_ans_context": "local_ans_w_context"}, inplace=True)
best_alpha_layer

Unnamed: 0,vector,lang,alpha,subtask,layer,local_ans_steer,local_ans_no_steer,local_ans_w_context
43,held-out universal (translated),bn,1,cities,22,0.5,0.3,1.0
49,held-out universal (translated),bn,1,names,21,0.58,0.5,0.74
77,held-out universal (translated),bn,2,o1,21,0.530303,0.454545,0.848485
105,held-out universal (translated),fr,-1,cities,21,0.352941,0.294118,1.0
155,held-out universal (translated),fr,2,names,22,0.34,0.24,0.633333
102,held-out universal (translated),fr,-2,o1,25,0.2,0.163636,0.945455
233,held-out universal (translated),ru,2,cities,23,0.761905,0.380952,1.0
240,held-out universal (translated),ru,2,names,23,0.54,0.26,0.88
247,held-out universal (translated),ru,2,o1,23,0.392157,0.215686,0.862745
318,held-out universal (translated),tr,2,cities,24,0.7,0.5,1.0


In [33]:
best_alpha_layer.to_csv("caa/gemma2_9b_it_shuffle/best_alpha_layer_for_alltaskpertask.csv", index=False)