In [1]:
%%capture
!pip install -r requirements.txt

In [None]:
import pandas as pd
from dotenv import load_dotenv
import torch as t
import pandas as pd
from tools.globals import load_country_globals

from tools.nnsight_utils import  get_text_generations
from tools.evaluation import get_answer_type_final
from tqdm import tqdm

from tools.apis import OpenAIWrapper
import os



load_country_globals()

device = t.device(
    "mps" if t.backends.mps.is_available() else "cuda" if t.cuda.is_available() else "cpu"
)
load_dotenv()
t.set_grad_enabled(False)

t.manual_seed(42)
if t.cuda.is_available():
    t.cuda.manual_seed_all(42)

%load_ext autoreload
%autoreload 2

In [2]:
prompt_suffix = {
    "English": "My guess is **",
    "Turkish": "Tahminim **",
    "French": "Ma supposition est **",
    "Russian": "Моё предположение **",
    "Bengali": "আমার অনুমান হলো **",
}

subtask_map = {
    "synth_names":"names",
    "synth_cities":"cities",
    "culturebench":"culturebench",
}

def eval_or_skip(txt):
    try:
        return eval(txt)
    except:
        return None

country_to_suffix= {"Turkey": "tr", "France": "fr", "Russia": "ru", "Bangladesh": "bn"}

### Load Dataset

In [None]:
# Dataset for the binary classification task
final_df = pd.read_csv("data/binary/mcl_dataset.csv")

In [None]:
# Dataset for prefix analysis
final_df = pd.read_csv("data/binary/mcl_dataset_prefix.csv")

In [None]:
final_df["options"] = final_df["options"].apply(eval_or_skip)
final_df.dropna(subset=["options"], inplace=True)
final_df["subtask"] = final_df["source_id"].apply(lambda x: subtask_map.get(x, "culturedistil"))

original_df = final_df.copy()
original_df["swapped"] = False

print(final_df.shape)
final_df.head(3)

(23948, 15)


Unnamed: 0,source_id,question_id,country,question,lang,options,hint,ans_local_idx,ans_west_idx,ans_local,ans_west,prompt,subtask,translated,suffix
0,synth_names,synth_names_0_tr,Turkey,Baklava Which of the following names is most l...,English,"[Liam, Alparslan]",False,2.0,1.0,Alparslan,Liam,Question:To answer the following multiple-choi...,names,,Baklava
1,synth_names,synth_names_0_tr,Turkey,Istanbul Which of the following names is most ...,English,"[Liam, Alparslan]",False,2.0,1.0,Alparslan,Liam,Question:To answer the following multiple-choi...,names,,Istanbul
2,synth_names,synth_names_0_tr,Turkey,Lira Which of the following names is most like...,English,"[Liam, Alparslan]",False,2.0,1.0,Alparslan,Liam,Question:To answer the following multiple-choi...,names,,Lira


In [5]:
def swap_options(row):
    row["options"] = [row["options"][1], row["options"][0]]
    row[["ans_local_idx", "ans_west_idx"]] = row[["ans_west_idx", "ans_local_idx"]]
    return row

swapped_df = final_df.copy()
swapped_df = swapped_df.apply(swap_options, axis=1)
swapped_df["swapped"] = True

In [6]:
final_df = pd.concat([original_df, swapped_df])
print(final_df.shape)

(47896, 16)


### via transformers library

#### Instruct model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "google/gemma-2-9b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=t.bfloat16).to(device)

In [None]:
from tools.prepare_input import prepare_dataset_it

data_df = prepare_dataset_it(final_df, tokenizer)

print(data_df.shape)

(47896, 17)


#### Base model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=t.bfloat16).to(device)

In [None]:
from tools.prepare_input import prepare_dataset_base

data_df = prepare_dataset_base(final_df, tokenizer)

print(data_df.shape)

#### Generation

In [None]:
batch_size = 64
inputs = [data_df["input"].tolist()[k:k+batch_size] for k in range(0, len(data_df), batch_size)]

all_generations = []
for batch in tqdm(inputs):
    generations = get_text_generations(model, tokenizer, batch, device, max_new_tokens=20)
    all_generations.extend(generations)

data_df["model"] = "aya_expanse_8b"
data_df["output"] = all_generations
data_df = data_df.apply(lambda x: get_answer_type_final(x, check_for="index"), axis=1)

100%|██████████| 612/612 [10:36<00:00,  1.04s/it]


In [None]:
data_df.to_csv("aya_expanse_8b_output.csv", index=False)

### via API services

In [None]:
from tools.prepare_input import prepare_dataset_it

data_df = prepare_dataset_it(final_df)

print(data_df.shape)

In [None]:
together_api = OpenAIWrapper(api_key=os.getenv("TOGETHER_AI_API_KEY"),
                           base_url="https://api.together.xyz/v1")

openai_api = OpenAIWrapper(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
from concurrent.futures import ThreadPoolExecutor

inputs = data_df["messages"].tolist()

def generate_text_llama_3_1_70b(imp):
    return together_api.text_gen(imp, model_name="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo")

def generate_text_gemma_2_27b(imp):
    return together_api.text_gen(imp, model_name="google/gemma-2-27b-it")

def generate_text_gpt4o(imp):
    return openai_api.text_gen(imp, model_name="gpt-4o")

# Example with GPT-4o
with ThreadPoolExecutor() as executor:
    all_generations = list(tqdm(executor.map(generate_text_gpt4o, inputs), total=len(inputs)))

data_df["output"] = all_generations
data_df["model"] = "gpt4o"
data_df = data_df.apply(lambda x: get_answer_type_final(x, check_for="index"), axis=1)
data_df.to_csv("gpt4o_output.csv", index=False)

100%|██████████| 11974/11974 [05:22<00:00, 37.11it/s]
