In [None]:
!gdown https://drive.google.com/uc?id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx

In [None]:
!tar -xzvf UNv1.0.testsets.tar.gz

In [None]:
src = "ar"
trg = "en"

In [None]:
sys_msg = f"Translate the following statement from {src} to {trg}"
def create_chat_prompt(sys_msg, input_text):
    return [
        {"role": "system", "content": sys_msg}, 
        {"role": "user", "content": input_text}
    ]

def create_chat_example(article, summary):
    return [
        {"role": "system", "content": article, "name": "example_user"},
        {"role": "system", "content": summary, "name": "example_assistant"},
    ]

In [None]:
!mkdir -p ../registry/data/unv1

In [None]:
import pandas as pd

src_path = "testsets/testset/UNv1.0.testset.ar"
trg_path = "testsets/testset/UNv1.0.testset.en"

df_test_tar = pd.read_csv(trg_path, delimiter="  ", header = None)
df_test_src = pd.read_csv(src_path, delimiter="  ", header = None)

df_test = pd.concat([df_test_tar, df_test_src], axis = 1)
df_test.columns = ['target', "source"]

src_path = "testsets/devset/UNv1.0.devset.ar"
trg_path = "testsets/devset/UNv1.0.devset.en"

df_dev_tar = pd.read_csv(trg_path, delimiter="  ", header = None)
df_dev_src = pd.read_csv(src_path, delimiter="  ", header = None)

df_dev = pd.concat([df_dev_tar, df_dev_src], axis = 1)
df_dev.columns = ['target', "source"]

In [None]:
df_test

In [None]:
df_dev["sample"] = df_dev.apply(lambda x: create_chat_example(x['source'], x['target']), axis=1)
df_dev[["sample"]].to_json(f'../registry/data/unv1/few_shot_{src}_{trg}.jsonl', lines=True, orient="records")

df_test["input"] = df_test['source'].apply(lambda x: create_chat_prompt(sys_msg, x))
df_test["ideal"] = df_test['target']
df_test[["input", "ideal"]].to_json(f'../registry/data/unv1/samples_{src}_{trg}.jsonl', lines=True, orient="records")

In [None]:
pos_tagging_task_specs = """
unv1:
    id: unv1.test.v1
    metrics: [accuracy]
    description: Evaluate Arabic translation
# Define the eval
unv1.test.v1:
  # Specify the class name as a dotted path to the module and class
  class: evals.elsuite.translate:Translate
  args:
    samples_jsonl: unv1/samples_ar_en.jsonl
    num_few_shot: 0 # max few shots to use

""".strip()
with open("../registry/evals/unv1.yaml", "w") as file:
    file.write(pos_tagging_task_specs)


In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["EVALS_THREADS"] = "3"
os.environ["EVALS_THREAD_TIMEOUT"] = "600"

In [None]:
record_path = "../eval_results/unv1.jsonl"
!oaieval gpt-3.5-turbo-0301 unv1 --record_path {record_path} --seed 41 --modelspec_extra_options temperature=0.0 --max_samples 4000

In [None]:
import pandas as pd

with open(record_path, "r") as f:
    events_df = pd.read_json(f, lines=True)
print(events_df[events_df["final_report"].notnull()]["final_report"].to_list())

In [None]:
from sacrebleu.metrics.bleu import BLEU

for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].sort_values('sample_id').data).iterrows():
    print(f"Prompt: {r.prompt[-1]}")
    print(f"Sampled: {r.sampled}")
    print(f"Truth: {df_test['target'][i]}")
    print("score", BLEU(effective_order = True).sentence_score(r.sampled, [df_test['target'][i]]).score)
    print(f"{i}","--" * 25)
    break

In [None]:
events_df[events_df.type == "sampling"].sort_values('sample_id')['sample_id']