In [None]:
#first download the dataset from https://catalog.ldc.upenn.edu/LDC2017T07 and extract it into the same directory
#create train, dev and test splits
!python3 splits_ldc/makeSplits.py bolt_sms_chat_ara_src_transliteration/data/transliteration/ splits_ldc/train.txt splits_ldc/train/xml_files
!python3 splits_ldc/makeSplits.py bolt_sms_chat_ara_src_transliteration/data/transliteration/ splits_ldc/dev.txt splits_ldc/dev/xml_files
!python3 splits_ldc/makeSplits.py bolt_sms_chat_ara_src_transliteration/data/transliteration/ splits_ldc/test.txt splits_ldc/test/xml_files 

#extract source and target for each split 
!python3 splits_ldc/getSourceAndTarget.py splits_ldc/train/xml_files/ splits_ldc/train/train-source.arabizi splits_ldc/train/train-word-aligned-target.gold splits_ldc/train/train-sentence-aligned-target.gold
!python3 splits_ldc/getSourceAndTarget.py splits_ldc/dev/xml_files/ splits_ldc/dev/dev-source.arabizi splits_ldc/dev/dev-word-aligned-target.gold splits_ldc/dev/dev-sentence-aligned-target.gold
!python3 splits_ldc/getSourceAndTarget.py splits_ldc/test/xml_files/ splits_ldc/test/test-source.arabizi splits_ldc/test/test-word-aligned-target.gold splits_ldc/test/test-sentence-aligned-target.gold

In [6]:
import pandas as pd
base_path = "splits_ldc"
dfs = []
for split in ['train', 'test']:
    li = []
    for filename in [f'{split}-source.arabizi', f'{split}-sentence-aligned-target.gold']:
        df = pd.read_csv(f"{base_path}/{split}/{filename}", header=None, delimiter = "\\n", engine = "python")
        li.append(df)

    df = pd.concat(li, axis=1, ignore_index=True)
    df.columns = ["arabizi", "arabic"]
    dfs.append(df)

df_dev, df_test = dfs

In [7]:
sys_msg = f"Translate Arabizi to Egyptian Arabic: "
def create_chat_prompt(sys_msg, input_text):
    return [
        {"role": "system", "content": sys_msg}, 
        {"role": "user", "content": input_text}
    ]

def create_chat_example(article, summary):
    return [
        {"role": "system", "content": article, "name": "example_user"},
        {"role": "system", "content": summary, "name": "example_assistant"},
    ]

In [8]:
!mkdir -p ../registry/data/bolt

In [9]:
df_test["input"] = df_test['arabizi'].apply(lambda x: create_chat_prompt(sys_msg, x))
df_test["ideal"] = df_test['arabic']
df_test[["input", "ideal"]].to_json(f'../registry/data/bolt/samples.jsonl', lines=True, orient="records")

In [10]:
pos_tagging_task_specs = """
bolt:
    id: bolt.test.v1
    metrics: [accuracy]
    description: Evaluate Arabic transliteration
# Define the eval
bolt.test.v1:
  # Specify the class name as a dotted path to the module and class
  class: evals.elsuite.translate:Translate
  args:
    samples_jsonl: bolt/samples.jsonl
    num_few_shot: 0 # max few shots to use

""".strip()
with open("../registry/evals/bolt.yaml", "w") as file:
    file.write(pos_tagging_task_specs)


In [11]:
import os
os.environ["OPENAI_API_KEY"] = "<openai-key>"
os.environ["EVALS_THREADS"] = "3"
os.environ["EVALS_THREAD_TIMEOUT"] = "600"

In [12]:
record_path = "../eval_results/bolt.jsonl"
!oaieval gpt-3.5-turbo-0301 bolt --record_path {record_path} --seed 41 --modelspec_extra_options temperature=0.0 --max_samples 6653

[2023-05-15 21:47:51,715] [registry.py:156] Loading registry from /home/zaid/evals/evals/registry/evals
[2023-05-15 21:47:51,838] [registry.py:156] Loading registry from /home/zaid/.evals/evals
[2023-05-15 21:47:52,856] [oaieval.py:213] [1;35mRun started: 230515184752XJJ5GC3M[0m
[2023-05-15 21:47:52,877] [data.py:75] Fetching bolt/samples.jsonl
[2023-05-15 21:47:52,912] [eval.py:32] Evaluating 6653 samples
[2023-05-15 21:47:52,918] [eval.py:152] Running in threaded mode with 3 threads!
  0%|▏                                      | 24/6653 [00:22<1:35:42,  1.15it/s][2023-05-15 21:48:16,031] [record.py:309] Logged 100 rows of events to ../eval_results/bolt.jsonl: insert_time=5.308ms
  0%|▏                                      | 28/6653 [00:24<1:09:50,  1.58it/s][2023-05-15 21:48:17,958] [_common.py:105] Backing off openai_chat_completion_create_retrying(...) for 0.4s (openai.error.RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-6VIsbC1WgU4rx2bWplxNV7gP 

In [13]:
import pandas as pd

with open(record_path, "r") as f:
    events_df = pd.read_json(f, lines=True)
print(events_df[events_df["final_report"].notnull()]["final_report"].to_list())

[{'accuracy': 0.322619405226794, 'sacrebleu_score': 13.760652272898145, 'sacrebleu_score_eo': 26.499700700036545}]


In [None]:
from sacrebleu.metrics.bleu import BLEU

for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].sort_values('sample_id').data).iterrows():
    print(f"Prompt: {r.prompt[-1]}")
    print(f"Sampled: {r.sampled}")
    print(f"Truth: {df_test['arabic'][i]}")
    print("score", BLEU(effective_order = True).sentence_score(r.sampled, [df_test['arabic'][i]]).score)
    print(f"{i}","--" * 25)