In [5]:
import os
import re
import json
import difflib

import pandas as pd
from tqdm import tqdm_notebook as tqdm

## Input Batches

In [3]:
os.makedirs("input_batches", exist_ok=True)

In [2]:
def get_messages(sentence, examples):
    messages = [
        {"role": "system", "content": "You are an expert in correcting typos in sentences."},
        {"role": "user", "content": """
Here are examples of sentences with typos; learn from them:

{examples}
Now, please correct this sentence and output only the corrected version with no additional text:

{target_sentence}
        """.format(target_sentence=sentence, examples=examples)},
    ]
    return messages

In [4]:
# create input batches (jsonl)

NFs = [
    "noise_0.012",
    "noise_0.024",
    "noise_0.06",
]

for nf in NFs:
    df = pd.read_csv(f"results/{nf}.csv")
    examples = ""
    total=len(df)
    jsonl_file = f"input_batches/phone_gpt4o_nf_{nf}.jsonl"
    jsonl = []

    for i in range(2):
        examples += f"\tsentence: {df['Predicted Sentence'][i]}\n"
        examples += f"\tcorrected: {df['True Sentence'][i]}\n\n"
        
    for index, row in tqdm(df.iterrows(), total=total):
        predicted_sentence = row['Predicted Sentence']
        messages = get_messages(predicted_sentence, examples)
        # create openai request
        prompt = {
            "custom_id": f"{index}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": messages,
                "max_tokens": 1000,
            }
        }
        jsonl.append(json.dumps(prompt))
    
    with open(jsonl_file, "w") as f:
        for line in jsonl:
            f.write(line + "\n")
    print(f"Saved {jsonl_file}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows(), total=total):


  0%|          | 0/1000 [00:00<?, ?it/s]

Saved input_batches/phone_gpt4o_nf_noise_0.012.jsonl


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows(), total=total):


  0%|          | 0/1000 [00:00<?, ?it/s]

Saved input_batches/phone_gpt4o_nf_noise_0.024.jsonl


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows(), total=total):


  0%|          | 0/1000 [00:00<?, ?it/s]

Saved input_batches/phone_gpt4o_nf_noise_0.06.jsonl


## Output Batches

In [6]:
def load_openai_batch_output(filename):
    with open(filename, 'r') as file:
        responses = [json.loads(line) for line in file]
    return responses

In [7]:
def get_llm_sentence_from_openai(responses, index):
    # Find the response corresponding to the current index
    response = responses[index]
    llm_sentence = response['response']['body']['choices'][0]['message']['content']
    return llm_sentence.strip()

In [8]:
def llm_postprocess(sentence):
    sentence = sentence.lower().strip()
    # remove all non a-z0-9 
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    return sentence

In [9]:
def compute_accuracy_and_wrong_syllables(true_sentence, predicted_sentence):
    # Character-level accuracy using SequenceMatcher
    char_matcher = difflib.SequenceMatcher(None, true_sentence, predicted_sentence)
    accuracy = char_matcher.ratio()
    
    # Word-level wrong syllable count using SequenceMatcher on word lists
    true_words = true_sentence.split()
    predicted_words = predicted_sentence.split()
    word_matcher = difflib.SequenceMatcher(None, true_words, predicted_words)
    
    # Calculate wrong syllables based on insert, delete, and replace operations
    wrong_syllables = sum(1 for tag, _, _, _, _ in word_matcher.get_opcodes() if tag in ('insert', 'delete', 'replace'))
    
    return accuracy, wrong_syllables

In [14]:
# prepare the output batches
output_batches_dir = "output_batches"
name_prefix = "phone_gpt4o_nf_"
output_dir = "results/gpt-4o"
NFs = [
    "noise_0.012",
    "noise_0.024",
    "noise_0.06",
]


for nf in NFs:
    df = pd.read_csv(f"results/{nf}.csv")
    responses = load_openai_batch_output(f'{output_batches_dir}/{name_prefix}{nf}.jsonl')
    examples = ""

    for i in range(2):
        examples += f"\tsentence: {df['Predicted Sentence'][i]}\n"
        examples += f"\tcorrected: {df['True Sentence'][i]}\n\n"

    llm_accs = []
    llm_ws = []
    llm_sen = []
    total = len(df)

    for index, row in tqdm(df.iterrows(), total=total):
        should_print = index % 100 == 0
        predicted_sentence = row['Predicted Sentence']
        true_sentence = row['True Sentence']
        accuracy, wrong_syllables = compute_accuracy_and_wrong_syllables(true_sentence, predicted_sentence)
        
        if should_print:
            print(f"[LLM Auto] Index: {index} of {total}")
            print("[LLM Auto] CoAtNet", accuracy, wrong_syllables)

        llm_sentence = get_llm_sentence_from_openai(responses, index)
        llm_sentence = llm_postprocess(llm_sentence)
        accuracy, wrong_syllables = compute_accuracy_and_wrong_syllables(true_sentence, llm_sentence)
        
        if should_print:
            print("[LLM Auto] LLM", accuracy, wrong_syllables)
            print("[LLM Auto] ==========")
        
        llm_sen.append(llm_sentence)
        llm_accs.append(accuracy)
        llm_ws.append(wrong_syllables)

    df['LLM Sentence'] = llm_sen
    df['LLM Accuracy'] = llm_accs
    df['LLM Wrong syllables'] = llm_ws

    llm_avg_accuracy = sum(llm_accs) / len(llm_accs)
    llm_sum_wrong_syllables = sum(llm_ws)

    print(f"[LLM Auto] Model: GPT-4o")
    print(f"[LLM Auto] NF {nf}")
    print(f"[LLM Auto] LLM Average Accuracy: {llm_avg_accuracy}")
    print(f"[LLM Auto] LLM Sum of Wrong Syllables: {llm_sum_wrong_syllables}")
    print("[LLM Auto] ===")
    
    df.to_csv(f'{output_dir}/{nf}.csv', index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows(), total=total):


  0%|          | 0/1000 [00:00<?, ?it/s]

[LLM Auto] Index: 0 of 1000
[LLM Auto] CoAtNet 0.9326923076923077 4
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 100 of 1000
[LLM Auto] CoAtNet 0.9452054794520548 3
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 200 of 1000
[LLM Auto] CoAtNet 0.9523809523809523 1
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 300 of 1000
[LLM Auto] CoAtNet 0.972972972972973 2
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 400 of 1000
[LLM Auto] CoAtNet 1.0 0
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 500 of 1000
[LLM Auto] CoAtNet 0.8888888888888888 4
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 600 of 1000
[LLM Auto] CoAtNet 0.9743589743589743 2
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 700 of 1000
[LLM Auto] CoAtNet 0.9574468085106383 3
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 800 of 1000
[LLM Auto] CoAtNet 0.9418604651162791 2
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 900 of 1000
[LLM Auto] CoAtNet 0.9444444444444444 2
[LLM Auto] LLM 1.0 0
[LLM Auto] Model: GPT-4o
[LLM Auto] NF noise_0.012
[LLM Auto] LLM Average Accuracy: 0.9978416153142287
[LLM 

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows(), total=total):


  0%|          | 0/1000 [00:00<?, ?it/s]

[LLM Auto] Index: 0 of 1000
[LLM Auto] CoAtNet 0.8269230769230769 4
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 100 of 1000
[LLM Auto] CoAtNet 0.8356164383561644 7
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 200 of 1000
[LLM Auto] CoAtNet 0.8809523809523809 4
[LLM Auto] LLM 0.9761904761904762 1
[LLM Auto] Index: 300 of 1000
[LLM Auto] CoAtNet 0.9054054054054054 4
[LLM Auto] LLM 0.9864864864864865 1
[LLM Auto] Index: 400 of 1000
[LLM Auto] CoAtNet 0.8333333333333334 5
[LLM Auto] LLM 0.9861111111111112 1
[LLM Auto] Index: 500 of 1000
[LLM Auto] CoAtNet 0.873015873015873 3
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 600 of 1000
[LLM Auto] CoAtNet 0.8589743589743589 3
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 700 of 1000
[LLM Auto] CoAtNet 0.8829787234042553 3
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 800 of 1000
[LLM Auto] CoAtNet 0.872093023255814 4
[LLM Auto] LLM 0.9883720930232558 1
[LLM Auto] Index: 900 of 1000
[LLM Auto] CoAtNet 0.8555555555555555 3
[LLM Auto] LLM 0.972972972972973 1
[LLM Auto] Model: GP

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows(), total=total):


  0%|          | 0/1000 [00:00<?, ?it/s]

[LLM Auto] Index: 0 of 1000
[LLM Auto] CoAtNet 0.6634615384615384 3
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 100 of 1000
[LLM Auto] CoAtNet 0.6712328767123288 2
[LLM Auto] LLM 0.993103448275862 1
[LLM Auto] Index: 200 of 1000
[LLM Auto] CoAtNet 0.6547619047619048 2
[LLM Auto] LLM 0.9761904761904762 1
[LLM Auto] Index: 300 of 1000
[LLM Auto] CoAtNet 0.6351351351351351 3
[LLM Auto] LLM 0.96 2
[LLM Auto] Index: 400 of 1000
[LLM Auto] CoAtNet 0.6805555555555556 2
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 500 of 1000
[LLM Auto] CoAtNet 0.6825396825396826 3
[LLM Auto] LLM 0.7761194029850746 2
[LLM Auto] Index: 600 of 1000
[LLM Auto] CoAtNet 0.6666666666666666 1
[LLM Auto] LLM 0.9615384615384616 1
[LLM Auto] Index: 700 of 1000
[LLM Auto] CoAtNet 0.7127659574468085 2
[LLM Auto] LLM 1.0 0
[LLM Auto] Index: 800 of 1000
[LLM Auto] CoAtNet 0.6976744186046512 3
[LLM Auto] LLM 0.7978142076502732 3
[LLM Auto] Index: 900 of 1000
[LLM Auto] CoAtNet 0.6777777777777778 2
[LLM Auto] LLM 0.9010989010989011 1
[L