# Evaluation for Table 2

In [None]:
import pandas as pd

df = pd.concat(
    [
        pd.read_csv("./outputs/bitabuse_infer_ocr_simchar.csv", index_col="id"),
        pd.read_csv("./outputs/bitabuse_infer_spell.csv", index_col="id"),
    ],
    axis=1,
)

In [None]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

from OpenAttack.metric import BLEU, JaccardWord
from OpenAttack.text_process.tokenizer import PunctTokenizer
from tqdm import tqdm

from src.utils import word_accuracy

tokenizer = PunctTokenizer()

bleu = BLEU(tokenizer)
jaccard_word = JaccardWord(tokenizer)

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row["text"]
    label = row["label"]

    ocr = row["ocr"]
    df.loc[i, "ocr_word_accuracy"] = word_accuracy(ocr, label, text)
    df.loc[i, "ocr_bleu"] = bleu.calc_score(ocr, label)
    df.loc[i, "ocr_jaccard_word"] = jaccard_word.calc_score(ocr, label)

    simchar = row["simchar"]
    df.loc[i, "simchar_word_accuracy"] = word_accuracy(simchar, label, text)
    df.loc[i, "simchar_bleu"] = bleu.calc_score(simchar, label)
    df.loc[i, "simchar_jaccard_word"] = jaccard_word.calc_score(simchar, label)

    spell = row["spellchecker"]
    df.loc[i, "spell_word_accuracy"] = word_accuracy(spell, label, text)
    spell_remove_underbar = spell.replace("_", "")
    df.loc[i, "spell_bleu"] = bleu.calc_score(spell_remove_underbar, label)
    df.loc[i, "spell_jaccard_word"] = jaccard_word.calc_score(
        spell_remove_underbar, label
    )
df.to_csv("./outputs/bitabuse_infer.csv")

## Merge the results of the gpt-4o-mini model

In [None]:
import os

output_dir = "gpt-4o-mini/gpt_batches"

output_files = [
    os.path.join(output_dir, path)
    for path in os.listdir(output_dir)
    if "output" in path
]
output_files.sort(key=lambda x: int(x.replace(output_dir, "").split("_")[2]))
output_files

In [None]:
import json

outputs = []

for path in output_files:
    print(path)
    with open(path) as f:
        lines = f.readlines()
    batch_outputs = []
    for line in lines:
        data = json.loads(line)
        id = int(data["custom_id"].split("_")[1])
        output = data["response"]["body"]["choices"][0]["message"]["content"]
        batch_outputs.append((id, output))
    batch_outputs.sort(key=lambda x: x[0])
    outputs.extend([x[1] for x in batch_outputs])
print(len(outputs))

In [None]:
import ssl

import pandas as pd
from tqdm import tqdm

ssl._create_default_https_context = ssl._create_unverified_context

from OpenAttack.metric import BLEU, JaccardWord
from OpenAttack.text_process.tokenizer import PunctTokenizer
from tqdm import tqdm

from src.utils import word_accuracy

tokenizer = PunctTokenizer()

bleu = BLEU(tokenizer)
jaccard_word = JaccardWord(tokenizer)

df = pd.read_csv("./outputs/bitabuse_infer.csv")

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row["text"]
    label = row["label"]
    gpt4 = outputs[i].lower()

    df.loc[i, "gpt-4o-mini"] = gpt4
    df.loc[i, "gpt-4o-mini_word_accuracy"] = word_accuracy(gpt4, label, text)
    df.loc[i, "gpt-4o-mini_bleu"] = bleu.calc_score(gpt4, label)
    df.loc[i, "gpt-4o-mini_jaccard_word"] = jaccard_word.calc_score(gpt4, label)

In [None]:
df.to_csv("./outputs/bitabuse_infer_with_gpt4.csv", index=False)

## Make the table

In [2]:
from datasets import load_dataset

from src.dataset import train_valid_test_split

bitcore = load_dataset("AutoML/bitcore", split="train")
bitviper = load_dataset("AutoML/bitviper", split="train")
bitabuse = load_dataset("AutoML/bitabuse", split="train")

bitcore_ids = []
bitviper_ids = []
bitabuse_ids = []

for seed in range(10):
    bitcore_ids.append(
        train_valid_test_split(bitcore, split_ratio=(6, 2, 2), seed=seed)[2]["id"]
    )
    bitviper_ids.append(
        train_valid_test_split(bitviper, split_ratio=(6, 2, 2), seed=seed)[2]["id"]
    )
    bitabuse_ids.append(
        train_valid_test_split(bitabuse, split_ratio=(6, 2, 2), seed=seed)[2]["id"]
    )

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("./outputs/bitabuse_infer_with_gpt4.csv", index_col="id")

ds_infos = [
    ("BitCore", bitcore_ids),
    ("BitViper", bitviper_ids),
    ("BitAbuse", bitabuse_ids),
]
metrics = ["word_accuracy", "jaccard_word", "bleu"]
method_infos = [
    ("SimChar DB", "simchar"),
    ("OCR", "ocr"),
    ("Spell Checker", "spell"),
    ("GPT-4o-mini", "gpt-4o-mini"),
]

df_result = pd.DataFrame(
    {
        "Measure": ["word_accuracy"] * 3 + ["jaccard_word"] * 3 + ["bleu"] * 3,
        "Dataset": ["BitCore", "BitViper", "BitAbuse"] * 3,
    }
)

df_result.set_index(["Measure", "Dataset"], inplace=True)
for ds_name, ids in ds_infos:
    for method, col in method_infos:
        for metric in metrics:
            t = [np.mean(df.loc[id, f"{col}_{metric}"]) for id in ids]
            tt = rf"${np.mean(t).round(4):.4f} \pm {np.std(t).round(4):.4f}$"
            df_result.loc[(metric, ds_name), method] = tt

print(df_result.to_latex())
df_result

## Character BERT

In [26]:
from datasets import load_dataset

ds_names = ["BitCore", "BitViper", "BitAbuse"]

ds = {
    "BitCore": load_dataset("AutoML/bitcore", split="train")
    .to_pandas()
    .set_index("id"),
    "BitViper": load_dataset("AutoML/bitviper", split="train")
    .to_pandas()
    .set_index("id"),
    "BitAbuse": load_dataset("AutoML/bitabuse", split="train")
    .to_pandas()
    .set_index("id"),
}

In [None]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
from OpenAttack.metric import BLEU, JaccardWord
from OpenAttack.text_process.tokenizer import PunctTokenizer
from tqdm import tqdm

from src.utils import word_accuracy

tokenizer = PunctTokenizer()
bleu = BLEU(tokenizer)
jaccard_word = JaccardWord(tokenizer)

for ds_name in ds_names:
    for seed in range(10):
        df = pd.read_csv(
            f"./outputs/charbert-{ds_name.lower()}/{seed}.csv", index_col="id"
        )
        for id, row in tqdm(df.iterrows(), total=len(df)):
            text = ds[ds_name].loc[id, "text"]
            label = ds[ds_name].loc[id, "label"]
            pred = row["charbert"]
            df.loc[id, "word_accuracy"] = word_accuracy(pred, label, text)
            df.loc[id, "bleu"] = bleu.calc_score(pred, label)
            df.loc[id, "jaccard_word"] = jaccard_word.calc_score(pred, label)
        df.to_csv(f"./outputs/charbert-{ds_name.lower()}/{seed}.csv")

In [None]:
result_df = pd.DataFrame(
    {
        "Measure": ["word_accuracy"] * 3 + ["jaccard_word"] * 3 + ["bleu"] * 3,
        "Dataset": ["BitCore", "BitViper", "BitAbuse"] * 3,
    }
)
result_df.set_index(["Measure", "Dataset"], inplace=True)

for ds_name in ds_names:
    for metric in metrics:
        t = [
            np.mean(
                pd.read_csv(
                    f"./outputs/charbert-{ds_name.lower()}/{seed}.csv",
                    index_col="id",
                )[metric]
            )
            for seed in range(10)
        ]
        tt = rf"${np.mean(t).round(4):.4f} \pm {np.std(t).round(4):.4f}$"
        result_df.loc[(metric, ds_name), "CharBERT"] = tt

print(result_df.to_latex())
result_df