In [1]:
from tqdm import tqdm
import ast
from datasets import load_from_disk

In [2]:
ds_train = load_from_disk("mathbridge_filtered")
ds_train

Dataset({
    features: ['context_before', 'equation', 'context_after', 'spoken_English'],
    num_rows: 4753354
})

In [3]:
ds_train_1000 = ds_train.shuffle(seed=42).select(range(10**3))
ds_train_1000

Dataset({
    features: ['context_before', 'equation', 'context_after', 'spoken_English'],
    num_rows: 1000
})

In [4]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model="Hyeonsieun/MathSpeech_T5_base_translator", max_length=1000)




In [5]:
import openai

def get_openai_response(prompt, transcription):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": """You are a helpful assistant. Your task is to correct the input LaTeX code to make it valid and compilable.
                            The input may contain both mathematical and non-mathematical text. Please ensure the output is corrected for both types
                            and that all elements are formatted correctly in LaTeX. Return only the corrected LaTeX code and nothing else.
                            Do not include any extra commands such as documentclass, begin, or end document. Exclude all additional comments, 
                            explanations, and any other text. The original transcription is: """
                + transcription,
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )
    return response.choices[0].message.content

In [None]:
responses = []
for ex in tqdm(ds_train_1000):
    transcription = ex["spoken_English"]
    prompt = pipe(transcription)[0]["generated_text"]
    response = get_openai_response(prompt, transcription)
    responses.append(response)

In [None]:
with open("openai_responses.txt", "w") as f:
    for response in responses:
        f.write(repr(response) + "\n")

In [7]:
with open("openai_responses.txt", "r") as f:
    responses2 = [ast.literal_eval(line) for line in f]
len(responses2)

1000

In [None]:
from TeXBLEU.new_metric import texbleu

scores_texbleu = []
for row, response in tqdm(zip(ds_train_1000, responses2), total=len(ds_train_1000)):
    score_texbleu = texbleu(response, row["equation"])
    scores_texbleu.append(score_texbleu)

In [9]:
final_texbleu = sum(scores_texbleu) / len(scores_texbleu)
final_texbleu

0.8465212999999994

In [28]:
import evaluate

# Load the evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
cer = evaluate.load("cer")
wer = evaluate.load("wer")

# Prepare references and predictions
references = [ex["equation"] for ex in ds_train_1000]
predictions = responses2

# Calculate BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)
print("BLEU score:", bleu_score["bleu"])

# Calculate ROUGE-1 score
rouge_score = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge1"])
print("ROUGE-1 score:", rouge_score["rouge1"])

# Calculate CER score
cer_score = cer.compute(predictions=predictions, references=references)
print("CER score:", cer_score)

# Calculate WER score
wer_score = wer.compute(predictions=predictions, references=references)
print("WER score:", wer_score)

BLEU score: 0.5373747199230257
ROUGE-1 score: 0.8528077477496061
CER score: 0.5142519219396806
WER score: 0.9194364161849711
