In [None]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

In [None]:
from datasets import load_dataset
ds = load_dataset("Kyudan/MathBridge")
ds

In [None]:
ds_train = ds["train"]
ds_train

In [None]:
def preprocess_data(examples):
    before = examples["context_before"]
    after = examples["context_after"]
    equation = examples["equation"]
    spoken_English = examples["spoken_English"]

    # Prepend a task-specific prompt if necessary, e.g., "translate English to LaTeX:"
    inputs = [f"{before} {spoken_English} {after}"]

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",  # Ensure that all sequences are padded to the max length
        return_tensors="pt",  # Ensure tensors are returned
    )

    outputs = [f"{before} {equation} {after}"]

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            outputs,
            max_length=512,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
ds_train_preprocessed = ds_train.map(preprocess_data)

In [None]:
ds_train_preprocessed.save_to_disk("./tokenized_dataset")