In [None]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import json
import pandas as pd
from datasets import load_dataset, load_from_disk

In [None]:
!rm -rf /kaggle/working/logs /kaggle/working/results /kaggle/working/wandb

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)

In [None]:
ds = load_dataset("Kyudan/MathBridge")

In [None]:
ds_train = ds["train"]

In [None]:
def preprocess_data(examples):
    before = examples["context_before"]
    after = examples["context_after"]
    equation = examples["equation"]
    spoken_English = examples["spoken_English"]

    # Prepend a task-specific prompt if necessary, e.g., "translate English to LaTeX:"
    inputs = [f"translate English to LaTeX: {before} {spoken_English} {after}"]
#     inputs = [f"translate English to LaTeX: {spoken_English}"]
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)

    outputs = [f"{before} {equation} {after}"]
#     outputs = [f"{equation}"]
    
    with tokenizer.as_target_tokenizer():
        model_outputs = tokenizer(outputs, max_length=512, truncation=True, padding=True)

    model_inputs["labels"] = model_outputs["input_ids"]

    return model_inputs

In [None]:
%%writefile imp_term.txt
\frac
\cdot
\times
\neq
\sqrt
^
_
\alpha
\beta
\gamma
\delta
\epsilon
\theta
\lambda
\leg
\geq
\le
\geq
\eq
\int
\sum
\prod
\lim
\int
\notin
\dots

In [None]:
with open('imp_term.txt', 'r') as file:
    lines = [line.strip() for line in file.readlines()]

In [None]:
ds_train = ds_train.filter(lambda x: len(x["equation"])>10 and any(word in x['equation'] for word in lines))
ds_train

In [None]:
ds_train_preprocessed = (ds_train.shuffle(seed=42)
                                 .select(range(2*10**5))
                                 .map(preprocess_data, remove_columns=ds_train.column_names, batched=True, batch_size=4))
ds_train_preprocessed

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,  # Smaller batch size
    gradient_accumulation_steps=8,  # Accumulate gradients for larger effective batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=None,
    run_name="T5_Finetune",
    learning_rate=1e-3,
    fp16=True,  # Enable mixed precision training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train_preprocessed,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
import wandb
wandb.login(key="91a0db028dce6f175361702b5140fa9c941bf8ff")

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")