In [1]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import json
import pandas as pd
from datasets import load_dataset, load_from_disk

In [2]:
!rm -rf /kaggle/working/logs /kaggle/working/results /kaggle/working/wandb

  pid, fd = os.forkpty()


In [3]:
# Load model directly
cache_dir = "/kaggle/working/my_model_dir"
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base", cache_dir=cache_dir)
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base", cache_dir=cache_dir)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [4]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)

In [5]:
cache_dir = "/kaggle/working/my_data_dir"

ds = load_dataset("Kyudan/MathBridge",  cache_dir=cache_dir)

Downloading readme:   0%|          | 0.00/66.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/968M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23195831 [00:00<?, ? examples/s]

In [6]:
ds_train = ds["train"]

In [7]:
def preprocess_data(examples):
    before = examples["context_before"]
    after = examples["context_after"]
    equation = examples["equation"]
    spoken_English = examples["spoken_English"]

    # Prepend a task-specific prompt if necessary, e.g., "translate English to LaTeX:"
#     inputs = [f"translate English to LaTeX: {before} {spoken_English} {after}"]
    inputs = [f"translate English to LaTeX: {spoken_English}"]
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)

#     outputs = [f"{before} {equation} {after}"]
    outputs = [f"{equation}"]
    
    with tokenizer.as_target_tokenizer():
        model_outputs = tokenizer(outputs, max_length=512, truncation=True, padding=True)

    model_inputs["labels"] = model_outputs["input_ids"]

    return model_inputs

In [8]:
ds_train = ds_train.filter(lambda x: len(x["equation"])>10)
ds_train

Filter:   0%|          | 0/23195831 [00:00<?, ? examples/s]

Dataset({
    features: ['context_before', 'equation', 'context_after', 'spoken_English'],
    num_rows: 9458549
})

In [9]:
ds_train_preprocessed = (ds_train.shuffle(seed=42)
                                 .select(range(2*10**5))
                                 .map(preprocess_data, remove_columns=ds_train.column_names, batched=True, batch_size=4))
ds_train_preprocessed

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50000
})

In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=None,
    run_name="T5_Finetune"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train_preprocessed,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [11]:
import wandb
wandb.login(key="91a0db028dce6f175361702b5140fa9c941bf8ff")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdeeptanshumalu[0m ([33mdeeptanshu-malu[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.18.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240916_035903-j8zmytsf[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mT5_Finetune[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/deeptanshu-malu/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/deeptanshu-malu/huggingface/runs/j8zmytsf[0m
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
3125,0.5774
6250,0.3287
9375,0.3012


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=9375, training_loss=0.40242677734375, metrics={'train_runtime': 6394.5109, 'train_samples_per_second': 23.458, 'train_steps_per_second': 1.466, 'total_flos': 1.715169861623808e+16, 'train_loss': 0.40242677734375, 'epoch': 3.0})

In [13]:
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json',
 './fine_tuned_t5/tokenizer.json')