In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
import os

load_dotenv()
dataset = load_dataset("allenai/math-meta-reasoning-cleaned", token=os.getenv("HF_TOKEN"))
dataset

Using the latest cached version of the dataset since allenai/math-meta-reasoning-cleaned couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/cheuktingho/.cache/huggingface/datasets/allenai___math-meta-reasoning-cleaned/default/0.0.0/7044eb3a39d07dabcd97f1d14f40a5d421bf5ae0 (last modified on Wed Aug  6 11:30:30 2025).


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'token_count'],
        num_rows: 987485
    })
})

In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [3]:
tokenized_datasets_split = tokenized_datasets["train"].shard(num_shards=100, index=0).train_test_split(test_size=0.2, shuffle=True)
tokenized_datasets_split

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'token_count', 'input_ids', 'attention_mask'],
        num_rows: 7900
    })
    test: Dataset({
        features: ['id', 'text', 'token_count', 'input_ids', 'attention_mask'],
        num_rows: 1975
    })
})

In [4]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    save_steps = 500,
    logging_steps=100,
    dataloader_pin_memory=False
)

In [5]:
from transformers import GPT2LMHeadModel, Trainer, DataCollatorForLanguageModeling

model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_split['train'],
    eval_dataset=tokenized_datasets_split['test'],
    data_collator=data_collator,
)

trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss


TrainOutput(global_step=4940, training_loss=0.0, metrics={'train_runtime': 0.0019, 'train_samples_per_second': 21091662.381, 'train_steps_per_second': 2637792.713, 'total_flos': 1.0321035264e+16, 'train_loss': 0.0, 'epoch': 5.0})

In [6]:
trainer.evaluate(tokenized_datasets_split['test'])
trainer.save_model("./trained_model")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


{'eval_loss': 0.9974642992019653,
 'eval_runtime': 213.0766,
 'eval_samples_per_second': 9.269,
 'eval_steps_per_second': 1.159,
 'epoch': 5.0}

In [7]:
trainer.save_model("./trained_model")