In [None]:
from datasets import load_dataset
from dotenv import load_dotenv
import os

load_dotenv()
dataset = load_dataset("Cheukting/math-meta-reasoning-cleaned", token=os.getenv("HF_TOKEN"))
dataset

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets_split = tokenized_datasets["train"].shard(num_shards=100, index=0).train_test_split(test_size=0.2, shuffle=True)
tokenized_datasets_split

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    save_steps = 500,
    logging_steps=100,
    dataloader_pin_memory=False
)

In [None]:
from transformers import GPT2LMHeadModel, Trainer, DataCollatorForLanguageModeling

model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_split['train'],
    eval_dataset=tokenized_datasets_split['test'],
    data_collator=data_collator,
)

trainer.train(resume_from_checkpoint=True)

In [None]:
trainer.evaluate(tokenized_datasets_split['test'])
trainer.save_model("./trained_model")

In [None]:
trainer.save_model("./trained_model")