In [1]:
import os
import json
import torch as tc
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM, AutoTokenizer, 
DataCollatorForLanguageModeling, Trainer, 
TrainingArguments
)

from gpt_2_gen.utils import to_device, get_device, set_seed, generate_namespace

os.environ["TOKENIZERS_PARALLELISM"] = "false"

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
cfg = generate_namespace(path=f"../config.yaml")
print(json.dumps(vars(cfg), indent=2))

set_seed(cfg.seed)
device = get_device()

{
  "model_name": "gpt2",
  "prompt": "The future of physics is",
  "seed": 42,
  "max_token_len": 128,
  "train_size": 2000,
  "val_size": 1000,
  "lr": 2e-05,
  "epochs": 3,
  "train_batch_size": 4,
  "eval_batch_size": 4,
  "save_step": 250,
  "eval_step": 250,
  "log_step": 100,
  "strategy": "epoch",
  "fig_path": "../outputs/",
  "res_path": "../results/"
}


In [3]:
model = AutoModelForCausalLM.from_pretrained(cfg.model_name)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)

inputs = tokenizer(
    cfg.prompt,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
inputs = to_device(inputs, device)
print(inputs)

{'input_ids': tensor([[  464,  2003,   286, 11887,   318]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='mps:0')}


In [4]:
with tc.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=50,
        pad_token_id=tokenizer.pad_token_id
    )

for seq in outputs: 
    print(tokenizer.decode(seq, skip_special_tokens=True))

The future of physics is in the hands of the people who are going to make it happen.

"We're going to have to make sure that we're not just going to be able to do this, but we're going to have to make sure that we're


## Parameters explanation:
    1. **inputs
        * Actually splits up into
            - input_ids=inputs["input_ids"],
            - attention_mask=inputs["attention_mask"]
    2. do_sample
        * Tells model to sample from probability distribution instead of picking the highest-probability token.
        do_sample = False -> deterministic
        do_sample = True -> random sampling
    3. top_k
        * Sorts logits and keepstop 50 most probably tokens. Then samples from here.
        * Lower k -> more deterministic
        * Higher k -> more random
    4. top_p = 0.95
        * Keeps the smallest set of tokens whose cumulative probability >= 0.95
        * Can be used in conjunction with top_k, which amounts to picking tokens that intersect with these two conditions.
        * Helps adapt to uncertainty.
    5. temperature
        * Scales softmax distribution before sampling
        * Lower T -> more deterministic
        * Higher T -> more random
    6. num_return_sequences
        * Generates N number of separate continuations (predictions) for the input text.

In [5]:
with tc.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=50,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=3,
        pad_token_id=tokenizer.pad_token_id
    )

for i, seq in enumerate(outputs):
    print(f"Seq. {i + 1}:")
    print(f"{tokenizer.decode(seq, skip_special_tokens=True)}")
    print("\n")

Seq. 1:
The future of physics is still in its infancy, but with a lot of new knowledge, new technologies, and new ideas, it is likely that the next big breakthrough will be the development of a particle accelerator.

In this post, we will explore the idea of a


Seq. 2:
The future of physics is not the same as it was twenty-five years ago. It is not an easy time to live in, and it is not an easy place to grow up. It is not the way of life we know today.

Today I am writing


Seq. 3:
The future of physics is going to be interesting, but it has to be a lot more interesting than what we've been doing," said one of the researchers involved in the work, J.D. Smith. "We need to understand and improve on things that are not being




In [6]:
data = "imdb"
dataset = load_dataset(data)

def tokenize_fn(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=cfg.max_token_len,
    )
    return tokenized

data_tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
training_args = TrainingArguments(
    output_dir=cfg.res_path,
    eval_strategy=cfg.strategy,
    save_strategy= cfg.strategy,
    learning_rate=cfg.lr,
    save_steps=cfg.save_step,
    eval_steps=cfg.eval_step,
    logging_steps=cfg.log_step,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    num_train_epochs=cfg.epochs,
    overwrite_output_dir=True,
    report_to="none",
    seed=cfg.seed,
    use_cpu=False if device.type == "mps" else True
)

# High training wrapper for Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_tokenized["train"].shuffle(seed=cfg.seed).select(range(cfg.train_size)),
    eval_dataset=data_tokenized["test"].shuffle(seed=cfg.seed).select(range(cfg.val_size)),
    processing_class=tokenizer,
    data_collator=data_collator
)

if device.type == "mps":
    tc.mps.empty_cache()
    
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.6887,3.654404
2,3.5778,3.653589
3,3.5165,3.658192


TrainOutput(global_step=1500, training_loss=3.6088597819010415, metrics={'train_runtime': 1213.7658, 'train_samples_per_second': 4.943, 'train_steps_per_second': 1.236, 'total_flos': 391938048000000.0, 'train_loss': 3.6088597819010415, 'epoch': 3.0})

In [14]:
from math import exp
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = exp(eval_loss)
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 38.79


In [9]:
# print(trainer.state.best_model_checkpoint)

In [10]:
# best_ckpt = trainer.state.best_model_checkpoint
# print("Best checkpoint:", best_ckpt)

# # Check existence and file size
# if best_ckpt and os.path.exists(os.path.join(best_ckpt, "pytorch_model.bin")):
#     print("Checkpoint found and looks valid.")
# else:
#     print("Checkpoint missing or incomplete â€” falling back to final model.")
#     best_ckpt = None

In [11]:
# from transformers import AutoModelForCausalLM

# if best_ckpt:
#     best_model = AutoModelForCausalLM.from_pretrained(best_ckpt)
# else:
#     best_model = trainer.model  # fallback to last trained model

# best_model.to(device)
# best_model.eval()

In [12]:
# from transformers import logging
# logging.set_verbosity_info()

# best_model = AutoModelForCausalLM.from_pretrained(best_ckpt, ignore_mismatched_sizes=False)

In [13]:
# best_model.save_pretrained(cfg.res_path + "best_model_clean")
# tokenizer.save_pretrained(cfg.res_path + "best_model_clean")