In [1]:
import os
import json
import torch as tc
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding
from gpt_2_gen.utils import to_device, get_device, set_seed, generate_namespace

os.environ["TOKENIZERS_PARALLELISM"] = "false"

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
cfg = generate_namespace(path=f"../config.yaml")
print(json.dumps(vars(cfg), indent=2))

set_seed(cfg.seed)
device = get_device()

{
  "model_name": "gpt2",
  "prompt": "The future of physics is",
  "seed": 42,
  "dpi": 400,
  "fig_path": "../outputs/"
}


In [3]:
model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name
).to(device)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model.config.pad_token_id = tokenizer.pad_token_id

inputs = tokenizer(
    cfg.prompt,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
inputs = to_device(inputs, device)
print(inputs)

{'input_ids': tensor([[  464,  2003,   286, 11887,   318]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='mps:0')}


In [5]:
with tc.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=50,
        pad_token_id=tokenizer.pad_token_id
    )

for seq in outputs: 
    print(tokenizer.decode(seq, skip_special_tokens=True))

The future of physics is in the hands of the people who are going to make it happen.

"We're going to have to make sure that we're not just going to be able to do this, but we're going to have to make sure that we're


## Parameters explanation:
    1. **inputs
        * Actually splits up into
            - input_ids=inputs["input_ids"],
            - attention_mask=inputs["attention_mask"]
    2. do_sample
        * Tells model to sample from probability distribution instead of picking the highest-probability token.
        do_sample = False -> deterministic
        do_sample = True -> random sampling
    3. top_k
        * Sorts logits and keepstop 50 most probably tokens. Then samples from here.
        * Lower k -> more deterministic
        * Higher k -> more random
    4. top_p = 0.95
        * Keeps the smallest set of tokens whose cumulative probability >= 0.95
        * Can be used in conjunction with top_k, which amounts to picking tokens that intersect with these two conditions.
        * Helps adapt to uncertainty.
    5. temperature
        * Scales softmax distribution before sampling
        * Lower T -> more deterministic
        * Higher T -> more random
    6. num_return_sequences
        * Generates N number of separate continuations (predictions) for the input text.

In [6]:
with tc.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=50,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=3,
        pad_token_id=tokenizer.pad_token_id
    )

for i, seq in enumerate(outputs):
    print(f"Seq. {i + 1}:")
    print(f"{tokenizer.decode(seq, skip_special_tokens=True)}")
    print("\n")

Seq. 1:
The future of physics is still in its infancy, but with a lot of new knowledge, new technologies, and new ideas, it is likely that the next big breakthrough will be the development of a particle accelerator.

In this post, we will explore the idea of a


Seq. 2:
The future of physics is not the same as it was twenty-five years ago. It is not an easy time to live in, and it is not an easy place to grow up. It is not the way of life we know today.

Today I am writing


Seq. 3:
The future of physics is going to be interesting, but it has to be a lot more interesting than what we've been doing," said one of the researchers involved in the work, J.D. Smith. "We need to understand and improve on things that are not being




In [7]:
dataset = load_dataset("imdb")

data_tokenized = dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, max_length=128), 
    batched=True,
    remove_columns=["text"]
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# train_loader = DataLoader(
#     tokenized["train"],
#     batch_size=16,
#     shuffle=True,
#     collate_fn=data_collator
# )

# eval_loader = DataLoader(
#     tokenized["test"],
#     batch_size=16,
#     collate_fn=data_collator
# )

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]