In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from IPython.display import Markdown, display
import textwrap

# Load model + tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set up generator
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
output = generator(
    "Once upon a time,",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

# Show it pretty
print("\nMarkdown Output:\n")
display(Markdown(output[0]["generated_text"]))


In [None]:
from datasets import load_dataset

# Load plain text file as a dataset
dataset = load_dataset("text", data_files={"train": "oracle_lines.txt"})

# Show one sample
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token!

# Tokenize function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Apply to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Preview
import pprint
pprint.pprint(tokenized_dataset["train"][0])
