In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from IPython.display import Markdown, display
import textwrap

# Load model + tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set up generator
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
output = generator(
    "Once upon a time,",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

# Show it pretty
print("\nMarkdown Output:\n")
display(Markdown(output[0]["generated_text"]))


In [None]:
from datasets import load_dataset

# Load plain text file as a dataset
dataset = load_dataset("text", data_files={"train": "oracle_lines.txt"})

# Show one sample
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token!

# Tokenize function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Apply to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Preview
import pprint
pprint.pprint(tokenized_dataset["train"][0])


In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Data collator helps the model learn next-token prediction
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-style = causal, not masked language modeling
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./oracle-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=1,
    logging_steps=5,
    prediction_loss_only=True
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)


In [9]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,4.1817
10,3.1499


TrainOutput(global_step=10, training_loss=3.665810298919678, metrics={'train_runtime': 20.3138, 'train_samples_per_second': 1.969, 'train_steps_per_second': 0.492, 'total_flos': 1306483752960.0, 'train_loss': 3.665810298919678, 'epoch': 5.0})