In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from IPython.display import Markdown, display
import textwrap

# Load model + tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set up generator
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
output = generator(
    "Once upon a time,",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

# Show it pretty
print("\nMarkdown Output:\n")
display(Markdown(output[0]["generated_text"]))


In [None]:
from datasets import load_dataset

# Load plain text file as a dataset
dataset = load_dataset("text", data_files={"train": "oracle_lines.txt"})

# Show one sample
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token!

# Tokenize function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Apply to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Preview
import pprint
pprint.pprint(tokenized_dataset["train"][0])


In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Data collator helps the model learn next-token prediction
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-style = causal, not masked language modeling
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./oracle-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=1,
    logging_steps=5,
    prediction_loss_only=True
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)


In [9]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,4.1817
10,3.1499


TrainOutput(global_step=10, training_loss=3.665810298919678, metrics={'train_runtime': 20.3138, 'train_samples_per_second': 1.969, 'train_steps_per_second': 0.492, 'total_flos': 1306483752960.0, 'train_loss': 3.665810298919678, 'epoch': 5.0})

In [1]:
from transformers import AutoModelForCausalLM, pipeline
from IPython.display import Markdown, display
import textwrap

# Load your fine-tuned model
finetuned_model = AutoModelForCausalLM.from_pretrained("./oracle-gpt2")

# Use the same tokenizer
generator = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)

# Generate output
output = generator(
    "The gods whispered that",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

# Clean wrap (for terminal)
print("\nWrapped Output:\n")
print(textwrap.fill(output[0]["generated_text"], width=80))

# Markdown display (notebook pretty)
display(Markdown(output[0]["generated_text"]))


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./oracle-gpt2.

In [2]:
trainer.save_model("oracle-gpt2")


NameError: name 'trainer' is not defined

In [3]:
from transformers import AutoModelForCausalLM, pipeline
from IPython.display import Markdown, display
import textwrap

# Load your fine-tuned model
finetuned_model = AutoModelForCausalLM.from_pretrained("oracle-gpt2")

# Use your tokenizer
generator = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)

# Generate text
output = generator(
    "The gods whispered that",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

# Show both wrapped and markdown output
print("\nWrapped Output:\n")
print(textwrap.fill(output[0]["generated_text"], width=80))

display(Markdown(output[0]["generated_text"]))


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory oracle-gpt2.

In [4]:
trainer.save_model("oracle-gpt2")


NameError: name 'trainer' is not defined

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # required for padding

model = AutoModelForCausalLM.from_pretrained("distilgpt2")


In [7]:
from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": "oracle_lines.txt"})

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./oracle-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=1,
    logging_steps=5,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)


In [9]:
trainer.train()
trainer.save_model("oracle-gpt2")


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,4.1817
10,3.1499


In [None]:
from transformers import AutoModelForCausalLM, pipeline
from IPython.display import Markdown, display
import textwrap

finetuned_model = AutoModelForCausalLM.from_pretrained("oracle-gpt2")

generator = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)

output = generator(
    "The gods whispered that",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

print("\nWrapped Output:\n")
print(textwrap.fill(output[0]["generated_text"], width=80))
display(Markdown(output[0]["generated_text"]))


Device set to use cpu



Wrapped Output:

The gods whispered that they had won all of the stars.   ‹ Now your name belongs
to our name, ‹ You were once you were— I didn't like the darkness. ‹ Never let
you dwell. The sun never stopped. ‹ Now you live! �


The gods whispered that they had won all of the stars.


‹ Now your name belongs to our name, ‹ You were once you were—
I didn't like the darkness.
‹ Never let you dwell.
The sun never stopped.
‹ Now you live!
�

In [12]:
# Load base model (not fine-tuned)
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Use same tokenizer
base_gen = pipeline("text-generation", model=base_model, tokenizer=tokenizer)

base_output = base_gen(
    "The gods whispered that",
    max_new_tokens=60,
    do_sample=True,
    top_k=50,
    temperature=0.9
)

print("\nBase Model Output:\n")
print(textwrap.fill(base_output[0]["generated_text"], width=80))


Device set to use cpu



Base Model Output:

The gods whispered that at every moment the moon had fallen upon the sky, and
the moon had fallen upon the moon. But the moon itself had fallen upon the
celestial body of the gods too.   The gods were not too much more pleased. The
moon looked in great brightness as the moon had fell upon the
