In [None]:
import re
import pandas as pd
from datasets import Dataset

## LOAD DATA

In [None]:
df = pd.read_parquet("../classification/data/en_poems.parquet")
df = df.astype({"title": "string", "text": "string", "author": "string"})
df.dtypes

## PREPROCESS THE DATA

In [None]:
def preprocess_text(text: str) -> list[str]:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # remove punctuation
    tokens = text.split()
    return tokens

df["tokens"] = df["text"].apply(preprocess_text)
df

In [None]:
dataset = Dataset.from_pandas(df[["text"]])

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

In [None]:
trainer.save_model("./gpt2-poem-model")
tokenizer.save_pretrained("./gpt2-poem-model")

In [23]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-poem-model", tokenizer="./gpt2-poem-model")

prompt = "qwioudhjqwoije"
results = generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)

print(results[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


qwioudhjqwoije, a city within a thousand miles,
Where the sun shines, like a star-light on a mountain shore
Where the waters are flowing, and the waves are breaking,
Was a place of wealth, and of fame, and of renown;
The first town to break for wealth the river Nile of the Nile,
The first city with renown, the first great city built in the world;
And it was all but a dream to
