## IMPORTS

In [1]:
import re
import pandas as pd
from datasets import Dataset
import evaluate
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## LOAD DATA

In [2]:
df = pd.read_parquet("../classification/data/en_poems.parquet")
df = df.astype({"title": "string", "text": "string", "author": "string"})
df.dtypes

title     string[python]
text      string[python]
author    string[python]
dtype: object

## PREPROCESS THE DATA

In [3]:
def preprocess_text(text: str) -> list[str]:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # remove punctuation
    tokens = text.split()
    return tokens

df["tokens"] = df["text"].apply(preprocess_text)
df

Unnamed: 0,title,text,author,tokens
0,Song for an Unwritten Play.,"The moon's a drowsy fool to-night, Wrapped in ...","Shanks, Edward","[the, moons, a, drowsy, fool, tonight, wrapped..."
1,The Cup.,As a hot traveller Going through stones and sa...,"Shanks, Edward","[as, a, hot, traveller, going, through, stones..."
2,A Rhymeless Song.,Rhyme with its jingle still betrays The song t...,"Shanks, Edward","[rhyme, with, its, jingle, still, betrays, the..."
3,Meadow and Orchard.,"My heart is like a meadow, Where clouds go ove...","Shanks, Edward","[my, heart, is, like, a, meadow, where, clouds..."
4,Who thinks that he possesses.,Who thinks that he possesses His mistress with...,"Shanks, Edward","[who, thinks, that, he, possesses, his, mistre..."
...,...,...,...,...
41801,XXVIII.,Sole Maker of the Worlds! They lay A barren bl...,"De Vere, Aubrey","[sole, maker, of, the, worlds, they, lay, a, b..."
41802,XXIX.,When from beneath the Almighty Hand The suns a...,"De Vere, Aubrey","[when, from, beneath, the, almighty, hand, the..."
41803,XXX.,"A woman “clothed with the sun,” Yet fleeing fr...","De Vere, Aubrey","[a, woman, clothed, with, the, sun, yet, fleei..."
41804,XXXI.,No ray of all their silken sheen The leaves fi...,"De Vere, Aubrey","[no, ray, of, all, their, silken, sheen, the, ..."


## CREATE THE DATASETS

In [4]:
train_df, val_df = train_test_split(df[["text"]], test_size=0.05, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 39715/39715 [00:05<00:00, 6764.81 examples/s]
Map: 100%|██████████| 2091/2091 [00:00<00:00, 6508.67 examples/s]


In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

"""training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)"""

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.8888,3.826202
2,3.735,3.750368
3,3.6704,3.711607
4,3.5971,3.687578
5,3.5612,3.671891
6,3.5161,3.658936
7,3.4957,3.65316
8,3.4706,3.64795
9,3.4216,3.646008


TrainOutput(global_step=12410, training_loss=3.597428992100054, metrics={'train_runtime': 6211.034, 'train_samples_per_second': 63.943, 'train_steps_per_second': 1.998, 'total_flos': 2.5924023631872e+16, 'train_loss': 3.597428992100054, 'epoch': 9.992547834843908})

## SAVE THE MODEL

In [7]:
trainer.save_model("./gpt2-poem-model")
tokenizer.save_pretrained("./gpt2-poem-model")

('./gpt2-poem-model/tokenizer_config.json',
 './gpt2-poem-model/special_tokens_map.json',
 './gpt2-poem-model/vocab.json',
 './gpt2-poem-model/merges.txt',
 './gpt2-poem-model/added_tokens.json',
 './gpt2-poem-model/tokenizer.json')

## GENERATE A POEM

In [8]:
generator = pipeline("text-generation", model="./gpt2-poem-model", tokenizer="./gpt2-poem-model")

prompt = "It never ends"
results = generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)

print(results[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


It never ends,
And never has been;
For the first time, I am old,
And in this world's past;
The world's past.
And when it stops,
And a new thing comes in,
I will lie down at my bedside,
I will lie down at my bedside,
And, with a sigh, I will turn and walk,
And all the world shall know.
Yes, I will lie down at my bedside


## EVALUATION METRICS

In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2")

prompts = [
    "It never ends",
    "The moonlight dances",
    "Darkness falls quickly",
    "Beneath the willow tree",
    "Whispers in the wind",
    "I dreamed of fire",
    "The silence grew louder",
    "Stars fell like rain",
    "Time forgets no one",
    "A rose in winter",
    "Shadows crawl at dawn",
    "My heart is a lantern",
    "Echoes of your name",
    "Frozen in memory",
    "We walked on glass",
    "The sky swallowed the sun",
    "Love fades to smoke",
    "Buried beneath the snow",
    "A storm without sound",
    "Hope wears thin threads"
]

all_poems = df["text"].tolist()

poem_embeddings = model.encode(all_poems, convert_to_tensor=True)

best_refs = []

for prompt in tqdm(prompts):
    prompt_embedding = model.encode(prompt, convert_to_tensor=True)
    similarities = util.cos_sim(prompt_embedding, poem_embeddings)[0]
    best_index = similarities.argmax().item()
    best_poem = all_poems[best_index]
    best_refs.append(best_poem)
    #print(f"\nPrompt: {prompt}\nBest Reference Poem:\n{best_poem}\n{'-'*80}")

100%|██████████| 20/20 [00:00<00:00, 214.04it/s]


In [10]:
generator = pipeline("text-generation", model="./gpt2-poem-model", tokenizer="./gpt2-poem-model")

generated_poems = [
    generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)[0]["generated_text"]
    for prompt in prompts
]

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Evaluate all
rouge_score = rouge.compute(predictions=generated_poems, references=best_refs)
bert_score = bertscore.compute(predictions=generated_poems, references=best_refs, lang="en")

# Print results
print("ROUGE Scores:")
for key, val in rouge_score.items():
    print(f"{key}: {round(val, 4)}")

print("BERTScore (averaged):")
print("Precision:", round(sum(bert_score["precision"]) / len(bert_score["precision"]), 4))
print("Recall:", round(sum(bert_score["recall"]) / len(bert_score["recall"]), 4))
print("F1:", round(sum(bert_score["f1"]) / len(bert_score["f1"]), 4))


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE Scores:
rouge1: 0.1917
rouge2: 0.0179
rougeL: 0.1262
rougeLsum: 0.1844
BERTScore (averaged):
Precision: 0.838
Recall: 0.8277
F1: 0.8328


## METRICS ANALYSIS

The ROUGE scores being a bit low tell us that our generated poems are different in the exact word choice and structure form from the references. We believe it is not something to be worried about since poetry should be really free in terms of choices.

On the other hand, for BERT, we believe that an F1 >0.83 is consistently strong, especially across the 20 different prompts we made. The model has a decent semantic alignement and it shows that it can produce meaningful and related poetry. It is not just grammatically coherent but also thematically and semantically grounded.
