## IMPORTS

In [1]:
import re
import pandas as pd
from datasets import Dataset
import evaluate
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## LOAD DATA AND TRANSLATED DATA, THEN MERGE BOTH

In [2]:
df1 = pd.read_parquet("../../data/en_poems.parquet")
df2 = pd.read_parquet("../../data/de_translated_en.parquet")
df = pd.concat([df1, df2], ignore_index=True)
df = df.astype({"title": "string", "text": "string", "author": "string"})
df.dtypes

title       string[python]
text        string[python]
author      string[python]
creation            object
dtype: object

In [3]:
len(df)

79959

## CREATE THE DATASETS

In [4]:
train_df, val_df = train_test_split(df[["text"]], test_size=0.05, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 75961/75961 [00:08<00:00, 8462.78 examples/s]
Map: 100%|██████████| 3998/3998 [00:00<00:00, 8985.72 examples/s]


In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,4.0281,3.997821
2,3.9135,3.93495
3,3.864,3.901535
4,3.8158,3.885563
5,3.7948,3.880372


TrainOutput(global_step=11870, training_loss=3.911005846391532, metrics={'train_runtime': 5970.6407, 'train_samples_per_second': 63.612, 'train_steps_per_second': 1.988, 'total_flos': 2.481000505344e+16, 'train_loss': 3.911005846391532, 'epoch': 5.0})

## SAVE THE MODEL

In [6]:
trainer.save_model("./gpt2-extended-data-poem-model")
tokenizer.save_pretrained("./gpt2-extended-data-poem-model")

('./gpt2-extended-data-poem-model/tokenizer_config.json',
 './gpt2-extended-data-poem-model/special_tokens_map.json',
 './gpt2-extended-data-poem-model/vocab.json',
 './gpt2-extended-data-poem-model/merges.txt',
 './gpt2-extended-data-poem-model/added_tokens.json',
 './gpt2-extended-data-poem-model/tokenizer.json')

## GENERATE A POEM

In [5]:
generator = pipeline("text-generation", model="./gpt2-extended-data-poem-model", tokenizer="./gpt2-extended-data-poem-model")

prompt = "It never ends"
results = generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)

print(results[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


It never ends, nor do I go,
I have nothing but grief and pain,
That comes to me through the night —
With my dead and sorrow, and a ghost
That calls me to his, and I am alone
To him I love.
I would never have seen a thing
So fair and so beautiful,
How beautiful would I have loved them all,
How beautiful for a mother
To see such a child come to her.
I would have passed


## EVALUATION METRICS

In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")

prompts = [
    "It never ends",
    "The moonlight dances",
    "Darkness falls quickly",
    "Beneath the willow tree",
    "Whispers in the wind",
    "I dreamed of fire",
    "The silence grew louder",
    "Stars fell like rain",
    "Time forgets no one",
    "A rose in winter",
    "Shadows crawl at dawn",
    "My heart is a lantern",
    "Echoes of your name",
    "Frozen in memory",
    "We walked on glass",
    "The sky swallowed the sun",
    "Love fades to smoke",
    "Buried beneath the snow",
    "A storm without sound",
    "Hope wears thin threads"
]

all_poems = df["text"].tolist()

poem_embeddings = model.encode(all_poems, convert_to_tensor=True)

best_refs = []

for prompt in prompts:
    prompt_embedding = model.encode(prompt, convert_to_tensor=True)
    similarities = util.cos_sim(prompt_embedding, poem_embeddings)[0]
    best_index = similarities.argmax().item()
    best_poem = all_poems[best_index]
    best_refs.append(best_poem)
    #print(f"\nPrompt: {prompt}\nBest Reference Poem:\n{best_poem}\n{'-'*80}")

In [7]:
#generator = pipeline("text-generation", model="./gpt2-extended-data-poem-model", tokenizer="./gpt2-extended-data-poem-model")

generated_poems = [
    generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)[0]["generated_text"]
    for prompt in prompts
]

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_score = rouge.compute(predictions=generated_poems, references=best_refs)
bert_score = bertscore.compute(predictions=generated_poems, references=best_refs, lang="en")

print("ROUGE Scores:")
for key, val in rouge_score.items():
    print(f"{key}: {round(val, 4)}")

print("BERTScore (averaged):")
print("Precision:", round(sum(bert_score["precision"]) / len(bert_score["precision"]), 4))
print("Recall:", round(sum(bert_score["recall"]) / len(bert_score["recall"]), 4))
print("F1:", round(sum(bert_score["f1"]) / len(bert_score["f1"]), 4))


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE Scores:
rouge1: 0.2112
rouge2: 0.0202
rougeL: 0.1396
rougeLsum: 0.2064
BERTScore (averaged):
Precision: 0.8319
Recall: 0.8302
F1: 0.8309


## METRICS ANALYSIS COMPARED TO THE OTHER NOTEBOOK

ROUGE-1:

We have a slightly better ROUGE score results, which suggests that the model is producing outputs that have more literal word overlap with references. Possibly, the added data increased lexical diversity or bias toward more common words.

BERTScore F1:

We have slightly less worse scores when it comes to BERT, which suggests that our model's semantic alignment with the reference may have slightly weakened. It could be due to the fact that the style or vocabulary of the new translated poems differs significantly from the original dataset, or that the translations introduced syntactic artifacts or inconsistencies, impacting fluency or meaning.

Conclusion:

Adding the translated poems increased lexical coverage or surface-level similarity (ROUGE-1), but might have introduced slight semantic drift (BERTScore F1) — possibly due to style, translation artifacts, or inconsistencies in tone or language quality.
