In [16]:
import pandas as pd

# Load Dataset
df = pd.read_csv("E:\\Datasets\\PoetryFoundationData.csv")  

df["Tags"].replace("", np.nan, inplace=True)
df["Theme"] = df["Tags"].fillna("")

df.to_csv("E:\\Datasets\\unlabeled_poems.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Tags"].replace("", np.nan, inplace=True)


In [4]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import re

df = pd.read_csv('E:\\Datasets\\unlabeled_poems.csv')  
df_cleaned = df.drop(columns=["Unnamed: 0", "Tags", "Poet"])

# Clean
df_cleaned["Title"] = df_cleaned["Title"].str.strip().str.replace(r"\r\n|\r|\n", " ", regex=True)
df_cleaned["Poem"] = df_cleaned["Poem"].str.strip().str.replace(r"\r\n|\r|\n", " ", regex=True)
df_cleaned["Theme"] = df_cleaned["Theme"].fillna("").str.strip().str.replace(r"\r\n|\r|\n", "", regex=True)

df_cleaned["Theme"] = df_cleaned["Theme"].apply(lambda x: x.split(",") if x else [])

# Extract unique themes
unique_themes_from_data = set(theme for themes in df_cleaned["Theme"] for theme in themes)

unique_themes_from_data = [re.sub(r'[^\w\s]', '', theme).strip() for theme in unique_themes_from_data]
df_cleaned.info(), unique_themes_from_data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13854 entries, 0 to 13853
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   13854 non-null  object
 1   Poem    13854 non-null  object
 2   Theme   13854 non-null  object
dtypes: object(3)
memory usage: 324.8+ KB


(None,
 ['Money  Economics',
  'Gratitude  Apologies',
  'Infatuation  Crushes',
  'Judaism',
  'Growing Old',
  'Social Commentaries',
  'Marriage  Companionship',
  'Humor  Satire',
  'Thanksgiving',
  'Horror',
  'Gardening',
  'Memorial Day',
  'Infancy',
  'Labor Day',
  'Animals',
  'Travels  Journeys',
  'Get Well  Recovery',
  'Health  Illness',
  'Cinco de Mayo',
  'Toasts  Celebrations',
  'Ghosts  the Supernatural',
  'Reading  Books',
  'The Spiritual',
  'Christianity',
  'Heartache  Loss',
  'Romantic Love',
  'Relationships',
  'First Love',
  'Greek  Roman Mythology',
  'Rivers',
  'Friends  Enemies',
  'Music',
  'Gay',
  'Unrequited Love',
  'Heavens',
  'Sciences',
  'Race  Ethnicity',
  'Winter',
  'Queer',
  'Graduation',
  'Funerals',
  'Islam',
  'Home Life',
  'Anniversary',
  'Streams',
  'Heroes  Patriotism',
  'Engagement',
  'Popular Culture',
  'Architecture  Design',
  'Other Religions',
  'Independence Day',
  'Christmas',
  'Indoor Activities',
  'Hanukk

In [5]:
predefined_themes = [
    "Nature", "Love", "Sadness", "Hope", "War", "Friendship", "Philosophy", "Death",
    "Happiness", "Dreams", "Seasons", "Family", "Loneliness", "Spirituality", "Fantasy",
    "Courage", "Adventure", "Wisdom", "Loss", "Mystery", "Conflict", "Beauty", "Grief",
    "Rebirth", "Regret", "Peace", "Self-Discovery", "Strength", "Trust", "Justice", "Innocence",
    "Youth", "Anger", "Revenge", "Eternity", "Love Lost", "Change", "Identity", "Morality",
    "Forgiveness", "Despair", "Hopefulness", "Light", "Darkness", "Enlightenment", "Ambition",
    "Freedom", "Romance", "Self-Reflection", "Journey", "Tradition", "Existentialism", "Connection",
    "Inspiration", "Imagination", "Wisdom"
]

all_themes = list(set(predefined_themes).union(unique_themes_from_data))

all_themes = sorted(set(all_themes))

len(all_themes), all_themes[:10]  # Show first 10 themes for reference


(179,
 ['Activities',
  'Adventure',
  'Ambition',
  'Anger',
  'Animals',
  'Anniversary',
  'Architecture  Design',
  'Arts  Sciences',
  'Beauty',
  'Birth'])

In [7]:
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

mlb = MultiLabelBinarizer(classes=all_themes)

# Transform theme labels into binary vectors
theme_matrix = mlb.fit_transform(df_cleaned["Theme"])

theme_df = pd.DataFrame(theme_matrix, columns=mlb.classes_)

df_combined = pd.concat([df_cleaned[["Poem"]], theme_df], axis=1)

# Split dataset into known and unknown themes
df_known = df_combined[df_cleaned["Theme"].apply(len) > 0]  # Poems with themes
df_unknown = df_combined[df_cleaned["Theme"].apply(len) == 0]  # Poems without themes

# Define dataset class for multi-label classification
class MultiLabelPoemDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length
        self.poems = self.data['Poem'].values
        self.labels = self.data.drop(columns=['Poem']).values  # All columns except 'Poem' are labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        poem = self.poems[idx]
        label = self.labels[idx]

        # Tokenize the poem text
        encoding = self.tokenizer(
            poem,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Convert to tensor format
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding['labels'] = torch.tensor(label, dtype=torch.float32)  # Multi-label binary vector
        return encoding

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Prepare training dataset
dataset = MultiLabelPoemDataset(df_known, tokenizer)

# Split dataset for training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define DataLoader for training
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(all_themes))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

len(train_dataset), len(val_dataset), device


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(10319, 2580, device(type='cuda'))

In [8]:

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,  # Increased to improve theme learning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('./multi_label_theme_classifier')
tokenizer.save_pretrained('./multi_label_theme_classifier')

print("Training Complete! Model Saved.")




  trainer = Trainer(
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: comradev73 (comradev73-vit) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss
1,0.0996,0.10084
2,0.0991,0.098118
3,0.095,0.09637
4,0.0939,0.092778
5,0.0851,0.089836


Training Complete! Model Saved.


In [10]:
import torch
import numpy as np
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading model...")
theme_classifier = BertForSequenceClassification.from_pretrained("./multi_label_theme_classifier", num_labels=len(all_themes)).to(device)
theme_tokenizer = BertTokenizer.from_pretrained("./multi_label_theme_classifier")

# Dynamic threshold tuning function using Otsu’s method
def find_best_threshold(scores):
    scores = np.array(scores).flatten()
    best_thresh = 0.5  # Default
    if len(scores) > 1:
        hist, bin_edges = np.histogram(scores, bins=50)
        best_thresh = bin_edges[np.argmax(hist)]  # Most frequent score as threshold
    return min(max(best_thresh, 0.3), 0.6)  # Clamp between 0.3 - 0.6

# Function to classify poem themes with optimized thresholding
def classify_poem_themes(poem_text, max_length=256, top_k=3):
    
    inputs = theme_tokenizer(poem_text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length).to(device)

    with torch.no_grad():
        outputs = theme_classifier(**inputs).logits

    predicted_scores = torch.sigmoid(outputs).cpu().numpy()[0]

    threshold = find_best_threshold(predicted_scores)

    predicted_themes = [all_themes[i] for i, score in enumerate(predicted_scores) if score > threshold]

    if not predicted_themes:
        top_k_indices = np.argsort(predicted_scores)[-top_k:]
        predicted_themes = [all_themes[i] for i in top_k_indices]

    return predicted_themes

print("Classifying themes for poems...")
# Apply classification to all poems
df["Predicted_Themes"] = df["Poem"].apply(classify_poem_themes)

# Save dataset with predicted themes
df[["Poem", "Predicted_Themes"]].to_csv("final_labeled_poems.csv", index=False)

print("Theme classification completed successfully!")


Loading model...
Classifying themes for poems...
Theme classification completed successfully!


In [25]:
import json
import pandas as pd
import numpy as np
import torch
from transformers import BertForSequenceClassification, BertTokenizer
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

poems_df = pd.read_csv("E:\\Datasets\\gutenberg_poems_cleaned.csv")

default_threshold = 0.45
per_class_thresholds = {theme: default_threshold for theme in all_themes}

print("Loading theme classifier...")
theme_classifier = BertForSequenceClassification.from_pretrained(
    "./multi_label_theme_classifier", num_labels=len(all_themes)
).to(device)
theme_tokenizer = BertTokenizer.from_pretrained("./multi_label_theme_classifier")

def classify_poem_themes(poem_text, max_length=256, top_k=3):
    poem_text = str(poem_text).strip()

    if not poem_text:
        return []  # skip this poem altogether or log it

    inputs = theme_tokenizer(
        poem_text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length
    ).to(device)

    with torch.no_grad():
        outputs = theme_classifier(**inputs).logits

    scores = torch.sigmoid(outputs).cpu().numpy()[0]

    predicted_themes = [
        all_themes[i]
        for i, score in enumerate(scores)
        if score > per_class_thresholds.get(all_themes[i], default_threshold)
    ]

    if not predicted_themes:
        top_k_indices = np.argsort(scores)[-top_k:]
        predicted_themes = [all_themes[i] for i in top_k_indices]

    common_themes = {"Love", "Nature", "Death", "Hope"}
    rare_theme_indices = [i for i, t in enumerate(all_themes) if t not in common_themes]
    rare_scores = [(i, scores[i]) for i in rare_theme_indices if scores[i] > (default_threshold - 0.1)]

    if rare_scores:
        top_rare_index = max(rare_scores, key=lambda x: x[1])[0]
        rare_theme = all_themes[top_rare_index]
        if rare_theme not in predicted_themes:
            predicted_themes.append(rare_theme)

    return list(set(predicted_themes))


# Apply classifier to all poems
print("Classifying poems into themes...")
poems_df["Themes"] = poems_df["Poem"].astype(str).apply(classify_poem_themes)

# Save the labeled poems
output_path = "labelled_gutenberg_poems.csv"
poems_df.to_csv(output_path, index=False)
print(f"\nDone! Labeled poems saved to: {output_path}")


Loading theme classifier...
🎯 Classifying poems into themes...

✅ Done! Labeled poems saved to: labelled_gutenberg_poems.csv


In [1]:
import torch
import pandas as pd
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, 
    DataCollatorForLanguageModeling, EarlyStoppingCallback
)
from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("final_labeled_poems.csv")

df["Prompt"] = df["Predicted_Themes"].apply(lambda x: f"<|theme|>: {', '.join(eval(x))}\n<|poem|>: ")

df["Training_Text"] = df["Prompt"] + df["Poem"]

dataset = Dataset.from_pandas(df[["Training_Text"]])

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device

def tokenize_function(examples):
    return tokenizer(
        examples["Training_Text"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)





Map:   0%|          | 0/13854 [00:00<?, ? examples/s]

In [3]:
training_args = TrainingArguments(
    output_dir="./poetry_gpt2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    fp16=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Training started...")
trainer.train()

model.save_pretrained("./poetry_gpt2_finetuned")
tokenizer.save_pretrained("./poetry_gpt2_finetuned")

print("Training completed successfully!")


  trainer = Trainer(


Training started...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.3622,3.036869
2,3.0901,2.955768
3,3.0063,2.910091
4,2.9865,2.879425
5,2.9359,2.860819
6,2.9242,2.853698


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Training completed successfully!


In [44]:
import pandas as pd
import ast

df = pd.read_csv("E:\\Datasets\\labeled_poems.csv")

def safe_parse_themes(x):
    try:
        themes = ast.literal_eval(x) if isinstance(x, str) else x
        if isinstance(themes, list):
            return "|".join(theme.strip() for theme in themes)
        else:
            return str(themes)
    except Exception as e:
        print(f"Error parsing: {x} — {e}")
        return "Unknown"

df['Themes'] = df['Themes'].apply(safe_parse_themes)

df['Poem'] = df['Poem'].apply(lambda p: p.replace('\r\n', '\n').replace('\r', '\n') if isinstance(p, str) else p)

df['text'] = df.apply(lambda row: f"Themes: {row['Themes']}\nPoem:\n{row['Poem']}", axis=1)

df[['text']].to_csv("poetry_text_dataset.csv", index=False)


In [45]:
df = pd.read_csv("poetry_text_dataset.csv")
df.head()


Unnamed: 0,text
0,"Themes: Living|Nature\nPoem:\n\n\nDog bone, st..."
1,Themes: Living|Nature\nPoem:\n\n\nThe old cupo...
2,Themes: Living|Nature\nPoem:\n\n\nLook for me ...
3,Themes: Living|Nature\nPoem:\n\n\nBehind the s...
4,Themes: Living|Relationships\nPoem:\n\n\nWhen ...


In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your fine-tuned model checkpoint
model_path = "./poetry_gpt2_finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to(device)

df = pd.read_csv("poetry_text_dataset.csv")

dataset = Dataset.from_pandas(df[["text"]])
dataset = dataset.train_test_split(test_size=0.1)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./poetry_gpt2_finetuned_continued",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    learning_rate=1e-5,
    evaluation_strategy="epoch",       
    save_strategy="epoch",             
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


Map:   0%|          | 0/12446 [00:00<?, ? examples/s]

Map:   0%|          | 0/1383 [00:00<?, ? examples/s]

  trainer = Trainer(
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: comradev73 (comradev73-vit) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.2862,3.13033
2,3.2071,3.129518
3,3.203,3.128995
4,3.1953,3.128803


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3112, training_loss=3.2204663686090385, metrics={'train_runtime': 4553.3206, 'train_samples_per_second': 10.934, 'train_steps_per_second': 0.683, 'total_flos': 1.3008162521088e+16, 'train_loss': 3.2204663686090385, 'epoch': 4.0})

In [4]:
model.save_pretrained("./poetry_gpt2_finetuned_continued")
tokenizer.save_pretrained("./poetry_gpt2_finetuned_continued")


('./poetry_gpt2_finetuned_continued\\tokenizer_config.json',
 './poetry_gpt2_finetuned_continued\\special_tokens_map.json',
 './poetry_gpt2_finetuned_continued\\vocab.json',
 './poetry_gpt2_finetuned_continued\\merges.txt',
 './poetry_gpt2_finetuned_continued\\added_tokens.json')

In [5]:
import torch
import re
import textwrap
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def format_poem_poetically(poem, width=60):
    poem = re.sub(r'\s+', ' ', poem.strip())

    doc = nlp(poem)
    poetic_lines = []

    for sent in doc.sents:
        words = sent.text.strip().split()
        line = []
        current_length = 0
        i = 0

        while i < len(words):
            word = words[i]
            line.append(word)
            current_length += len(word) + 1  # +1 for space

            if current_length >= width:
                split_found = False
                for j in range(i + 1, min(i + 6, len(words))):
                    if re.search(r'[.,!?;:—]$', words[j]) or (words[j].istitle() and words[j] != "I"):
                        poetic_lines.append(" ".join(line + words[i+1:j+1]))
                        i = j
                        line = []
                        current_length = 0
                        split_found = True
                        break

                if not split_found:
                    poetic_lines.append(" ".join(line))
                    line = []
                    current_length = 0

            i += 1

        if line:
            poetic_lines.append(" ".join(line))

    return "\n".join(poetic_lines).strip()

def generate_poem(max_length=200):
    model.eval()

    theme = input("Enter a theme for your poem (e.g., Love, Hope, Solitude): ")

    prompt = f"Write a heartfelt and meaningful poem on the theme: {theme}\nPoem:\n"

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=0.8,
            top_p=0.95,
            top_k=40,
            repetition_penalty=1.3,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    poem = generated_text.replace(prompt, "").strip()

    poem = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', poem)  # Insert space between lowercase and uppercase
    poem = re.sub(r'([.,!?;])(?=\w)', r'\1 ', poem)   

    formatted_poem = format_poem_poetically(poem, width=60)

    print("\nHere's your poem:\n")
    print(formatted_poem)

# Run
generate_poem()


Enter a theme for your poem (e.g., Love, Hope, Solitude):  Solitude,Sad,Depression


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Here's your poem:

Someday I shall be free to go back and forth between those two.
In this way we may never look into each other’s eyes again; The
world is not all one.
There are many things that must wait before me in order For
happiness or sorrow I shall make my own—the heart has power
over it
And yet it does nothing but beat so hard At night till morning When
its rhythm makes no sound But shifts its course and drifts like water.
When pleasure grows strong enough then will joy grow weak?
No!
I have made peace with pain And found myself alone among some of them Who
feel themselves as happy they were when you left home.
This melancholy is what people say about their happiest moments, They
say more than any sadness can ever bring Back.
But for every unhappy thing there should also Be relief at ease In
sleep where everything feels new and fresh Like


In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import math

# Load fine-tuned model and tokenizer
model_path = "./poetry_gpt2_finetuned_continued"  
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

poem_text = """The battle was always won or lost, and

It always took a long time to get there.
My father, who fought and lost,

Struck his head back against the wall,
I heard a shout as though

He could hear me, but he could not.
It took the day long, and I thought

Of how my mother cried out,
"It's our day"""

inputs = tokenizer(poem_text, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

with torch.no_grad():
    outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    perplexity = torch.exp(loss)

print(f"Loss: {loss.item():.4f}")
print(f"Perplexity: {perplexity.item():.4f}")


Loss: 2.9772
Perplexity: 19.6320


In [7]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
import math

model_path = "./poetry_gpt2_finetuned_continued"  # Update if your path differs
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model.eval()

if torch.cuda.is_available():
    model.to("cuda")

df = pd.read_csv("poetry_text_dataset.csv")  # Must contain a column named "text"
texts = df['text'].dropna().tolist()

def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
    return math.exp(loss.item()) if loss.item() < 100 else float("inf")  # Avoid overflow

perplexities = []
for text in tqdm(texts, desc="Calculating perplexities"):
    try:
        ppl = calculate_perplexity(text)
        perplexities.append(ppl)
    except Exception as e:
        print(f"Error processing: {e}")
        perplexities.append(float("inf"))

# === Average Perplexity ===
valid_ppls = [p for p in perplexities if p < float("inf")]
average_ppl = sum(valid_ppls) / len(valid_ppls)
print(f"\nAverage Perplexity on dataset: {average_ppl:.4f}")


Calculating perplexities: 100%|██████████████████████████████████████████████████| 13829/13829 [11:35<00:00, 19.90it/s]



✅ Average Perplexity on dataset: 28.0797


In [9]:
import gzip
import json
import pandas as pd
import re
from collections import defaultdict

input_file_path = "C:\\Users\\comra\\Downloads\\gutenberg-poetry-v001.ndjson.gz"
output_csv_path = 'E:\\Datasets\\gutenberg_poems_cleaned.csv'

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation/special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

poems = defaultdict(list)

with gzip.open(input_file_path, 'rt', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        gid = entry['gid']
        line_text = entry['s']
        cleaned_line = clean_text(line_text)
        if cleaned_line:  # Skip empty lines
            poems[gid].append(cleaned_line)

poem_texts = [' '.join(lines) for lines in poems.values()]

df = pd.DataFrame({'Poem': poem_texts})
df.to_csv(output_csv_path, index=False)

print(f"Saved {len(df)} poems to {output_csv_path}")


Saved 1191 poems to E:\Datasets\gutenberg_poems_cleaned.csv


In [36]:
import os
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import sigmoid
import csv

def load_poems_from_folder(folder_path):
    poems = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read().strip()
                if content:  # Skip empty files
                    poems.append({'Poem': content})
    return pd.DataFrame(poems)

def predict_themes(df, model, tokenizer, label_list, max_length=512, threshold=0.5):
    predicted_labels = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    for text in df['Poem']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            probs = sigmoid(outputs.logits)[0].cpu().numpy()

        labels = [label_list[i] for i, p in enumerate(probs) if p > threshold]

        if not labels:
            labels = [label_list[probs.argmax()]]

        predicted_labels.append(", ".join(labels))

    df['Themes'] = predicted_labels
    return df[['Poem', 'Themes']]

def label_poems_in_folder(folder_path, output_csv_path, model_path, label_list):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(label_list))

    df = load_poems_from_folder(folder_path)

    df = predict_themes(df, model, tokenizer, label_list)

    df.to_csv(output_csv_path, index=False, quoting=csv.QUOTE_ALL)
    print(f"Saved {len(df)} labelled poems to {output_csv_path}")

folder_path = "E:\\Datasets\\abc"  # Folder with .txt files (each a poem)
output_csv = "labelled_abc_poems.csv"
model_path = "./multi_label_theme_classifier"  
label_list = all_themes 

label_poems_in_folder(folder_path, output_csv, model_path, label_list)


Saved 99 labelled poems to labelled_abc_poems.csv


In [37]:
df = pd.read_csv("labelled_abc_poems.csv")
df.head()

Unnamed: 0,Poem,Themes
0,2 ABC of H.k. and China revised vision.\nBarre...,Sorrow Grieving
1,"Apparently life without love, is no life at al...",Love Lost
2,A abc angles on angels flaws (poem)\nMix with ...,Sorrow Grieving
3,A abc Brazil dance (poem)\nJack of crack in po...,Sorrow Grieving
4,ABC... I can't go on\n123... what's the next o...,Men Women


In [42]:
import os
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# Load tokenizer and model
model_dir = "./poetry_gpt2_finetuned_continued"

tokenizer = GPT2Tokenizer.from_pretrained(model_dir, local_files_only=True)
model = GPT2LMHeadModel.from_pretrained(model_dir, local_files_only=True)

model.to(device)

# Load the labeled dataset
df = pd.read_csv("labelled_abc_poems.csv")


from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df[["text"]])
eval_dataset = Dataset.from_pandas(eval_df[["text"]])

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_eval = eval_dataset.map(tokenize, batched=True)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./poetry_gpt2_refined",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    fp16=True,
    save_total_limit=2,
    report_to="none",
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("./poetry_gpt2_refined")
tokenizer.save_pretrained("./poetry_gpt2_refined")


Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,4.3171,3.980987
2,3.983,3.793307
3,3.8928,3.709542


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('./poetry_gpt2_refined\\tokenizer_config.json',
 './poetry_gpt2_refined\\special_tokens_map.json',
 './poetry_gpt2_refined\\vocab.json',
 './poetry_gpt2_refined\\merges.txt',
 './poetry_gpt2_refined\\added_tokens.json')

In [1]:
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

df = pd.read_csv("labelled_abc_poems.csv")
def prepare_input(row):
    themes = ", ".join(eval(row["Themes"])) if isinstance(row["Themes"], str) and row["Themes"].startswith("[") else row["Themes"]
    return f"<|theme|> {themes} <|poem|> {row['Poem']}"

df["text"] = df.apply(prepare_input, axis=1)

df["text"] = df["text"].astype(str).str.strip()
dataset = Dataset.from_pandas(df[["text"]])

model_path = "./poetry_gpt2_finetuned_continued"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_test = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test['train']
eval_dataset = train_test['test']

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./poetry_gpt2_v2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=5,
    save_total_limit=2,
    learning_rate=1e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
    seed=42
)

#Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

trainer.save_model("./poetry_gpt2_v2")
tokenizer.save_pretrained("./poetry_gpt2_v2")





Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer = Trainer(
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: comradev73 (comradev73-vit) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,4.3346,3.926764
2,3.6198,3.785994
3,3.7384,3.742439


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('./poetry_gpt2_v2\\tokenizer_config.json',
 './poetry_gpt2_v2\\special_tokens_map.json',
 './poetry_gpt2_v2\\vocab.json',
 './poetry_gpt2_v2\\merges.txt',
 './poetry_gpt2_v2\\added_tokens.json')

In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TextGenerationPipeline
import torch
import math
import evaluate

model_path = "./poetry_gpt2_refined"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def calculate_perplexity(model, tokenizer, texts):
    model.eval()
    perplexities = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            perplexity = torch.exp(loss).item()
            perplexities.append(perplexity)
    return perplexities

bertscore = evaluate.load("bertscore")

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
prompts = ["Write a heartfelt and meaningful poem on the Love, Romance",
           "Write a meaningful poem on the War, Hate",
           "Write a accurate poem on the Living, Hope",
           "Write a creative and meaningful poem on the Family",
           "Write a perfect poem on the Science",
           "Write a heartfelt and meaningful poem on the Love, Romance",
           "Write a meaningful poem on the War, Hate",
           "Write a accurate poem on the Living, Hope",
           "Write a creative and meaningful poem on the Family",
           "Write a perfect poem on the Science"
            ]
generated_poems = [generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, top_p=0.95, no_repeat_ngram_size=3)[0]['generated_text'] for prompt in prompts]

print("\nGenerated Poems:\n")
for i, poem in enumerate(generated_poems):
    print(f"Poem {i+1}:\n{poem}\n{'-'*40}")

#  Calculate Perplexity
perplexities = calculate_perplexity(model, tokenizer, generated_poems)
print("\nPerplexities:")
for i, p in enumerate(perplexities):
    print(f"Poem {i+1}: {p:.2f}")
avg_perplexity = sum(perplexities) / len(perplexities)
print(f"Average Perplexity: {avg_perplexity:.2f}")

references = ["I gaze upon the night sky filled with stars", 
              "The silence of gardens brings peace",
              "A voice in the dark calls my name",
              "Love burns without control",
              "Hope is all I have left",
              "I gaze upon the night sky filled with stars", 
              "The silence of gardens brings peace",
              "A voice in the dark calls my name",
              "Love burns without control",
              "Hope is all I have left"
             ]

bertscore_result = bertscore.compute(predictions=generated_poems, references=references, lang="en")
print("\nBERTScore (F1):")
for i, score in enumerate(bertscore_result["f1"]):
    print(f"Poem {i+1}: {score:.4f}")
avg_bert = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])
print(f"Average BERTScore (F1): {avg_bert:.4f}")

# Thematic Coherence (using theme classifier)
theme_classifier_path = "./multi_label_theme_classifier"
theme_model = AutoModelForSequenceClassification.from_pretrained(theme_classifier_path).to(device)
theme_tokenizer = AutoTokenizer.from_pretrained(theme_classifier_path)

def predict_themes(texts):
    theme_model.eval()
    theme_labels = []
    with torch.no_grad():
        for text in texts:
            inputs = theme_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
            logits = theme_model(**inputs).logits
            probs = torch.sigmoid(logits)
            labels = (probs > 0.5).int().tolist()[0]
            theme_labels.append(labels)
    return theme_labels

theme_predictions = predict_themes(generated_poems)
print("\nThematic Coherence (Binary Labels):")
for i, labels in enumerate(theme_predictions):
    print(f"Poem {i+1} Themes: {labels}")


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Generated Poems:

Poem 1:
Write a heartfelt and meaningful poem on the Love, Romance, Relationships
All-in-One
   Love, Family & Family
How a young man lives
With love to his wife
And his child
A little girl
A love we share,
A life's worth
Love to each of us
Glorious and kind
Beneath a love we cherish
Beyond any earthly limits
Came to him,
He asked her to let him
Hold his
----------------------------------------
Poem 2:
Write a meaningful poem on the War, Hate, Life
and Love that you are, and that you deserve,
In the face of danger and danger,
The poet shall be called a hero.
I hope that your story will serve you well.
Love will make you a hero again
When it is time, when we need you most.
You will never again be called an ordinary poet.
Because when you are on the run,
Sometimes you will die. You will
----------------------------------------
Poem 3:
Write a accurate poem on the Living, Hope, Love, Relationships
Like a poet who’d read this poem,
Love, but only if one can read
Love’s p

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore (F1):
Poem 1: 0.8009
Poem 2: 0.8098
Poem 3: 0.8148
Poem 4: 0.8108
Poem 5: 0.8063
Poem 6: 0.8170
Poem 7: 0.8103
Poem 8: 0.8211
Poem 9: 0.8035
Poem 10: 0.8091
Average BERTScore (F1): 0.8104

Thematic Coherence (Binary Labels):
Poem 1 Themes: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Poem 2 Themes: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [1]:
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import re
import spacy

nlp = spacy.load("en_core_web_sm")

model_path = "./poetry_gpt2_v2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # for safety

def format_poem(poem, width=60):

    poem = re.sub(r'[*_~`<>\\|#^]', '', poem)
    poem = re.sub(r'\s+', ' ', poem.strip())
    poem = re.sub(r'\.{2,}', '.', poem)

    doc = nlp(poem)
    poetic_lines = []

    for sent in doc.sents:
        words = sent.text.strip().split()
        line = []
        current_length = 0
        i = 0

        while i < len(words):
            word = words[i]
            line.append(word)
            current_length += len(word) + 1

            if current_length >= width:
                split_point = len(line)
                for k in reversed(range(len(line))):
                    if re.search(r'[.,!?;:—]$', line[k]) or (line[k].istitle() and line[k] != "I"):
                        split_point = k + 1
                        break

                new_line = " ".join(line[:split_point])
                if len(new_line.strip()) > 3:  
                    poetic_lines.append(new_line)

                i = i - len(line) + split_point
                line = []
                current_length = 0
            i += 1

        if line:
            new_line = " ".join(line)
            if len(new_line.strip()) > 3:
                poetic_lines.append(new_line)

    if poetic_lines and len(poetic_lines[-1]) < width // 2:
        poetic_lines.pop()

    return "\n".join(poetic_lines).strip()



# Main generation function
def generate_poem(theme, max_length=150):
    model.eval()
    prompt = f"Write a poetic piece that uses rich imagery, emotional depth, and lyrical language on the theme: {theme}\nPoem:\n"

    input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids.to(device)
    attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            temperature=0.85,
            top_p=0.92,
            top_k=40,
            max_new_tokens=150,
            repetition_penalty=1.2,
            num_return_sequences=3,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    if not generated_text or prompt not in generated_text:
        return "Something went wrong while generating the poem. Please try again with a different theme."

    poem = generated_text.replace(prompt, "").strip()
    poem = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', poem)
    poem = re.sub(r'([.,!?;])(?=\w)', r'\1 ', poem)

    return format_poem(poem)

# Gradio UI
demo = gr.Interface(
    fn=generate_poem,
    inputs=gr.Textbox(label="Enter Theme", placeholder="Type your own theme here..."),
    outputs=gr.Textbox(label="Generated Poem", lines=10),
    title="Poetry Generator 🎭",
    description="Enter a theme and generate a poem "
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


