## Read the Data

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, create_optimizer
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset

my_data = pd.read_csv("/kaggle/input/modern-renaissance-poetry/all.csv")
my_data.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [2]:
my_data.shape

(573, 5)

In [3]:
my_data['type'].unique()

array(['Mythology & Folklore', 'Nature', 'Love'], dtype=object)

In [4]:
# extract love poems as a list
poems = my_data.loc[my_data['type'] == 'Love']['content']
poems = list(poems)

poems[0], len(poems)

('Why didst thou promise such a beauteous day,\r\nAnd make me travel forth without my cloak,\r\nTo let base clouds oertake me in my way,\r\nHiding thy bravery in their rotten smoke?\r\nTis not enough that through the cloud thou break,\r\nTo dry the rain on my storm-beaten face,\r\nFor no man well of such a salve can speak\r\nThat heals the wound and cures not the disgrace:\r\nNor can thy shame give physic to my grief;\r\nThough thou repent, yet I have still the loss:\r\nThe offenders sorrow lends but weak relief\r\nTo him that bears the strong offences cross.\r\n   Ah! but those tears are pearl which thy love sheds,\r\n   And they are rich and ransom all ill deeds.\r\n \r\n \r\n ',
 326)

## Using Pre-trained LM

In [5]:
# Load pre-trained model and tokenizer
model_name = "gpt2"
model = TFGPT2LMHeadModel.from_pretrained(model_name)

# Set up tokenizer with padding
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Update model config
model.config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [6]:
tokenizer.pad_token_id

50256

# Preprocessing the Data

Prepare your dataset so that the model can use it for learning.

In [7]:
# clean and preprocess the text
def preprocess_text(text):
    # remove extra whitespaces and convert to lowercase
    text = ' '.join(text.split()).lower()
    return text

cleaned_poems = [preprocess_text(poem) for poem in poems]
cleaned_poems[11]

'joy of my life, full oft for loving you i bless my lot, that was so lucky placed: but then the more your own mishap i rue, that are so much by so mean love embased. for had the equal heavens so much you graced in this as in the rest, ye might invent some heavenly wit, whose verse could have enchased your glorious name in golden monument. but since ye deignd so goodly to relent to me your thrall, in whom is little worth, that little that i am shall all be spent in setting your immortal praises forth; whose lofty argument uplifting me shall lift you up unto an high degree.'

In [8]:
# Prepare the dataset
def load_dataset(poems_list, tokenizer):
    dataset = Dataset.from_dict({"text": poems_list})
#     dataset = dataset.map(lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128), batched=True)
    
    def tokenize_and_prepare(examples):
        tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="tf")
        input_ids = tf.convert_to_tensor(tokenized["input_ids"])
        labels = tf.identity(input_ids)  # Create a copy of input_ids
        
        tokenized["labels"] = labels
        return tokenized
    
    dataset = dataset.map(tokenize_and_prepare, batched=True, remove_columns=dataset.column_names)
    return dataset

# use pad function to prepare dataset
train_dataset = load_dataset(cleaned_poems, tokenizer)
train_dataset

Map:   0%|          | 0/326 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 326
})

In [9]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

In [10]:
# Create tensorflow dataset
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    shuffle=True,
    batch_size=4,
#     collate_fn=data_collator,
)

## Model Prepare

In [11]:
# Prepare optimizer and loss function
# optimizer = create_optimizer(
#     init_lr=5e-5,
#     num_train_steps=len(tf_train_dataset) * 5,
#     num_warmup_steps=0,
#     adam_epsilon=1e-8
# )
# optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# Implement gradient clipping
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.0)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def masked_spare_categorical_crossentropy(y_true, y_pred):
#     mask = tf.math.not_equal(y_true, 0) # Assuming 0 is the padding token ID
#     losses = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
#     masked_losses = tf.boolean_mask(losses, mask)
    
#     return tf.reduce_mean(masked_losses)

In [12]:
# compile the model
# model.compile(optimizer=optimizer, loss=loss)
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer])

# Training

In [13]:
# Fine-tune the model
# model.fit(tf_train_dataset, epochs=20)
history = model.fit(tf_train_dataset, epochs=10, verbose=1)

Epoch 1/10
Cause: for/else statement not yet supported


I0000 00:00:1725219201.489298      66 service.cc:145] XLA service 0x7fb06a8ab060 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1725219201.489351      66 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1725219201.657419      66 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
# Save the fine-tuned model
model.save_pretrained("./love_poem_model_tf")

# Generate a love poem

In [15]:
# Generate a love poem
def generate_poem(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="tf")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [16]:
# load the fine-tuned model
model = TFGPT2LMHeadModel.from_pretrained("./love_poem_model_tf")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./love_poem_model_tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [17]:
# Generate a poem
prompt = "love is"
poem = generate_poem(prompt, model, tokenizer)
print(poem)



love is isisisisesisesizesizesisesiseiseisesizeiseizeizeizesizeizedizedizeizationizationizedizationizeizingizingizeizerizerizersizersizeriseriserisersisersizersisersersersererierierererrerrerrrrererrerrerrersrersrerrrsrsrrdrdrrararasrasrarrasrranranrronronronsronsronranronrryryyyrylylyylyryysysy
