In [38]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [40]:
# Ensure that tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [41]:
# Load and prepare dataset
dataset = load_dataset("amishshah/song_lyrics")
train_test_dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test_dataset["train"].select(range(100))      #take subset
val_dataset = train_test_dataset["test"].select(range(100))

In [42]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['lyrics'], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 100/100 [00:00<00:00, 110.04 examples/s]
Map: 100%|██████████| 100/100 [00:01<00:00, 97.30 examples/s] 


In [43]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [44]:
# Train the model
trainer.train()
trainer.evaluate()

                                               
 48%|████▊     | 36/75 [39:39:41<12:45, 19.63s/it]

{'loss': 3.3981, 'grad_norm': 17.714309692382812, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.4}


                                                  
 48%|████▊     | 36/75 [39:44:28<12:45, 19.63s/it]

{'loss': 3.4702, 'grad_norm': 6.6142258644104, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.8}


                                                  
 48%|████▊     | 36/75 [39:49:05<12:45, 19.63s/it]

{'loss': 3.0259, 'grad_norm': 7.868333339691162, 'learning_rate': 3e-06, 'epoch': 1.2}


                                                  
 48%|████▊     | 36/75 [39:53:07<12:45, 19.63s/it]

{'loss': 3.2003, 'grad_norm': 6.783219814300537, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.6}


                                                  
 48%|████▊     | 36/75 [39:57:08<12:45, 19.63s/it]

{'loss': 3.1818, 'grad_norm': 6.743659019470215, 'learning_rate': 5e-06, 'epoch': 2.0}


                                                  
 48%|████▊     | 36/75 [40:02:05<12:45, 19.63s/it]

{'loss': 3.2774, 'grad_norm': 6.418843746185303, 'learning_rate': 6e-06, 'epoch': 2.4}


                                                  
 48%|████▊     | 36/75 [40:06:01<12:45, 19.63s/it]

{'loss': 2.9877, 'grad_norm': 6.423933029174805, 'learning_rate': 7.000000000000001e-06, 'epoch': 2.8}


                                                  
100%|██████████| 75/75 [33:35<00:00, 26.87s/it]it]


{'train_runtime': 2015.1326, 'train_samples_per_second': 0.149, 'train_steps_per_second': 0.037, 'train_loss': 3.1982723236083985, 'epoch': 3.0}


100%|██████████| 13/13 [03:36<00:00, 16.67s/it]


{'eval_loss': 3.0463104248046875,
 'eval_runtime': 234.6157,
 'eval_samples_per_second': 0.426,
 'eval_steps_per_second': 0.055,
 'epoch': 3.0}

In [45]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

# Load the model and tokenizer for text generation
from transformers import pipeline

# Ensure your model and tokenizer are loaded correctly
diomedes = pipeline('text-generation', model='./results', tokenizer='./results')

# Generate text using the pipeline
results = diomedes('Hello ', max_length=100)
print(results[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Hello !!!!!!

This is really the only one that I can remember when I bought this dress. I know what I was thinking and going through. I don't know what I looked for, but it is very different from my friends dress, and I'm very excited now. I want to be able to play at the games! I got it before I came to Seattle to try and put down money! I didn't have money for things like that, and it gave me hope all
