In [78]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import pandas as pd

In [63]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [64]:
# Ensure that tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [72]:
# Load and prepare dataset
dataset = load_dataset("csv", data_files="lyrics.csv")

In [73]:
# Convert dataset to a DataFrame
df = pd.DataFrame(dataset)
print(df.head())

                                               train
0  {'name': 'Evil Twin', 'album': 'Suck It and Se...
1  {'name': 'Only Ones Who Know', 'album': 'Favou...
2  {'name': 'Arabella', 'album': 'AM', 'lyrics': ...
3  {'name': 'Cigarette Smoke', 'album': 'Beneath ...
4  {'name': 'Reckless Serenade', 'album': 'Suck I...


In [74]:
print(df.columns)

Index(['train'], dtype='object')


In [75]:
#convert the lyrics column of the DataFrame into a list 
# Extract the lyrics data from the 'train' column
lyrics_list = [row['lyrics'] for row in df['train']]
print(lyrics_list[:3])  # Print the first few lyrics entries

['[Verse 1]\nYou\'ve never met before\nBut still she greets you like a long lost rock \'n\' roll\nShe\'s definitely one of those\nWhere you\'ll go wherever she goes\nAnd with my body on my mind\nBoth stop talking at the exact same time\nTrying to think of ways to make her mine\nBut they\'re difficult to find\n\n[Chorus]\nIt\'s not what I need\nNo, your love\'s not what I need\nSo don\'t give it to me\n\n[Verse 2]\nAnd she said\n"Oh, well, I know this will sound cold, but I really have to go\nNo, it\'s not that I\'m not free, there\'s nowhere I need to be\nIt\'s just your love\'s not what I need, so don\'t give it to me"\n\n[Chorus]\nIt\'s not what I need\nNo, your love\'s not what I need\nSo don\'t give it to me\n\n[Bridge]\nAnd she said, "Baby, how can I believe you?\nHow can I believe you when you can\'t believe your luck?\nNo point sticking to the plan when it\'s come unstuck"\n\n[Verse 3]\nIt\'s more a hunger than a thirst\nShe\'ll break your heart the second time\nBefore you know 

In [76]:
#text cleaning
import re

cleaned_lyrics_list = []
for song_lyrics in lyrics_list:
    # remove square brackets and their contents
    no_brackets = re.sub(r'\[.*?\]', '', song_lyrics)
    # replace slashes with spaces
    no_slashes = re.sub(r'/', ' ', no_brackets)
    # split the cleaned lyrics into separate lines
    lines = no_slashes.split('\n')
    # remove any remaining empty lines
    lines = [line for line in lines if line.strip() != '']
    cleaned_lyrics_list.extend(lines)

cleaned_lyrics_list[:3]

["You've never met before",
 "But still she greets you like a long lost rock 'n' roll",
 "She's definitely one of those"]

In [77]:
# Convert to lower case and save as a list
preprocessed_data = [item.lower() for item in cleaned_lyrics_list]

print(f"There are {len(preprocessed_data)} lines of lyrics\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
  print(preprocessed_data[i])

There are 4418 lines of lyrics

The first 5 lines look like this:

you've never met before
but still she greets you like a long lost rock 'n' roll
she's definitely one of those
where you'll go wherever she goes
and with my body on my mind


In [82]:
# Split the preprocessed data into training and testing sets
train_data, val_data = train_test_split(preprocessed_data, test_size=0.2, random_state=42)

# Print the size of the training and testing sets
print(f"Training set size: {len(train_data)} lines of lyrics")
print(f"Testing set size: {len(val_data)} lines of lyrics")

Training set size: 3534 lines of lyrics
Testing set size: 884 lines of lyrics


In [83]:
# Combine preprocessed data into datasets
train_dataset = Dataset.from_dict({"lyrics": train_data})
val_dataset = Dataset.from_dict({"lyrics": val_data})

In [84]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['lyrics'], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 3534/3534 [00:00<00:00, 9212.97 examples/s]
Map: 100%|██████████| 884/884 [00:00<00:00, 7902.48 examples/s]


In [85]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [86]:
# Train the model
trainer.train()

  0%|          | 0/2652 [00:00<?, ?it/s]

  0%|          | 10/2652 [00:14<1:05:47,  1.49s/it]

{'loss': 5.3736, 'grad_norm': 34.79045104980469, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  1%|          | 20/2652 [00:29<1:03:00,  1.44s/it]

{'loss': 5.2126, 'grad_norm': 44.05769729614258, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  1%|          | 30/2652 [00:48<1:28:51,  2.03s/it]

{'loss': 5.3305, 'grad_norm': 42.7755012512207, 'learning_rate': 3e-06, 'epoch': 0.03}


  2%|▏         | 40/2652 [01:11<1:42:06,  2.35s/it]

{'loss': 5.2036, 'grad_norm': 50.338260650634766, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.05}


  2%|▏         | 50/2652 [01:35<1:42:25,  2.36s/it]

{'loss': 4.8836, 'grad_norm': 40.186180114746094, 'learning_rate': 5e-06, 'epoch': 0.06}


  2%|▏         | 60/2652 [01:58<1:42:06,  2.36s/it]

{'loss': 5.1663, 'grad_norm': 42.778419494628906, 'learning_rate': 6e-06, 'epoch': 0.07}


  3%|▎         | 70/2652 [02:21<1:35:54,  2.23s/it]

{'loss': 4.897, 'grad_norm': 46.73408508300781, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}


  3%|▎         | 80/2652 [02:44<1:40:23,  2.34s/it]

{'loss': 5.2631, 'grad_norm': 41.49978256225586, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}


  3%|▎         | 90/2652 [03:07<1:37:15,  2.28s/it]

{'loss': 4.6785, 'grad_norm': 40.31572341918945, 'learning_rate': 9e-06, 'epoch': 0.1}


  4%|▍         | 100/2652 [03:32<1:43:41,  2.44s/it]

{'loss': 4.8368, 'grad_norm': 50.02645492553711, 'learning_rate': 1e-05, 'epoch': 0.11}


  4%|▍         | 110/2652 [03:58<1:43:45,  2.45s/it]

{'loss': 4.739, 'grad_norm': 47.00916290283203, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}


  5%|▍         | 120/2652 [04:23<1:46:53,  2.53s/it]

{'loss': 4.8577, 'grad_norm': 34.23596954345703, 'learning_rate': 1.2e-05, 'epoch': 0.14}


  5%|▍         | 130/2652 [04:47<1:36:17,  2.29s/it]

{'loss': 4.4986, 'grad_norm': 42.67557907104492, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.15}


  5%|▌         | 140/2652 [05:11<1:44:13,  2.49s/it]

{'loss': 4.4808, 'grad_norm': 44.18138122558594, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.16}


  6%|▌         | 150/2652 [05:35<1:42:31,  2.46s/it]

{'loss': 4.2022, 'grad_norm': 47.694427490234375, 'learning_rate': 1.5e-05, 'epoch': 0.17}


  6%|▌         | 160/2652 [05:59<1:43:32,  2.49s/it]

{'loss': 4.2785, 'grad_norm': 41.66613006591797, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.18}


  6%|▋         | 170/2652 [06:24<1:48:16,  2.62s/it]

{'loss': 4.5616, 'grad_norm': 34.71968078613281, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.19}


  7%|▋         | 180/2652 [06:49<1:35:52,  2.33s/it]

{'loss': 4.5443, 'grad_norm': 45.436737060546875, 'learning_rate': 1.8e-05, 'epoch': 0.2}


  7%|▋         | 184/2652 [06:58<1:37:05,  2.36s/it]

KeyboardInterrupt: 

In [61]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

# Load the model and tokenizer for text generation
from transformers import pipeline

# Ensure your model and tokenizer are loaded correctly
diomedes = pipeline('text-generation', model='./results', tokenizer='./results')

# Generate text using the pipeline
results = diomedes('Hello ', max_length=100)
print(results[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Hello ive seen this one before.I can see it when I am looking at the top corner. The window tint seems greenish but has faded to the last time i saw it.I took my photos and started running around the room wondering, can the camera handle looking out at you?
