In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [5]:
dataset = load_dataset("amishshah/song_lyrics")
dataset = dataset["train"].shuffle(seed=42)
subset_size = 25000
dataset = dataset.select(range(subset_size))
train_test_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_dataset["train"]
val_dataset = train_test_dataset["test"]
#train_test_dataset = dataset["train"].train_test_split(test_size=0.1)
#train_dataset = train_test_dataset["train"]
#val_dataset = train_test_dataset["test"]

# Load tokenizer and pre-trained model

In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("./results")
model = GPT2LMHeadModel.from_pretrained("./results")

# Ensure that tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize Dataset

In [7]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['lyrics'], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 22500/22500 [00:26<00:00, 863.34 examples/s]
Map: 100%|██████████| 2500/2500 [00:03<00:00, 754.81 examples/s]


# Fine-tuning

In [8]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./models',
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)

# Initialize Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [9]:
# Check if GPU is available and if not, use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

model.to(device)
# Train the model
trainer.train()

Device:  cpu


  0%|          | 100/22500 [01:58<7:00:14,  1.13s/it]

{'loss': 2.7021, 'grad_norm': 2.0258922576904297, 'learning_rate': 1e-05, 'epoch': 0.02}


  1%|          | 200/22500 [03:51<7:00:23,  1.13s/it]

{'loss': 2.7131, 'grad_norm': 2.1059556007385254, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 300/22500 [05:44<6:53:36,  1.12s/it]

{'loss': 2.663, 'grad_norm': 1.6924552917480469, 'learning_rate': 3e-05, 'epoch': 0.05}


  2%|▏         | 400/22500 [07:37<6:59:54,  1.14s/it]

{'loss': 2.6041, 'grad_norm': 1.5649513006210327, 'learning_rate': 4e-05, 'epoch': 0.07}


  2%|▏         | 500/22500 [09:30<6:54:35,  1.13s/it]

{'loss': 2.6989, 'grad_norm': 2.5242412090301514, 'learning_rate': 5e-05, 'epoch': 0.09}


  3%|▎         | 600/22500 [11:26<6:56:47,  1.14s/it] 

{'loss': 2.7088, 'grad_norm': 1.7427242994308472, 'learning_rate': 4.9772727272727275e-05, 'epoch': 0.11}


  3%|▎         | 700/22500 [13:19<6:56:44,  1.15s/it]

{'loss': 2.5961, 'grad_norm': 1.8031307458877563, 'learning_rate': 4.9545454545454553e-05, 'epoch': 0.12}


  4%|▎         | 800/22500 [15:13<6:53:20,  1.14s/it]

{'loss': 2.6381, 'grad_norm': 1.61652672290802, 'learning_rate': 4.931818181818182e-05, 'epoch': 0.14}


  4%|▍         | 900/22500 [17:06<6:47:32,  1.13s/it]

{'loss': 2.7141, 'grad_norm': 1.735072135925293, 'learning_rate': 4.909090909090909e-05, 'epoch': 0.16}


  4%|▍         | 1000/22500 [18:58<6:42:47,  1.12s/it]

{'loss': 2.6836, 'grad_norm': 1.3271573781967163, 'learning_rate': 4.886363636363637e-05, 'epoch': 0.18}


  5%|▍         | 1100/22500 [20:53<6:43:21,  1.13s/it] 

{'loss': 2.7017, 'grad_norm': 1.6593340635299683, 'learning_rate': 4.863636363636364e-05, 'epoch': 0.2}


  5%|▌         | 1200/22500 [22:46<6:41:32,  1.13s/it]

{'loss': 2.7599, 'grad_norm': 1.875386357307434, 'learning_rate': 4.840909090909091e-05, 'epoch': 0.21}


  6%|▌         | 1242/22500 [23:33<6:47:33,  1.15s/it]

KeyboardInterrupt: 

In [22]:
model_path = "./models"

In [23]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/vocab.json',
 './models/merges.txt',
 './models/added_tokens.json')

# Lyric Generation

In [24]:
# Load the model and tokenizer for text generation
from transformers import pipeline

# Ensure your model and tokenizer are loaded correctly
text_generator = pipeline('text-generation', model=model_path, tokenizer=model_path)

# Generate text using the pipeline
prompt = "complete the lyrics about love and loss "
results = text_generator(prompt, max_length=600, truncation=True)
print(results[0]['generated_text'])


complete the lyrics about love and loss                                                                                               x3 
Love is like a thing                                                                                                                                                                                                                                                                                                           II                                                                                                                                                                                               


# Load in a fine-tuned model

In [12]:
model = GPT2LMHeadModel.from_pretrained(model_path)

# Evaluate fine-tuning using perplexity

In [28]:
text = '''Hello darkness, my old friend
I've come to talk with you again
Because a vision softly creeping'''



In [29]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def calculate_perplexity(model, tokenizer, text):
    encode = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(encode, labels=encode)
        loss = outputs[0]

    return torch.exp(loss).item()

# Load models and tokenizer
model_pretrained = GPT2LMHeadModel.from_pretrained(model_path)
model_base = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Calculate perplexity
perplexity_pretrained = calculate_perplexity(model_pretrained, tokenizer, text)
perplexity_base = calculate_perplexity(model_base, tokenizer, text)

print(f'Perplexity of Pretrained Model: {perplexity_pretrained}')
print(f'Perplexity of Base GPT-2 Model: {perplexity_base}')


Perplexity of Pretrained Model: 64.45396423339844
Perplexity of Base GPT-2 Model: 122.21576690673828


# Evaluate fine-tuning using rouge-score

In [None]:
masked_song_prompt = "Hello darkness, my old ****, I've come to **** with you again, Because a vision softly ****,"
unmasked_song_prompt = '''Hello darkness, my old friend
I've come to talk with you again
Because a vision softly creeping'''

In [None]:
from rouge_score import rouge_scorer

# Load models and tokenizer
model_base = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

text_generator_finetuned = pipeline('text-generation', model=model_path, tokenizer=model_path)
generated_lyrics_finetuned = text_generator(masked_song_prompt, max_length=500)[0]['generated_text']

text_generator_base = pipeline('text-generation', model=model_base, tokenizer=tokenizer)
generated_lyrics_base = text_generator(masked_song_prompt, max_length=500)[0]['generated_text']

# Initialize the ROUGE scorer, you can specify which rouge types to calculate
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate average scores
def calculate_average_rouge(generated, references):
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for gen, ref in zip(generated, references):
        score = scorer.score(ref, gen)
        for key in scores.keys():
            scores[key].append(score[key].fmeasure)
    
    average_scores = {key: sum(values) / len(values) for key, values in scores.items()}
    return average_scores

# Calculate average ROUGE scores
average_scores_base = calculate_average_rouge(generated_lyrics_base, unmasked_song_prompt)
print("Average ROUGE scores for base GPT-2:", average_scores_base)

average_scores_finetuned = calculate_average_rouge(generated_lyrics_finetuned, unmasked_song_prompt)
print("Average ROUGE scores for finetuned model:", average_scores_finetuned)


In [33]:
# Define a Python list to hold the songwriting prompts
song_prompts = [
    "Complete these lyrics: Underneath the stars that",
    "Complete these lyrics: When the rain starts falling and",
    "Complete these lyrics: Beside the flowing river where",
    "Complete these lyrics: Amid the echoes of the old town, the",
    "Complete these lyrics: As the leaves turn golden, there",
    "Complete these lyrics: Through the winding streets of my memories,",
    "Complete these lyrics: Under the shadow of the old bridge, we",
    "Complete these lyrics: When the lights go out and",
    "Complete these lyrics: Across the endless sea, the wind",
    "Complete these lyrics: Beneath the wide sky where",
    "Complete these lyrics: Along the path less traveled by,",
    "Complete these lyrics: Beyond the horizon where dreams",
    "Complete these lyrics: Through the whispering fields of green,",
    "Complete these lyrics: On the balcony at midnight, I",
    "Complete these lyrics: In the silence of the morning,",
    "Complete these lyrics: Beside the crackling fire, we",
    "Complete these lyrics: When the curtain falls and",
    "Complete these lyrics: In the deepest of forests,",
    "Complete these lyrics: Beneath the bustling city lights,",
    "Complete these lyrics: With every heartbeat, I",
    "Complete these lyrics: As the storm clouds gather,",
    "Complete these lyrics: Amidst the fading colors of the sunset,",
    "Complete these lyrics: Under the spell of the full moon,",
    "Complete these lyrics: Along the frozen shores,",
    "Complete these lyrics: Between the pages of a forgotten book,",
    "Complete these lyrics: Beneath the starry sky, as",
    "Complete these lyrics: Wrapped in the cold night air,",
    "Complete these lyrics: In the quiet hours before dawn,",
    "Complete these lyrics: On the edge of a dream, where",
    "Complete these lyrics: Under the burning sun that",
    "Complete these lyrics: Within the walls of this ancient castle,",
    "Complete these lyrics: Beyond the fields that",
    "Complete these lyrics: Underneath the weight of this world,",
    "Complete these lyrics: As the river flows and",
    "Complete these lyrics: Through the valley of shadows,",
    "Complete these lyrics: In the grip of the cold winter,",
    "Complete these lyrics: Amidst the clashing of our words,",
    "Complete these lyrics: On the streets where history",
    "Complete these lyrics: With the fading light of the evening,",
    "Complete these lyrics: In the hush of the night, where",
    "Complete these lyrics: By the old mill stream where",
    "Complete these lyrics: As the clock strikes midnight,",
    "Complete these lyrics: Beyond the misty mountains,",
    "Complete these lyrics: Through the sorrow and the pain,",
    "Complete these lyrics: In the echo of ancient chants,",
    "Complete these lyrics: On a journey to nowhere,",
    "Complete these lyrics: Amidst the crowds of yesterday,",
    "Complete these lyrics: In the mirror's reflection, I",
    "Complete these lyrics: At the break of dawn,",
    "Complete these lyrics: Where the wild roses grow,",
    "Complete these lyrics: In the gleam of the fading light,",
    "Complete these lyrics: By the calm waters of the lake,",
    "Complete these lyrics: Beneath the veil of twilight,",
    "Complete these lyrics: On the wings of the morning,",
    "Complete these lyrics: Through the storm and the rain,",
    "Complete these lyrics: As the world turns slowly,",
    "Complete these lyrics: Beneath the ancient oak,",
    "Complete these lyrics: On the road that leads to nowhere,",
    "Complete these lyrics: In the depths of your eyes,",
    "Complete these lyrics: At the end of the journey,",
    "Complete these lyrics: With the whisper of the wind,",
    "Complete these lyrics: Under the canopy of stars,",
    "Complete these lyrics: In the labyrinth of my thoughts,",
    "Complete these lyrics: When the shadows fall and",
    "Complete these lyrics: As we dance through the night,",
    "Complete these lyrics: On the shores of a distant land,",
    "Complete these lyrics: In the twilight of our years,",
    "Complete these lyrics: With each passing storm,",
    "Complete these lyrics: By the fading embers of the fire,",
    "Complete these lyrics: Through the pages of our days,",
    "Complete these lyrics: Under the veil of the night sky,",
    "Complete these lyrics: In the whispers of the forgotten,",
    "Complete these lyrics: At the edge of the sea,",
    "Complete these lyrics: Where the streets have no name,",
    "Complete these lyrics: In the flow of the endless river,",
    "Complete these lyrics: As the stars begin to gather,",
    "Complete these lyrics: On the brink of the unknown,",
    "Complete these lyrics: Within the echoes of silence,",
    "Complete these lyrics: Through the mist and the darkness,",
    "Complete these lyrics: Underneath the cascade of falling leaves,",
    "Complete these lyrics: As the fireflies light the night,",
    "Complete these lyrics: By the light of the silvery moon,",
    "Complete these lyrics: Amidst the ruins of once-great cities,",
    "Complete these lyrics: On the breath of the morning breeze,",
    "Complete these lyrics: In the shadow of the setting sun,",
    "Complete these lyrics: Through the dance of the firelight,",
    "Complete these lyrics: On the paths we used to roam,",
    "Complete these lyrics: As the rain washes over,",
    "Complete these lyrics: With the songs of the ancients,"
]

# Print out the list or perform any other operations you need with it
print(song_prompts)


['Complete these lyrics: Underneath the stars that', 'Complete these lyrics: When the rain starts falling and', 'Complete these lyrics: Beside the flowing river where', 'Complete these lyrics: Amid the echoes of the old town, the', 'Complete these lyrics: As the leaves turn golden, there', 'Complete these lyrics: Through the winding streets of my memories,', 'Complete these lyrics: Under the shadow of the old bridge, we', 'Complete these lyrics: When the lights go out and', 'Complete these lyrics: Across the endless sea, the wind', 'Complete these lyrics: Beneath the wide sky where', 'Complete these lyrics: Along the path less traveled by,', 'Complete these lyrics: Beyond the horizon where dreams', 'Complete these lyrics: Through the whispering fields of green,', 'Complete these lyrics: On the balcony at midnight, I', 'Complete these lyrics: In the silence of the morning,', 'Complete these lyrics: Beside the crackling fire, we', 'Complete these lyrics: When the curtain falls and', 'Com

# Evaluate using ChatGPT

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="OPENAPIKEY",
)

def compare_lyrics(lyrics1, lyrics2):
    prompt_text = f"Here are two sets of song lyrics:\n\nLyrics A:\n{lyrics1}\n\nLyrics B:\n{lyrics2}\n\nWhich set of lyrics do you think is better?"
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt_text,
            }
        ],
        model="gpt-3.5-turbo",
    )

#     print(response.choices[0].text.strip())
    print(chat_completion.choices[0].message)
    
    
prompt = "Complete this lyric about love and loss:"
# Load models and tokenizer
model_base = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

text_generator_finetuned = pipeline('text-generation', model=model_path, tokenizer=model_path)
generated_lyrics_finetuned = text_generator_finetuned(prompt, max_length=500, truncation=True)[0]['generated_text']

text_generator_base = pipeline('text-generation', model=model_base, tokenizer=tokenizer)
generated_lyrics_base = text_generator_base(prompt, max_length=500, truncation=True)[0]['generated_text']

# Call the function to compare the lyrics
compare_lyrics(generated_lyrics_base, generated_lyrics_finetuned)
# Lyrics A is the first parameter, Lyrics B is the second parameter

In [34]:
from openai import OpenAI
import logging
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

client = OpenAI(
    api_key="OPENAPIKEY",
)

logging.getLogger().setLevel(logging.ERROR)

def compare_lyrics(lyrics1, lyrics2):
    prompt_text = f"Here are two sets of song lyrics:\n\nLyrics A:\n{lyrics1}\n\nLyrics B:\n{lyrics2}\n\nWhich set of lyrics do you think is better (for your answer, just put your response ex. Lyrics _)?"
    
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt_text}],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content

### gpt model stuff
model_base = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

### fine tuned model stuff
checkpoint_path = './models'
model_fine_tuned = GPT2LMHeadModel.from_pretrained(checkpoint_path)
tokenizer_fine_tuned = GPT2Tokenizer.from_pretrained(checkpoint_path)

text_generator_base = pipeline('text-generation', model=model_base, tokenizer=tokenizer)
text_generator_finetuned = pipeline('text-generation', model=model_fine_tuned, tokenizer=tokenizer_fine_tuned)
results = []

for prompt in song_prompts:
    generated_lyrics_finetuned = text_generator_finetuned(prompt, max_length=500, truncation=True)[0]['generated_text']
    generated_lyrics_base = text_generator_base(prompt, max_length=500, truncation=True)[0]['generated_text']
    result = compare_lyrics(generated_lyrics_base, generated_lyrics_finetuned)
    results.append(result)
    
print(results)

choices_count = np.mean([r == 'Lyrics B' for r in results])
print(f"Our Model was chosen {choices_count * 100}% of the time.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

['Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics A', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Lyrics B', 'Ly