In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
dataset = load_dataset("amishshah/song_lyrics")
dataset = dataset["train"].shuffle(seed=42)
subset_size = 200
dataset = dataset.select(range(subset_size))
train_test_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_dataset["train"]
val_dataset = train_test_dataset["test"]
#train_test_dataset = dataset["train"].train_test_split(test_size=0.1)
#train_dataset = train_test_dataset["train"]
#val_dataset = train_test_dataset["test"]

In [3]:
#train_subset = train_dataset.select(range(100))
#val_subset = val_dataset.select(range(50))

# Load tokenizer and pre-trained model

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Ensure that tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [4]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['lyrics'], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 180/180 [00:00<00:00, 322.15 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 234.23 examples/s]


In [7]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./models',
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [2]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

ModuleNotFoundError: No module named 'absl'

In [1]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

NameError: name 'torch' is not defined

In [8]:
# Check if GPU is available and if not, use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

model.to(device)
# Train the model
trainer.train()

                                       
  0%|          | 0/180 [02:38<?, ?it/s]         

{'loss': 3.404, 'grad_norm': 10.230388641357422, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.22}


                                       
  0%|          | 0/180 [04:51<?, ?it/s]         

{'loss': 3.3938, 'grad_norm': 8.806055068969727, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.44}


                                       
  0%|          | 0/180 [07:10<?, ?it/s]         

{'loss': 3.3367, 'grad_norm': 7.128822326660156, 'learning_rate': 3e-06, 'epoch': 0.67}


                                       
  0%|          | 0/180 [09:35<?, ?it/s]         

{'loss': 3.1115, 'grad_norm': 8.318202018737793, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.89}




KeyboardInterrupt: 

In [None]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

# Load the model and tokenizer for text generation
from transformers import pipeline

# Ensure your model and tokenizer are loaded correctly
diomedes = pipeline('text-generation', model='./results', tokenizer='./results')

# Generate text using the pipeline
results = diomedes('Hello ', max_length=600)
print(results[0]['generated_text'])


# Now with genre data

In [51]:
def tokenize_function(examples):
    # Prepend the tag to each lyric in the batch
    concatenated_lyrics = ["[Genre: " + tag + "] " + lyric for tag, lyric in zip(examples["tag"], examples["lyrics"])]
    return tokenizer(concatenated_lyrics, truncation=True, padding="max_length", max_length=512)


train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)

# Initialize Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

In [None]:
# Save the model and tokenizer
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

In [3]:
model_path = './models'

In [5]:
# Load the model and tokenizer for text generation
from transformers import pipeline

# Ensure your model and tokenizer are loaded correctly
text_generator = pipeline('text-generation', model=model_path, tokenizer=model_path)

# Example of generating genre-specific text
genre = "rap"  # Replace with any genre present in your dataset

prompt = f"[Genre: {genre}] "
results = text_generator(prompt, max_length=500)
print(results[0]['generated_text'])

OSError: Can't load tokenizer for './models/checkpoint-9000'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './models/checkpoint-9000' is the correct path to a directory containing all relevant files for a GPT2TokenizerFast tokenizer.

# Load in a fine-tuned model

In [3]:
model = GPT2LMHeadModel.from_pretrained('./checkpoint-9000')

# Evaluate fine-tuning using perplexity

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def calculate_perplexity(model, tokenizer, text):
    encode = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(encode, labels=encode)
        loss = outputs[0]

    return torch.exp(loss).item()

# Load models and tokenizer
model_pretrained = GPT2LMHeadModel.from_pretrained('path_to_pretrained_model')
model_base = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Example text
text = "Complete this lyric about love and loss:"

# Calculate perplexity
perplexity_pretrained = calculate_perplexity(model_pretrained, tokenizer, text)
perplexity_base = calculate_perplexity(model_base, tokenizer, text)

print(f'Perplexity of Pretrained Model: {perplexity_pretrained}')
print(f'Perplexity of Base GPT-2 Model: {perplexity_base}')


# Evaluate fine-tuning using rouge-score

In [None]:
from rouge_score import rouge_scorer

# Example data
generated_lyrics = [
    "hello darkness my old friend, I've come to talk with you again",
    "because a vision softly creeping, left its seeds while I was sleeping"
]

reference_lyrics = [
    "hello darkness my old friend, I've come to speak with you again",
    "because a vision softly creeping, left its seeds while I was sleeping"
]

# Initialize the ROUGE scorer, you can specify which rouge types to calculate
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate average scores
def calculate_average_rouge(generated, references):
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for gen, ref in zip(generated, references):
        score = scorer.score(ref, gen)
        for key in scores.keys():
            scores[key].append(score[key].fmeasure)
    
    average_scores = {key: sum(values) / len(values) for key, values in scores.items()}
    return average_scores

# Calculate average ROUGE scores
average_scores = calculate_average_rouge(generated_lyrics, reference_lyrics)
print("Average ROUGE scores:", average_scores)
