In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup, AdamW
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import datasets 
# from datasets import Dataset, DatasetDict
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

# Data Preprocessing

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import re

def clean_text(line):
    pattern = re.compile(r'[^a-zA-Z0-9\s,.!?[\]]')
    line = pattern.sub('', line)
    line = re.sub(r'\s+', ' ', line).strip()
    return line

# lyrics_train_df = pd.read_csv('./data/lyrics_train.csv')  # No need for index_col=False, as it's False by default
# lyrics_train_df['Lyric'] = lyrics_train_df['Lyric'].str.replace("\r\n", " ").str.replace("\r", " ").str.replace("\n", " ")
# lyrics_train_df['Lyric'] = lyrics_train_df['Lyric'].apply(clean_text)
# lyrics_train_df.to_csv('./cleaned_train_lyrics.csv', index=False)
lyrics_train_df = pd.read_csv('./data/cleaned_train_lyrics.csv')

lyrics_train_df

Unnamed: 0.1,Unnamed: 0,Lyric,genre
0,0,"See me, ancient one! Dismal Tuat, Nergal unsaf...",Metal
1,1,Feels like Im covered in lies so turn off the ...,Metal
2,2,"Works of art, painted black Magniloquent, blee...",Metal
3,3,Into the cage like an animal You must survive ...,Metal
4,4,Paralysed in pleasure I hear you call Lost my ...,Metal
...,...,...,...
499995,499995,[Verse 1] I dont want to tell you that its ove...,country
499996,499996,I get to thinking sometimes I dont know why I ...,country
499997,499997,When I was A little boy around the table athom...,country
499998,499998,[Verse 1] Its a junked out joint off a backroa...,country


In [4]:
import re

def clean_text(line):
    pattern = re.compile(r'[^a-zA-Z0-9\s,.!?[\]]')
    line = pattern.sub('', line)
    line = re.sub(r'\s+', ' ', line).strip()
    return line

# lyrics_test_df = pd.read_csv('./data/lyrics_test.csv')  # No need for index_col=False, as it's False by default
# lyrics_test_df['Lyric'] = lyrics_test_df['Lyric'].str.replace("\r\n", " ").str.replace("\r", " ").str.replace("\n", " ")
# lyrics_test_df['Lyric'] = lyrics_test_df['Lyric'].apply(clean_text)
# lyrics_test_df.to_csv('./data/cleaned_test_lyrics.csv', index=False)
lyrics_test_df = pd.read_csv('./data/cleaned_test_lyrics.csv')

lyrics_test_df

Unnamed: 0.1,Unnamed: 0,Lyric,genre
0,0,can you hear me call your name Im not far away...,Metal
1,1,You say you are so clever You beleive that you...,Metal
2,2,Walking across these misery plains When all fo...,Metal
3,3,Fuck you you bitch get out of my head Twisting...,Metal
4,4,Crashing forth upon the soil Filthy waves gave...,Metal
...,...,...,...
49995,49995,[Verse 1] When the sun sinks down and dreams s...,country
49996,49996,I watched from the window as she slipped from ...,country
49997,49997,"Look around, it is never far See who the wound...",country
49998,49998,We started arguing on the onramp Of Interstate...,country


In [6]:
class LyricsDataset(Dataset):
    def __init__(self, lyrics_df):
        super().__init__()


        self.lyric_list = []
        self.end_of_text_token = "<|endoflyric|>"
        
        for lyric, genre in tqdm(zip(lyrics_df['Lyric'],lyrics_df['genre'] ), total=len(lyrics_df['genre'])):
            lyric_str = f"LYRIC[{genre.lower()}]:{lyric}{self.end_of_text_token}"
#             lyric_str = f"LYRIC:{lyric}{self.end_of_text_token}"
            self.lyric_list.append(lyric_str)
        
    def __len__(self):
        return len(self.lyric_list)

    def __getitem__(self, item):
        return self.lyric_list[item]

In [7]:
train_data = LyricsDataset(lyrics_train_df)
test_data = LyricsDataset(lyrics_test_df)

train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True)

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/50000 [00:00<?, ?it/s]

# Training Pipelines

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

tokenizer.add_special_tokens({
    'pad_token':'<|pad|>'
                             })

1

In [None]:
def calc_perp(model, tokenizer, test_loader):
    model.eval()
    nlls = []
    for lyric in tqdm(test_loader):
        lyric_tens = tokenizer(lyric, padding=True, truncation= True, return_tensors='pt')['input_ids'].to(device)
        target_tens = lyric_tens.clone()
        with torch.no_grad():
            outputs = model(lyric_tens, labels=target_tens)
            neg_log_likelihood = outputs.loss
            nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).mean())
    print("Evaluations:", ppl.item())
    model.train()

In [8]:
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 5e-6
WARMUP_STEPS = 80000
EVAL_STEPS = 100000
PRINT_STEPS = 100


optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
proc_seq_count = 0


proc_seq_count = 0
sum_loss = 0.0
batch_count = 0
steps = 0

In [9]:
model.train()
for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,lyric in enumerate(tqdm(train_loader)):
        lyric_tens = tokenizer(lyric, padding=True, return_tensors='pt', truncation= True)['input_ids'].to(device)
        output = model(lyric_tens, labels = lyric_tens.clone())
        loss = output['loss']  / BATCH_SIZE
        loss.backward()
        sum_loss = sum_loss + output['loss'].detach().data
        
        proc_seq_count = proc_seq_count + 1
        steps += 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == PRINT_STEPS:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
        if steps > EVAL_STEPS:
            steps = 0
            calc_perp(model, tokenizer, test_loader)
            model.push_to_hub("multi-genre-mdeium")
            tokenizer.push_to_hub("multi-genre-mdeium")



  0%|          | 0/500000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [39]:
merged_lyrics = ' '.join(train_data.lyric_list)
with open('train.txt', 'w') as file:
    file.write(merged_lyrics)

In [30]:
# Create a PyTorch dataset

# dataset = TextDataset(file_path='./train.txt', tokenizer = tokenizer, block_size=128)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./gpt2-finetuned")

NameError: name 'tokenized_train_dataset' is not defined

# Evaluations

In [3]:
import random 

model.to(device)

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
    
    
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("D3STRON/multi_genre_music_generator ")
model = AutoModelForCausalLM.from_pretrained("D3STRON/multi_genre_music_generator ")
model.to(device)

def evaluate(model, tokenizer, test_loader):
    model.eval()
    nlls = []
    EVAL_STEPS = 1000
    steps = 0
    for lyric in tqdm(test_loader):
        lyric_tens = tokenizer(lyric, padding=True, truncation= True, return_tensors='pt')['input_ids'].to(device)
        target_tens = lyric_tens.clone()
        with torch.no_grad():
            outputs = model(lyric_tens, labels=target_tens)
            neg_log_likelihood = outputs.loss
            nlls.append(neg_log_likelihood)
        if EVAL_STEPS == steps:
            steps = 0
            ppl = torch.exp(torch.stack(nlls).mean())
            print("Evaluations:", ppl.item())
        steps += 1

In [24]:
evaluate(model, tokenizer, test_loader)

  0%|          | 0/50000 [00:00<?, ?it/s]

Evaluations: 22.455976486206055
Evaluations: 21.971406936645508
Evaluations: 22.022998809814453
Evaluations: 22.10702133178711
Evaluations: 22.291528701782227


KeyboardInterrupt: 

# Inference

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("D3STRON/multi_genre_music_generator")
model = AutoModelForCausalLM.from_pretrained("D3STRON/multi_genre_music_generator")
model.to(device)

def choose_from_top(probs, cur_ids, n=7, no_rep=2):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob /= np.sum(top_prob)  # Normalize

    cur_ids_list = cur_ids.squeeze().cpu().numpy().tolist()
    for i in range(200):
        if i < 4:
            choice = np.random.choice(ind, 1)[0]
        else:
            choice = np.random.choice(ind, 1, p=top_prob)[0]
        n_gram = cur_ids_list[-no_rep + 1 :] + [choice]
        if any(cur_ids_list[i : i + no_rep] == n_gram for i in range(len(cur_ids_list) - no_rep + 1)):
            continue
        return choice
    return np.random.choice(ind)

In [3]:
with torch.no_grad():
    cur_ids = tokenizer("LYRIC[pop]:Oh Anurag my friend", padding=True, return_tensors='pt', truncation= True)['input_ids'].to(device)
    for i in range(100):
        outputs = model(cur_ids, labels=cur_ids)
        logits = outputs['logits']
        softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
        if i == 3:
            n = 30
        else:
            n = 5
        next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), cur_ids, n=n) #Randomly(from the topN probability distribution) select the next word
        cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

        if next_token_id in tokenizer.encode('<|endoflyric|>'):
            joke_finished = True
            break

    output_list = list(cur_ids.squeeze().to('cpu').numpy())
    output_text = tokenizer.decode(output_list)
    print(output_text)

LYRIC[pop]:Oh Anurag my friend, the world may not know your true colors, you might not understand the pain that I feel I know youve got it all, I can see that, but its not for you Ive seen the way your heart is racing I see the things in front you, and you dont see them at the time, so its up in the sky Oh I dont care about your feelings or your dreams, its all just part of your story I just wanna be with my best man I wanna know that Im
