In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# GPT2 Transformer Generator
Used a fined-tuned GPT2 model to generate lyrics based on genre

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 27.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 73.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 58.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [6]:
DATA_DIR = "/content/drive/MyDrive/w266-finalproj/data"
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [7]:
# load the data 
# from a sample of 6000 pop lyrics, let's use 2000 for fine tuning task
# the rest will be use for training a decoder layer
data = pd.read_csv(f"{DATA_DIR}/16_tokens_seeds.csv")
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data

Unnamed: 0,genre,processed_lyric,word_num,language,lyric
0,Metal,"['down', 'fell', 'the', 'stars', 'as', 'they',...",163,en,Down fell the stars as they splashed into the ...
1,Metal,"['yesterday', 'is', 'gone', 'forever', '\n', '...",232,en,Yesterday is gone forever\nNo turning back the...
2,Metal,"['crisis', 'feeds', 'the', 'lunacy', '\n', 'al...",176,en,Crisis feeds the lunacy\nAll fear the new mach...
3,Metal,"['lay', 'beside', 'me,', 'tell', 'me', 'what',...",395,en,"Lay beside me, tell me what they've done\nSpea..."
4,Metal,"['the', 'sky', 'was', 'clear', 'that', 'night'...",83,en,The sky was clear that night\nWe were alone\nA...
...,...,...,...,...,...
795,Soul,"['lay', 'it', 'down,', 'lay', 'it', 'down,', '...",315,en,"Lay it down, lay it down, lay it down\nPut you..."
796,Soul,"['was', 'blind,', 'but', 'now', 'i', 'see', '\...",241,en,"Was blind, but now I see\n\nAmazing, Amazing G..."
797,Soul,"['there', ""ain't"", 'no', 'reason', 'for', 'us'...",336,en,There ain't no reason for us sitting down\nTo ...
798,Soul,"['willow', 'weep', 'for', 'me', '\n', 'willow'...",127,en,Willow weep for me\nWillow weep for me\nBend y...


## Data Preprocessing
A quick look at the lyrics, the data is a little dirty. We will need to do some preprocessing before it is ready to use. The following pre-processing procedures will be apply:
1. removal of odds tokens
2. tokenization

In [8]:
# initialize a tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
# load the model
import torch.nn.functional as F


#Load the model to use it in evaluation mode
for genre in data['genre'].unique():
  generated_data = pd.DataFrame(columns = ['genre', 'original_lyric', 'generated_lyric', 'model'])
  if genre in ['Metal', 'Rap', 'Rock', 'Jazz', 'Folk', 'Pop', 'Soul']:
    continue
  if genre == 'R&B': genre = 'rnb'
  model = torch.load(f'{DATA_DIR}/03_model_training/fine-tuning/{genre.lower()}-gpt2-fined-tuned-model.pt')
  model = model.to('cpu')
  model.eval()
  print(f"{genre} Model Successfully Loaded")

  # slice the genre
  if genre == 'rnb': genre = 'R&B'

  genre_specific_data = data[data['genre'] == genre]
  songs = []

  # instantiate some parameters 
  top_p=0.8
  temperature=1.
  filter_value = -float("Inf")
  max_length = 100
  generated_num = 0
  
  if genre == 'R&B': genre = 'rnb'

  for i, row in tqdm(genre_specific_data.iterrows()):
    lyric = row.lyric
    lyric = lyric.replace('\n\n', ' ')
    lyric = lyric.replace('\n', ' ')
    lyric = lyric.replace('\t', ' ')
    lyric = lyric.replace('#', ' ')
    lyric = lyric.replace("'", '')
    lyric = lyric.replace("(", '')
    lyric = lyric.replace(")", '')
    lyric = lyric.replace(";", '')
    lyric = lyric.replace(":", '')
    lyric = lyric.replace("-", '')
    lyric = lyric.replace("[", '')
    lyric = lyric.replace("]", '')
    l = lyric.lower().split(" ")[:16]
    prompt = " ".join(l)

    # tokenize the data set
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    for i in range(max_length):
      # run model predict
      # fetch the model loss and logits(prediction outputs)
      outputs = model(generated, labels=generated)
      loss, logits = outputs[:2]
      logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

      # apply softmax to the output logic to create a probablity
      sorted_logits, sorted_indices = torch.sort(logits, descending=True)
      cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
      
      # from the probablity, compared it with a threshold percentage (0.8) in this case
      # shift the sorted indicies to be removed ro the right 
      sorted_indices_to_remove = cumulative_probs > top_p
      sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
      sorted_indices_to_remove[..., 0] = False

      # apply the filter
      indices_to_remove = sorted_indices[sorted_indices_to_remove]
      logits[:, indices_to_remove] = filter_value

      # generate the next token and append to 
      next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
      generated = torch.cat((generated, next_token), dim=1)

      # keep text generation until the end of token is achieve
      # then generate the text and break out of the loop
      if next_token in tokenizer.encode("<|endoftext|>"):
        generated_num = generated_num + 1
        output_list = list(generated.squeeze().numpy())
        output_text = tokenizer.decode(output_list)
        break

    # If end of text token never reach, then just decode
    output_list = list(generated.squeeze().numpy())
    output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 

    payload = {
        "genre": genre,
        "original_lyric": row.lyric,
        "generated_lyric": output_text,
        "model": "gpt2"
    }

    # print(f"genre: {genre}, sample: {output_text[:5]}")
    generated_data = generated_data.append(payload, ignore_index=True)
  generated_data.to_csv(f"{DATA_DIR}/gpt_{genre.lower()}_generated_lyrics.csv")

rnb Model Successfully Loaded


0it [00:00, ?it/s]