In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# GPT Fine Tuning
Since lyric generation task primarily deals with generating the next word using context from the previous word as input. A decoder based transformer would work well for this. However, since GPT is trained on a large corpus of data. We will be perfomring some fine tuning using our lyrical data before it is ready to use for lyric generation. [Fine-Tune-GPT-article](https://towardsdatascience.com/how-to-fine-tune-gpt-2-for-text-generation-ae2ea53bc272)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 36.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 47.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

In [None]:
DATA_DIR = "/content/drive/MyDrive/w266-finalproj/data"
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
# load the data 
# from a sample of 6000 pop lyrics, let's use 2000 for fine tuning task
# the rest will be use for training a decoder layer
pop_data = pd.read_csv(f"{DATA_DIR}/02_intermediate/rock_sample.csv")
pop_data = pop_data.loc[:, ~pop_data.columns.str.contains('^Unnamed')]

# sampling data without replacement so the datasets do not mixed together
# fine_tuning_set = pop_data.sample(n=2000, replace=False)
# generation_set = pop_data.drop(fine_tuning_set.index)
fine_tuning_set = pop_data

## Data Preprocessing
A quick look at the lyrics, the data is a little dirty. We will need to do some preprocessing before it is ready to use. The following pre-processing procedures will be apply:
1. removal of odds tokens
2. tokenization

In [None]:
class SongLyrics(Dataset):
    def __init__(self, data, truncate=False, gpt2_type="gpt2", max_length=1024):
        """
            Clean up and Tokenize the given lyric
            Parameters
            ----------
            data : pd.Series()
                lyrics data in series format

            target_col : { boolean, default: False }
                toggle to truncate the list of tokenized lyric to a desirable length

            gpt2_type : { string, default: "gpt2" }
                numerical seed for random state of resampling

            max_length : { int, default=1024 },
                maximum length of the given dataset

            Returns
            ----------
            SongLyric : Object
                contains the treated lyrics using specified tokenizer
            
        """
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []
        
        for row in data:
            row = row.replace('\n\n', ' ')
            row = row.replace('\n', ' ')
            row = row.replace('\t', ' ')
            row = row.replace('#', ' ')
            row = row.replace("'", '')
            row = row.replace("(", '')
            row = row.replace(")", '')
            row = row.replace(";", '')
            row = row.replace(":", '')
            row = row.replace("-", '')
            row = row.replace("[", '')
            row = row.replace("]", '')
            self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{data}|>{row[:max_length]}<|endoftext|>", 
                                      truncation=True,
                                      max_length=max_length)
            ))
            
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

# # clean up the input text
# # tokenize the dataset
dataset = SongLyrics(pop_data['lyrics'], truncate=True, gpt2_type="gpt2", max_length=1024)
print(f"Number of Lyrics: {len(dataset)}")

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Number of Lyrics: 6000


## Pretraining Task
Preparing GPT to perform pretraining task. Since GPT2 is big, we'll most likely run into some memory errors. To remedy this issue, we'll be using an accumulate gradient technqiue outlined in [Fine-Tune-GPT-article](https://towardsdatascience.com/how-to-fine-tune-gpt-2-for-text-generation-ae2ea53bc272). Quote from the article, "the idea is before calling for optimization to perform a step of graident descent, it will sum the gradients of several operations. Then it will divide that total by the number of accumulated steps in otder to get an average loss over the training sample."

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    """
        Accumulated batch size (since GPT2 is so big)
    """
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
def train( 
    dataset, model, tokenizer,
    batch_size=24, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue
            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
model = train(dataset, model, tokenizer)



Training epoch 0
0


6000it [07:53, 12.66it/s]


Training epoch 1
tensor(1.0024, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.57it/s]


Training epoch 2
tensor(2.1756, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.55it/s]


Training epoch 3
tensor(1.4535, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.55it/s]


Training epoch 4
tensor(1.6191, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.60it/s]


Training epoch 5
tensor(0.9957, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.60it/s]


Training epoch 6
tensor(1.2402, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.57it/s]


Training epoch 7
tensor(1.9769, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:58, 12.54it/s]


Training epoch 8
tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.57it/s]


Training epoch 9
tensor(1.6774, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.58it/s]


Training epoch 10
tensor(2.0683, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.60it/s]


Training epoch 11
tensor(1.6363, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.59it/s]


Training epoch 12
tensor(1.1855, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:55, 12.61it/s]


Training epoch 13
tensor(1.2706, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:55, 12.61it/s]


Training epoch 14
tensor(0.8589, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.60it/s]


Training epoch 15
tensor(1.3278, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:55, 12.63it/s]


Training epoch 16
tensor(1.2216, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.57it/s]


Training epoch 17
tensor(1.3418, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:55, 12.61it/s]


Training epoch 18
tensor(0.8684, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:56, 12.60it/s]


Training epoch 19
tensor(1.1940, device='cuda:0', grad_fn=<NllLossBackward0>)


6000it [07:57, 12.57it/s]


In [None]:
torch.save(model, f'{DATA_DIR}/03_model_training/fine-tuning/rock-gpt2-fined-tuned-model.pt')

In [None]:
import torch.nn.functional as F

#Load the model to use it in evaluation mode
model = torch.load(f'{DATA_DIR}/03_model_training/fine-tuning/rock-gpt2-fined-tuned-model.pt')
model = model.to('cpu')
model.eval()
print("Model Successfully Loaded")

Model Successfully Loaded


In [None]:
# testing GPT fine tune with a given 16 words prompt
prompt = "there's two things that i have yet to learn, how to forget or have i"

# tokenize the data set
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)


# instantiate some parameters 
top_p=0.8
temperature=1.
filter_value = -float("Inf")
max_length = 100
generated_num = 0
generated_list = []



for i in tqdm(range(max_length)):

  # run model predict
  # fetch the model loss and logits(prediction outputs)
  outputs = model(generated, labels=generated)
  loss, logits = outputs[:2]
  logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

  # apply softmax to the output logic to create a probablity
  sorted_logits, sorted_indices = torch.sort(logits, descending=True)
  cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
  
  # from the probablity, compared it with a threshold percentage (0.8) in this case
  # shift the sorted indicies to be removed ro the right 
  sorted_indices_to_remove = cumulative_probs > top_p
  sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
  sorted_indices_to_remove[..., 0] = False

  # apply the filter
  indices_to_remove = sorted_indices[sorted_indices_to_remove]
  logits[:, indices_to_remove] = filter_value

  # generate the next token and append to 
  next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
  generated = torch.cat((generated, next_token), dim=1)

  # keep text generation until the end of token is achieve
  # then generate the text and break out of the loop
  if next_token in tokenizer.encode("<|endoftext|>"):
    generated_num = generated_num + 1
    output_list = list(generated.squeeze().numpy())
    output_text = tokenizer.decode(output_list)
    generated_list.append(output_text)
    break

# If end of text token never reach, then just decode
output_list = list(generated.squeeze().numpy())
output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
generated_list.append(output_text)

100%|██████████| 100/100 [00:44<00:00,  2.23it/s]


In [None]:
print(output_text)

there's two things that i have yet to learn, how to forget or have i just feel something going on and what to do next. The first is knowing the laws of physics and knowing that something is wrong. The second is understanding the universe. What is happening is hard to grasp as you understand it. No one knows what is happening and every person is either on edge, confused or frightened of it. So do not trust any of the information you get as you feel you're "coming face to face" with reality, which is not only the reality of things, but also<|endoftext|>
