In [None]:
# Inspiration taken from
# https://towardsdatascience.com/how-to-fine-tune-gpt-2-for-text-generation-ae2ea53bc272
# https://towardsdatascience.com/foundations-of-nlp-explained-bleu-score-and-wer-metrics-1a5ba06d812b
# https://towardsdatascience.com/hyperparameter-tuning-of-neural-networks-with-optuna-and-pytorch-22e179efc837

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os
import pickle
from sklearn.model_selection import train_test_split
import optuna
import matplotlib.pyplot as plt
from numpy import arange
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [None]:
def plot_bleu_scores(gpt2_scores, rnn_scores, prompt_lengths):

    plt.plot(prompt_lengths, gpt2, label='GPT-2')
    plt.plot(prompt_lengths, rnn, label='RNN')

    plt.title('BLEU scores')
    plt.xlabel('Prompt length')
    plt.ylabel('Score')

    plt.xticks(arange(2, 7, 1))

    plt.legend(loc='best')
    plt.show()

In [None]:
def get_large_dataset():
    file = open("preprocessing/preprocessedLines", 'rb')
    augmented_lines = pickle.load(file)
    stripped_lines = []
    file.close()
    
    # Strip sos and eos
    for line in augmented_lines:
        new_line = line.replace("<|eos|>", "").replace("<|sos|>", "").replace("<|pad|>", "")
        stripped_lines.append(new_line)
        
    df = pd.DataFrame(stripped_lines, columns=["Lyrics"])
    
    return df

In [None]:
def get_small_dataset():
    txt_file = "data/kanye_verses.txt"

    df = pd.read_fwf(txt_file)
    return df

In [None]:
def split_dataset(dataset, train_ratio = 0.8, val_ratio = 0.1, test_ratio=0.1):
    val_size = val_ratio/(val_ratio+train_ratio)
    
    X_train, X_test = train_test_split(dataset, test_size=test_ratio, shuffle=False)
    X_train, X_val = train_test_split(X_train, test_size=val_size, shuffle=False)

    return X_train, X_val, X_test

In [None]:
# Tokenize lyrics
class SongLyrics(Dataset):  
    def __init__(self, kanye_lyrics, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type) # Get GPT2 tokenizer
        self.lyrics = []

        for row in kanye_lyrics["Lyrics"]:
        
            sample = torch.tensor(self.tokenizer.encode(f"{row[:max_length]}<|endoftext|>"))
            paddingLength = max_length-len(sample)
            for i in range(paddingLength):
                sample = torch.cat([sample, torch.tensor(self.tokenizer.encode(f"<|endoftext|>"))])
                
            if sample.size()[0] <= max_length:
                self.lyrics.append(sample) # Encode each line up to max length
        
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [None]:
# Getting data, splitting it and getting tokenizer and model.
#kanye_lyrics = get_small_dataset()
kanye_lyrics = get_large_dataset()

dataset = SongLyrics(kanye_lyrics, truncate=False, gpt2_type="gpt2", max_length=50)

X_train, X_val, X_test = split_dataset(dataset)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
def plot_training_session(train_loss, val_loss):
    num_epochs = len(train_loss)
    train_values = train_loss.values()
    val_values = val_loss.values()
 
    epochs = range(1, num_epochs+1)

    plt.plot(epochs, train_values, label='Training Loss')
    plt.plot(epochs, val_values, label='Validation Loss')

    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.xticks(arange(0, num_epochs+1, 2))

    plt.legend(loc='best')
    plt.show()

In [None]:
def finetune(
    train_dataloader, val_dataloader, model, tokenizer,
    epochs=5, lr=2e-5, weight_decay=0.05,
    max_seq_len=200, warmup_steps=50, print_report=True
):
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay) # weight decay default is 0.01
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    best_val_loss = np.inf
    train_loss_list = {}
    val_loss_list = {}
    
    for epoch in range(epochs):
        train_loss = 0
        val_loss = 0

        for entry in tqdm(train_dataloader): # progress bar
            entry = entry.to(device)
            outputs = model(entry, labels=entry) # labels by shifting
            
            loss = outputs[0]
            train_loss += loss.item()
            
            loss.backward()

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            
        # Val loss
        for entry in val_dataloader:
            entry = entry.to(device)
            outputs = model(entry, labels=entry)
            
            loss = outputs[0]
            val_loss += loss.item()
        
        val_loss_avg = val_loss/len(val_dataloader)
        train_loss_avg = train_loss/len(train_dataloader)
        
        train_loss_list[epoch] = train_loss_avg
        val_loss_list[epoch] = val_loss_avg
        
        if print_report:
            print("--------------------------------------------")
            print("Epoch " + str(epoch + 1))
            print("Avg. train loss: " + str(train_loss_avg))
            print("Avg. validation loss: " + str(val_loss_avg))
            print("--------------------------------------------")
          
        if val_loss_avg > best_val_loss:
            print("Stopping due to worse validation loss")
            print("Best validation loss: " + str(best_val_loss))
            break
        else:
            best_val_loss = val_loss_avg
            
    if print_report:
        plot_training_session(train_loss_list, val_loss_list)
    
    return model, best_val_loss # Returning best val loss for the HPO

In [None]:
# Generate outputs
def generate(
    model,
    tokenizer,
    prompt,
    entry_length=200
):
    model.eval()
    vocab = list(tokenizer.encoder.values())

    with torch.no_grad():
        entry_finished = False
        generated = torch.tensor(tokenizer.encode(prompt))

        for i in range(entry_length): 
            outputs = model(generated, labels=generated)
            loss, logits = outputs[:2]
            logits = logits[-1, :]
            p = F.softmax(logits, dim=-1).detach().cpu().numpy()
            
            #top_n_idx = p.argsort()[-3:][::-1]
            #sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]] # If we want randomized generation
            sampled_token_index = p.argmax()
            
            next_token = torch.tensor(vocab[sampled_token_index]).unsqueeze(0)
            
            generated = torch.cat((generated, next_token), dim=0)

            if next_token in tokenizer.encode("<|endoftext|>"):
                break
        
        output_list = list(generated.squeeze().numpy())
        output_text = tokenizer.decode(output_list)
        
        return output_text

In [None]:
# Objective for Optuna hyperparameter optimization
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1),
        "weight_decay": trial.suggest_float("weight_decay", 0.05, 0.20),
        "epochs": trial.suggest_int("epochs", 5, 20),
        "batch_size": trial.suggest_int("batch_size", 4, 32)
        }
    
    train_dataloader = DataLoader(X_train, batch_size=params["batch_size"], shuffle=True, drop_last=True)
    val_dataloader = DataLoader(X_val, batch_size=params["batch_size"], shuffle=True, drop_last=True)
    
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    model, val_loss = finetune(train_dataloader, val_dataloader, model, tokenizer, 
                               epochs=params["epochs"], 
                               lr=params["learning_rate"], 
                               weight_decay=params["weight_decay"],
                               print_report=False
                              )
    
    return val_loss


#study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler()) # Bayesian sampler
#study.optimize(objective, n_trials=10)

#file = open("large_model", 'wb')
#pickle.dump(study, file)
#file.close()

In [None]:
# Calculating perplexity, which we decided not to include in report
def perplexity(model, dataloader):
    device=torch.device("cuda")
    ppl_acc = 0
    
    for entry in dataloader:
        entry = entry.to(device)
        outputs = model(entry, labels=entry)

        loss = outputs[0]
        
        ppl = torch.exp(loss)
        ppl_acc += ppl
    
    average_test_ppl = ppl_acc/len(dataloader)

    print({ 'Average ppl': average_test_ppl.item() })

In [None]:
#filename = "small_model"
#epochs = 15
#batch_size = 26
#weight_decay = 0.1832593146816496
#learning_rate =  0.001015043804472181

filename = "large_model"
epochs = 6
batch_size = 31
weight_decay = 0.14375047857667853
learning_rate =  0.008253568893666966

train_dataloader = DataLoader(X_train, batch_size=batch_size, shuffle=True, drop_last=True)
val_dataloader = DataLoader(X_val, batch_size=batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(X_test, batch_size=batch_size, shuffle=True, drop_last=True)

model, val_loss = finetune(train_dataloader, val_dataloader, model, tokenizer, epochs=epochs, weight_decay=weight_decay, lr=learning_rate) # Set batch_size

# Save model
#file = open("filename", 'wb')
#pickle.dump(model.state_dict(), file)
#file.close()

In [None]:
def bleu_score(model, dataloader):
    ref_sentences = []
    prompt_lengths = [2, 3, 4, 5, 6]
    
    # Strip data of non-text
    for batch in dataloader:
        for batch_entry in batch:
            ref_sentences.append(tokenizer.decode(batch_entry)
                                 .replace("<|endoftext|>", "")
                                 .split())
    
    for prompt_length in prompt_lengths:
        ref_sentences_without_prompt = []
        texts_without_prompt = []      

        for ref_sentence in tqdm(ref_sentences):
            ref_sentence_wo_prompt = ref_sentence[prompt_length:]
            if (len(ref_sentence_wo_prompt) > 3): # Because BLEU uses 4-gram, prediction must be long enough
                prompt = " ".join(ref_sentence[:prompt_length])

                ref_sentence_length = len(ref_sentence)

                text = generate(model.to('cpu'), tokenizer, prompt, entry_length=ref_sentence_length)
                text = text.replace("<|endoftext|>", "")

                text_wo_prompt = text.split()[prompt_length:]

                if (len(text_wo_prompt) > 3): # Because BLEU uses 4-gram
                    texts_without_prompt.append(" ".join(text_wo_prompt))
                    #print(" ".join(text_wo_prompt))
                    ref_sentences_without_prompt.append(" ".join(ref_sentence_wo_prompt))
                    #print(" ".join(ref_sentence_wo_prompt))

        score = bleu(texts_without_prompt, ref_sentences_without_prompt)
        print(score)    

                
def bleu(generated, target):
    ref_bleu = []
    gen_bleu = []
    for l in generated:
        gen_bleu.append(l.split())
    for i,l in enumerate(target):
        ref_bleu.append([l.split()])
    score_bleu = corpus_bleu(ref_bleu, gen_bleu)
    return score_bleu

In [None]:
prompt = "All i want is"
entry_length = 20

# Load trained model
file = open("large_model", 'rb')
best_model_state = pickle.load(file)
file.close()

best_model = GPT2LMHeadModel.from_pretrained('gpt2')
best_model.load_state_dict(best_model_state)

#bleu_score(best_model, test_dataloader)

# --------- Trained model ---------
generated_text = generate(best_model.to('cpu'), tokenizer, prompt, entry_length=entry_length)
print("Finetuned model:")
print(generated_text)

# --------- Non-trained model ---------
model_plain = GPT2LMHeadModel.from_pretrained('gpt2')

generated_text = generate(model_plain.to('cpu'), tokenizer, prompt, entry_length=entry_length)
print("Not finetuned model:")
print(generated_text)