# Fine-tuning GPT-2 on a jokes dataset in PyTorch

This notebook was created as a part of a blog post - [Fine-tuning large Transformer models on a single GPU in PyTorch - Teaching GPT-2 a sense of humor](https://mf1024.github.io/2019/11/12/Fun-With-GPT-2/). Here I demonstrate how to fine-tune a pre-trained GPT-2 model on a jokes dataset.

Let's see if the model can learn to crack some jokes!

For this experiment, I will use a pre-trained GPT-2 medium-sized model from the huggingface [transformers repository](https://github.com/huggingface/transformers).

#### If you haven't yet, check out the notebook in this [gist](https://gist.github.com/mf1024/430d7fd6ff527350d3e4b5bda0d8614e) where use the same pretrained model to generate text.

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

from google.colab import drive

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
model = model.to(device)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

### PyTorch Dataset module for Short jokes dataset

For fine-tuning the GPT2 model, I will use this [Short Jokes dataset](https://www.kaggle.com/abhinavmoudgil95/short-jokes) published on Kaggle. After each joke, I add "<|endofext|>" which is recognized by the GPT2 model as and end of text marker. The marker will allow me to concatenate many jokes in a single input sequence.

In [None]:
class JokesDataset(Dataset):
    def __init__(self, jokes_dataset_path = ''):
        super().__init__()

        short_jokes_path = os.path.join(jokes_dataset_path, 'data_twq.csv')

        self.joke_list = []
        self.end_of_text_token = "<|endoftext|>"

        with open(short_jokes_path, encoding="utf8") as csv_file:
          #  csv_reader = csv.reader(csv_file, delimiter=',')
            csv_reader = csv.reader(csv_file, delimiter=';')
            x = 0
            for row in csv_reader:
                joke_str = f"{row[1]}{self.end_of_text_token}"
                self.joke_list.append(joke_str)

    def __len__(self):
        return len(self.joke_list)

    def __getitem__(self, item):
        return self.joke_list[item]


In [None]:
dataset = JokesDataset()
joke_loader = DataLoader(dataset, batch_size=1, shuffle=True)

### Hyperparameters

I tested many(more than 5) hyperparameter sets till I found one that works the best. I mostly tuned ***BATCH_SIZE*** (in this case, it's the number of forward-backward passes between each optimization step), ***EOPOCHS***, and ***LEARNING_RATE***.

For a parameter value starting point for fine-tuning, I inspired from [this](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py) and [this](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py) huggingface fine-tuning code.

In [None]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-3
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 300
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

### Model training

I will train the model and save the model weights after each epoch and then I will try to generate jokes with each version of the weight to see which performs the best.

In [None]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps  = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_jokes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started" + '=' * 30)

    for idx,joke in enumerate(joke_loader):

        #################### "Fit as many joke sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        joke_tens = torch.tensor(tokenizer.encode(joke[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if joke_tens.size()[1] > MAX_SEQ_LEN:
            continue

        #The first joke sequence in the sequence
        if not torch.is_tensor(tmp_jokes_tens):
            tmp_jokes_tens = joke_tens
            continue
        else:
            #The next joke does not fit in so we process the sequence and leave the last joke
            #as the start for next sequence
            if tmp_jokes_tens.size()[1] + joke_tens.size()[1] > MAX_SEQ_LEN:
                work_jokes_tens = tmp_jokes_tens
                tmp_jokes_tens = joke_tens
            else:
                #Add the joke to sequence, continue and try to add more
                tmp_jokes_tens = torch.cat([tmp_jokes_tens, joke_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################

        outputs = model(work_jokes_tens, labels=work_jokes_tens)
        loss, logits = outputs[:2]
        loss.backward()
        sum_loss = sum_loss + loss.detach().data

        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 10:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_xl_manbot_{epoch}.pt"))


sum loss 602.1497192382812
sum loss 575.7716064453125
sum loss 551.5322265625
sum loss 528.8902587890625
sum loss 510.8201599121094
sum loss 495.2193298339844
sum loss 492.1264343261719
sum loss 485.3371276855469
sum loss 476.4540100097656
sum loss 468.6570129394531
sum loss 464.3804016113281
sum loss 473.2364196777344
sum loss 468.1008605957031
sum loss 460.2870788574219
sum loss 458.3224792480469
sum loss 458.1855163574219
sum loss 452.0108642578125
sum loss 453.250732421875
sum loss 449.69940185546875
sum loss 451.52410888671875
sum loss 448.9682922363281
sum loss 443.6908264160156
sum loss 438.49395751953125
sum loss 448.5375671386719
sum loss 435.82171630859375
sum loss 442.5558166503906
sum loss 437.61407470703125
sum loss 411.0587463378906
sum loss 401.7568664550781
sum loss 398.3537902832031
sum loss 401.9207458496094
sum loss 397.22998046875
sum loss 394.453369140625
sum loss 391.5653076171875
sum loss 400.8759765625
sum loss 403.8072509765625
sum loss 394.9708251953125
sum lo

KeyboardInterrupt: ignored

### Generating the jokes

In [None]:
MODEL_EPOCH = 0

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_xl_manbot_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

jokes_output_file_path = f'generated_content_{MODEL_EPOCH}.txt'

model.eval()
if os.path.exists(jokes_output_file_path):
    os.remove(jokes_output_file_path)

joke_num = 0
with torch.no_grad():

        for joke_idx in range(20):

            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode("Tips ")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break


            if joke_finished:

                joke_num = joke_num + 1

                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(jokes_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")



KeyboardInterrupt: ignored

3rd epoch model seemed to perform the best.

The generated jokes output was too long for a notebook, so I stored it in [this file](https://github.com/mf1024/transformers/blob/master/generated_2_jokes.txt).

In [None]:
drive.mount('/content/drive/')

model_path = '/content/drive/MyDrive/gpt2_model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
#model_path = '/content/drive/MyDrive/gpt2_large_manbot.h5'
#model.save_pretrained(model_path)

#model.save('/content/drive/MyDrive/my_gpt2_model/gpt2_large_manbot.h5')

Mounted at /content/drive/


('/content/drive/MyDrive/gpt2_model/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2_model/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2_model/vocab.json',
 '/content/drive/MyDrive/gpt2_model/merges.txt',
 '/content/drive/MyDrive/gpt2_model/added_tokens.json')

In [None]:
torch.save(model.state_dict(), model_path)

RuntimeError: ignored

In [None]:
model.save('/content/drive/MyDrive/my_gpt2_model/gpt2_large_manbot.h5')

AttributeError: ignored

NameError: ignored

('/content/drive/MyDrive/my_gpt2_model/tokenizer_config.json',
 '/content/drive/MyDrive/my_gpt2_model/special_tokens_map.json',
 '/content/drive/MyDrive/my_gpt2_model/vocab.json',
 '/content/drive/MyDrive/my_gpt2_model/merges.txt',
 '/content/drive/MyDrive/my_gpt2_model/added_tokens.json')

In [None]:
model_save_name = 'gpt2_large_manbot.pt'
path = F"/content/drive/{model_save_name}"
torch.save(model.state_dict(), path)

RuntimeError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_path = '/content/drive/MyDrive/my_gpt2_model'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [None]:
MODEL_EPOCH = 4

jokes_output_file_path = f'generated_content_{MODEL_EPOCH}.txt'

model.eval()
if os.path.exists(jokes_output_file_path):
    os.remove(jokes_output_file_path)

joke_num = 0
with torch.no_grad():

        for joke_idx in range(50):

            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode("Porn ")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to('device') * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break


            if joke_finished:

                joke_num = joke_num + 1

                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(jokes_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")


RuntimeError: ignored