In [2]:
import json

# collect sentences
with open("dataset_jokes/wocka.json") as fn:
  jokes = json.load(fn)

In [6]:
jokes[0]

{'body': 'What do you call a cow with no legs?\r\n\r\nGround Beef!',
 'category': 'Animal',
 'id': 1,
 'title': 'Cow With No Legs'}

In [13]:
sentences = [] # collect sentences

for i in jokes: # iterate over all recipes
    try:
        title = i['title'] # get the title
        category = i['category'] # get the category
        body = i['body'] # get the body
        sentence = f"{title}, {category}, {body}" # create the sentence as string
        if sentence != '': # if the sentence is not empty
            sentences.append(sentence) # add the sentence to the list
    except KeyError: # if the recipe has no title or ingredients
        continue

# clean sentences
# TODO: add further cleaning steps
def clean(sentence):
    sentence = sentence.replace('\r', ' ')  # replace repetetive words
    sentence = sentence.replace('\n', '')  # replace new line chars
    sentence = sentence.replace('  ', ' ')  # replace repetetive words
    sentence = sentence.strip()  # strip leading and trailing white-spaces
    return sentence

sentences = list(map(clean, sentences))  # map method.
# sentences = [clean(sentence) for sentence in sentences]  # list comprehension method

In [14]:
sentences[0]

'Cow With No Legs, Animal, What do you call a cow with no legs? Ground Beef!'

In [15]:
from sklearn.model_selection import train_test_split

# split into train/dev
# TODO: alternatively, we could use the `datasets.Dataset.train_test_split()` method 
SEED = 10  # set seed var for reproducibility
train_sentences, test_sentences = train_test_split(sentences, 
                                                   test_size=0.1, 
                                                   # change the train_size for rapid testing (for example, use 0.1)
                                                   train_size=0.9,  
                                                   random_state=SEED)

# write into files
for split, sents in zip(['train', 'test'], [train_sentences, test_sentences]):
    with open(f"{split}.txt", 'w') as fn:
        fn.write('\n'.join(sents))

In [17]:
# create the datasets.Dataset object
from datasets import load_dataset 

dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'}) # load the dataset from the text files

Using custom data configuration default-6a30f5af75f6bfd4
                            

Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/deniz/.cache/huggingface/datasets/text/default-6a30f5af75f6bfd4/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...
Dataset text downloaded and prepared to /home/deniz/.cache/huggingface/datasets/text/default-6a30f5af75f6bfd4/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.




In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9052
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1002
    })
})

In [19]:
# Instantiate tokenizer
from transformers import AutoTokenizer
pretrained_model = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model)

# Define a function to tokenize the dataset and return the text indices. 
# We also add trailing <|endoftext|> special token
def tokenize_sentence(dataset):
    # As we can see, there is no padding since the PAD token is not originally used by GPT-2. 
    # We could perform padding by adding the PAD token to the vocabulary with the method `add_special_tokens()`
    return tokenizer([f"{sentence} {tokenizer.eos_token}" for sentence in dataset['text']])
    # return tokenizer(dataset['text])

# apply to dataset object
dataset_features = dataset.map(tokenize_sentence,
                               batched=True,
                               remove_columns=['text'],
                               desc='Tokenizing train and test splits')

Tokenizing train and test splits:   0%|          | 0/10 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1127 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing train and test splits: 100%|██████████| 10/10 [00:00<00:00, 14.49ba/s]
Tokenizing train and test splits: 100%|██████████| 2/2 [00:00<00:00, 23.41ba/s]


In [20]:
dataset_features

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids'],
        num_rows: 9052
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids'],
        num_rows: 1002
    })
})

In [21]:
# group sentences in batches of equal size (standard GPT-2 approach)
# We use an adaptation of the `group_text` function for that purpose
def group_texts(examples):
    # Concatenate all texts.
    block_size = 512  # set the "blocks" to half of the maximum GPT-2 model length (1024) for memory issues
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    # # Add labels to the dataset_features
    # # Since the task is language modelling, the labels to predict are actually the input indices "shifted"

    # result["labels"] = result["input_ids"].copy()
    return result

# apply the group function to the dataset

dataset_grouped = dataset_features.map(group_texts,
                                       batched=True,
                                       desc='Group sentences in blocks of equal size (512)')


Group sentences in blocks of equal size (512): 100%|██████████| 10/10 [00:03<00:00,  2.60ba/s]
Group sentences in blocks of equal size (512): 100%|██████████| 2/2 [00:00<00:00,  5.22ba/s]


# Check block size 

In [None]:
for i in dataset_grouped['train']['input_ids']:
    if len(i) != 512:
        print(len(i))

In [33]:
for i in dataset_grouped['test']['input_ids']:
    if len(i) != 512:
        print(i)
        print(len(i))

[4834, 15521, 972, 11, 8366, 11, 4874, 612, 373, 257, 2576, 508, 2227, 4025, 17515, 11, 523, 530, 1110, 673, 1816, 284, 766, 607, 6253, 11, 1583, 13, 4176, 13, 220, 1583, 13, 4176, 1297, 607, 284, 6437, 607, 17515, 290, 9585, 262, 1708, 25, 366, 6173, 6684, 3483, 36, 11, 35, 6684, 3483, 36, 11, 43, 6684, 3483, 36, 11, 314, 41300, 26746, 30373, 347, 6684, 3483, 1546, 1911, 1881, 1110, 673, 373, 2491, 2739, 11, 290, 3066, 284, 466, 607, 13565, 319, 262, 1323, 618, 257, 3516, 1625, 510, 284, 607, 290, 1965, 611, 673, 373, 257, 5827, 286, 1583, 13, 4176, 338, 11, 284, 543, 673, 8712, 25, 366, 5297, 11, 703, 750, 345, 760, 43634, 679, 8712, 366, 39, 11860, 15513, 360, 11860, 15513, 37760, 2474, 220, 50256, 38101, 29926, 2611, 1406, 12301, 11, 25455, 337, 2002, 64, 11, 25455, 40084, 523, 3735, 326, 673, 17157, 625, 290, 1392, 5169, 329, 6301, 8469, 13, 220, 50256]
160


In [35]:
# Add "labels" column to the dataset_features. 
# To modify the dataset structure, we use the `dataset.map()` method
def add_labels(dataset):
    # Since the task is language modelling, the labels to predict are actually 
    # the input indices shifted forward by one element (token)
    dataset['labels'] = dataset['input_ids'].copy()
    return dataset

dataset_for_lm = dataset_grouped.map(add_labels,
                                     batched=True,
                                     desc='Add labels to create data for language model training')
 

Add labels to create data for language model training: 100%|██████████| 3/3 [00:00<00:00,  4.11ba/s]
Add labels to create data for language model training: 100%|██████████| 1/1 [00:00<00:00, 14.45ba/s]


In [36]:
# Instantiate the model class
from transformers import (
    AutoConfig, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments,
    default_data_collator,
)
import torch


# TODO: experiment with different model configuration and batch sizes until 
# the models fits into GPU memory (otherwise it generated CUDA-out-of-memory error)
# The model is instantiated from the pretrained GPT-2 model
# Here, I reduced the number of attention head and layers, 
# to significantly reduce the model size and make sure it fits in the GPU memory
config = AutoConfig.from_pretrained(pretrained_model,
                                    n_head=12,  # reduce the size of the model for memory issues
                                    n_layer=12)

pretrained_model = 'gpt2-recipes'
model = AutoModelForCausalLM.from_pretrained(pretrained_model, 
                                             config=config)

# Again, we simulate a batch size of 8 by setting the `gradient_accumulation_steps` parameters
no_cuda = not bool(torch.cuda.is_available())

if no_cuda:
  print(f"Training on CPUs")
else:
  print(f"Training on GPU")

training_args = TrainingArguments(no_cuda=no_cuda,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=4, # virtually increment the batch_size
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  logging_steps=100,
                                  logging_dir='gpt2-jokes/tb',  # where to store the tensorboard
                                  num_train_epochs=2,
                                  output_dir='gpt2-jokes')

# Start the training!
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_for_lm['train'],
    eval_dataset=dataset_for_lm['test'], # we use the test set as validation set
    tokenizer=tokenizer,
    # Data collator is used to create batches from data. 
    # When a tokenizer is passed the default to DataCollatorWithPadding is used.
    # So we change it since our model do not use PAD tokens
    data_collator=default_data_collator,
)

Training on GPU


In [37]:
# Use tensorboard to monitor the training
# Load the TensorBoard notebook extension
%reload_ext tensorboard  

 # read data from tensorboard dir
%tensorboard --logdir gpt2-jokes/tb 

In [38]:
# Finally: let's start the training!
train_results = trainer.train()

***** Running training *****
  Num examples = 2802
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 350
 20%|██        | 71/350 [01:32<06:09,  1.32s/it]