In [4]:
import math
import re
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
from datasets import load_dataset
from transformers import (
    TrainerCallback,
    GPT2Config,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    AdamW,
    TrainingArguments,
    Trainer,
)

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast

from transformers import (GPT2Config,
                          GPT2LMHeadModel)

model_name = 'movie-plot-generation-from-scratch'

# Load dataset

First, we load the dataset

In [5]:
# Load dataset from text file called "data.txt". We won't use a validation set
dataset = load_dataset("text", data_files="data_top_15_genres.txt")['train']
dataset

Using custom data configuration default-2fcf8d2135508f85
Reusing dataset text (C:\Users\Anton\.cache\huggingface\datasets\text\default-2fcf8d2135508f85\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Dataset({
    features: ['text'],
    num_rows: 37031
})

## Tokenizer training

We now need to tokenize the dataset. We create a tokenizer and train it on our data.

In [6]:
# Add special tokens for each genre
genres = ['romantic drama', 'short film', 'family film',
          'adventure', 'action/adventure', 'indie',
          'black-and-white', 'horror', 'crime fiction',
          'world cinema', 'action', 'thriller', 
          'romance film', 'comedy', 'drama']

special_tokens = ['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<SEP>']
genre_tokens =  [f'<{genre}>' for genre in genres]
all_special_tokens = special_tokens + genre_tokens

tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
trainer = BpeTrainer(special_tokens=all_special_tokens, vocab_size=50257)
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(dataset['text'], trainer)

# load our tokenizer into huggingface transformers library 
# For some reason the special tokens are not assigned to the corresponding properties 
# even though tokenization works as intended. We therefore add the special tokens manually.
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer, 
    model_input_names=['input_ids', 'attention_mask'])
special_tokens_dict = {'additional_special_tokens': genre_tokens}
tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.unk_token = '<UNK>'
tokenizer.bos_token = '<BOS>'
tokenizer.eos_token = '<EOS>'
tokenizer.pad_token = '<PAD>'
tokenizer.sep_token = '<SEP>'

# Save 
tokenizer.save_pretrained(model_name)

('movie-plot-generation-from-scratch\\tokenizer_config.json',
 'movie-plot-generation-from-scratch\\special_tokens_map.json',
 'movie-plot-generation-from-scratch\\tokenizer.json')

### Define transformer model

In [7]:
# Load a new GPT2 model with 512 max length
config = GPT2Config(
    vocab_size=50257,
    n_positions=512,
    n_ctx=512,
)
model = GPT2LMHeadModel(config=config)

# Load tokenizer 
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
tokenizer

PreTrainedTokenizerFast(name_or_path='movie-plot-generation-from-scratch', vocab_size=50257, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'bos_token': '<BOS>', 'eos_token': '<EOS>', 'unk_token': '<UNK>', 'sep_token': '<SEP>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<romantic drama>', '<short film>', '<family film>', '<adventure>', '<action/adventure>', '<indie>', '<black-and-white>', '<horror>', '<crime fiction>', '<world cinema>', '<action>', '<thriller>', '<romance film>', '<comedy>', '<drama>']})

**Tokenize the dataset**

We tokenize the dataset. The tokenized examples contain the column names 'attention_mask' which is a mask for padding tokens and 'input_ids' which is the id of each token corrsponding to a word. We drop the text as that is not needed anymore. Also note that we duplicate the inputs to add our labels. This is because the model of the 🤗 Transformers library apply the shifting to the right, so we don't need to do it manually.

In [8]:
def tokenize_function(examples):
    result = tokenizer(examples["text"], max_length=512, padding='max_length', truncation=True)
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

#Make dataset format pytorch tensors
tokenized_dataset.set_format("torch")

# Finally, select a subset if wanted
train_set = tokenized_dataset#.select(list(range(10)))
train_set

HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))




Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 37031
})

### Training
First, setup training args.
The last argument to setup everything so we can push the model to the Hub regularly during training..

Then pass training args to Trainer.

In [9]:
class SaveTokenizer(TrainerCallback):
    """
    A callback used to save the tokenizer whenever a model checkpoint is saved.
    """
    def on_save(self, args, state, control, **kwargs):
        tokenizer.save_pretrained(model_name)

        
ce_loss = torch.nn.CrossEntropyLoss()
        
def compute_metrics(eval_pred):
    """
    The compute function needs to receive a tuple (with logits and labels)
    and has to return a dictionary with string keys (the name of the metric) and float values.
    It will be called at the end of each evaluation phase on the whole arrays of predictions/labels.
    """
    logits, labels = eval_pred
    # Calculate perplexity https://huggingface.co/transformers/perplexity.html
    # "the exponentiation of the cross-entropy between the data and model predictions."
    
    perplexity = math.exp(ce_loss(logits, labels))
    
    return {'perplexity': perplexity}

In [10]:
torch.cuda.empty_cache()
batch_size = 1 # 1:34:39 for one epoch (no evaluation steps) with batch_size = 2

training_args = TrainingArguments(
    model_name,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    save_steps=2000,
    save_total_limit=1,
    log_level='info',
    logging_steps=250
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    compute_metrics=compute_metrics,
    callbacks=[SaveTokenizer],
)

In [11]:
train_results=trainer.train()
pickle.dump(train_results, open(model_name+"/train_results.pickle", "wb")) #Load: train_results = pickle.load(open("train_results.pickle", "rb"))

model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

***** Running training *****
  Num examples = 37031
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 18516


Step,Training Loss
250,4.3881
500,3.5488
750,3.7126
1000,3.7017
1250,3.5893
1500,3.5033
1750,3.2668
2000,3.3303
2250,3.3288
2500,3.5792


Saving model checkpoint to movie-plot-generation-from-scratch\checkpoint-2000
Configuration saved in movie-plot-generation-from-scratch\checkpoint-2000\config.json
Model weights saved in movie-plot-generation-from-scratch\checkpoint-2000\pytorch_model.bin
tokenizer config file saved in movie-plot-generation-from-scratch\tokenizer_config.json
Special tokens file saved in movie-plot-generation-from-scratch\special_tokens_map.json
Saving model checkpoint to movie-plot-generation-from-scratch\checkpoint-4000
Configuration saved in movie-plot-generation-from-scratch\checkpoint-4000\config.json
Model weights saved in movie-plot-generation-from-scratch\checkpoint-4000\pytorch_model.bin
Deleting older checkpoint [movie-plot-generation-from-scratch\checkpoint-2000] due to args.save_total_limit
tokenizer config file saved in movie-plot-generation-from-scratch\tokenizer_config.json
Special tokens file saved in movie-plot-generation-from-scratch\special_tokens_map.json
Saving model checkpoint to m

('movie-plot-generation-from-scratch\\tokenizer_config.json',
 'movie-plot-generation-from-scratch\\special_tokens_map.json',
 'movie-plot-generation-from-scratch\\tokenizer.json')

In [None]:
# Inference test
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
stories = generator("<BOS> <drama> Expecting the unexpected <SEP>", max_length=512, num_return_sequences=1)
print(*[story['generated_text'] + "\n\n\n------------------------\n" for story in stories])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Now that the data has been cleaned, we're ready to instantiate our Trainer.