In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset
import torch
from pickle import load
from collections import defaultdict

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [3]:
def create_dataset(data):
    data_dict = {"input_ids":[]}
    for k, v in data.items():
        input_ids = tokenizer.encode(v[2])
        for i in range(0,len(input_ids),1024):
            data_dict["input_ids"].append(input_ids[i:i+512])
    return Dataset.from_dict(data_dict)

In [6]:
data = load(open("../Data/SouthPark_Data_train.pkl", "rb"))
data = create_dataset(data)

In [5]:
from transformers import DataCollatorForLanguageModeling
data_colator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False,return_tensors='pt')
from transformers import Trainer,TrainingArguments


model = GPT2LMHeadModel.from_pretrained('gpt2',eos_token_id=tokenizer.eos_token_id,bos_token_id=tokenizer.bos_token_id)

args = TrainingArguments(
    output_dir='./outputs/',
    num_train_epochs=1,
    weight_decay=0.1,
    learning_rate=5e-4,
    lr_scheduler_type='cosine',
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_colator,
    tokenizer=tokenizer,
    train_dataset=data,
)

Using amp half precision backend


In [6]:
trainer.train()

***** Running training *****
  Num examples = 2581
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1291
 39%|███▊      | 500/1291 [02:52<04:30,  2.92it/s]Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500\config.json


{'loss': 3.3459, 'learning_rate': 0.0003378192953241824, 'epoch': 0.39}


Model weights saved in ./outputs/checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./outputs/checkpoint-500\tokenizer_config.json
Special tokens file saved in ./outputs/checkpoint-500\special_tokens_map.json
 77%|███████▋  | 1000/1291 [05:47<01:38,  2.96it/s]Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000\config.json


{'loss': 3.102, 'learning_rate': 6.089972077092024e-05, 'epoch': 0.77}


Model weights saved in ./outputs/checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./outputs/checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./outputs/checkpoint-1000\special_tokens_map.json
100%|██████████| 1291/1291 [07:32<00:00,  3.31it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1291/1291 [07:32<00:00,  2.86it/s]

{'train_runtime': 452.0384, 'train_samples_per_second': 5.71, 'train_steps_per_second': 2.856, 'train_loss': 3.166011606418837, 'epoch': 1.0}





TrainOutput(global_step=1291, training_loss=3.166011606418837, metrics={'train_runtime': 452.0384, 'train_samples_per_second': 5.71, 'train_steps_per_second': 2.856, 'train_loss': 3.166011606418837, 'epoch': 1.0})

In [7]:
from transformers import pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer="gpt2")

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Kupus/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_

In [10]:
output = pipe("", max_length=1024)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [13]:
print(output[0]["generated_text"])

A lot of people in the audience are excited to try and take me back to normal!
Chef: Well, let's try to do something with you, Stan.
Stan: Dude, hehehehahhhheh! [turns around and goes down the stairs to the ground]
Scene Description: The White Houses. Stan and Kyle head to Kyle's house. He runs into the basement, walks around, and finds a stack of paper in the ground. Kyle and the boys step out of it. A voice comes on.
Kyle: Hi. Whoa!
Whistlin' Voice: Yes, Stanley. [a man is walking on a sofa]
Stan: Stanley, come over here. [the man leaves the sofa]
Kyle: Jesus! Stan has lost hope in your life.
Stan: [wearing a purple hat] Oh! What happened?!
Wendy: He had an accident! [Stan and Kyle look at each other] The man who's the most afraid and afraid of the earth! [tears up]
Kyle: What the hell is that?!
Stan: That's because the father says things are gonna be so bad, and he's gonna drive his son out of town.
Kyle: [wearing a purple cap] He was driving the whole idea!
Stan: Aaaughh--
Kyle: [t