In [2]:
import json

# collect sentences
with open("dataset_jokes/wocka.json") as fn:
  jokes = json.load(fn)

In [6]:
jokes[0]

{'body': 'What do you call a cow with no legs?\r\n\r\nGround Beef!',
 'category': 'Animal',
 'id': 1,
 'title': 'Cow With No Legs'}

In [13]:
sentences = [] # collect sentences

for i in jokes: # iterate over all recipes
    try:
        title = i['title'] # get the title
        category = i['category'] # get the category
        body = i['body'] # get the body
        sentence = f"{title}, {category}, {body}" # create the sentence as string
        if sentence != '': # if the sentence is not empty
            sentences.append(sentence) # add the sentence to the list
    except KeyError: # if the recipe has no title or ingredients
        continue

# clean sentences
# TODO: add further cleaning steps
def clean(sentence):
    sentence = sentence.replace('\r', ' ')  # replace repetetive words
    sentence = sentence.replace('\n', '')  # replace new line chars
    sentence = sentence.replace('  ', ' ')  # replace repetetive words
    sentence = sentence.strip()  # strip leading and trailing white-spaces
    return sentence

sentences = list(map(clean, sentences))  # map method.
# sentences = [clean(sentence) for sentence in sentences]  # list comprehension method

In [14]:
sentences[0]

'Cow With No Legs, Animal, What do you call a cow with no legs? Ground Beef!'

In [15]:
from sklearn.model_selection import train_test_split

# split into train/dev
# TODO: alternatively, we could use the `datasets.Dataset.train_test_split()` method 
SEED = 10  # set seed var for reproducibility
train_sentences, test_sentences = train_test_split(sentences, 
                                                   test_size=0.1, 
                                                   # change the train_size for rapid testing (for example, use 0.1)
                                                   train_size=0.9,  
                                                   random_state=SEED)

# write into files
for split, sents in zip(['train', 'test'], [train_sentences, test_sentences]):
    with open(f"{split}.txt", 'w') as fn:
        fn.write('\n'.join(sents))

In [17]:
# create the datasets.Dataset object
from datasets import load_dataset 

dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'}) # load the dataset from the text files

Using custom data configuration default-6a30f5af75f6bfd4
                            

Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/deniz/.cache/huggingface/datasets/text/default-6a30f5af75f6bfd4/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...
Dataset text downloaded and prepared to /home/deniz/.cache/huggingface/datasets/text/default-6a30f5af75f6bfd4/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.




In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9052
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1002
    })
})

In [19]:
# Instantiate tokenizer
from transformers import AutoTokenizer
pretrained_model = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model)

# Define a function to tokenize the dataset and return the text indices. 
# We also add trailing <|endoftext|> special token
def tokenize_sentence(dataset):
    # As we can see, there is no padding since the PAD token is not originally used by GPT-2. 
    # We could perform padding by adding the PAD token to the vocabulary with the method `add_special_tokens()`
    return tokenizer([f"{sentence} {tokenizer.eos_token}" for sentence in dataset['text']])
    # return tokenizer(dataset['text])

# apply to dataset object
dataset_features = dataset.map(tokenize_sentence,
                               batched=True,
                               remove_columns=['text'],
                               desc='Tokenizing train and test splits')

Tokenizing train and test splits:   0%|          | 0/10 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1127 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing train and test splits: 100%|██████████| 10/10 [00:00<00:00, 14.49ba/s]
Tokenizing train and test splits: 100%|██████████| 2/2 [00:00<00:00, 23.41ba/s]


In [20]:
dataset_features

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids'],
        num_rows: 9052
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids'],
        num_rows: 1002
    })
})

In [21]:
# group sentences in batches of equal size (standard GPT-2 approach)
# We use an adaptation of the `group_text` function for that purpose
def group_texts(examples):
    # Concatenate all texts.
    block_size = 512  # set the "blocks" to half of the maximum GPT-2 model length (1024) for memory issues
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    # # Add labels to the dataset_features
    # # Since the task is language modelling, the labels to predict are actually the input indices "shifted"

    # result["labels"] = result["input_ids"].copy()
    return result

# apply the group function to the dataset

dataset_grouped = dataset_features.map(group_texts,
                                       batched=True,
                                       desc='Group sentences in blocks of equal size (512)')


Group sentences in blocks of equal size (512): 100%|██████████| 10/10 [00:03<00:00,  2.60ba/s]
Group sentences in blocks of equal size (512): 100%|██████████| 2/2 [00:00<00:00,  5.22ba/s]


# Check block size 

In [None]:
for i in dataset_grouped['train']['input_ids']:
    if len(i) != 512:
        print(len(i))

In [33]:
for i in dataset_grouped['test']['input_ids']:
    if len(i) != 512:
        print(i)
        print(len(i))

[4834, 15521, 972, 11, 8366, 11, 4874, 612, 373, 257, 2576, 508, 2227, 4025, 17515, 11, 523, 530, 1110, 673, 1816, 284, 766, 607, 6253, 11, 1583, 13, 4176, 13, 220, 1583, 13, 4176, 1297, 607, 284, 6437, 607, 17515, 290, 9585, 262, 1708, 25, 366, 6173, 6684, 3483, 36, 11, 35, 6684, 3483, 36, 11, 43, 6684, 3483, 36, 11, 314, 41300, 26746, 30373, 347, 6684, 3483, 1546, 1911, 1881, 1110, 673, 373, 2491, 2739, 11, 290, 3066, 284, 466, 607, 13565, 319, 262, 1323, 618, 257, 3516, 1625, 510, 284, 607, 290, 1965, 611, 673, 373, 257, 5827, 286, 1583, 13, 4176, 338, 11, 284, 543, 673, 8712, 25, 366, 5297, 11, 703, 750, 345, 760, 43634, 679, 8712, 366, 39, 11860, 15513, 360, 11860, 15513, 37760, 2474, 220, 50256, 38101, 29926, 2611, 1406, 12301, 11, 25455, 337, 2002, 64, 11, 25455, 40084, 523, 3735, 326, 673, 17157, 625, 290, 1392, 5169, 329, 6301, 8469, 13, 220, 50256]
160


In [35]:
# Add "labels" column to the dataset_features. 
# To modify the dataset structure, we use the `dataset.map()` method
def add_labels(dataset):
    # Since the task is language modelling, the labels to predict are actually 
    # the input indices shifted forward by one element (token)
    dataset['labels'] = dataset['input_ids'].copy()
    return dataset

dataset_for_lm = dataset_grouped.map(add_labels,
                                     batched=True,
                                     desc='Add labels to create data for language model training')
 

Add labels to create data for language model training: 100%|██████████| 3/3 [00:00<00:00,  4.11ba/s]
Add labels to create data for language model training: 100%|██████████| 1/1 [00:00<00:00, 14.45ba/s]


In [44]:
# Instantiate the model class
from transformers import (
    AutoConfig, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments,
    default_data_collator,
)
import torch


# TODO: experiment with different model configuration and batch sizes until 
# the models fits into GPU memory (otherwise it generated CUDA-out-of-memory error)
# The model is instantiated from the pretrained GPT-2 model
# Here, I reduced the number of attention head and layers, 
# to significantly reduce the model size and make sure it fits in the GPU memory
config = AutoConfig.from_pretrained(pretrained_model,
                                    n_head=12,  # reduce the size of the model for memory issues
                                    n_layer=12)

pretrained_model = 'gpt2-recipes'
model = AutoModelForCausalLM.from_pretrained(pretrained_model, 
                                             config=config)

# Again, we simulate a batch size of 8 by setting the `gradient_accumulation_steps` parameters
no_cuda = not bool(torch.cuda.is_available())

if no_cuda:
  print(f"Training on CPUs")
else:
  print(f"Training on GPU")

training_args = TrainingArguments(no_cuda=no_cuda,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=4, # virtually increment the batch_size
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  logging_steps=100,
                                  logging_dir='gpt2-jokes/tb',  # where to store the tensorboard
                                  num_train_epochs=10,
                                  output_dir='gpt2-jokes')

# Start the training!
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_for_lm['train'],
    eval_dataset=dataset_for_lm['test'], # we use the test set as validation set
    tokenizer=tokenizer,
    # Data collator is used to create batches from data. 
    # When a tokenizer is passed the default to DataCollatorWithPadding is used.
    # So we change it since our model do not use PAD tokens
    data_collator=default_data_collator,
)

loading configuration file gpt2-recipes/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file gpt2-recipes/pytorch_model.bin
All model 

Training on GPU


In [45]:
# Use tensorboard to monitor the training
# Load the TensorBoard notebook extension
%reload_ext tensorboard  

 # read data from tensorboard dir
%tensorboard --logdir gpt2-jokes/tb 

In [46]:
# Finally: let's start the training!
train_results = trainer.train()

***** Running training *****
  Num examples = 2802
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 1750
  6%|▌         | 100/1750 [02:06<33:49,  1.23s/it]

{'loss': 3.4358, 'learning_rate': 4.714285714285714e-05, 'epoch': 0.57}


 10%|█         | 175/1750 [03:38<31:53,  1.21s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 10%|█         | 175/1750 [03:46<31:53,  1.21s/it]Saving model checkpoint to gpt2-jokes/checkpoint-175
Configuration saved in gpt2-jokes/checkpoint-175/config.json


{'eval_loss': 3.1719183921813965, 'eval_runtime': 7.4121, 'eval_samples_per_second': 38.451, 'eval_steps_per_second': 9.714, 'epoch': 1.0}


Model weights saved in gpt2-jokes/checkpoint-175/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-175/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-175/special_tokens_map.json
 11%|█▏        | 200/1750 [04:18<31:37,  1.22s/it]

{'loss': 3.2798, 'learning_rate': 4.428571428571428e-05, 'epoch': 1.14}


 17%|█▋        | 300/1750 [06:20<29:30,  1.22s/it]

{'loss': 3.1755, 'learning_rate': 4.1428571428571437e-05, 'epoch': 1.71}


 20%|██        | 350/1750 [07:21<28:58,  1.24s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 20%|██        | 350/1750 [07:29<28:58,  1.24s/it]Saving model checkpoint to gpt2-jokes/checkpoint-350
Configuration saved in gpt2-jokes/checkpoint-350/config.json


{'eval_loss': 3.120232343673706, 'eval_runtime': 7.4185, 'eval_samples_per_second': 38.417, 'eval_steps_per_second': 9.705, 'epoch': 2.0}


Model weights saved in gpt2-jokes/checkpoint-350/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-350/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-350/special_tokens_map.json
 23%|██▎       | 400/1750 [08:32<27:59,  1.24s/it]

{'loss': 3.1372, 'learning_rate': 3.857142857142858e-05, 'epoch': 2.29}


 29%|██▊       | 500/1750 [10:37<27:13,  1.31s/it]

{'loss': 3.0679, 'learning_rate': 3.571428571428572e-05, 'epoch': 2.86}


 30%|███       | 525/1750 [11:09<26:02,  1.28s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 30%|███       | 525/1750 [11:17<26:02,  1.28s/it]Saving model checkpoint to gpt2-jokes/checkpoint-525
Configuration saved in gpt2-jokes/checkpoint-525/config.json


{'eval_loss': 3.0964958667755127, 'eval_runtime': 8.2067, 'eval_samples_per_second': 34.728, 'eval_steps_per_second': 8.773, 'epoch': 3.0}


Model weights saved in gpt2-jokes/checkpoint-525/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-525/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-525/special_tokens_map.json
 34%|███▍      | 600/1750 [12:56<24:36,  1.28s/it]

{'loss': 3.0294, 'learning_rate': 3.285714285714286e-05, 'epoch': 3.43}


 40%|████      | 700/1750 [15:06<22:39,  1.30s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4


{'loss': 3.0154, 'learning_rate': 3e-05, 'epoch': 4.0}



 40%|████      | 700/1750 [15:14<22:39,  1.30s/it]Saving model checkpoint to gpt2-jokes/checkpoint-700
Configuration saved in gpt2-jokes/checkpoint-700/config.json


{'eval_loss': 3.0854358673095703, 'eval_runtime': 7.962, 'eval_samples_per_second': 35.795, 'eval_steps_per_second': 9.043, 'epoch': 4.0}


Model weights saved in gpt2-jokes/checkpoint-700/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-700/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-700/special_tokens_map.json
 46%|████▌     | 800/1750 [17:23<19:17,  1.22s/it]

{'loss': 2.9664, 'learning_rate': 2.714285714285714e-05, 'epoch': 4.57}


 50%|█████     | 875/1750 [18:55<17:56,  1.23s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 50%|█████     | 875/1750 [19:04<17:56,  1.23s/it]Saving model checkpoint to gpt2-jokes/checkpoint-875
Configuration saved in gpt2-jokes/checkpoint-875/config.json


{'eval_loss': 3.0803442001342773, 'eval_runtime': 8.2368, 'eval_samples_per_second': 34.601, 'eval_steps_per_second': 8.741, 'epoch': 5.0}


Model weights saved in gpt2-jokes/checkpoint-875/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-875/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-875/special_tokens_map.json
 51%|█████▏    | 900/1750 [19:37<17:57,  1.27s/it]

{'loss': 2.9507, 'learning_rate': 2.4285714285714288e-05, 'epoch': 5.14}


 57%|█████▋    | 1000/1750 [21:40<15:10,  1.21s/it]

{'loss': 2.9288, 'learning_rate': 2.1428571428571428e-05, 'epoch': 5.71}


 60%|██████    | 1050/1750 [22:41<14:11,  1.22s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 60%|██████    | 1050/1750 [22:49<14:11,  1.22s/it]Saving model checkpoint to gpt2-jokes/checkpoint-1050
Configuration saved in gpt2-jokes/checkpoint-1050/config.json


{'eval_loss': 3.074277877807617, 'eval_runtime': 8.2019, 'eval_samples_per_second': 34.748, 'eval_steps_per_second': 8.778, 'epoch': 6.0}


Model weights saved in gpt2-jokes/checkpoint-1050/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-1050/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-1050/special_tokens_map.json
 63%|██████▎   | 1100/1750 [23:52<13:32,  1.25s/it]

{'loss': 2.9012, 'learning_rate': 1.8571428571428572e-05, 'epoch': 6.29}


 69%|██████▊   | 1200/1750 [25:56<10:53,  1.19s/it]

{'loss': 2.8806, 'learning_rate': 1.5714285714285715e-05, 'epoch': 6.86}


 70%|███████   | 1225/1750 [26:26<10:22,  1.19s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 70%|███████   | 1225/1750 [26:34<10:22,  1.19s/it]Saving model checkpoint to gpt2-jokes/checkpoint-1225
Configuration saved in gpt2-jokes/checkpoint-1225/config.json


{'eval_loss': 3.073920965194702, 'eval_runtime': 8.0765, 'eval_samples_per_second': 35.288, 'eval_steps_per_second': 8.915, 'epoch': 7.0}


Model weights saved in gpt2-jokes/checkpoint-1225/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-1225/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-1225/special_tokens_map.json
 74%|███████▍  | 1300/1750 [28:06<08:53,  1.19s/it]

{'loss': 2.8783, 'learning_rate': 1.2857142857142857e-05, 'epoch': 7.43}


 80%|████████  | 1400/1750 [30:04<06:55,  1.19s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4


{'loss': 2.8589, 'learning_rate': 1e-05, 'epoch': 8.0}



 80%|████████  | 1400/1750 [30:13<06:55,  1.19s/it]Saving model checkpoint to gpt2-jokes/checkpoint-1400
Configuration saved in gpt2-jokes/checkpoint-1400/config.json


{'eval_loss': 3.0733442306518555, 'eval_runtime': 8.6275, 'eval_samples_per_second': 33.034, 'eval_steps_per_second': 8.345, 'epoch': 8.0}


Model weights saved in gpt2-jokes/checkpoint-1400/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-1400/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-1400/special_tokens_map.json
 86%|████████▌ | 1500/1750 [32:14<04:56,  1.19s/it]

{'loss': 2.85, 'learning_rate': 7.142857142857143e-06, 'epoch': 8.57}


 90%|█████████ | 1575/1750 [33:43<03:27,  1.19s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

 90%|█████████ | 1575/1750 [33:52<03:27,  1.19s/it]Saving model checkpoint to gpt2-jokes/checkpoint-1575
Configuration saved in gpt2-jokes/checkpoint-1575/config.json


{'eval_loss': 3.073152542114258, 'eval_runtime': 8.8302, 'eval_samples_per_second': 32.276, 'eval_steps_per_second': 8.154, 'epoch': 9.0}


Model weights saved in gpt2-jokes/checkpoint-1575/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-1575/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-1575/special_tokens_map.json
 91%|█████████▏| 1600/1750 [34:24<03:02,  1.22s/it]

{'loss': 2.8406, 'learning_rate': 4.285714285714286e-06, 'epoch': 9.14}


 97%|█████████▋| 1700/1750 [36:27<00:59,  1.19s/it]

{'loss': 2.8356, 'learning_rate': 1.4285714285714286e-06, 'epoch': 9.71}


100%|██████████| 1750/1750 [37:27<00:00,  1.18s/it]***** Running Evaluation *****
  Num examples = 285
  Batch size = 4

100%|██████████| 1750/1750 [37:36<00:00,  1.18s/it]Saving model checkpoint to gpt2-jokes/checkpoint-1750
Configuration saved in gpt2-jokes/checkpoint-1750/config.json


{'eval_loss': 3.0730676651000977, 'eval_runtime': 8.8754, 'eval_samples_per_second': 32.111, 'eval_steps_per_second': 8.112, 'epoch': 10.0}


Model weights saved in gpt2-jokes/checkpoint-1750/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/checkpoint-1750/tokenizer_config.json
Special tokens file saved in gpt2-jokes/checkpoint-1750/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1750/1750 [37:38<00:00,  1.29s/it]

{'train_runtime': 2258.2127, 'train_samples_per_second': 12.408, 'train_steps_per_second': 0.775, 'train_loss': 2.9967728445870536, 'epoch': 10.0}





In [47]:
# Save model and tokenizer
trainer.save_model('gpt2-jokes')

# Save the metrics obtained (loss)
metrics_train = train_results.metrics
trainer.log_metrics('train', metrics_train)
trainer.save_metrics('train', metrics_train)

# save trainer state Saves the Trainer state, since Trainer.save_model 
# saves only the tokenizer with the model
trainer.save_state()

Saving model checkpoint to gpt2-jokes
Configuration saved in gpt2-jokes/config.json
Model weights saved in gpt2-jokes/pytorch_model.bin
tokenizer config file saved in gpt2-jokes/tokenizer_config.json
Special tokens file saved in gpt2-jokes/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  train_loss               =     2.9968
  train_runtime            = 0:37:38.21
  train_samples_per_second =     12.408
  train_steps_per_second   =      0.775


# 4. Evaluate the model

In [40]:
metrics_eval = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 285
  Batch size = 4
100%|██████████| 72/72 [00:07<00:00,  9.02it/s]


In [41]:
import math

# compute perplexity as the exponential of the loss (cross-entropy)
perplexity = math.exp(metrics_eval['eval_loss'])
metrics_eval['perplexity'] = perplexity

# save evaluation metrics
trainer.log_metrics('eval', metrics_eval)
trainer.save_metrics('eval', metrics_eval)

***** eval metrics *****
  epoch                   =        2.0
  eval_loss               =     3.1606
  eval_runtime            = 0:00:07.99
  eval_samples_per_second =     35.641
  eval_steps_per_second   =      9.004
  perplexity              =    23.5845


In [48]:
from transformers import TextGenerationPipeline, AutoModelForCausalLM, AutoTokenizer

checkpoint = 'gpt2-jokes'
model_checkpoint = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

pipeline_generate = TextGenerationPipeline(model_checkpoint,
                                           tokenizer=tokenizer)

while True:
    prompt = input('\n\nInsert prompt\n')
    max_length = int(input('\nInsert max generation length\n'))
    top_p = float(input('\nInsert top_p\n'))
    top_k = int(input('\nInsert top_k\n'))
    num_return_sequences = int(input('\nInsert num_return_sequences\n'))
    
    generated_sentence = pipeline_generate(prompt,    
                                           max_length=max_length,
                                           do_sample=True,
                                           top_k=top_k,
                                           top_p=top_p,
                                           num_return_sequences=num_return_sequences,
                                           early_stopping=False)
    
    
    for gen in generated_sentence:
        print(gen['generated_text'])

loading configuration file gpt2-jokes/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-recipes",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file gpt2-jokes/pytorch_model.bin
All mo

two guys walk in a bar. The first guy asks the second guy if he's ever been to a bar before. The second guy says, "Yes, I've been to a bar before but I haven't ever seen someone drinking." So the


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


two guys walk in a bar  and one guy yells "Sell to the people on the ground" 
two guys walk in a bar 


ValueError: invalid literal for int() with base 10: ''