In [1]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import Trainer, TrainingArguments    
    
tokenizer = GPT2Tokenizer.from_pretrained("./models/impgen-tokenizer")
tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token=tokenizer.bos_token_id,
    eos_token=tokenizer.eos_token_id
)


model = GPT2LMHeadModel(config)
dataset = load_dataset("text", data_files=["./data/speeches.txt"])

2023-03-23 02:28:54.824225: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 02:28:57.292017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-23 02:28:57.292309: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
Found cached dataset text (/home/jovyan/.cache/huggingface/datasets/text/default-13f07c76466abf35/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ac

  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
def encode(lines):
    return tokenizer(lines["text"], add_special_tokens=True, truncation=True, max_length=512)

dataset.set_transform(encode)

In [3]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir="./impgen",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=5,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"]
)
trainer.train()
trainer.save_model("impgen")



Step,Training Loss
500,6.0473


In [8]:
NEWLINECHAR = "<N>"

def encode_newlines(inp):
    return inp.replace("\n", NEWLINECHAR)

def decode_newlines(inp):
    return inp.replace(NEWLINECHAR, "\n")

inp = "There's never been"
input_ids = tokenizer.encode(inp, return_tensors="pt")
model_out = model.generate(
    input_ids,
    max_length=100,
    num_beams=4,
    temperature=0.7,
    no_repeat_ngram_size=4,
    num_return_sequences=3,
    return_dict_in_generate=True,
    output_scores=True
)

for seq in model_out["sequences"]:
    print(decode_newlines(tokenizer.decode(seq)))
        


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


There's never been very much. It's a lot of a lot of people. And we're going to be very much. We're going to have a lot of our country. And I think it. And I want to be a lot of you. And we have a very much. And it. I think it's a very much, and we're doing a lot of it's a lot. And I don't know, and we have a great. And I'm going to be
There's never been very much. It's a lot of a lot of people. And we're going to be very much. We're going to have a lot of our country. And I think it. And I want to be a lot of you. And we have a very much. And it. I think it's a very much, and we're doing a lot of it's a lot. And I don't know, and we have a great. And I'm going to do
There's never been very much. It's a lot of a lot of people. And we're going to be very much. We're going to have a lot of our country. And I think it. And I want to be a lot of you. And we have a very much. And it. I think it's a very much, and we're doing a lot of it's a lot. And I don't know, and we have a great. And I'