In [1]:
import datasets as ds
import torch as t
import transformers as tfs
import re
from sklearn.model_selection import train_test_split

In [2]:
def clean_text_data(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        texts = f.readlines()
        texts = list(filter(lambda x: not x .isspace(), texts))
        for i in range(len(texts)):
            bos_token = '<BOS>'
            eos_token = '<EOS>'
            texts[i] = bos_token + ' ' + texts[i].strip() + ' ' + eos_token + '\n'
    return texts

def build_datasets(text_list, dest):
    f = open(dest, 'w', encoding='utf-8')
    data = ""
    for text in text_list:
        data += text
    f.write(data)
    f.close()
train_test_ratio = 0.9

clean_texts = clean_text_data("fantasy_adventure.txt")
train_data, test_data = train_test_split(clean_texts, train_size=train_test_ratio, random_state=1)

build_datasets(train_data, "train.txt")
build_datasets(test_data, "test.txt")


In [3]:
tokenizer = tfs.GPT2TokenizerFast.from_pretrained("gpt2")
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)


In [7]:
def load_text_dataset(train_path, test_path, tokenizer):
    train_dataset = tfs.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128
    )

    test_dataset  = tfs.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=128
    )
    data_collator = tfs.DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm = False
    )
    return train_dataset, test_dataset, data_collator
train_dataset, test_dataset, data_collator = load_text_dataset("train.txt", "test.txt", tokenizer)

In [5]:
model = tfs.GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [14]:
training_args = tfs.TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=100,
    no_cuda=True,
    save_total_limit =1,
)
trainer = tfs.Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

t][A[A[A[A[A




 66%|██████▋   | 850/1283 [1:17:04<37:17,  5.17s/it][A[A[A[A[A




 66%|██████▋   | 851/1283 [1:17:08<36:08,  5.02s/it][A[A[A[A[A




 66%|██████▋   | 852/1283 [1:17:13<34:33,  4.81s/it][A[A[A[A[A




 66%|██████▋   | 853/1283 [1:17:19<37:58,  5.30s/it][A[A[A[A[A




 67%|██████▋   | 854/1283 [1:17:23<34:33,  4.83s/it][A[A[A[A[A




 67%|██████▋   | 855/1283 [1:17:29<37:50,  5.31s/it][A[A[A[A[A




 67%|██████▋   | 856/1283 [1:17:34<35:43,  5.02s/it][A[A[A[A[A




 67%|██████▋   | 857/1283 [1:17:37<33:19,  4.69s/it][A[A[A[A[A




 67%|██████▋   | 858/1283 [1:17:44<36:54,  5.21s/it][A[A[A[A[A




 67%|██████▋   | 859/1283 [1:17:49<35:57,  5.09s/it][A[A[A[A[A




 67%|██████▋   | 860/1283 [1:17:55<38:42,  5.49s/it][A[A[A[A[A




 67%|██████▋   | 861/1283 [1:17:59<35:55,  5.11s/it][A[A[A[A[A




 67%|██████▋   | 862/1283 [1:18:04<34:23,  4.90s/it][A[A[A[A[A




 67%|██████▋   | 863/1283 [1:18:08<31:55, 

KeyboardInterrupt: 