In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = ["name_dataset.txt"]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=20_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
#!mkdir tokenizer_wiki_2
tokenizer.save_model("./exp_wiki/tokenizer_wiki_")

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./exp_wiki/tokenizer_wiki_/vocab.json",
    "./exp_wiki/tokenizer_wiki_/merges.txt",
)

In [None]:
# GPU availability
import torch
torch.cuda.is_available()

In [None]:
#random seed
import numpy as np
import random

seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer

# Initializing a GPT2 configuration
configuration = GPT2Config(vocab_size=52_000)
model = GPT2LMHeadModel(config=configuration)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("./exp_wiki/", max_len=512)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data_file.txt",
    block_size=128,
)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

In [None]:
from transformers import TextDataset
from transformers import Trainer, TrainingArguments

#curriculum: (block_size, batch_size, end_steps)
curriculum = [
    (64, 8, 10_000),
    (128, 8, 20_000),
    (256, 8, 30_000),
    (512, 8, 60_000),
]

last_steps = 0
is_first_phase = True

for block_size, batch_size, end_steps in curriculum:
  print(f"######## Block size = {block_size}, Batch size = {batch_size} ########")
  #Build our training and evaluation datasets
  train_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path="dataset_path_train.txt",
      block_size=block_size,
  )
  eval_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path="dataset_path_eval.txt",
      block_size=512,
  )
  #Set training arguments
  training_args = TrainingArguments(
      output_dir="log", #mkdir log if necessary
      overwrite_output_dir=True,
      num_train_epochs=1,
      per_gpu_train_batch_size=8,
      save_steps=10_000,
      save_total_limit=2,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
  )
  #train a language model
  if is_first_phase:
    trainer.train()
    is_first_phase = False
  else:
    trainer.train(f"log/checkpoint-{last_steps}")
  last_steps = end_steps

In [None]:
#mkdir log if necessary
trainer.save_model("./exp_wiki/insert_name")

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def generate(model, prompt, max_tokens, temperature=0.7):
    model.eval()
    for _ in range(max_tokens):
        prompt = prompt[:, :128]
        logits = model(prompt)
        logits = logits[0][:, -1, :] / temperature
        logit_probs = nn.functional.softmax(logits, dim=-1)
        next_prompt = torch.multinomial(logit_probs, num_samples=1)
        prompt = torch.cat((prompt, next_prompt), dim=1)
    return prompt

In [None]:
max_tokens = 128
prompt = tokenizer.encode("In my opinion", return_tensors='pt').to('cuda')
generated_text = generate(model, prompt, max_tokens, temperature=0.7)
generated_text = tokenizer.decode(generated_text[0])
print(generated_text)