https://www.freecodecamp.org/news/train-algorithms-from-scratch-with-hugging-face/

## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(WordLevel())
spl_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
trainer = WordLevelTrainer(special_tokens = spl_tokens, vocab_size = 10000, min_frequency = 2)

tokenizer.pre_tokenizer = Whitespace()

In [None]:
from pathlib import Path

paths = [str(x) for x in Path(".").glob("../Data/*.csv")]
paths

%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.train(paths, trainer = trainer)

tokenizer.save("./tokenizer.json")

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("Chess")

TypeError: not a string

In [1]:
from transformers import T5Config, T5Model
config = T5Config.from_pretrained('t5-small')
model = T5Model(config)

In [5]:
model.save_pretrained("Chess")

In [None]:
import pandas as pd
chess_data = pd.read_csv('cleaned_lichess08_test.csv')

In [None]:
len(chess_data)

In [None]:
model.num_parameters()

from tokenizers.implementations import BaseTokenizer
from transformers import PreTrainedTokenizer, LineByLineTextDataset

#ttokenizer = Tokenizer(WordLevel("vocb.json"))
#tokenizer.pre_tokenizer = WhitespaceSplit(" ")

# Load the `tokenizers` Tokenizer somehow
base_tokenizer = BaseTokenizer(Tokenizer.from_file("./tokenizer.json"))

# Wrap it in something `transformers` understands
tokenizer = PreTrainedTokenizer(pretrained_vocab_files_map = "./tokenizer.json")


In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files=["./cleaned_lichess08_test.csv"])

In [None]:
dataset

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.mask_token = '<mask>'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Chess",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

- https://discuss.huggingface.co/t/t5-training-from-scratch/1898
- https://huggingface.co/docs/datasets/loading_datasets.html
- https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=VmaHZXzmkNtJ
- https://huggingface.co/blog/how-to-train