https://www.freecodecamp.org/news/train-algorithms-from-scratch-with-hugging-face/

In [1]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

In [2]:
tokenizer = Tokenizer(WordLevel())
spl_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
trainer = WordLevelTrainer(special_tokens = spl_tokens, vocab_size = 10000, min_frequency = 2)

tokenizer.pre_tokenizer = Whitespace()

In [3]:
from pathlib import Path

paths = [str(x) for x in Path(".").glob("../Data/*.csv")]
paths

['..\\Data\\cleaned_merged_chess_data.csv']

tokenizer.train(paths, trainer = trainer)

tokenizer.save("./tokenizer.json")

In [4]:
tokenizer = Tokenizer.from_file("./tokenizer.json")

In [5]:
from transformers import T5Config, T5Model
config = T5Config.from_pretrained('t5-small')
model = T5Model(config)

In [6]:
import pandas as pd
chess_data = pd.read_csv('cleaned_lichess08_sample.csv')

In [7]:
model.num_parameters()

60506624

from tokenizers.implementations import BaseTokenizer
from transformers import PreTrainedTokenizer, LineByLineTextDataset

#ttokenizer = Tokenizer(WordLevel("vocb.json"))
#tokenizer.pre_tokenizer = WhitespaceSplit(" ")

# Load the `tokenizers` Tokenizer somehow
base_tokenizer = BaseTokenizer(Tokenizer.from_file("./tokenizer.json"))

# Wrap it in something `transformers` understands
tokenizer = PreTrainedTokenizer(pretrained_vocab_files_map = "./tokenizer.json")


In [8]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files=["./cleaned_lichess08_sample.csv"])

Using custom data configuration default-70b4d030918b2585
Reusing dataset csv (C:\Users\Oshingabesan Adebayo\.cache\huggingface\datasets\csv\default-70b4d030918b2585\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)
100%|###################################################################################| 1/1 [00:00<00:00, 500.22it/s]


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['PGN'],
        num_rows: 10000
    })
})

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.mask_token = '<mask>'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Chess",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: dehbaiyor (use `wandb login --relogin` to force relogin)


https://discuss.huggingface.co/t/t5-training-from-scratch/1898
https://huggingface.co/docs/datasets/loading_datasets.html
https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=VmaHZXzmkNtJ