Adapted from https://huggingface.co/blog/how-to-train

## Train a tokenizer


In [None]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("*.csv")]
paths = [paths[3], paths[6]]
print(paths)

In [None]:
special_tokens = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
] #+ chess_tokens

In [None]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=50_000, min_frequency=2, special_tokens=special_tokens)

In [None]:
tokenizer.save("MixedTokens")

In [2]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./MixedTokens/vocab.json",
    "./MixedTokens/merges.txt",
)

In [3]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [4]:
tokenizer.encode("e4 c5 d4 Qxf3 O-O").tokens

['<s>', 'e', '4', 'Ġc', '5', 'Ġd', '4', 'ĠQxf', '3', 'ĠO', '-', 'O', '</s>']

### We'll define the following config for the model

In [5]:
from transformers import RobertaModel, RobertaForMaskedLM, RobertaConfig, EncoderDecoderModel

In [6]:
#from transformers import RobertaConfig

enc_config = RobertaConfig(
    vocab_size = 50_000,
    max_position_embeddings = 514,
    num_embeddings = 768,
    num_attention_heads = 12,
    num_hidden_layers = 12,
    type_vocab_size = 1,
    hidden_size = 768,
)

dec_config = RobertaConfig(
    vocab_size = 50_000,
    max_position_embeddings = 514,
    num_embeddings = 768,
    num_attention_heads = 32,
    num_hidden_layers = 12,
    type_vocab_size = 1,
    hidden_size = 768,
    is_decoder = True,
)

Now let's re-create our tokenizer in transformers

In [7]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./MixedTokens", max_length=512)

Finally let's initialize our model.

In [10]:
#from transformers import RobertaForMaskedLM
#encoder = RobertaModel(config=enc_config)
#decoder = RobertaForMaskedLM(config=dec_config)

#model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

#model = RobertaForMaskedLM(config = enc_config)
model = RobertaForMaskedLM.from_pretrained("./MixedTokens", max_length=512)

In [11]:
model.num_parameters()
# => 125 million parameters

125084240

### Now let's build our training Dataset

We'll build our dataset by applying our tokenizer to our text file.

Here, as we only have one text file, we don't even need to customize our `Dataset`. We'll just use the `LineByLineDataset` out-of-the-box.

In [12]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="merged_large_bal.csv",
    block_size=64,
)
#from datasets import load_dataset
#dataset = load_dataset('csv', data_files=paths[1])

CPU times: user 8min 10s, sys: 24.4 s, total: 8min 34s
Wall time: 1min 51s


Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.

This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Finally, we are all set to initialize our Trainer

In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./MixedTokens",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=15
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it.


### Start training

In [None]:
%%time
trainer.train()

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/105421 [00:00<?, ?it/s]

{"loss": 2.0006740362644195, "learning_rate": 4.9762855598030754e-05, "epoch": 0.004742888039384942, "step": 500}
{"loss": 2.0613680160045624, "learning_rate": 4.9525711196061506e-05, "epoch": 0.009485776078769884, "step": 1000}
{"loss": 2.0248699731826783, "learning_rate": 4.9288566794092265e-05, "epoch": 0.014228664118154826, "step": 1500}
{"loss": 2.027987641572952, "learning_rate": 4.9051422392123016e-05, "epoch": 0.01897155215753977, "step": 2000}
{"loss": 2.0233216965198517, "learning_rate": 4.881427799015377e-05, "epoch": 0.023714440196924712, "step": 2500}
{"loss": 2.000885869979858, "learning_rate": 4.857713358818452e-05, "epoch": 0.028457328236309653, "step": 3000}
{"loss": 2.0219379873275756, "learning_rate": 4.833998918621527e-05, "epoch": 0.0332002162756946, "step": 3500}
{"loss": 1.9661591556072235, "learning_rate": 4.8102844784246024e-05, "epoch": 0.03794310431507954, "step": 4000}
{"loss": 1.9859060139656066, "learning_rate": 4.786570038227678e-05, "epoch": 0.0426859923

#### 🎉 Save final model (+ tokenizer + config) to disk

In [17]:
trainer.save_model("./MixedTokens")

## 4. Check that the LM actually trained

Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.

Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, `<mask>`) and return a list of the most probable filled sequences, with their probabilities.



In [18]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./MixedTokens",
    tokenizer="./MixedTokens"
)

In [23]:
fill_mask('i += <mask>')

[{'sequence': '<s> i += 1</s>', 'score': 0.7747066020965576, 'token': 328},
 {'sequence': '<s> i += 2</s>', 'score': 0.02622959017753601, 'token': 788},
 {'sequence': '<s> i += i</s>', 'score': 0.018232006579637527, 'token': 370},
 {'sequence': '<s> i += n</s>', 'score': 0.01438959315419197, 'token': 399},
 {'sequence': '<s> i += j</s>', 'score': 0.010179267264902592, 'token': 1218}]