# *GP task*

### Install tokenizer and feed it the training file

In [2]:
# ! pip install tokenizers==0.9.4
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./Downloads/").glob("Train.txt")]
# paths = Path("./Downloads/Train.txt")

### Initialize the tokenizer and customize the training

In [20]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
# Make a folder to save the tokenizer into. for example: tokenizer
tokenizer.save_model("./Downloads/doberta/tokenizer/")

['./Downloads/doberta/tokenizer/vocab.json',
 './Downloads/doberta/tokenizer/merges.txt']

### Tokenizer PostProcessing

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

### Check how the tokenizer works

In [10]:
tokenizer.encode("السلام عليكم").tokens

['<s>', 'Ø§ÙĦØ³', 'ÙĦ', 'Ø§Ùħ', 'ĠØ¹', 'ÙĦ', 'ÙĬÙĥ', 'Ùħ', '</s>']

### Defining the model configuration

In [16]:
# !pip install transformers
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [46]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./Downloads/doberta/tokenizer/", max_len=512)

### Imporing the model with defined config

In [22]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)

### Defining the dataset (depricated)

In [23]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./Downloads/Train.txt",
    block_size=128,
)



In [24]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Import and initialize the trainer

In [37]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Downloads/doberta/model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

## Do not run the following cell again it was used once for training

In [39]:
%%time
torch.cuda.empty_cache()
# trainer.train()

Step,Training Loss
500,3.957338
1000,3.210505
1500,2.954434
2000,2.748876
2500,2.647091
3000,2.593219
3500,2.531885
4000,2.507774


CPU times: user 1h 31min 29s, sys: 2.5 s, total: 1h 31min 31s
Wall time: 1h 31min 30s


TrainOutput(global_step=4156, training_loss=2.878065702201542)

In [40]:
trainer.save_model("./Downloads/doberta/model/")

## Testing the model

In [47]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./Downloads/doberta/model/",
    tokenizer=tokenizer
)

Some weights of RobertaModel were not initialized from the model checkpoint at ./Downloads/doberta/model/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
fill_mask(' انا احب <mask> ')

[{'sequence': '<s> انا احبا </s>',
  'score': 0.3162265121936798,
  'token': 268,
  'token_str': 'Ø§'},
 {'sequence': '<s> انا احبي </s>',
  'score': 0.2284846305847168,
  'token': 270,
  'token_str': 'ÙĬ'},
 {'sequence': '<s> انا احبَ </s>',
  'score': 0.09527813643217087,
  'token': 261,
  'token_str': 'Ùİ'},
 {'sequence': '<s> انا احبى </s>',
  'score': 0.05870430916547775,
  'token': 297,
  'token_str': 'Ùī'},
 {'sequence': '<s> انا احبِ </s>',
  'score': 0.04745069146156311,
  'token': 262,
  'token_str': 'ÙĲ'}]