# Summary
In this notebook we embed names using RoBERTa. 

In [6]:
import os
os.chdir('../')

# Tokenizer config
tokenizer_config = {
    'vocab_size' : 5000,
    'min_frequency' : 2,
    'add_special_tokens' : True,
    'pad_to_max_length' : True,
    'return_attention_mask' : True,
    'return_tensors' : 'pt',
    'tokenizer_saving_path' : './models/',
}
# experiments are going to be saved as follows:
# |- models
# |---- eperiment_1
# |---- | ---- tokenizer
# |---- | ---- model
# |---- | config.json
# ...
# 



# Training config
TRAIN_EPOCHS = 10
LEARNING_RATE=0.0005
WEIGHT_DECAY = 0.0001
TRAIN_BATCH_SIZE = 512
VALID_BATCH_SIZE = 128
MAX_LEN=30
mlm_probability=0.15

# model config - RoBERTa
vocab_size=5000
max_position_embeddings=32
num_attention_heads=4
num_hidden_layers=2
type_vocab_size=1


In [3]:
import torch
import transformers

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, DebertaTokenizer, BertTokenizer
from tokenizers import ByteLevelBPETokenizer

# from datasets import load_metric, Dataset
from torch.utils.data import Dataset
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train a tokenizer
class BPE_Based_Tokenizer():
    def __init__(self, config):
        self.tokenizer      = ByteLevelBPETokenizer()
        self.vocab_size     = config.get(vocab_size)
        self.min_frequency  = config.get(min_frequency)
        self.saving_path    = config.get(saving_path)
        self.model_type     = config.model_type

        assert self.model_type in ['BERT', 'RoBERTa', 'DeBERTa'], "Model type must be in ['BERT', 'RoBERTa', 'DeBERTa']!"


    def train_and_save(self, training_names):
        self.tokenizer.train_from_iterator(
            training_names,
            vocab_size=vocab_size, min_frequency=min_frequency,
            show_progress=True,
            special_tokens=[
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",
                "<mask>",
                ])
        os.mkdir(self.toeknizer_path)
        self.tokenizer.save_model(self.toeknizer_path)

    def load_and_wrap_tokenizer(self):
        """
        A function that loads the tokenizer and also expand its functionality according to a model.
        This functionality allows it to do additional things, like `encode_plus`.
        """
        self.saving_path
        if self.model_type == 'BERT':
            self.tokenizer = BertTokenizer(vocab_file = self.saving_path + '/vocab.json',
                                              merges_file= self.saving_path + '/merges.txt')
        if self.model_type == 'RoBERTa':
            self.tokenizer = RobertaTokenizer(vocab_file = self.saving_path + '/vocab.json',
                                              merges_file= self.saving_path + '/merges.txt')
        
        if self.model_type == 'DeBERTa':
            self.tokenizer = DebertaTokenizer(vocab_file = self.saving_path + '/vocab.json',
                                              merges_file= self.saving_path + '/merges.txt')
    def encode_plus(self,x):
        return self.tokenizer.encode_plus(x,
                      max_length            = self.max_len,
                      # truncation=True,
                      add_special_tokens    = self.add_special_tokens,
                      pad_to_max_length     = self.pad_to_max_length,
                      return_attention_mask = self.return_attention_mask,
                      return_tensors='pt')



  from .autonotebook import tqdm as notebook_tqdm
