<a href="https://colab.research.google.com/github/Cheto01/nlp-in-python-tutorial/blob/master/medical_bigbird.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers~=4.6.0
!pip list | grep -E 'transformers|tokenizers'

Collecting transformers~=4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 4.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 20.3MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |

# 1.Data preparation

In [None]:
!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
#!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

--2021-06-16 09:50:03--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 13.225.93.114, 13.225.93.14, 13.225.93.108, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|13.225.93.114|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 312733741 (298M) [text/plain]
Saving to: ‘oscar.eo.txt’


2021-06-16 09:50:32 (11.0 MB/s) - ‘oscar.eo.txt’ saved [312733741/312733741]



# Helper functions

## 1. Tokenizer

Here I will use 2 different tokenizer to compare the results:

*   First a byte level tokenizer trained from scratch 
*   second a BigBird tokenizer, similar borrowed from Roberta.




### 1. ByteLevel tokenizer

In [None]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=20358, min_frequency=5, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 18min 18s, sys: 3.71 s, total: 18min 21s
Wall time: 9min 29s


In [None]:
!mkdir med_bigbird
tokenizer.save_model("med_bigbird")

['med_bigbird/vocab.json', 'med_bigbird/merges.txt']

### Sentencepiece

This is required if we want to train our tokenizer and save it into a format that will be supported by the model: [spm](https://colab.research.google.com/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb#scrollTo=SUcAbKnRVAv6)

In [None]:
!pip install sentencepiece
import sentencepiece as spm




In [None]:

## Example of user defined symbols
spm.SentencePieceTrainer.train('--input=oscar.eo.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')

sp_user = spm.SentencePieceProcessor()
sp_user.load('m_user.model')

# ids are reserved in both mode.
# <unk>=0, <s>=1, </s>=2, <sep>=3, <cls>=4
# user defined symbols allow these symbol to apper in the text.
print(sp_user.encode_as_pieces('this is a test<sep> hello world<cls>'))
print(sp_user.piece_to_id('<sep>'))  # 3
print(sp_user.piece_to_id('<cls>'))  # 4
print('3=', sp_user.decode_ids([3]))  # decoded to <sep>
print('4=', sp_user.decode_ids([4]))  # decoded to <cls>

['▁', 'th', 'is', '▁', 'is', '▁a', '▁te', 'st', '<sep>', '▁he', 'll', 'o', '▁', 'w', 'or', 'ld', '<cls>']
3
4
3= <sep>
4= <cls>


Save the tokenizer

### 2. BigBird tokenizer

In [None]:
#@markdown don't need it for now
from transformers import BigBirdTokenizer, PreTrainedTokenizer
tokenizer2= BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

How to use the new tokenizer 

In [None]:
#@markdown don't need it for now
#this is how to load a tokenizer stored on disk
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./med_bigbird/vocab.json",
    "./med_bigbird/merges.txt",
)

In [None]:
# new tokenizer
tokenizer.encode("Mi estas Julien.")

Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
#bigbird tokenizer
tokenizer2.encode("Mi estas Julien.")

# Train the language model

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

False

### model configuration

In [None]:
from transformers import BigBirdConfig

config= BigBirdConfig(
    vocab_size = 20358,
    hidden_size=768, 
    num_hidden_layers=12, 
    num_attention_heads=12, 
    intermediate_size=3072, 
    hidden_act='gelu_fast', 
    hidden_dropout_prob=0.1, 
    attention_probs_dropout_prob=0.1, 
    max_position_embeddings=1024, 
    type_vocab_size=2, 
    initializer_range=0.02, 
    layer_norm_eps=1e-12, 
    use_cache=True, 
    is_encoder_decoder=False, 
    pad_token_id=1, 
    bos_token_id=0, 
    eos_token_id=2, 
    sep_token_id=66, 
    attention_type='block_sparse', 
    use_bias=True, 
    rescale_embeddings=False, 
    block_size=800, 
    num_random_blocks=3, 
    gradient_checkpointing=True,
    
    
)

### create our tokenizer in transformers

In [None]:

from transformers import BigBirdTokenizerFast

tokenizer = BigBirdTokenizerFast.from_pretrained("m_user.model", max_length=1024) # generally, med_bigbird has a .spm extension


In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./med_bigbird", max_len=1024)

### Initialize the model

Here, It would be interesting to compare a pretraining from existing checkpoint weight, or just from nothing
For the time beeing, I would like to try a training from scratch since the vocabulary is also new.

The code bellow doesn't load any weight, for the future, I will try to instantiate apretrained model by loading the BigBird checkpoint `from_pretrained()` [guide](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained)

It could be something like

`model = BigBirdForMaskedLM.from_pretrained('google/bigbird-roberta-base', other parameters) `

In [None]:
from transformers import BigBirdForMaskedLM

model = BigBirdForMaskedLM(config=config)

In [None]:
# checking the parameter size
model.num_parameters()

102681990

### Make our dataset ready for mlm

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=800,
)



Set the mlm parameters

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Create a trainer function

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./med_bigbird",
    overwrite_output_dir=True,
    num_train_epochs=2,
    #per_gpu_train_batch_size=64,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

## Training time

In [None]:
trainer.train()

Attention type 'block_sparse' is not possible if sequence_length: 32 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...


Step,Training Loss


Step,Training Loss


In [None]:
def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.Adam(net.parameters(), lr=1e-3)
    step, timer = 0, d2l.Timer()
    animator = d2l.Animator(xlabel='step', ylabel='loss', xlim=[1, num_steps],
                            legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for tokens_X, segments_X, valid_lens_x, pred_positions_X,\
            mlm_weights_X, mlm_Y, nsp_y in train_iter:
            tokens_X = tokens_X.to(devices[0])
            segments_X = segments_X.to(devices[0])
            valid_lens_x = valid_lens_x.to(devices[0])
            pred_positions_X = pred_positions_X.to(devices[0])
            mlm_weights_X = mlm_weights_X.to(devices[0])
            mlm_Y, nsp_y = mlm_Y.to(devices[0]), nsp_y.to(devices[0])
            trainer.zero_grad()
            timer.start()
            mlm_l, nsp_l, l = _get_batch_loss_bert(
                net, loss, vocab_size, tokens_X, segments_X, valid_lens_x,
                pred_positions_X, mlm_weights_X, mlm_Y, nsp_y)
            l.backward()
            trainer.step()
            metric.add(mlm_l, nsp_l, tokens_X.shape[0], 1)
            timer.stop()
            animator.add(step + 1,
                         (metric[0] / metric[3], metric[1] / metric[3]))
            step += 1
            if step == num_steps:
                num_steps_reached = True
                break

    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on '
          f'{str(devices)}')