# Cross Lingual Finetuning on Language Modeling Task
---

In [1]:
!nvidia-smi

Thu May  7 21:53:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
!pip install transformers



## Import Libraries

In [None]:
import torch
import logging
from pathlib import Path
from functools import partial
from tqdm.notebook import tqdm
from typing import List, Tuple
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence

In [None]:
from transformers import (
    AdamW,
    PreTrainedModel,
    PreTrainedTokenizer,
    BertTokenizer,
    BertForMaskedLM,
    get_linear_schedule_with_warmup,
)

## Global Config

In [None]:
device = "cuda"
epochs = 1
batch_size = 4
data_dir = Path("/content/")
cache_dir = Path("/content/cache")
max_grad_norm = 1.0

## Utils

In [None]:
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, mlm_probability=0.15) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

    if tokenizer.mask_token is None:
        raise ValueError(
            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
        )

    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

# Load Multilingual BERT

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased', cache_dir=cache_dir)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir=cache_dir)

# Dutch Dataset

In [None]:
class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: Path, block_size=512):
        assert file_path.is_file(), "Missing data file"
        
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

In [None]:
def collate(examples: List[torch.Tensor], pad_token_id: int):
    return pad_sequence(examples, batch_first=True, padding_value=pad_token_id)

In [10]:
eval_dataset = LineByLineTextDataset(tokenizer, data_dir / "valid.txt", tokenizer.max_len)
eval_dataset[0]

tensor([  101, 10167, 13610, 38071, 10785, 10574, 35916, 10303, 10223, 49437,
        10415, 88101, 10107, 73918, 11280, 74368, 20817, 10423, 10187, 18011,
        21848, 12027,   119,   102])

In [None]:
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size,
                             collate_fn=partial(collate, pad_token_id=tokenizer.pad_token_id))

In [12]:
batch = next(iter(eval_dataloader))
batch

tensor([[   101,  10167,  13610,  38071,  10785,  10574,  35916,  10303,  10223,
          49437,  10415,  88101,  10107,  73918,  11280,  74368,  20817,  10423,
          10187,  18011,  21848,  12027,    119,    102],
        [   101,  12076,  28560,  11690,  63790,  27124,  10540,  20085,  15559,
          10136,  10697,  10106,  12313,    119,    102,      0,      0,      0,
              0,      0,      0,      0,      0,      0],
        [   101,  11474,  13790,  10697,  13978,  10134,  10785,  12046, 107241,
          46680,  10877,  10145,  10104,  73708,  10145,  88101,  10107,  73918,
          11280,  74368,  20817,  10107,    119,    102],
        [   101,  10167,  48811,  10368,  10124,  10200,  37695,  11422,  14166,
          84811,    119,    102,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0]])

In [13]:
inputs, labels = mask_tokens(batch, tokenizer)
inputs, labels

(tensor([[   101,    103,    103,  38071,  10785,  10574,    103,  10303,  10223,
           49437,  10415,  88101,  10107,  73918,    103,  74368,  20817,  10423,
           10187,  18011,  21848,  12027,    103,    102],
         [   101,  12076,  28560,  11690,  63790,  27124,  10540,  20085,    103,
           10136,  10697,    103,  12313,    103,    102,      0,      0,      0,
               0,      0,      0,      0,      0,      0],
         [   101,  11474,  13790,  10697,  13978,  10134,  10785,  12046, 107241,
           46680,  10877,  10145,  10104,  73708,  10145,  88101,    103,  73918,
           11280,  74368,  20817,  10107,    119,    102],
         [   101,    103,  48811,  10368,  10124,  10200,  37695,  11422,  14166,
           84811,    119,    102,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0]]),
 tensor([[ -100, 10167, 13610,  -100,  -100,  -100, 35916,  -100,  -100,  -100,
           -100,  -100,  -

# Training

In [None]:
train_dataset = LineByLineTextDataset(tokenizer, data_dir / "train.txt", tokenizer.max_len)

In [None]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size,
                              collate_fn=partial(collate, pad_token_id=tokenizer.pad_token_id))

In [None]:
def train(train_dataloader,
          model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer,
          epochs: int,
          batch_size: int,
          adam_epsilon: float = 1e-8,
          warmup_steps: int = 0,
          weight_decay: float = 0.0,
          learning_rate: float = 5e-5) -> Tuple[int, float]:
    model.train()
    model.to(device)

    t_total = len(train_dataloader) * epochs
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )

    global_step = 0
    tr_loss = 0.0

    model.zero_grad()
    for _ in range(epochs):
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            inputs, labels = mask_tokens(batch, tokenizer)
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1
            
            if global_step%100 == 0:
              print(f"Step:{global_step} Loss:{loss.item()}")

            '''
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to %s", output_dir)
            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
            logger.info("Saving optimizer and scheduler states to %s", output_dir)
            '''

    return tr_loss

In [17]:
train(train_dataloader, model, tokenizer, epochs, batch_size)

HBox(children=(IntProgress(value=0, description='Iteration', max=20000, style=ProgressStyle(description_width=…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


Step:100 Loss:1.8251819610595703
Step:200 Loss:2.5557920932769775
Step:300 Loss:2.871283531188965
Step:400 Loss:1.977010726928711
Step:500 Loss:2.567967414855957
Step:600 Loss:3.0234196186065674
Step:700 Loss:2.507707357406616
Step:800 Loss:0.2539706826210022
Step:900 Loss:2.0257086753845215
Step:1000 Loss:2.034459352493286
Step:1100 Loss:2.6103057861328125
Step:1200 Loss:3.149667501449585
Step:1300 Loss:1.6349120140075684
Step:1400 Loss:1.558638334274292
Step:1500 Loss:2.168497085571289
Step:1600 Loss:1.5064332485198975
Step:1700 Loss:4.168768882751465
Step:1800 Loss:3.3562839031219482
Step:1900 Loss:2.9333038330078125
Step:2000 Loss:1.5825611352920532
Step:2100 Loss:2.6400489807128906
Step:2200 Loss:1.0358847379684448
Step:2300 Loss:2.6385340690612793
Step:2400 Loss:1.7754117250442505
Step:2500 Loss:1.787164568901062
Step:2600 Loss:2.9130992889404297
Step:2700 Loss:2.7998452186584473
Step:2800 Loss:4.1013054847717285
Step:2900 Loss:3.008906364440918
Step:3000 Loss:2.093066930770874
S

0.0

# Evaluation

In [18]:
model.to(device)
model.eval()
eval_loss = 0.0
nb_eval_steps = 0

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    inputs, labels = mask_tokens(batch, tokenizer)
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(inputs, masked_lm_labels=labels)
        lm_loss = outputs[0]
        eval_loss += lm_loss.mean().item()
    nb_eval_steps += 1
    
eval_loss = eval_loss / nb_eval_steps
perplexity = torch.exp(torch.tensor(eval_loss))

HBox(children=(IntProgress(value=0, description='Evaluating', max=2500, style=ProgressStyle(description_width=…




In [19]:
perplexity

tensor(7.3679)

In [20]:
eval_loss

1.9971271817121654

In [21]:
bertje_model = BertForMaskedLM.from_pretrained("bert-base-dutch-cased")

HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=438869143, style=ProgressStyle(description_…




In [22]:
bertje_tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased")

HBox(children=(IntProgress(value=0, description='Downloading', max=241440, style=ProgressStyle(description_wid…




In [None]:
eval_dataset = LineByLineTextDataset(bertje_tokenizer, data_dir / "valid.txt", bertje_tokenizer.max_len)
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size,
                             collate_fn=partial(collate, pad_token_id=bertje_tokenizer.pad_token_id))

In [24]:
batch = next(iter(eval_dataloader))
batch

tensor([[    1,  3570,   229, 23116, 21829, 13291, 22777,  9910, 16804,  8421,
         15018, 21849, 28920, 21877, 13261,  8578,  2214,    13,     2],
        [    1,  3362,  9635, 10761, 15339, 20579, 25108, 20255, 13644,   266,
            13,     2,     3,     3,     3,     3,     3,     3,     3],
        [    1,  7222,   248, 20255,   251, 22250, 13291, 16348, 29566, 20722,
         10537,  3886, 20722, 21849, 28921,    13,     2,     3,     3],
        [    1,  3570,  2145,   126,   132, 13903, 11130, 17517, 15892, 13182,
         21225,    13,     2,     3,     3,     3,     3,     3,     3]])

In [26]:
inputs, labels = mask_tokens(batch, bertje_tokenizer)
inputs, labels

(tensor([[    1,  3570,   229, 23116, 21829,     4, 22777,  9910, 16804,  8421,
              4, 21849, 28920, 21877, 13261,  8578,     4,    13,     2],
         [    1,  3362,  9635, 10761, 15339, 20579, 25108, 20255, 13644,   266,
              4,     2,     3,     3,     3,     3,     3,     3,     3],
         [    1,     4,   248, 20255,   251, 22250, 13291, 16348, 29566, 20722,
          10537,  3886,     4, 21849, 28921,    13,     2,     3,     3],
         [    1,  3570,  2145,   126,   132, 13903, 11130, 17517, 15892, 13182,
          21225,    13,     2,     3,     3,     3,     3,     3,     3]]),
 tensor([[ -100,  -100,  -100,  -100,  -100, 13291,  -100,  -100, 16804,  -100,
          15018,  -100,  -100,  -100,  -100,  -100,  2214,  -100,  -100],
         [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             13,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
         [ -100,  7222,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  

In [27]:
bertje_model.to(device)
bertje_model.eval()
eval_loss = 0.0
nb_eval_steps = 0

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    inputs, labels = mask_tokens(batch, bertje_tokenizer)
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = bertje_model(inputs, masked_lm_labels=labels)
        lm_loss = outputs[0]
        eval_loss += lm_loss.mean().item()
    nb_eval_steps += 1
    
eval_loss = eval_loss / nb_eval_steps
perplexity = torch.exp(torch.tensor(eval_loss))

HBox(children=(IntProgress(value=0, description='Evaluating', max=2500, style=ProgressStyle(description_width=…




In [28]:
perplexity

tensor(18.7426)

In [29]:
eval_loss

2.930801215242967