In [12]:
from utils import print_data_stats, load_data, flat_accuracy, format_time, subset_data

import pandas as pd
pd.set_option("styler.format.precision", 3)

import random
random.seed(11)

import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

LANGS = ["ar","en","es","ru","zh"]
LANGS_MAPPING = {"en":"english","es":"spanish","ru":"russian","ar":"arabic","zh":"chinese"}

data = load_data()

# mBERT
![BERT](https://yashuseth.files.wordpress.com/2019/06/fig1-1.png)
<!-- ![title](https://miro.medium.com/max/1400/0*lBYVNRe1esIXn1qE.png) -->
**BERT**: Bidirectional Encoder Representations for Transformers  
**mBERT**: BERT pre-trained from monolingual corpora in 104 languages

- Commonly used for cross-lingual transfer these days
- [A Primer in BERTology: What we know about how BERT works](https://arxiv.org/abs/2002.12327)
- [How multilingual is Multilingual BERT?](https://arxiv.org/abs/1906.01502)
- [The Illustrated Transformers](http://jalammar.github.io/illustrated-transformer/)
- [Huggingface multilingual models intro](https://huggingface.co/transformers/v2.2.0/multilingual.html)
- Codes below are substantially borrowed from [this blog post](https://mccormickml.com/2019/07/22/BERT-fine-tuning/) by Chris McCormick and Nick Ryan

In [13]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased", cache_dir="../transformer-models/", do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-uncased", num_labels = 2, cache_dir="../transformer-models/", output_attentions = False, output_hidden_states = False).to(device)

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [14]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (105879, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              

In [15]:
# lang = "en"
for lang in LANGS:
    sample_sentence = random.choice(data[lang]["train"])[0]

    # Print the original sentence.
    print(' Original: ', sample_sentence)

    # Print the sentence split into tokens.
    print('Tokenized: ', tokenizer.tokenize(sample_sentence),"\n")

    # Print the sentence mapped to token ids.
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample_sentence)))

    # Print the encoded sentence (with [CLS], [SEP] appended).
    print('Encoded IDs: ', tokenizer.encode(sample_sentence))

 Original:  مكااااان الفندق بعيد قليلا لكنه كذلك مقبول ومعروف عند التاكسيات وممكن الوصول له عبر الجي بي اس
Tokenized:  ['م', '##كا', '##ا', '##ا', '##ا', '##ان', 'الف', '##ند', '##ق', 'ب', '##عيد', 'ق', '##ليل', '##ا', 'لكنه', 'كذلك', 'م', '##قب', '##ول', 'ومع', '##روف', 'عند', 'ال', '##تا', '##كس', '##يات', 'و', '##مم', '##كن', 'الوصول', 'له', 'عبر', 'ال', '##جي', 'بي', 'اس'] 

Token IDs:  [476, 33416, 10383, 10383, 10383, 10872, 44411, 12585, 12117, 452, 54066, 473, 53667, 10383, 79200, 48665, 476, 50364, 14089, 44936, 59619, 21613, 24177, 17330, 27276, 14651, 479, 89112, 21902, 71713, 17227, 33826, 24177, 39321, 40954, 11844]
Encoded IDs:  [101, 476, 33416, 10383, 10383, 10383, 10872, 44411, 12585, 12117, 452, 54066, 473, 53667, 10383, 79200, 48665, 476, 50364, 14089, 44936, 59619, 21613, 24177, 17330, 27276, 14651, 479, 89112, 21902, 71713, 17227, 33826, 24177, 39321, 40954, 11844, 102]
 Original:  I was looking for banana tempura for dessert and they dont have.
Tokenized:  ['i', '

# Fine-tune mBERT w/ the SA data


In [14]:
import time
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler

MAX_LEN = 20
NUM_EPOCHS = 2
BATCH_SIZE = 32
def get_optimizer(model, total_steps):
    optim = AdamW(model.parameters(), lr = 5e-5)
    scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
    return optim, scheduler


def get_tensordata(sentences, labels):
    input_ids,attention_masks = [], []
    for sentence, label in zip(sentences, labels):
        encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = [int((l == "pos")) for l in labels]
    labels = torch.tensor(labels)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(
                dataset,  # The training samples.
                sampler = RandomSampler(dataset), # Select batches randomly
                batch_size = BATCH_SIZE # Trains with this batch size.
            )
    return dataloader


def run_model(model, data:dict, lang_train:list, lang_test:str, bool_print=False) -> float:
    
    def train(epoch, bool_valid=True):
        if bool_print:
            print(f'\n======== Epoch {epoch+1} / {NUM_EPOCHS} ========\nTraining...')
        total_train_loss = 0
        t0 = time.time()

        model.train()
        for step, batch in enumerate(data_train):

            # Progress update every 50 batches.
            if bool_print and step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(data_train), elapsed))
            batch_input_ids = batch[0].to(device)
            batch_att_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)
            model.zero_grad()  
            loss, logits = model(batch_input_ids, 
                                 token_type_ids=None, 
                                 attention_mask=batch_att_mask, 
                                 labels=batch_labels)
            total_train_loss += loss.item()
            loss.backward()
            optim.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(data_train)
        training_time = format_time(time.time() - t0)

        if bool_print:
            print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(training_time))

        if bool_valid and data_test != None:
            t0 = time.time()
            # Put the model in evaluation mode--the dropout layers behave differently
            # during evaluation.
            model.eval()
            # Tracking variables 
            total_eval_accuracy = 0
            total_eval_loss = 0
            nb_eval_steps = 0

            # Evaluate data for one epoch
            for batch in data_test:

                # Unpack this training batch from our dataloader. 
                # As we unpack the batch, we'll also copy each tensor to the GPU using 
                # the `to` method.
                batch_input_ids = batch[0].to(device)
                batch_att_mask = batch[1].to(device)
                batch_labels = batch[2].to(device)

                # Tell pytorch not to bother with constructing the compute graph during
                # the forward pass, since this is only needed for backprop (training).
                with torch.no_grad(): 
                    (loss, logits) = model(batch_input_ids, 
                                           token_type_ids=batch_att_mask, 
                                           attention_mask=None,
                                           labels=batch_labels)

                # Accumulate the validation loss.
                total_eval_loss += loss.item()

                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                label_ids = batch_labels.to('cpu').numpy()

                # Calculate the accuracy for this batch of test sentences, and
                # accumulate it over all batches.
                total_eval_accuracy += flat_accuracy(logits, label_ids)


            # Report the final accuracy for this validation run.
            avg_val_accuracy = total_eval_accuracy / len(data_test)

            # Calculate the average loss over all of the batches.
            avg_val_loss = total_eval_loss / len(data_test)

            # Measure how long the validation run took.
            validation_time = format_time(time.time() - t0)
            if bool_print:
                print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
                print("  Validation Loss: {0:.2f}".format(avg_val_loss))
                print("  Validation took: {:}".format(validation_time))
        else:
            avg_val_loss, avg_val_accuracy, validation_time = None, None, None
        log = {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid Loss': avg_val_loss,
            'Valid Acc': avg_val_accuracy,
        }
        return log

    sentences_train, y_train = [], []
    for lang in lang_train:
        _sentences, _labels = zip(*data[lang]["train"])
        sentences_train += _sentences
        y_train += _labels
    sentences_test, y_test = zip(*data[lang_test]["test"])
    
    data_train, data_test = get_tensordata(sentences_train, y_train), get_tensordata(sentences_test, y_test)
    
    total_steps = len(data_train) * NUM_EPOCHS
    optim, scheduler = get_optimizer(model, total_steps)
    
    
    training_logs = []
    max_acc = 0
    for epoch in range(NUM_EPOCHS):
        log = train(epoch, bool_valid=True)
        max_acc = max(max_acc, log['Valid Acc'])
        training_logs.append(log)
    return max_acc

In [15]:
res_dict = {}
for lang in LANGS:
    max_acc = run_model(model, data, [lang], lang)
    res_dict[lang] = max_acc
    print(f"train: {lang}, test: {lang}, acc: {max_acc:.3f}")

train: ar, test: ar, acc: 0.841
train: en, test: en, acc: 0.848
train: es, test: es, acc: 0.878
train: ru, test: ru, acc: 0.811
train: zh, test: zh, acc: 0.830


# Zero-shot Cross-lingual Transfer Experiments + Low-resource setting

Now we control the number of training samples and compare how well cross-lingual transfer works

In [124]:
data_sample = subset_data(data)
print_data_stats(data_sample)

Unnamed: 0,#train,#test,train-pos%,test-pos%,sample,label
ar,1333,1145,0.6,0.57,الغرفة رائحتها كريهة جدًا,neg
en,1333,555,0.67,0.72,This place is a must visit!,pos
es,1333,650,0.72,0.68,Perfecto... Como siempre ...,pos
ru,1333,865,0.76,0.68,Спасибо вам огромное.,pos
zh,1333,529,0.57,0.59,看电影也相当爽，,pos


In [None]:
res_subset_dict = {}
for lang_test in LANGS:
    res_subset_dict[lang_test] = {}
    for lang_train in LANGS:   
        max_acc = run_model(model, data_sample, [lang_train], lang_test)
        res_subset_dict[lang_test][lang_train] = max_acc
        print(f"train: {lang_train}, test: {lang_test}, acc: {max_acc:.3f}")
    print("")

In [118]:
display(pd.DataFrame.from_dict(res_dict))

Unnamed: 0,ar,en,es,ru,zh
ar,0.863,0.835,0.882,0.824,0.787
en,0.861,0.849,0.882,0.827,0.808
es,0.861,0.861,0.875,0.833,0.8
ru,0.836,0.849,0.855,0.829,0.802
zh,0.843,0.839,0.848,0.827,0.8


# Activity: Explain your observation from the table

- observation 1:
- observation 2:

# Cross-lingual Transfer (use all data)

In [147]:
res_subset_dict = {}

for lang_test in LANGS:   
    max_acc = run_model(model, data, LANGS, lang_test)
    res_subset_dict[lang_test] = max_acc
    print(f"train: {LANGS}, test: {lang_test}, acc: {max_acc:.3f}")

train: ['ar', 'en', 'es', 'ru', 'zh'], test: ar, acc: 0.852
train: ['ar', 'en', 'es', 'ru', 'zh'], test: en, acc: 0.856
train: ['ar', 'en', 'es', 'ru', 'zh'], test: es, acc: 0.854
train: ['ar', 'en', 'es', 'ru', 'zh'], test: ru, acc: 0.807
train: ['ar', 'en', 'es', 'ru', 'zh'], test: zh, acc: 0.793


train: ar, test: ar, acc: 0.841
train: en, test: en, acc: 0.848
train: es, test: es, acc: 0.878
train: ru, test: ru, acc: 0.811
train: zh, test: zh, acc: 0.830