In [1]:
import torch
import string
from datasets import load_from_disk, load_dataset
from transformers import BertForMaskedLM, BertTokenizer
from transformers import AdamW
from tqdm import tqdm
from multigpu import main
import torch.multiprocessing as mp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL)
model = BertForMaskedLM.from_pretrained(MODEL)
MAX_LEN = 128
CLS = 101
SEP = 102
MASKED = 103
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 2
LEARNING_RATE = 1e-5
EPOCHS = 3
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
device = torch.device('cuda')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Decide dataset and masking strategy

In [3]:
available_datasets = {
    'carolina': 0,
    'psa_small': 0, # portuguese sentiment analysis small
    'psa_full': 0 # portuguese sentiment analysis full
}
available_masking_strategies = {
    'end_token': 0,
    'random_tokens': 0
}

Choose:

In [4]:
chosen_dataset = 'carolina'
chosen_masking_strategy = 'end_token'

In [5]:
path=f'{chosen_dataset}/{chosen_masking_strategy}/'

# Helpers

### Dataset helpers

In [6]:
def carolina_dataset():
    """
    Returns a dictionary {'train', 'test'}, where
    each one of the keys is a list of  strings.
    """
    corpus_carolina = load_dataset("carolina-c4ai/corpus-carolina", taxonomy="wik", revision="v1.2")
    sentences_carolina = corpus_carolina['corpus']['text']
    sep_carolina = int(0.7*len(sentences_carolina))
    train = sentences_carolina[:sep_carolina]
    test = sentences_carolina[sep_carolina:]

    return {
        'train': train,
        'test': test
    }

In [7]:
def psa_full_dataset():
    """
    Returns a dictionary {'train', 'test'}, where
    each one of the keys is a list of  strings.
    """
    portuguese_sentiment_analysis = load_dataset('jvanz/portuguese_sentiment_analysis')
    train = portuguese_sentiment_analysis['train']['review_text_processed']
    test = portuguese_sentiment_analysis['test']['review_text_processed']
    return {
        'train': train,
        'test': test
    }

In [8]:
def psa_small_dataset():
    path='../../data/portuguese_sentiment_analysis/'
    return_dic = {}
    
    for type in ['train', 'test']:
        data = load_from_disk(path+type)
        return_dic[type] = data['review_text_processed']

    return return_dic

In [9]:
available_datasets['carolina'] = carolina_dataset
available_datasets['psa_full'] = psa_full_dataset
available_datasets['psa_small'] = psa_small_dataset 

### Masking helpers

In [10]:
def assure_CLS_and_SEP_false(id_tensor, true_false_tensor):
    """
    Receives a tensor of ids and a tensor of masking, with
    True and False values. Returns a copy of the true_false_tensor
    where every CLS and SEP position is for sure set to False.

    Doesn't mutate any input value and it's free of aliasing.
    """
    ans = true_false_tensor.detach().clone()
    ans = ans & (id_tensor != CLS) & (id_tensor != SEP)
    return ans

In [11]:
def mask_random_positions(tensor, k = 0.15):
    """
    Receives a tensor and returns
    a new tensor of the same shape randomly filled
    with True's at the rate k.
    """
    shape = tensor.shape
    rand = torch.rand(shape)
    return (rand < k)

In [12]:
def mask_end_positions(tensor_ids):
  """
  Receives a tensor and returns
  a tensor of the same shape filled with False's
  and a single True, at the end.
  """
  shape = tensor_ids.shape
  masks = torch.zeros(shape) != 0

  for i in range(0, shape[0]):
    for j in range(0, masks[i].shape[0]):
      if tensor_ids[i][j] == SEP:
        masks[i][j-1] = True
  return masks

In [13]:
def mask_nothing(tensor):
    """
    Receives a tensor and returns a tensor
    of the same shape filled with False's.
    """
    shape = tensor.shape
    return torch.full(shape, False)

In [14]:
def mask(masking_strategy, tensor):
    shape = tensor.shape
    mask_arr = masking_strategy(tensor)
    mask_arr = assure_CLS_and_SEP_false(tensor, mask_arr)
    rows = shape[0]

    def indices_to_mask(idx):
        """
        Returns a list containing all positions in the
        idx-th row of mask_arr that have to be masked.

        0 <= idx < rows has to be satisfied.
        """
        to_mask_positions = mask_arr[idx].nonzero()
        return torch.flatten(to_mask_positions).tolist()

    for i in range(rows):
        selection = indices_to_mask(i)
        tensor[i, selection] = MASKED

In [15]:
def save_labels_and_ids(sentences, path, masking_strategy):
    sentences = [sentence for sentence in sentences if sentence]
    
    inputs = tokenizer(sentences, return_tensors = 'pt', max_length = MAX_LEN, \
                        padding = True, truncation = True)
        
    inputs['labels'] = inputs['input_ids'].detach().clone()

    mask(masking_strategy, inputs['input_ids'])

    torch.save({
        'labels': inputs['labels'],
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask']
    }, f'{path}')

In [16]:
available_masking_strategies['end_token'] = mask_end_positions 
available_masking_strategies['random_tokens'] = mask_random_positions

### Utils

In [17]:
def send_to_device(arg):
    return arg.to(device)

# Putting everything together

In [18]:
"""
get_dataset_function = available_datasets[chosen_dataset]
get_masking_strategy_function = available_masking_strategies[chosen_masking_strategy]

dataset = get_dataset_function()

train = dataset['train']
test = dataset['test']

# for type, sentences in [('train', train), ('test', test)]:
#     print(f'{type}:')
#     print(f'# sentences: {len(sentences)}')
#     print(f'small sample: {sentences[:3]}\n')

#     save_labels_and_ids(sentences, f'{chosen_dataset}/{chosen_masking_strategy}/{type}_tokens', \
#                         get_masking_strategy_function)
"""

"\nget_dataset_function = available_datasets[chosen_dataset]\nget_masking_strategy_function = available_masking_strategies[chosen_masking_strategy]\n\ndataset = get_dataset_function()\n\ntrain = dataset['train']\ntest = dataset['test']\n\n# for type, sentences in [('train', train), ('test', test)]:\n#     print(f'{type}:')\n#     print(f'# sentences: {len(sentences)}')\n#     print(f'small sample: {sentences[:3]}\n')\n\n#     save_labels_and_ids(sentences, f'{chosen_dataset}/{chosen_masking_strategy}/{type}_tokens', #                         get_masking_strategy_function)\n"

In [19]:
def create_dataloader(path, params):

    class MyDataset(torch.utils.data.Dataset):
        def __init__(self, input):
            self.input = input

        def __getitem__(self, idx):
            return { key: torch.tensor(val[idx]) for key, val in self.input.items() }

        def __len__(self):
            return len(self.input['input_ids'])

    """
    # uncomment the following lines if you don't want
    # to sanity check
    amount_to_print = 5
    print(f"printing first {amount_to_print}:") 
    for id, label in zip(inputs['input_ids'][:amount_to_print], inputs['labels'][:amount_to_print]):
        print(f"id: {''.join(tokenizer.decode(id))}")
        print(f"label: {''.join(tokenizer.decode(label))}")
    """

    inputs = torch.load(path) 
    return torch.utils.data.DataLoader( MyDataset(inputs), **params)

In [20]:
masking_strategy = available_masking_strategies[chosen_masking_strategy]
train_dataloader = create_dataloader(path+'train_tokens', train_params)

### Training

We'll implement early stopping, that will print the best model if the next "early_stopping_patience" models are worse than it.

In [21]:
best_loss = 100 # best loss that the current best model yielded
best_model_state = -1 # the best model so far
early_stopping_patience = 2 # number of models we have to check to decide a best model

In [22]:
model = send_to_device(model)

In [23]:
opt = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

In [24]:
def run_epoch(num_epoch=0):
    """ 
    Returns the loss yielded by the model's current state.
    Modifies the state of the model, leaving the state
    that yields the returned loss.

    Receives a num_epoch that will be printed in the progress
    feedback.
    """
    loop = tqdm(train_dataloader, leave=True)
    average_loss = 0
    
    for batch in loop:
        opt.zero_grad()

        input_ids = batch['input_ids']
        labels =batch['labels']
        attention_mask = batch['attention_mask']

        input_ids = send_to_device(input_ids)
        labels = send_to_device(labels)
        attention_mask = send_to_device(attention_mask)

        output = model(input_ids, labels=labels, attention_mask = attention_mask)
        
        loss = output.loss
        average_loss += loss.item()
        loss.backward()
        opt.step()
       
        loop.set_description(f'Epoch {num_epoch}')
        loop.set_postfix(loss = loss.item())

    qtd = len(train_dataloader)

    return average_loss/qtd

In [25]:
early_patience_counter = 0
run_epochs = 0

while early_patience_counter < early_stopping_patience:
    run_epochs += 1
    
    loss = run_epoch(run_epochs)
    
    if loss < best_loss:
        best_loss = loss 
        best_model = model.state_dict()
        early_patience_counter = 0
        torch.save(best_model, path+f'model_at_epoch_{run_epochs}')
    else:
        early_patience_counter += 1

    print(f'Current loss is {loss}, and best loss is {best_loss}')

#torch.save(best_model, path+'best_model')


  return { key: torch.tensor(val[idx]) for key, val in self.input.items() }
Epoch 1:   0%|          | 5/21004 [00:02<2:21:02,  2.48it/s, loss=1.28] 


KeyboardInterrupt: 

In [None]:
torch.save(best_model, path+'best_model')

### Accuracy

In [25]:
from next_sentence_prediction.masked_llm.next_word_algorithms.priority_queue.predictions import get_all_predictions
model.to('cpu')
get_all_predictions('Hoje eu vou comer', tokenizer, model, 2, verbose=True)

Current biggest prob eh 0.5018615126609802 com palavra .
Size of the heap None
Printing heap:
 e has prob 0.00496727554127574

Found a new completion Hoje eu vou comer
Current biggest prob eh 0.00496727554127574 com palavra  e
Size of the heap None
Printing heap:
 de has prob 0.004598874598741531

Current biggest prob eh 0.004598874598741531 com palavra  de
Size of the heap None
Printing heap:
 comer has prob 0.0037914386484771967
 e cantar has prob 0.0003103567582770604

Current biggest prob eh 0.0037914386484771967 com palavra  comer
Size of the heap None
Printing heap:
o has prob 0.0034122997894883156
 de lá has prob 0.00018149149761011174
 e cantar has prob 0.0003103567582770604

Current biggest prob eh 0.0034122997894883156 com palavra o
Size of the heap None
Printing heap:
 com has prob 0.0032536862418055534
 comer. has prob 0.0018488998292404482
 e cantar has prob 0.0003103567582770604
 de lá has prob 0.00018149149761011174

Current biggest prob eh 0.0032536862418055534 com pala

['o', '']

In [None]:
def remove_ponctuation(sentence):
  ponctuation_array = ['.', '!', '?', ';', '...']

  chars = [char for char in sentence]

  while chars and chars[-1] in ponctuation_array:
    chars = chars[:-1]

  sentence = ''.join(chars)

  return sentence

In [None]:
raw_sentences_no_points = [ remove_ponctuation(sentence) for sentence in raw_sentences ]

In [None]:
def remove_last_word(sentence):
  """
  Receives a sentence and return a tuple where the first
  element is the sentence without the last word and the
  second element is the last word itself.
  """
  words = sentence.split()
  return " ".join(words[:-1]), words[-1]

In [None]:
def add_masked_token(sentence):
    return sentence + " <mask>"

In [None]:
sentences = [sentence for sentence in raw_sentences_no_points if sentence.split()]

In [None]:
desired_acc = 15

In [None]:
# for visuals
loop = tqdm(sentences, leave=True)

processed_up_to_now = 0

accuracies = [0] * desired_acc

for sentence in loop:

    last_word_removed, last_word = remove_last_word(sentence)
    last_word_removed = add_masked_token(last_word_removed)

    try:
        bert = get_all_predictions(last_word_removed, desired_acc)
    except:
        # if i fell here, the sentence reached the maximum amount
        # of tokens, so we'll skip it
        continue
    if last_word in bert:
        ix = bert.index(last_word)
        accuracies[ix] += 1

    processed_up_to_now += 1


  1%|          | 87/14913 [00:00<02:20, 105.81it/s]

100%|██████████| 14913/14913 [02:25<00:00, 102.22it/s]


In [None]:
file = open(path+'accuracy.txt', 'w')

In [None]:
up_to_now = 0
for top, acc_val in enumerate(accuracies):
    up_to_now += acc_val 
    percentage = (up_to_now/processed_up_to_now)*100
    str = f"{top+1}: {percentage}%"
    file.write(str+'\n')
    print(str)
file.close()

1: 13.696047404215204%
2: 17.487037909905055%
3: 19.37243283280587%
4: 21.129890243081274%
5: 22.33519628307858%
6: 23.304827957713286%
7: 24.02531816039324%
8: 24.685206383408527%
9: 25.32489394653559%
10: 25.850111103629388%
11: 26.32819338765066%
12: 26.765874351895498%
13: 27.041950037034546%
14: 27.318025722173587%
15: 27.540232980944047%
