## Pre training BERT

**Author:** Hari 
<br>
**Date:** 7th July 2022
<br>
**Context:** Trialing code for masked language modelling and whole world masking
<br>
**Objective:** Pre train Bert according to our own data


Loading the base distillbert model


In [1]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


Importing the tokenizer


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Importing the text file and loading it to a dataset

In [3]:
from datasets import load_dataset
path = 'C:/Users/Hari Shiman/Desktop/Data/text/trial.txt'

dataset = load_dataset('text', data_files=path)

Using custom data configuration default-52069c1c973a81bd
Reusing dataset text (C:\Users\Hari Shiman\.cache\huggingface\datasets\text\default-52069c1c973a81bd\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing each word

In [4]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Loading cached processed dataset at C:\Users\Hari Shiman\.cache\huggingface\datasets\text\default-52069c1c973a81bd\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8\cache-4add6e5fa01c2f49.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50771
    })
})

Determine chunk size

In [6]:
chunk_size = 128

Combine all the sentences and split them up into the groups of the above mentioned chunk size


In [5]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [7]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at C:\Users\Hari Shiman\.cache\huggingface\datasets\text\default-52069c1c973a81bd\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8\cache-f2d81eea359cfab9.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 115653
    })
})

Mask the words (Masked Language Modelling)


In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Function for whole word masking (will be used later to compare)

In [13]:
import collections
import numpy as np


from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop('word_ids')

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return tf_default_data_collator(features)

Split the dataset we have into train and test

In [10]:
train_size = 10000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at C:\Users\Hari Shiman\.cache\huggingface\datasets\text\default-52069c1c973a81bd\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8\cache-1884b3c79ac0b68b.arrow and C:\Users\Hari Shiman\.cache\huggingface\datasets\text\default-52069c1c973a81bd\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8\cache-3a5e642cc7acf4d0.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

Choose between masked language modelling and whole word masking to apply to the datasets

In [14]:
fn = data_collator

tf_train_dataset = downsampled_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels",'word_ids'],
    collate_fn=fn,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels",'word_ids'],
    collate_fn=fn,
    shuffle=False,
    batch_size=32,
)

KeyError: 'word_ids'

Trainer

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

Before fitting the model, perplexity results

In [12]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

NameError: name 'tf_eval_dataset' is not defined

After fitting the model, the perplexity results

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset)
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Save the model


In [None]:
model_name = 'trial1'
model.save_pretrained(model_name, saved_model=True)

Load the model

In [None]:
from transformers import pipeline
from tensorflow import keras
new_model = TFAutoModelForMaskedLM.from_pretrained(model_name)
new_model.summary()

Test the model

In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model=new_model,tokenizer=tokenizer
)

text = "top up fuel [MASK]"
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")