# <center> Masked Language Modelling

## Load, split, clean the data

In [1]:
from datasets import load_dataset

twitter_data = load_dataset("csv", data_files="data/original_data.csv")
twitter_data

Found cached dataset csv (C:/Users/cayde/.cache/huggingface/datasets/csv/default-e1ad4f0191b5b012/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 40000
    })
})

In [2]:
twitter_data["train"][0]

{'tweet_id': 1956967341,
 'sentiment': 'empty',
 'content': '@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['}

In [3]:
shuffled_data = twitter_data.shuffle(seed=42)
shuffled_data["train"][0]

Loading cached shuffled indices for dataset at C:\Users\cayde\.cache\huggingface\datasets\csv\default-e1ad4f0191b5b012\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9dc688d02f83d5c9.arrow


{'tweet_id': 1694746590,
 'sentiment': 'neutral',
 'content': '@WayneLiew just need to know what to be cautious about.  being cautious is good.'}

In [4]:
# Create train, test, validation split
split_size = int(0.1 * len(shuffled_data["train"]))
split_data = shuffled_data["train"].train_test_split(test_size=split_size)
validation_data = split_data["test"]
split_data = split_data["train"].train_test_split(test_size=split_size)
split_data["validation"] = validation_data
split_data

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 4000
    })
})

In [5]:
split_data["validation"][0]

{'tweet_id': 1966338978,
 'sentiment': 'surprise',
 'content': '@RWPhoto Sadly I think I know exactly were you put it--in the expired drawer   Those things have a life span of less than two years'}

In [75]:
import html

twitter_datasets = split_data.map(
    lambda x: {"content": [html.unescape(text) for text in x["content"]]}, batched=True
)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

## Train a new tokenizer

In [80]:
def get_training_corpus():
    dataset = twitter_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["content"]

training_corpus = get_training_corpus()

##### Normalization

In [76]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [77]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [82]:
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


##### Pre-Tokenization

In [83]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [84]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

##### Train tokenization model

In [88]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)

In [89]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [90]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


##### Post-processing

In [91]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")

tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [92]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [93]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


##### Add a decoder

In [95]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [96]:
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

##### Save tokenizer

In [98]:
tokenizer.save("nlp/tokenizer.json")

In [100]:
tokenizer = Tokenizer.from_file("nlp/tokenizer.json")

In [102]:
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

## Tokenize the dataset

In [108]:
test_ids = wrapped_tokenizer(split_data["train"][0]["content"])['input_ids']
wrapped_tokenizer.decode(test_ids)

"[CLS] i'm gonna miss all the live comet action tomorrow! i have to go take care of my cousins and they don't have access to the interwebz [SEP]"

In [107]:
def tokenize_function(examples):
    result = wrapped_tokenizer(examples["content"])
    if wrapped_tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = split_data.map(
    # Remove sentiment classification since our task is different
    tokenize_function, batched=True, remove_columns=["tweet_id", "content", "sentiment"]
)
tokenized_datasets

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 4000
    })
})

## Concatenate and chunk

In [116]:
# Get sample lengths
tokenized_samples = tokenized_datasets["train"][:10]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 34'
'>>> Review 1 length: 11'
'>>> Review 2 length: 18'
'>>> Review 3 length: 28'
'>>> Review 4 length: 17'
'>>> Review 5 length: 19'
'>>> Review 6 length: 21'
'>>> Review 7 length: 27'
'>>> Review 8 length: 13'
'>>> Review 9 length: 13'


In [117]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 201'


In [118]:
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 73'


In [119]:
def group_texts(examples, chunk_size=128):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [120]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 5051
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 654
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 649
    })
})

In [121]:
tokenizer.decode(lm_datasets["train"][0]["input_ids"])

"i ' m gonna miss all the live comet action tomorrow! i have to go take care of my cousins and they don ' t have access to the interwebz uploading new videos on youuuchuuub! @ monkeymad2 nooooooo!! not the receipt!! don ' t break my heart @ msfeistus i am a child of the digital age i use twitter to ask such questions do not bring logic into this @ hootsboots don ' t worry, the bizarre will find you @ jadeofjades i hope you ' re okay. do what beyonce do..."

## Collate Data

In [122]:
def add_labels(example):
    # Create a new labels column
    return {"labels": example["input_ids"].copy()}

lm_datasets = lm_datasets.map(add_labels, batched=True)
lm_datasets

Map:   0%|          | 0/5051 [00:00<?, ? examples/s]

Map:   0%|          | 0/654 [00:00<?, ? examples/s]

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 5051
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 654
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 649
    })
})

In [123]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=wrapped_tokenizer, mlm_probability=0.15, return_tensors='tf')

In [125]:
# This masks individual tokens, we mask by entire word later
samples = [lm_datasets["train"][i] for i in range(3)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {wrapped_tokenizer.decode(chunk)}'")


'>>> [CLS] i'm gonna [MASK] all the live comet [MASK] tomorrow! i [MASK] to go take care of my cousins and they don [MASK] t have access to the interwebz [SEP] [CLS] uploading new videos on [MASK]chuuub! [SEP] [CLS] @ monkeymad [MASK] nooooooo!! not the [MASK] [MASK] don't break my heart [SEP] [CLS] @ msfeistus i am a child of the digital age i use twitter to ask such questions do not bring logic into this [SEP] [CLS] @ hootsboots don't worry, the biz [MASK]e will find you [SEP] [CLS] @ jadeofjades [MASK] hope you'[MASK] okay [MASK] do what beyonce [MASK]... [SEP] [CLS]'

'>>> [MASK] shay1988 lol [MASK] here.... wish there was [MASK] way to microsize [MASK]... lol [SEP] [CLS] @ tessmorris i considered being [MASK] nurse when i [MASK] younger but i really don't think i could [MASK] if a baby died [SEP] [CLS] still bored [MASK] [MASK] long now [MASK] the next tour date [SEP] [CLS] jb on the front of [MASK] x!!!!! wooh contest timeee [SEP] [CLS] i am sad because i broke my super - awesom

In [128]:
import collections
import numpy as np
from transformers.data.data_collator import tf_default_data_collator

def whole_word_masking_data_collator(features, wwm_probability=0.2):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = wrapped_tokenizer.mask_token_id
        feature["label"] = new_labels

    return tf_default_data_collator(features)

In [133]:
# Mask by entire words, not just tokens
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {wrapped_tokenizer.decode(chunk)}'")


'>>> [CLS] i'[MASK] [MASK] miss all the [MASK] comet action tomorrow! i have to go take care of my cousins [MASK] they don't have access to the interwebz [SEP] [CLS] uploading new videos on youuuchuuub! [SEP] [CLS] @ monkeymad2 nooooooo!! not the receipt [MASK] don [MASK] t break my heart [SEP] [CLS] @ msfeistus i am a child of the digital age [MASK] [MASK] twitter to ask such questions do not bring [MASK] [MASK] this [SEP] [CLS] [MASK] [MASK] [MASK] don't worry, [MASK] bizarre will [MASK] you [SEP] [CLS] [MASK] jadeofjades [MASK] hope you're [MASK]. [MASK] what [MASK] do... [SEP] [CLS]'

'>>> @ [MASK] [MASK] lol same here [MASK] wish there was a way to microsize everything... lol [SEP] [CLS] @ tessmorris [MASK] [MASK] being a [MASK] when i [MASK] [MASK] but [MASK] really don [MASK] t think i could cope if [MASK] baby died [SEP] [CLS] [MASK] bored but not [MASK] now [MASK] the [MASK] [MASK] date [SEP] [CLS] jb on the front of factor [MASK]!!!!! wooh [MASK] [MASK] [SEP] [CLS] i am sad 

## Load the model to fine-tune

In [139]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [140]:
model.summary()

Model: "tf_distil_bert_for_masked_lm_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNorm  multiple                 1536      
 alization)                                                      
                                                                 
 vocab_projector (TFDistilBe  multiple                 23866170  
 rtLMHead)                                                       
                                                                 
Total params: 66,985,530
Trainable params: 66,985,530
Non-trainable params: 0
________________________

In [136]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [141]:
tf_train_dataset = model.prepare_tf_dataset(
    lm_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=128,
)

tf_eval_dataset = model.prepare_tf_dataset(
    lm_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=128,
)

In [144]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]
callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-twitter", tokenizer=tokenizer
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
c:\Users\cayde\Code\Python\EmotionRecognition\distilbert-base-uncased-finetuned-twitter is already a clone of https://huggingface.co/cayjobla/distilbert-base-uncased-finetuned-twitter. Make sure you pull the latest changes with `repo.git_pull()`.
