In [1]:
from transformers import TFAutoModelForMaskedLM
pre_model="distilbert-base-uncased"
model=TFAutoModelForMaskedLM.from_pretrained(pre_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFDistilBertForMaskedLM.

All the weights of TFDistilBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [2]:
model.summary()

Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNor  multiple                  1536      
 malization)                                                     
                                                                 
 vocab_projector (TFDistilB  multiple                  23866170  
 ertLMHead)                                                      
                                                                 
Total params: 66985530 (255.53 MB)
Trainable params: 66985530 (255.53 MB)
Non-trainable params: 0 (0.00 

In [3]:
text="This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained(pre_model)

In [5]:
import numpy as np
import tensorflow as tf
inputs=tokenizer(text,return_tensors="np")
token_logits=model(inputs).logits

#find the location of mask and extract its logits
mask_tokens_index=np.argwhere(inputs["input_ids"]==tokenizer.mask_token_id)[0,1]
mask_token_logits=token_logits[0,mask_tokens_index,:]

top_5_tokens=np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
  print(f">>>{text.replace(tokenizer.mask_token,tokenizer.decode([token]))}")

>>>This is a great deal.
>>>This is a great success.
>>>This is a great adventure.
>>>This is a great idea.
>>>This is a great feat.


In [6]:
# to showcase domain adaptation we use imdb dataset
!pip install datasets
from datasets import load_dataset



In [7]:
imdb_dataset=load_dataset("imdb")
imdb_dataset

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
sample=imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
  print(f"\n'>>>Review:{row['text']}'")
  print(f"'>>>Label:{row['label']}'")


'>>>Review:There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>>Label:1'

'>>>Review:This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stub your

In [9]:
#Data processing for masked language model
def tokenize_function(examples):
  result=tokenizer(examples["text"])
  if tokenizer.is_fast:
    result["word_ids"]=[result.word_ids(i) for i in range(len(result["input_ids"]))]
  return result

tokenized_datasets=imdb_dataset.map(
    tokenize_function,batched=True,remove_columns=["text","label"]
)

tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [10]:
tokenizer.model_max_length

512

In [11]:
chunk_size=128

In [12]:
#we will concatenate all the examples together
def group_texts(examples):
  # to concatenate all texts
  concatenated_examples={k: sum(examples[k],[]) for k in examples.keys()}
  #to compute length of the concatenated texts
  total_length=len(concatenated_examples[list(examples.keys())[0]])
  #we drop the last chunk if its smaller than chunk size
  total_length=(total_length//chunk_size)*chunk_size
  #we split the chunks by max len
  result={
      k:[t[i:i+chunk_size] for i in range(0,total_length,chunk_size)]
      for k,t in concatenated_examples.items()
  }

  result["labels"]=result["input_ids"].copy()
  return result

In [13]:
lm_datasets=tokenized_datasets.map(group_texts,batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [14]:
#fine tuning with the help of the Trainer API

from transformers import DataCollatorForLanguageModeling
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm_probability=0.15)

In [15]:
#to see how the random masking happens lets take an example
samples=[lm_datasets["train"][i] for i in range(2)]
for sample in samples:
  _=sample.pop("word_ids")
for chunk in data_collator(samples)["input_ids"]:
  print(f"\n'>>>{tokenizer.decode(chunk)}'")


'>>>[CLS] i rented i am curious - yellow from my video store because [MASK] all the controversy that surrounded it when it was first [MASK] in 1967. i also heard that at [MASK] it was seized by u. s. customs if it ever tried [MASK] enter this country, therefore being [MASK] [MASK] of films considered " controversial " i really had to see this for myself [MASK] < br / > < br / > the plot is centered around a young swedish [MASK] student named lena who wants to learn everything she can [MASK] [MASK]. in particular [MASK] wants to focus [MASK] attention [MASK] [MASK] making some sort of documentary on what the average swede thought about certain political [MASK] such'

'>>>[MASK] [MASK]fect war and race issues in the united states. in [MASK] [MASK] politicians and ordinary [MASK]izens of stockholm about their opinions on politics, she has [MASK] with [MASK] drama teacher, classmates, and married men [MASK] < br / > < br / > what kills me about i [MASK] curious - yellow is that 40 years a

In [16]:
#as we can see above some tokens have been randomly selected and masked

In [17]:
#we can mask words as whole rather than the tokens itself

import collections
import numpy as np
from transformers.data.data_collator import tf_default_data_collator

wwm_probability=0.2
def whole_word_masking_data_collator(features):
  for feature in features:
    word_ids=feature.pop("word_ids")
    #creating a map between words and corresponding token indices
    mapping=collections.defaultdict(list)
    current_word_index=-1
    current_word=None
    for idx,word_id in enumerate(word_ids):
      if word_id is not None:
        if word_id!=current_word:
          current_word=word_id
          current_word_index+=1
        mapping[current_word_index].append(idx)
    #randomy masked words
    mask=np.random.binomial(1,wwm_probability,(len(mapping),))
    input_ids=feature["input_ids"]
    labels=feature["labels"]
    new_labels=[-100]*len(labels)
    for word_id in np.where(mask)[0]:
      word_id=word_id.item()
      for idx in mapping[word_id]:
        new_labels[idx]=labels[idx]
        input_ids[idx]=tokenizer.mask_token_id
    feature["labels"]=new_labels
  return tf_default_data_collator(features)

In [18]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i [MASK] curious - yellow from my [MASK] store because of all the controversy that [MASK] it when [MASK] was first released in 1967. i also heard that at first [MASK] was seized by u. s. customs [MASK] it ever tried to enter this country [MASK] therefore being a fan of films considered " [MASK] " i really had to see [MASK] for [MASK]. < br / > < br / > the plot is centered around a young swedish drama student named lena who [MASK] [MASK] [MASK] everything she [MASK] about life. in particular she wants to focus her attentions to making some sort of [MASK] on what [MASK] average [MASK] [MASK] [MASK] about certain political [MASK] [MASK]'

'>>> [MASK] the vietnam war [MASK] race issues [MASK] the united states. in between asking politicians and ordinary denizens [MASK] [MASK] [MASK] their [MASK] [MASK] politics [MASK] she has [MASK] with her [MASK] teacher, classmates, and married men. < br / > < br / > [MASK] kills me about i [MASK] curious [MASK] yellow is that 40 y

In [19]:
train_size=10000
test_size=int(0.1*train_size)
downsampled_dataset=lm_datasets["train"].train_test_split(
    train_size=train_size,test_size=test_size,seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [20]:
tf_train_dataset=model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset=model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [21]:
from transformers import create_optimizer
num_train_steps=len(tf_train_dataset)
optimizer,schedule=create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)
# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [22]:
model.fit(tf_train_dataset,validation_data=tf_eval_dataset)



<tf_keras.src.callbacks.History at 0x7cadd01227a0>

In [23]:
import math
eval_loss=model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 13.70


In [24]:
from transformers import pipeline

mask_filler=pipeline(
    "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb"
)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [25]:
preds=mask_filler(text)
for pred in preds:
    print(f">>> {pred['sequence']}")

>>> this is a great film.
>>> this is a great movie.
>>> this is a great idea.
>>> this is a great deal.
>>> this is a great adventure.
