# **La tarea de modelado del lenguaje enmascarado**

In [None]:
!pip install transformers

In [None]:
from transformers import BertForMaskedLM, pipeline

In [None]:
# El paquete Transformers llega con varios "heads" estándar además del modelo BERT estándar
bert_lm = BertForMaskedLM.from_pretrained('bert-base-cased')

In [None]:
bert_lm   #inspecciona el modelo


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
#Pipelines en transformers toma como entradas modelos/tokenizadores y son el modo más sencillo de automatizar varias tareas
#Podemos realizar una tarea de modelado de lenguaje auto-codificado
nlp = pipeline("fill-mask", model='bert-base-cased') #podemos también usar "model=bert_lm" con el mismo resultado


In [None]:
type(nlp.model)


transformers.models.bert.modeling_bert.BertForMaskedLM

In [None]:
nlp.tokenizer


BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
print(type(nlp.model))
preds = nlp(f"If you don't {nlp.tokenizer.mask_token} at the sign, you will get a ticket.")
print("If you don't *** at the sign, you will get a ticket.")
for p in preds:
    print(f"Token: {p['token_str']}. Score: {100*p['score']:.2f}%")



<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>
If you don't *** at the sign, you will get a ticket.
Token: stop. Score: 51.10%
Token: look. Score: 38.41%
Token: arrive. Score: 1.11%
Token: glance. Score: 1.05%
Token: turn. Score: 0.72%


In [None]:
print(type(nlp.model))
preds = nlp(f"The {nlp.tokenizer.mask_token} shines brightly.")
print("The *** shine brightly .")
for p in preds:
    print(f"Token: {p['token_str']}. Score: {100*p['score']:.2f}%")


<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>
The *** shine brightly .
Token: sun. Score: 22.87%
Token: moon. Score: 13.84%
Token: light. Score: 5.72%
Token: sky. Score: 2.69%
Token: room. Score: 2.10%


In [None]:
print(type(nlp.model))
preds = nlp(f"This morning the {nlp.tokenizer.mask_token} shines very bright.")
print("This morning the *** shines very bright .")
for p in preds:
    print(f"Token: {p['token_str']}. Score: {100*p['score']:.2f}%")


<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>
This morning the *** shines very bright .
Token: sun. Score: 58.93%
Token: moon. Score: 16.46%
Token: sky. Score: 5.04%
Token: light. Score: 2.28%
Token: day. Score: 0.85%
