Transformer models handles text by converting them into numbers. i.e. it converts text -> tokens -> numbers 
for this we use AutoTokenizer

In [30]:
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-cased")


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer("Hello, I'm a single sentence!")
print(encoded_input)



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [101, 8667, 117, 146, 112, 182, 170, 1423, 5650, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [2]:
tokenizer.decode(encoded_input["input_ids"])

"[CLS] Hello, I ' m a single sentence! [SEP]"

In [3]:
encoded_input = tokenizer("What's up?", "You doing good or what?")
print(encoded_input)

{'input_ids': [101, 1327, 112, 188, 1146, 136, 102, 1192, 1833, 1363, 1137, 1184, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


Lets take a look at tensors

In [4]:
encoded_input = tokenizer("What's up?", "You doing good or what?", return_tensors = "pt")
print(encoded_input)

{'input_ids': tensor([[ 101, 1327,  112,  188, 1146,  136,  102, 1192, 1833, 1363, 1137, 1184,
          136,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Now lets do the padding so that model can normalize i.e. make the sentences of same length

In [15]:
encoded_input1 = tokenizer(["What's up?", "You doing good or what?"], padding = True,  return_tensors = "pt")
print(encoded_input)

{'input_ids': tensor([[ 101, 1327,  112,  188, 1146,  136,  102, 1192, 1833, 1363, 1137, 1184,
          136,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Lets truncate the input

In [16]:
encoded_input = tokenizer(
    "This is a very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.",
    truncation=True,
)
print(encoded_input["input_ids"])

[101, 1188, 1110, 170, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1263, 5650, 119, 102]


padding + truncation

In [18]:
encoded_in = tokenizer(
    ["How are you?", "I am good"],
    padding = True,
    truncation = True,
    return_tensors = "pt",
    max_length = 5,
)

print(encoded_in)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  102],
        [ 101,  146, 1821, 1363,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}


Batch size = number of items inside the list []

ADDING SPECIAL TOKENS

beginning of a sentence ([CLS]) 


separator between sentences ([SEP])

In [21]:
en = tokenizer("Hi what's up?")
print(en)
tokenizer.decode(en["input_ids"])

{'input_ids': [101, 8790, 1184, 112, 188, 1146, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


"[CLS] Hi what ' s up? [SEP]"

These special tokens are automatically added by the tokenizer. Not all models need special tokens; they are primarily used when a model was pretrained with them, in which case the tokenizer will add them since the model expects them.

In [23]:
seq = tokenizer([
    "Yo my man.",
    "why so serious?"
]
)
print(seq)

{'input_ids': [[101, 14941, 1139, 1299, 119, 102], [101, 1725, 1177, 3021, 136, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [39]:
encoded_sequences = [
    [
        101,
        1045,
        1005,
        2310,
        2042,
        3403,
        2005,
        1037,
        17662,
        12172,
        2607,
        2026,
        2878,
        2166,
        1012,
        102,
    ],
    [101, 1045, 5223, 2023, 2061, 2172, 999, 102],
    
]

As this "array" is already of rectangular shape, lets convert it to a tensor.

In [42]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

batch = tokenizer.pad(
    {"input_ids": encoded_sequences},
    padding = True,
    return_tensors = "pt"
)

input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [44]:
print(outputs)

NameError: name 'outputs' is not defined

In [45]:
with torch.no_grad():   # inference mode
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )


In [46]:
print(outputs)



BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.3412, -0.1088,  0.1147,  ..., -0.2650,  0.1881, -0.1104],
         [-0.0316, -0.9939,  0.3383,  ..., -0.2290,  0.5234,  0.2093],
         [ 0.1651, -0.8795,  0.5937,  ..., -0.0165, -0.0986,  0.1480],
         ...,
         [-0.1512, -0.5010,  0.1329,  ..., -0.3520, -0.1183,  0.2425],
         [ 0.1559, -0.4795,  0.1415,  ..., -0.4274, -0.2023,  0.2730],
         [ 0.9356, -0.5068,  0.2157,  ..., -0.9903,  0.0920, -0.6285]],

        [[-0.0725,  0.0540, -0.0037,  ...,  0.1450,  0.2381, -0.0164],
         [-0.1619, -0.3062, -0.2282,  ...,  0.3782, -0.1170,  0.1295],
         [-0.1173, -0.1003,  0.1703,  ...,  0.3227, -0.1996,  0.1646],
         ...,
         [-0.1968, -0.3095, -0.2132,  ...,  0.3795, -0.0715,  0.0696],
         [-0.1486, -0.2172, -0.1510,  ...,  0.3222,  0.1034, -0.0320],
         [-0.1927, -0.1105,  0.1996,  ...,  0.4188,  0.2443, -0.0229]]]), pooler_output=tensor([[-0.7012,  0.5009,  0.9999,  .

In [48]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
sentence = tokenizer.convert_tokens_to_string(tokens)
sentence


ValueError: only one element tensors can be converted to Python scalars

In [49]:
for i, tokens in enumerate(batch_tokens):
    text = tokenizer.convert_tokens_to_string(tokens)
    print(f"Sentence {i}:", text)


NameError: name 'batch_tokens' is not defined

In [50]:
batch_tokens = tokenizer.convert_ids_to_tokens(input_ids)


ValueError: only one element tensors can be converted to Python scalars

In [51]:
batch_tokens = [
    tokenizer.convert_ids_to_tokens(sentence_ids)
    for sentence_ids in input_ids
]


In [52]:
for i, tokens in enumerate(batch_tokens):
    print(f"\nSentence {i}:")
    for tid, tok in zip(input_ids[i], tokens):
        print(f"{int(tid):>4} → {tok}")



Sentence 0:
 101 → [CLS]
1045 → 正
1005 → 國
2310 → themselves
2042 → ##ine
3403 → search
2005 → hours
1037 → 月
17662 → Riders
12172 → stern
2607 → changes
2026 → largest
2878 → silver
2166 → previous
1012 → 夫
 102 → [SEP]

Sentence 1:
 101 → [CLS]
1045 → 正
5223 → rapidly
2023 → kept
2061 → White
2172 → vocals
 999 → 司
 102 → [SEP]
   0 → [PAD]
   0 → [PAD]
   0 → [PAD]
   0 → [PAD]
   0 → [PAD]
   0 → [PAD]
   0 → [PAD]
   0 → [PAD]


In [53]:
for i, tokens in enumerate(batch_tokens):
    text = tokenizer.convert_tokens_to_string(tokens)
    print(f"Sentence {i}:", text)


Sentence 0: [CLS] 正 國 themselvesine search hours 月 Riders stern changes largest silver previous 夫 [SEP]
Sentence 1: [CLS] 正 rapidly kept White vocals 司 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
