In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import AutoTokenizer

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
tokenizer('hello world')

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [6]:
tokens = tokenizer.tokenize('hello world')
tokens

['hello', 'world']

In [7]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[7592, 2088]

In [8]:
tokenizer.convert_ids_to_tokens(ids)

['hello', 'world']

In [9]:
tokenizer.decode(ids)

'hello world'

In [10]:
ids = tokenizer.encode(tokens)
ids

[101, 7592, 102, 2088, 102]

In [11]:
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'hello', '[SEP]', 'world', '[SEP]']

In [12]:
tokenizer.decode(ids)

'[CLS] hello [SEP] world [SEP]'

In [13]:
model_inputs = tokenizer("hello world")
model_inputs

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [14]:
data = [
    "I like cats.",
    "Do you like cats too?"
]
tokenizer(data)

{'input_ids': [[101, 1045, 2066, 8870, 1012, 102], [101, 2079, 2017, 2066, 8870, 2205, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [15]:
from transformers import AutoModelForSequenceClassification

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
# outputs = model(**model_inputs)

# not works, because model default requests torch tensors as input

In [18]:
model_inputs = tokenizer("hello world", return_tensors='pt')
model_inputs

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [19]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0741, -0.1176]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
                                                            checkpoint,
                                                            num_labels=3
                                                           )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1999, -0.3196, -0.3408]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
outputs.logits

tensor([[ 0.1999, -0.3196, -0.3408]], grad_fn=<AddmmBackward0>)

In [23]:
outputs['logits']

tensor([[ 0.1999, -0.3196, -0.3408]], grad_fn=<AddmmBackward0>)

In [24]:
outputs[0]

tensor([[ 0.1999, -0.3196, -0.3408]], grad_fn=<AddmmBackward0>)

In [25]:
outputs.logits.detach().cpu().numpy()

array([[ 0.1999217 , -0.3195573 , -0.34080413]], dtype=float32)

In [26]:
# data = [
#     "I like cats.",
#     "Do you like cats too?"
# ]
# model_inputs = tokenizer(data, return_tensors='pt')
# model_inputs

# error because padding and trauncation disabled

In [27]:
model_inputs = tokenizer(
                            data,
                            padding=True, truncation=True,
                            return_tensors='pt'
                         )
model_inputs

{'input_ids': tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [28]:
model_inputs['input_ids']

tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]])

In [29]:
model_inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [30]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3622, -0.1852, -0.2583],
        [ 0.3830, -0.2454, -0.2868]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

END