#### Set verbosity

In [1]:
import transformers
transformers.logging.set_verbosity_error() #debug, info, warning, error, critical

  from .autonotebook import tqdm as notebook_tqdm


#### A pipeline

In [2]:
from transformers import pipeline

tasks = ["feature-extraction", "fill-mask", "ner", "question-answering", 
 "sentiment-analysis", "summarization", "text-generation", 
 "translation", "zero-shot-classification", ]

task = tasks[4]

pline = pipeline(task)
pline("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598049521446228}]

#### Behind the pipeline

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
import torch

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"


raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)
model_w_head = AutoModelForSequenceClassification.from_pretrained(checkpoint)

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
out_wo_head = model(**inputs)
out_w_head = model_w_head(**inputs)
predictions = torch.nn.functional.softmax(out_w_head.logits, dim=-1)

print("\n- Inputs (after tokenization): ", inputs)
print("\n- Transformer - size of last hidden state: ", out_wo_head.last_hidden_state.shape)
print("\n- Head - size of logits: ", out_w_head.logits.shape)
print("\n- Predictions: ", predictions)
print("\n- id2label: ", model_w_head.config.id2label)


- Inputs (after tokenization):  {'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

- Transformer - size of last hidden state:  torch.Size([2, 16, 768])

- Head - size of logits:  torch.Size([2, 2])

- Predictions:  tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)

- id2label:  {0: 'NEGATIVE', 1: 'POSITIVE'}


In [4]:
print('- Inputs', type(inputs))
print('- Model', type(model))
print('- Out_wo_head', type(out_wo_head))
print('- Model_w_head', type(model_w_head))
print('- Out_w_head', type(out_w_head))
print('- Predictions', type(predictions))

- Inputs <class 'transformers.tokenization_utils_base.BatchEncoding'>
- Model <class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>
- Out_wo_head <class 'transformers.modeling_outputs.BaseModelOutput'>
- Model_w_head <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>
- Out_w_head <class 'transformers.modeling_outputs.SequenceClassifierOutput'>
- Predictions <class 'torch.Tensor'>


#### Models

In [6]:
from transformers import BertConfig, BertModel

# Building the config with a specific Config class
config = BertConfig()

# Initializing a specific Model class with random weights
model = BertModel(config)

# Initializing the model with pre-trained weights
model = BertModel.from_pretrained("bert-base-cased")

# Save the model
model.save_pretrained(".\models")

print(f'Nb hidden layers: {model.config.num_hidden_layers}')

Nb hidden layers: 12


In [7]:
# Reload the model while adapting a parameter from the config

# First method to reload the model
from transformers import AutoModel
model = AutoModel.from_pretrained('.\models', num_hidden_layers=10, )
print("\nRelaod the model with the first method: ")
print(f'Nb hidden layers: {model.config.num_hidden_layers}')

# Second method to reload the model
from transformers import AutoConfig, AutoModel
config = AutoConfig.from_pretrained('.\models', num_hidden_layers=10)
model = AutoModel.from_pretrained('.\models', config=config)
print("\nRelaod the model with the second method: ")
print(f'Nb hidden layers: {model.config.num_hidden_layers}')


Relaod the model with the first method: 
Nb hidden layers: 10

Relaod the model with the second method: 
Nb hidden layers: 10


#### Tokenizer

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Using a Transformer network is simple"

# The usual way to use a tokenizer
tokens1 = tokenizer(sequence)
print("\nTokenization the usual way:")
print("- Tokens: ", tokens1)
print("- Decode input IDs: ", tokenizer.decode(tokens1['input_ids']))

# Step by step
tokens2 = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens2)
print("\nTokenization step by step:")
print("- Tokens: ", tokens2)
print("- IDs: ", ids)
print("- Decode input IDs: ", tokenizer.decode(ids))


Tokenization the usual way:
- Tokens:  {'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
- Decode input IDs:  [CLS] Using a Transformer network is simple [SEP]

Tokenization step by step:
- Tokens:  ['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
- IDs:  [7993, 170, 13809, 23763, 2443, 1110, 3014]
- Decode input IDs:  Using a Transformer network is simple


#### Batching Sequences: why we need attention masks

In [10]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Let's imagine, the tokenization results in the following input ids:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

# Let's compute the output for each input
logits_seq1 = model(torch.tensor(sequence1_ids)).logits
logits_seq2 = model(torch.tensor(sequence2_ids)).logits
logit_batched = model(torch.tensor(batched_ids)).logits

print("\nSeq1: ", sequence1_ids)
print("Logits seq1:   ", logits_seq1)

print("\nSeq2: ", sequence2_ids)
print("Logits seq2:   ", logits_seq2)

print("\nBatched: ", batched_ids)
print("Logit batched: ", logit_batched)

# There is a difference betweeen the two approaches for sequence 2, what happened ?
print("\nThe logits of the batched sequence are different from the logits of the second shorter sequence")
print("This is because the second sequence of the batch has been padded. That padding was used by the attention layers; this was not the case with the second sequence.")


Seq1:  [[200, 200, 200]]
Logits seq1:    tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)

Seq2:  [[200, 200]]
Logits seq2:    tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

Batched:  [[200, 200, 200], [200, 200, 0]]
Logit batched:  tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)

The logits of the batched sequence are different from the logits of the second shorter sequence
This is because the second sequence of the batch has been padded. That padding was used by the attention layers; this was not the case with the second sequence.


#### Sequence batching: use of attention masks

In [11]:
# Correct the difference with attention_mask 
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


### From tokenizer to model

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

#### One step of Fine Tuning

In [14]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(batch)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2023,  2607,  2003,  6429,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [15]:
# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
pred = model(**batch)
loss = pred.loss
loss.backward()
optimizer.step()
print(loss)



tensor(0.4374, grad_fn=<NllLossBackward0>)


#### Dataset

In [17]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

ModuleNotFoundError: No module named 'transformers.datasets'

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

In [None]:
raw_train_dataset.features

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [None]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

In [None]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [None]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets