# Behind the pipeline

In [1]:
from transformers import pipeline 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

classifer (
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)
classifer = pipeline("sentiment-analysis")

In [None]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


"""
distilbert

BERT-family model

Encoder-only

base

Hidden size ~768

uncased

Lowercases input

Vocabulary does not preserve capitalization

finetuned-sst-2

Classification head trained on sentiment

english

Language assumptions baked into tokenizer
"""


In [None]:
raw_inputs = [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding = True, truncation = True, return_tensors= "pt")
print(inputs)

# Models

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-cased")

In [None]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased") #defiens the architecture directly instead of using AutoModel

## Loading and saving 

In [None]:
model.save_pretrained("directory_on_my_computer") # save the model's weights and architecture configuration
# save 2 files config.json: contain all the necessary attributes needed to build the model architecture, also contain the metadata
#ytorch_model.safetensors: state dictionary, contains all the model's weights

In [None]:
# to use saved model:
from transformers import AutoModel
model = AutoModel.from_pretrained("directory_on_my_computer")


## logging to huggin face from a notebook

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## push model to the hub

In [None]:
model.push_to_hub("my-awesome-model")

In [None]:
# load the model
from transformers import AutoModel

model = AutoModel.from_pretrained("your-username/my-awesome-model")

## Encoding text

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer("Hello, this is a single sentence!")
print(encoded_input)
"""
the output dictionary:
input_ids: numerical representaion of the tokens
token_type_ids: tells the model which part of the input is sentence A and which is sentence BaseException
attention_musk: indicates which tokens should be attended to and whihc should not
"""

In [None]:
# we can decode the input IDs to get back the original text:
tokenizer.decode(encoded_input["input_ids"]) 
#[CLS] and [SEP] are special tokens add by the tokenizer as they aiter required by the model

### encode multiple sentences at once

In [None]:
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"])
print(encoded_input)

In [None]:
encoded_input = tokenizer ("How are you?", "I'm fine, thank you!", return_tensors="pt")#return pytorch array
print(encoded_input)

In [None]:
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"], return_tensors="np")#return numpy array
print(encoded_input)

### Padding inputs

In [None]:
# the two lists dont have the same length and the arrays and tensors need to be rectangular
# we solve this problem by pad the inputs that will make all sentences the same length by adding special tokenizer
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"], return_tensors="pt", padding=True)#return pytorch array
print(encoded_input)

### truncating inputs

In [None]:
# truncation parameter used to make BERT model able to process more tokens than 512 token 
encoded_input = tokenizer(
    "This is a very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.",
    truncation=True,
)
print(encoded_input["input_ids"])

In [None]:
# combining padding and trancation arguments can make the tensors have the exact needed size
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"], 
                           return_tensors="pt", 
                           padding=True,
                           max_length = 5,
                           )#return pytorch array
print(encoded_input)

### adding special tokens

In [None]:
# special tokens are added in BERT to better represent the sentnece boundaries
# such as the beginnig of the sentece ([CLS]) or separator between sentences ([SEP])
encoded_input = tokenizer ("How are you")
print (encoded_input["input_ids"])
tokenizer.decode(encoded_input["input_ids"])
# not al models need special tokens

# Tokenization

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)

## from token to input IDs

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

## decoding

In [None]:
decoded_string = tokenizer.decode ([7993, 170, 13809, 23763, 2443, 1110, 3014])
print (decoded_string)

# Handling multiple sequences

In [None]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])  
print("Input IDs:", input_ids)

output = model(input_ids)#the model expect multiple sentences by default
print("Logits:", output.logits)

# Putting all together 

In [None]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence) # model_input contain everything necessary for a modle to operate well
                                  # distilbert needs input IDs and attention mask

print(model_inputs)

## handling multiple sequence

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences)
print(model_inputs)

## padding

In [None]:
# pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding = "longest")

# pad the sequences up to the model max lenth (512 fro BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

#pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

## truncating 

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# truncate the sequences that are longer than the model max length
model_inputs = tokenizer(sequences, truncation=True)

#trancate the sequences that are longer than the specified max lenght
model_inputs = tokenizer(sequences, max_length=8, truncation= True)


## return tensors from different frameworks

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# returns PyTorch tensors
model_inputs = tokenizer(sequences, padding = True, return_tensors="pt")
print(model_inputs)

# returns NumPy arrays 
model_inputs = tokenizer(sequences, padding = True, return_tensors="np")
print(model_inputs)

## special tokens

### single sentence

In [None]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
input_ids = model_inputs["input_ids"]
print(input_ids)

tokens = tokenizer.tokenize(sequence)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)



In [None]:
print(tokenizer.decode(input_ids))
print(tokenizer.decode(ids))

### multiple sentences

In [None]:
tokens = tokenizer.tokenize(sequences)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

In [None]:

print(tokenizer.decode(ids))

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences, padding = True, return_tensors="pt",truncation=True)
input_ids = model_inputs["input_ids"]
print(input_ids)


# input_ids is a 2D tensor (batch_size Ã— sequence_length).
# tokenizer.decode() expects a single 1D sequence of token IDs.
# tokenizer.batch_decode() is used to decode multiple sequences (a batch).
print(tokenizer.batch_decode(input_ids)) 

## Wrapping up: from tokenizer to model

In [58]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Loads the tokenizer (vocabulary, tokenization rules, and special tokens) that exactly matches the model checkpoint, ensuring text is converted into the correct token IDs expected by the model's embedding layer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) # Loads a pretrained transformer model along with a task-specific sequence classification head, restoring all learned weights so the model can map token sequences to classification logits 

sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"] #fuly spcified model package hosted by hugging face

tokens = tokenizer(sequences, padding = True, truncation=True, return_tensors="pt")
output = model(**tokens)# The **tokens syntax unpacks the dictionary so the model receives the arguments it expects.
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


: 