# Behind the pipeline

In [None]:
from transformers import pipeline 

In [None]:
classifer = pipeline("sentiment-analysis")
classifer (
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

In [None]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


"""
distilbert

BERT-family model

Encoder-only

base

Hidden size ~768

uncased

Lowercases input

Vocabulary does not preserve capitalization

finetuned-sst-2

Classification head trained on sentiment

english

Language assumptions baked into tokenizer
"""


In [None]:
raw_inputs = [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding = True, truncation = True, return_tensors= "pt")
print(inputs)

# Models

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-cased")

In [None]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased") #defiens the architecture directly instead of using AutoModel

## Loading and saving 

In [None]:
model.save_pretrained("directory_on_my_computer") # save the model's weights and architecture configuration
# save 2 files config.json: contain all the necessary attributes needed to build the model architecture, also contain the metadata
#ytorch_model.safetensors: state dictionary, contains all the model's weights

In [None]:
# to use saved model:
from transformers import AutoModel
model = AutoModel.from_pretrained("directory_on_my_computer")


## logging to huggin face from a notebook

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## push model to the hub

In [None]:
model.push_to_hub("my-awesome-model")

In [None]:
# load the model
from transformers import AutoModel

model = AutoModel.from_pretrained("your-username/my-awesome-model")

## Encoding text

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer("Hello, this is a single sentence!")
print(encoded_input)
"""
the output dictionary:
input_ids: numerical representaion of the tokens
token_type_ids: tells the model which part of the input is sentence A and which is sentence BaseException
attention_musk: indicates which tokens should be attended to and whihc should not
"""

{'input_ids': [101, 8667, 117, 1142, 1110, 170, 1423, 5650, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
# we can decode the input IDs to get back the original text:
tokenizer.decode(encoded_input["input_ids"]) 
#[CLS] and [SEP] are special tokens add by the tokenizer as they aiter required by the model

'[CLS] Hello, this is a single sentence! [SEP]'

### encode multiple sentences at once

In [14]:
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"])
print(encoded_input)

{'input_ids': [[101, 1731, 1132, 1128, 136, 102], [101, 146, 112, 182, 2503, 117, 6243, 1128, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [17]:
encoded_input = tokenizer ("How are you?", "I'm fine, thank you!", return_tensors="pt")#return pytorch array
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,  146,  112,  182, 2503,  117, 6243,
         1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [12]:
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"], return_tensors="np")#return numpy array
print(encoded_input)

{'input_ids': array([array([ 101, 1731, 1132, 1128,  136,  102]),
       array([ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102])],
      dtype=object), 'token_type_ids': array([array([0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
      dtype=object), 'attention_mask': array([array([1, 1, 1, 1, 1, 1]), array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])],
      dtype=object)}


### Padding inputs

In [13]:
# the two lists dont have the same length and the arrays and tensors need to be rectangular
# we solve this problem by pad the inputs that will make all sentences the same length by adding special tokenizer
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"], return_tensors="pt", padding=True)#return pytorch array
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,    0,    0,    0,    0],
        [ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### truncating inputs

In [11]:
# truncation parameter used to make BERT model able to process more tokens than 512 token 
encoded_input = tokenizer(
    "This is a very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.",
    truncation=True,
)
print(encoded_input["input_ids"])

[101, 1188, 1110, 170, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1263, 5650, 119, 102]


In [18]:
# combining padding and trancation arguments can make the tensors have the exact needed size
encoded_input = tokenizer (["How are you?", "I'm fine, thank you!"], 
                           return_tensors="pt", 
                           padding=True,
                           max_length = 5,
                           )#return pytorch array
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,    0,    0,    0,    0],
        [ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}




### adding special tokens

In [None]:
# special tokens are added in BERT to better represent the sentnece boundaries
# such as the beginnig of the sentece ([CLS]) or separator between sentences ([SEP])
encoded_input = tokenizer ("How are you")
print (encoded_input["input_ids"])
tokenizer.decode(encoded_input["input_ids"])
# not al models need special tokens

[101, 1731, 1132, 1128, 102]


'[CLS] How are you [SEP]'