### Using Bert 

In [2]:
from transformers import BertConfig, BertModel

# 建立config
config = BertConfig()

# built the model from the config
model = BertModel(config)

In [3]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



### load pretraining model 

In [4]:
from transformers import BertModel 
# use .from_pretrained("")
model = BertModel.from_pretrained("bert-base-cased")


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Using bert

In [5]:
from transformers import pipeline
unmasker = pipeline("fill-mask", model="bert-base-cased")
mask = "[MASK]"
unmasker("Hello i am {} cool man.".format(mask))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.6979667544364929,
  'token': 170,
  'token_str': 'a',
  'sequence': 'Hello i am a cool man.'},
 {'score': 0.043818388134241104,
  'token': 1103,
  'token_str': 'the',
  'sequence': 'Hello i am the cool man.'},
 {'score': 0.03237597271800041,
  'token': 1177,
  'token_str': 'so',
  'sequence': 'Hello i am so cool man.'},
 {'score': 0.023433903232216835,
  'token': 117,
  'token_str': ',',
  'sequence': 'Hello i am, cool man.'},
 {'score': 0.01700587011873722,
  'token': 1304,
  'token_str': 'very',
  'sequence': 'Hello i am very cool man.'}]

#### Add feature

In [6]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")
# feature
text = "I love bert"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
unmasker('I love {}'.format(mask))

[{'score': 0.7751027345657349,
  'token': 119,
  'token_str': '.',
  'sequence': 'I love.'},
 {'score': 0.08472345769405365,
  'token': 1128,
  'token_str': 'you',
  'sequence': 'I love you'},
 {'score': 0.06730950623750687,
  'token': 106,
  'token_str': '!',
  'sequence': 'I love!'},
 {'score': 0.024934297427535057,
  'token': 132,
  'token_str': ';',
  'sequence': 'I love ;'},
 {'score': 0.009460309520363808,
  'token': 136,
  'token_str': '?',
  'sequence': 'I love?'}]

## Tokenizer

In [8]:
# Word based
tokenizer_text = "Hi how are you I am Charlie".split()
# Use space to seperate
print(tokenizer_text)
token_dict = {ch : i for i , ch in enumerate(tokenizer_text)}
print(token_dict)

# cons : To large || model dk "dog" & "dogs" is the same


['Hi', 'how', 'are', 'you', 'I', 'am', 'Charlie']
{'Hi': 0, 'how': 1, 'are': 2, 'you': 3, 'I': 4, 'am': 5, 'Charlie': 6}


In [9]:
# Character-based
tokenizer_text = "Hi how are you I am Charlie"
character = list(tokenizer_text)
print(character)
token_dict_cha = {ch : i for i , ch in enumerate(sorted(set(character))) }
print(token_dict_cha)

# cons : each token pointless

['H', 'i', ' ', 'h', 'o', 'w', ' ', 'a', 'r', 'e', ' ', 'y', 'o', 'u', ' ', 'I', ' ', 'a', 'm', ' ', 'C', 'h', 'a', 'r', 'l', 'i', 'e']
{' ': 0, 'C': 1, 'H': 2, 'I': 3, 'a': 4, 'e': 5, 'h': 6, 'i': 7, 'l': 8, 'm': 9, 'o': 10, 'r': 11, 'u': 12, 'w': 13, 'y': 14}


In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
seq = "Hi I am Charlie"
tokenizer(seq)
# 101 : start , 102 : end

{'input_ids': [101, 8790, 146, 1821, 4117, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

### Encoder

In [14]:
# how do id generate
tokens = tokenizer.tokenize(seq)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

['Hi', 'I', 'am', 'Charlie']
[8790, 146, 1821, 4117]


### Decoder

In [22]:

text = tokenizer.decode(ids)
print(text)

Hi I am Charlie


In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 

# same as model pretraining
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

seq = "I've have waiting for a hugging face course my whole life"

tokens = tokenizer.tokenize(seq)
ids = tokenizer.convert_tokens_to_ids(tokens)
inputs_ids = torch.tensor(ids)
# fail
model(inputs_ids)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [25]:
print(inputs_ids)

tensor([ 1045,  1005,  2310,  2031,  3403,  2005,  1037, 17662,  2227,  2607,
         2026,  2878,  2166])


In [28]:
# real case
tokenized_input_ids = tokenizer(seq, return_tensors="pt")
print(tokenized_input_ids["input_ids"])
# dim 2

tensor([[  101,  1045,  1005,  2310,  2031,  3403,  2005,  1037, 17662,  2227,
          2607,  2026,  2878,  2166,   102]])


In [32]:
# Correct
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokens = tokenizer.tokenize(seq)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_id = torch.tensor([ids])
print('Input id : ', input_id)
output = model(input_id)
print('Model output : ',output.logits)

Input id :  tensor([[ 1045,  1005,  2310,  2031,  3403,  2005,  1037, 17662,  2227,  2607,
          2026,  2878,  2166]])
Model output :  tensor([[-3.4708,  3.7444]], grad_fn=<AddmmBackward0>)


#### Padding

In [35]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
seq1_ids = [[200, 200, 200]]
seq2_ids = [[200, 200]]
# enter one time
batched_ids = [
    [200, 200, 200], 
    [200, 200, tokenizer.pad_token_id]
]
print(model(torch.tensor(seq1_ids)).logits)
print(model(torch.tensor(seq2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)
# seqs not the same because attention consider front behind

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


#### Attention Mask

In [36]:
batched_ids = [
    [200, 200, 200], 
    [200, 200, tokenizer.pad_token_id]
]
 
attention_mask = [
    [1, 1, 1],
    [1, 1 ,0]
]
output = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(output.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


### Long seq

In [None]:
# Usually max 512 1024
# Longformer , LED

# ** Real Case **

In [38]:
import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

seq = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
# **存成dict
output = model(**tokenizer(seq, padding=True, truncation=True, return_tensors="pt"))
print(output.logits)

tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>)


https://blog.csdn.net/weixin_51130521/article/details/124191097 

解釋 * , **用意
