### 关于transformer中decoder-only架构的LLM模型 为什么建议左填充的case

In [3]:
from transformers import BertModel ,BertTokenizer
from transformers import LlamaForCausalLM ,LlamaTokenizer

In [4]:
## 构造两个不等长的输入句子
input_text = [
    "I want to go to space",
    "I'm going to Greece for my holiday to see the beauty",
]

##### 之前 encoder-only 模型的填充一般是right-padding

In [16]:
## 之前 encoder-only 模型一般都是右填充
bert_tokenizer= BertTokenizer.from_pretrained("./link_model/bert-base-uncased/")
bert = BertModel.from_pretrained("./link_model/bert-base-uncased/")
print(bert_tokenizer.padding_side)

right


In [6]:
## bert tokenzier 之后
tokens = bert_tokenizer(input_text,padding="longest",return_tensors="pt")
print(tokens.input_ids[0])
print(tokens.input_ids[1])

tensor([ 101, 1045, 2215, 2000, 2175, 2000, 2686,  102,    0,    0,    0,    0,
           0,    0,    0])
tensor([ 101, 1045, 1005, 1049, 2183, 2000, 5483, 2005, 2026, 6209, 2000, 2156,
        1996, 5053,  102])


In [7]:
bert(input_ids= tokens["input_ids"], attention_mask=tokens["attention_mask"],return_dict=False)

(tensor([[[ 0.1180,  0.3472,  0.0159,  ..., -0.1407,  0.3505,  0.3096],
          [ 0.2318,  0.4219,  0.1071,  ..., -0.0161,  0.5884,  0.1275],
          [ 0.3912,  0.2970,  1.1053,  ...,  0.0593, -0.0361, -0.0474],
          ...,
          [ 0.0150,  0.0878,  0.1920,  ...,  0.2756,  0.0966,  0.0368],
          [ 0.3867,  0.3596,  0.2187,  ...,  0.1566, -0.0414,  0.1783],
          [ 0.2775,  0.3966,  0.1957,  ...,  0.1723, -0.0351,  0.2319]],
 
         [[ 0.1144,  0.1349, -0.0943,  ..., -0.3948,  0.2984,  0.3757],
          [ 0.3793, -0.0288,  0.0120,  ..., -0.2841,  0.3794,  0.4573],
          [ 0.6562,  0.2536, -0.0305,  ..., -0.4044, -0.7741, -0.2387],
          ...,
          [ 0.0656, -0.2254,  0.5395,  ..., -0.5768,  0.1196, -0.1314],
          [ 0.2855, -0.5172,  0.2094,  ...,  0.0477,  0.3329, -0.1484],
          [ 0.6014,  0.2732,  0.0175,  ..., -0.1611, -0.7703, -0.3335]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[-0.8469, -0.2697,  0.1834,  ...,  0.2944, -0.6

##### decoder-only 使用右填充会出现的问题,以及得到的警告

In [17]:
llama_tokenzier = LlamaTokenizer.from_pretrained("./link_model/llama2-7b-hf/")
llama_model = LlamaForCausalLM.from_pretrained("./link_model/llama2-7b-hf/",trust_remote_code= True)
llama_tokenzier.pad_token = llama_tokenzier.eos_token
print(llama_tokenzier.padding_side)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]

right





In [18]:
## 使用llama tokenzier进行token 化
tokens = llama_tokenzier(input_text,padding="longest",return_tensors="pt")
print(tokens.input_ids[0])
print(tokens.input_ids[1])

tensor([   1,  306,  864,  304,  748,  304, 2913,    2,    2,    2,    2,    2,
           2,    2,    2])
tensor([    1,   306, 29915, 29885,  2675,   304, 25549,   363,   590,  8753,
        22394,   304,  1074,   278, 15409])


In [19]:
output = llama_model.generate(pad_token_id=llama_tokenzier.pad_token_id, **tokens)
res_text = llama_tokenzier.batch_decode(output,skip_special_tokens=True)
res_text

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


['I want to go to space nobody wants to go',
 "I'm going to Greece for my holiday to see the beauty of the country and to"]

#### 使用left-padding 可以消除上述的警告,以及left-padding的优势

In [20]:
llama_tokenzier = LlamaTokenizer.from_pretrained("./link_model/llama2-7b-hf/",padding_side = "left")
llama_tokenzier.pad_token = llama_tokenzier.eos_token
print(llama_tokenzier.padding_side)

tokens = llama_tokenzier(input_text,padding="longest",return_tensors="pt")
print(tokens.input_ids[0])
print(tokens.input_ids[1])

left
tensor([   2,    2,    2,    2,    2,    2,    2,    2,    1,  306,  864,  304,
         748,  304, 2913])
tensor([    1,   306, 29915, 29885,  2675,   304, 25549,   363,   590,  8753,
        22394,   304,  1074,   278, 15409])


In [21]:
output = llama_model.generate(pad_token_id=llama_tokenzier.pad_token_id, **tokens)
res_text = llama_tokenzier.batch_decode(output)
res_text

['</s></s></s></s></s></s></s></s><s>I want to go to space. I want to go',
 "<s>I'm going to Greece for my holiday to see the beauty of the country and to"]