In [None]:
'''
https://github.com/huggingface/transformers/blob/main/src/transformers/generation_utils.py
the repo have model.generate() func
and the func can tell us how to use model and beam-search
'''

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

In [2]:
model_name = 'Helsinki-NLP/opus-mt-en-zh'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [3]:
sentences = ["The house is wonderful.", "I like to work in u.6v"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)

output_sequences = model.generate(input_ids=inputs["input_ids"],
                                  attention_mask=inputs["attention_mask"],
                                  do_sample=False)
                                  # disable sampling to test if batching affects output
tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

['这房子是美妙的。', '我喜欢在e. 6v工作']

In [4]:
output_sequences

tensor([[65000,   789, 10095,    72, 37908,    12,    10,     0, 65000, 65000],
        [65000, 13565,    38,   179,     6,     8,   135,  4068,   198,     0]])

In [5]:
inputs

{'input_ids': tensor([[   26,  2793,    32, 15099,     6,     0, 65000, 65000, 65000, 65000],
        [   28,   240,     9,   119,    11,     8,  1508,  2157,  4068,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [33]:
data = model.model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

In [34]:
data

BaseModelOutput(last_hidden_state=tensor([[[-9.2812e-02,  3.6446e-01,  9.4637e-02,  ..., -7.0080e-01,
           6.8105e-04, -6.1914e-03],
         [-2.1327e-01,  1.3161e-01,  1.9674e-01,  ..., -7.9506e-01,
          -5.4278e-01, -1.4929e-01],
         [ 3.3304e-02, -1.3072e-01,  4.9979e-01,  ..., -2.1106e-01,
          -3.3489e-01, -1.5482e-01],
         ...,
         [ 1.8291e-01,  2.7879e-02,  2.5640e-02,  ..., -5.3971e-02,
          -6.1518e-02,  1.7738e-02],
         [ 2.2472e-01,  1.8369e-02,  5.8589e-02,  ..., -8.1724e-02,
          -8.7148e-03,  6.4150e-02],
         [ 1.6981e-01,  5.1836e-02,  1.9128e-01,  ..., -1.6713e-01,
           1.7896e-01,  1.8652e-01]],

        [[-1.0097e+00, -6.3492e-01,  2.1529e-01,  ..., -4.3858e-01,
          -2.5724e-01, -1.2957e-01],
         [-9.8106e-01, -2.2376e-01,  6.3795e-02,  ...,  2.9150e-01,
          -3.7196e-02, -5.3129e-01],
         [-5.5728e-02, -4.7423e-01,  8.9754e-02,  ...,  3.9865e-01,
           5.9542e-02, -1.9249e-01],
     

In [20]:
data['last_hidden_state'].shape

torch.Size([2, 10, 512])

In [23]:
len(data['past_key_values'])

6

In [27]:
data['past_key_values'][0][0].shape

torch.Size([2, 8, 10, 64])

In [37]:
input_ids

tensor([[13959,  1566,    12,  2968,    10,    37,   629,    19,  1627,     5,
             1]])

In [35]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss

In [36]:
loss

tensor(0.2542, grad_fn=<NllLossBackward>)

In [40]:
model_name = 'Helsinki-NLP/opus-mt-en-zh'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [121]:
'''
https://juejin.cn/post/7069225910427189256
'''
sentences = ["The house is wonderful.", "I like to work in u.6v"]
label_seq = ['这房子是美妙的。', '我喜欢在e. 6v工作']
inputs = tokenizer(sentences, return_tensors="pt", padding=True)['input_ids']
label_inputs = tokenizer(label_seq, return_tensors="pt", padding=True)['input_ids']
#
loss = model(input_ids=inputs, labels=label_inputs).loss

In [122]:
loss

tensor(3.7461, grad_fn=<NllLossBackward>)

In [125]:
model(input_ids=inputs, labels=label_inputs, output_attentions=True, output_hidden_states=True, return_dict=True).keys()

odict_keys(['loss', 'logits', 'decoder_hidden_states', 'decoder_attentions', 'cross_attentions', 'encoder_last_hidden_state', 'encoder_hidden_states', 'encoder_attentions'])

In [52]:
model(input_ids=inputs, labels=label_inputs)['encoder_last_hidden_state'].shape

torch.Size([2, 10, 512])

In [48]:
inputs

tensor([[   26,  2793,    32, 15099,     6,     0, 65000, 65000, 65000, 65000],
        [   28,   240,     9,   119,    11,     8,  1508,  2157,  4068,     0]])

In [53]:
model.model(input_ids=input_ids)

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [54]:
input_ids

tensor([[13959,  1566,    12,  2968,    10,    37,   629,    19,  1627,     5,
             1]])

In [61]:
model.get_decoder()(input_ids)['last_hidden_state'].shape

torch.Size([1, 11, 512])

In [90]:
sentences = ["The house is wonderful.", "I like to work in u.6v"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)

output_sequences = model.generate(input_ids=inputs["input_ids"],
                                  attention_mask=inputs["attention_mask"],
                                  do_sample=False,
                                  output_attentions=True,
                                  output_hidden_states=True,
                                  output_scores=True,
                                  return_dict_in_generate=True)
                                  # disable sampling to test if batching affects output

In [91]:
output_sequences.keys()

odict_keys(['sequences', 'sequences_scores', 'scores', 'beam_indices', 'encoder_attentions', 'encoder_hidden_states', 'decoder_attentions', 'cross_attentions', 'decoder_hidden_states'])

In [113]:
output_sequences['scores'][0]

tensor([[ -9.8833, -11.5677, -10.2921,  ..., -13.6133, -13.6209,     -inf],
        [ -9.8833, -11.5677, -10.2921,  ..., -13.6133, -13.6209,     -inf],
        [ -9.8833, -11.5677, -10.2921,  ..., -13.6133, -13.6209,     -inf],
        ...,
        [ -8.3813, -11.7055, -12.6086,  ..., -13.9708, -13.9674,     -inf],
        [ -8.3813, -11.7055, -12.6086,  ..., -13.9708, -13.9674,     -inf],
        [ -8.3813, -11.7055, -12.6086,  ..., -13.9708, -13.9674,     -inf]])

In [117]:
output_sequences['scores'][1]

tensor([[ -9.3673, -10.1171,  -9.7802,  ..., -14.1245, -14.1317,     -inf],
        [ -9.6910,  -7.1989,  -8.5523,  ..., -13.3171, -13.3654,     -inf],
        [ -9.1201,  -9.9758,  -9.3361,  ..., -13.8836, -13.9023,     -inf],
        ...,
        [ -6.2548,  -8.0736,  -7.1292,  ..., -12.8266, -12.8382,     -inf],
        [ -8.3927,  -8.9225,  -7.3705,  ..., -13.3349, -13.3512,     -inf],
        [ -7.7695,  -5.3762,  -6.2260,  ..., -12.9911, -13.0367,     -inf]])

In [118]:
output_sequences['beam_indices']

((tensor(0),
  tensor(1),
  tensor(1),
  tensor(2),
  tensor(1),
  tensor(1),
  tensor(1),
  tensor(0),
  tensor(0),
  tensor(0),
  tensor(0),
  tensor(0),
  tensor(3),
  tensor(1)),
 (tensor(0),
  tensor(1),
  tensor(1),
  tensor(2),
  tensor(1),
  tensor(1),
  tensor(1),
  tensor(0),
  tensor(0),
  tensor(0),
  tensor(2),
  tensor(1),
  tensor(0),
  tensor(0)))

In [115]:
output_sequences['sequences']

tensor([[65000,   789, 10095,    72, 37908,    12,    10,     0, 65000, 65000],
        [65000, 13565,    38,   179,     6,     8,   135,  4068,   198,     0]])

In [92]:
inputs

{'input_ids': tensor([[   26,  2793,    32, 15099,     6,     0, 65000, 65000, 65000, 65000],
        [   28,   240,     9,   119,    11,     8,  1508,  2157,  4068,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
'''
we apply auto function as model
the model has two ways.
first, it is for training, second, it is for inference, evaluation, experiment.
odict_keys(['loss', 
            'logits', 
            'decoder_hidden_states', 
            'decoder_attentions', 
            'cross_attentions', 
            'encoder_last_hidden_state', 
            'encoder_hidden_states', 
            'encoder_attentions'])

'''