In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

In [4]:
model_name = 'Helsinki-NLP/opus-mt-en-zh'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [12]:
sentences = ["The house is wonderful.", "I like to work in u.6v ", "My favorite food is instant noodle."]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)

In [13]:
# inference
output_sequences = model.generate(input_ids=inputs["input_ids"],
                                  attention_mask=inputs["attention_mask"],
                                  do_sample=False,
                                  output_attentions=True,
                                  output_hidden_states=True,
                                  output_scores=True,
                                  return_dict_in_generate=True)

In [14]:
output_sequences.keys()

odict_keys(['sequences', 'sequences_scores', 'scores', 'beam_indices', 'encoder_attentions', 'encoder_hidden_states', 'decoder_attentions', 'cross_attentions', 'decoder_hidden_states'])

In [15]:
sequences = output_sequences['sequences']
sequences

tensor([[65000,   789, 10095,    72, 37908,    12,    10,     0, 65000, 65000],
        [65000, 13565,    38,   179,     6,     8,   135,  4068,   198,     0],
        [65000,   105, 42279,  6448,    72,   524,  6357,  2162,     0, 65000]])

In [16]:
translated_sents = tokenizer.batch_decode(sequences, skip_special_tokens=True)
translated_sents

['这房子是美妙的。', '我喜欢在e. 6v工作', '我最喜欢的食物是即食面']

In [17]:
from opencc import OpenCC
cc = OpenCC('s2t')
translated_sents = [cc.convert(sent) for sent in translated_sents]
translated_sents

['這房子是美妙的。', '我喜歡在e. 6v工作', '我最喜歡的食物是即食麪']

In [22]:
# training
sentences = ["The house is wonderful.", "I like to work in u.6v"]
label_seq = ['这房子是美妙的。', '我喜欢在e. 6v工作']
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
label_inputs = tokenizer(label_seq, return_tensors="pt", padding=True)
training_info = model(input_ids=inputs["input_ids"], 
                      attention_mask=inputs["attention_mask"],
                      labels=label_inputs['input_ids'],
                      output_attentions=True, 
                      output_hidden_states=True, 
                      return_dict=True)

In [23]:
training_info.keys()

odict_keys(['loss', 'logits', 'decoder_hidden_states', 'decoder_attentions', 'cross_attentions', 'encoder_last_hidden_state', 'encoder_hidden_states', 'encoder_attentions'])

In [24]:
loss = training_info['loss']
loss.backward()
loss

tensor(3.8880, grad_fn=<NllLossBackward>)

In [18]:
model_name = 'Helsinki-NLP/opus-mt-zh-en'

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/786k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

In [24]:
sentences = ["我最喜欢的食物是即食面"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
# inference
output_sequences = model.generate(input_ids=inputs["input_ids"],
                                  attention_mask=inputs["attention_mask"],
                                  do_sample=False,
                                  output_attentions=True,
                                  output_hidden_states=True,
                                  output_scores=True,
                                  return_dict_in_generate=True)
sequences = output_sequences['sequences']
translated_sents = tokenizer.batch_decode(sequences, skip_special_tokens=True)

In [25]:
translated_sents

['My favorite food is instant noodles.']