In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import TFAutoModelForSeq2SeqLM

from transformers import create_optimizer, AdamWeightDecay
import tensorflow.keras.callbacks as cb

In [None]:
# print(tf.__version__)
# print("Num GPUs Available", len(tf.config.experimental.list_physical_devices('GPU')))

2.10.0
Num GPUs Available 1


In [None]:
# !pip install datasets
# !pip install evaluate
# !pip install transformers
# !pip install sentencepiece
# !pip install sacrebleu

### Load the data, pretrained model & tokenizer

In [None]:
# load the data
dataset = load_dataset('iwslt2017', 'iwslt2017-zh-en')
data_train = dataset['train']
data_test = dataset['test']
data_val = dataset['validation']

Found cached dataset iwslt2017 (C:/Users/84619/.cache/huggingface/datasets/iwslt2017/iwslt2017-zh-en/1.0.0/03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-zh.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


### Inference Test

In [None]:
input_text  = data_train["translation"][34657]['en']
print(input_text)
tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)

with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

Natasha wanted to introduce her brother and father to all the villagers, and the day we showed up turned out to be a 60-year-old man's birthday.
娜塔莎想把她的兄弟和父亲 介绍给所有的村民们, 那天我们来到, 成为了60岁男人的生日。


In [None]:
source_lang = "en"
target_lang = "zh"

### Preprocess the data to match the input format of the pretrained model

In [None]:
def preprocessing(data):
    inputs = [pair[source_lang] for pair in data["translation"]]
    targets = [pair[target_lang] for pair in data["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data_train = data_train.map(preprocessing, batched=True)
tokenized_data_test = data_test.map(preprocessing, batched=True)
tokenized_data_val = data_val.map(preprocessing, batched=True)

Loading cached processed dataset at C:\Users\84619\.cache\huggingface\datasets\iwslt2017\iwslt2017-zh-en\1.0.0\03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49\cache-0756fc51e69ee1ee.arrow
Loading cached processed dataset at C:\Users\84619\.cache\huggingface\datasets\iwslt2017\iwslt2017-zh-en\1.0.0\03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49\cache-a594758ccc168f00.arrow
Loading cached processed dataset at C:\Users\84619\.cache\huggingface\datasets\iwslt2017\iwslt2017-zh-en\1.0.0\03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49\cache-4ac34de171910c77.arrow


In [None]:
tokenized_data_train

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 231266
})

### Fine-tune the model

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

In [None]:
tf_train_set = tokenized_data_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

tf_val_set = tokenized_data_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator,
)
callback = cb.EarlyStopping(patience=3, restore_best_weights=True)

In [None]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)
model.fit(x=tf_train_set, validation_data=tf_val_set, epochs=10, callbacks = [callback])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x2ab6a611f70>

### Fine-tuned model saving and loading

In [None]:
model.save_pretrained('Documents\saved_models')

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained('Documents\saved_models')

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Documents\saved_models.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


### Inference

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained('Documents\saved_models')
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Documents\saved_models.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
# load the data
dataset = load_dataset('iwslt2017', 'iwslt2017-zh-en')
data_train = dataset['train']
data_test = dataset['test']
data_val = dataset['validation']

Found cached dataset iwslt2017 (C:/Users/84619/.cache/huggingface/datasets/iwslt2017/iwslt2017-zh-en/1.0.0/03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from datasets import Dataset

In [None]:
ds = Dataset.from_dict({"en": [pair['en'] for pair in data_test["translation"]], 
             "zh": [pair['zh'] for pair in data_test["translation"]]})

In [None]:
import time

In [None]:
def translate(data):
    inputs = data['en']
    print('translating')
    start = time.time()
    tokenized = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    translated = model.generate(**tokenized, max_length=128)
    with tokenizer.as_target_tokenizer():
        tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    data["translated"] = tgt_text
    end = time.time()
    print("Time used: ", end-start)
    return data

In [None]:
translated = ds.map(translate, batched=True, batch_size=32)



INFO:tensorflow:Assets written to: ram://583071ed-c17e-4392-91d6-6acbd5490748/assets


INFO:tensorflow:Assets written to: ram://583071ed-c17e-4392-91d6-6acbd5490748/assets


Map:   0%|          | 0/8549 [00:00<?, ? examples/s]

translating




Time used:  222.11679673194885
translating
Time used:  221.63260960578918
translating
Time used:  221.21576690673828
translating
Time used:  219.54789566993713
translating
Time used:  221.61147165298462
translating
Time used:  219.28222942352295
translating
Time used:  222.98560452461243
translating
Time used:  220.71967458724976
translating
Time used:  220.63223910331726
translating
Time used:  223.6516933441162
translating
Time used:  221.88564610481262
translating
Time used:  221.06858444213867
translating
Time used:  222.1700632572174
translating
Time used:  212.8631453514099
translating
Time used:  201.65148186683655
translating
Time used:  202.1982400417328
translating
Time used:  201.7021176815033
translating
Time used:  201.10041689872742
translating
Time used:  210.1893036365509
translating
Time used:  214.44086956977844
translating
Time used:  212.13490271568298
translating
Time used:  213.1430323123932
translating
Time used:  211.7897548675537
translating
Time used:  215.206

In [None]:
df_test = pd.DataFrame(data_test['translation'])

In [None]:
def translate(english_text):
    tokenized = tokenizer([english_text])
    out = model.generate(**tokenized, max_length=128)
    with tokenizer.as_target_tokenizer():
        return tokenizer.decode(out[0], skip_special_tokens=True)

In [None]:
df_test['translated'] = df_test['en'].map(translate)

Exception ignored in: <function UniquePtr.__del__ at 0x0000022413945670>
Traceback (most recent call last):
  File "C:\Users\84619\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\framework\c_api_util.py", line 74, in __del__
    self.deleter(obj)
KeyboardInterrupt: 


KeyboardInterrupt: ignored