In [21]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import numpy as np
import evaluate
import torch
from torch.utils.data import Dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained("KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en",model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en")

In [4]:
def prepare_dataset(data):
    source_language = [value['ko'] for key, value in data.items()]
    target_language = [value['en'] for key, value in data.items()]
    return source_language, target_language

In [5]:
train = load_dataset("iwslt2017","iwslt2017-ko-en", split="train")

In [17]:
train[0]

{'translation': {'en': 'Thank you so much, Chris.',
  'ko': '감사합니다, 크리스. 이곳에 두 번이나'}}

In [18]:
source_lang = "ko"
target_lang = "en"
prefix = "translate English to Korean: "


def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_train = train.map(preprocess_function, batched=True)

In [19]:
tokenized_train[0]

{'translation': {'en': 'Thank you so much, Chris.',
  'ko': '감사합니다, 크리스. 이곳에 두 번이나'},
 'input_ids': [1092,
  5778,
  513,
  7,
  20006,
  20004,
  20153,
  5850,
  20005,
  27,
  981,
  8,
  135,
  361,
  188,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [21257, 20025, 20078, 20182, 20006, 23522, 20005, 1]}

In [25]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]

        input_ids = torch.tensor(example['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(example['attention_mask'], dtype=torch.long)
        labels = torch.tensor(example['labels'], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

In [26]:
dataset = TranslationDataset(tokenized_train)

In [27]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [28]:
from transformers import TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
    remove_unused_columns=False,
    logging_dir="./logs",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


import transformers
transformers.logging.set_verbosity_info()

trainer.train()

***** Running training *****
  Num examples = 230,240
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14,390
  Number of trainable parameters = 296,696,448
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 