In [1]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset_builder
from datasets import load_dataset
import numpy as np
import evaluate
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained("KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en",model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en")

In [3]:
def prepare_dataset(data):
    source_language = [value['ko'] for key, value in data.items()]
    target_language = [value['en'] for key, value in data.items()]
    return source_language, target_language

In [4]:
train  = load_dataset("Moo/korean-parallel-corpora", split="train")
#test = load_dataset("Moo/korean-parallel-corpora", split="test")
#validation = load_dataset("Moo/korean-parallel-corpora", split="validation")

In [5]:
train[0]

{'ko': '개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"',
 'en': 'Much of personal computing is about "can you top this?"'}

In [6]:
inputs = tokenizer(train['ko'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs = tokenizer(train['en'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [7]:
dataset = torch.utils.data.TensorDataset(inputs.input_ids, inputs.attention_mask, outputs.input_ids, outputs.attention_mask)

In [8]:
dataset[0]

(tensor([  381, 11023,   832,    54,     5,  1310,   202,    12, 20004, 20023,
          6704,   121,  2265,    33,  1739,    19, 20016,  1577, 20787,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [9]:
from transformers import TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
    remove_unused_columns=False,
    logging_dir="./logs",
)

def data_collator(batch):
        return {
            "input_ids": torch.stack([item[0] for item in batch]),
            "attention_mask": torch.stack([item[1] for item in batch]),
            "labels": torch.stack([item[2] for item in batch]),
        }

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


import transformers
transformers.logging.set_verbosity_info()

trainer.train()

***** Running training *****
  Num examples = 96,215
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6,014
  Number of trainable parameters = 296,696,448


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 