In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import numpy as np
import evaluate
import torch
from torch.utils.data import Dataset

In [None]:
!pip install transformers
!pip install evaluate
!pip install datasets

In [4]:
tokenizer = AutoTokenizer.from_pretrained("6mtx9/train_korean-parallel-corpora",model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("6mtx9/train_korean-parallel-corpora")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/4.17M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [5]:
train = load_dataset("iwslt2017","iwslt2017-ko-en", split="train")
eval = load_dataset("iwslt2017","iwslt2017-ko-en", split="validation")

Downloading builder script:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/230240 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8514 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/879 [00:00<?, ? examples/s]

In [6]:
train[0]

{'translation': {'en': 'Thank you so much, Chris.',
  'ko': '감사합니다, 크리스. 이곳에 두 번이나'}}

In [7]:
eval[0]

{'translation': {'en': 'Last year I showed these two slides so that  demonstrate that the arctic ice cap,  which for most of the last three million years  has been the size of the lower 48 states,  has shrunk by 40 percent.',
  'ko': '작년에 이 두개의 슬라이드로 북극의 만년설이 지난 3백만년 동안 미국의 본토 48개주 크기였던 것이 40%나 줄었다는 것을 보여들였습니다.'}}

In [8]:
def reverse_translation(entry):
    en_translation = entry['translation']['en']
    ko_translation = entry['translation']['ko']
    new = {'ko':ko_translation,'en':en_translation}
    return new

In [9]:
new_train = train.map(reverse_translation)
new_eval = eval.map(reverse_translation)

Map:   0%|          | 0/230240 [00:00<?, ? examples/s]

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

In [10]:
inputs_train = tokenizer(new_train['ko'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs_train = tokenizer(new_train['en'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [11]:
inputs_eval= tokenizer(new_eval['ko'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs_eval = tokenizer(new_eval['en'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [12]:
dataset_train = torch.utils.data.TensorDataset(inputs_train.input_ids, inputs_train.attention_mask, outputs_train.input_ids, outputs_train.attention_mask)
dataset_eval = torch.utils.data.TensorDataset(inputs_eval.input_ids, inputs_eval.attention_mask, outputs_eval.input_ids, outputs_eval.attention_mask)

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
pip install accelerate -U

In [14]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Aug 17 15:02:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    48W / 350W |      3MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime !')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime !


In [None]:
from transformers import TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
    remove_unused_columns=False,
    logging_dir="./logs",
)

def data_collator(batch):
        return {
            "input_ids": torch.stack([item[0] for item in batch]),
            "attention_mask": torch.stack([item[1] for item in batch]),
            "labels": torch.stack([item[2] for item in batch]),
        }

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


import transformers
transformers.logging.set_verbosity_info()

trainer.train()

# Save the trained model
output_dir = "./trained_model"
trainer.save_model(output_dir)

***** Running training *****
  Num examples = 230,240
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14,390
  Number of trainable parameters = 296,696,448


Epoch,Training Loss,Validation Loss


In [None]:
from google.colab import drive
drive.mount('/content/drive')