In [17]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset_builder
from datasets import load_dataset
import numpy as np
import evaluate
import torch

In [None]:
!pip install transformers
!pip install evaluate
!pip install datasets

In [None]:
pip install accelerate -U

In [18]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 140:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime !')

Your runtime has 89.6 gigabytes of available RAM

Not using a high-RAM runtime


In [19]:
tokenizer = AutoTokenizer.from_pretrained("6mtx9/train_iwslt2017",model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("6mtx9/train_iwslt2017")

# Getting dataset

In [20]:
def prepare_dataset(data):
    source_language = [value['ko'] for key, value in data.items()]
    target_language = [value['en'] for key, value in data.items()]
    return source_language, target_language

In [21]:
train  = load_dataset("yhavinga/ccmatrix","en-ko", split="train")
#test = load_dataset("Moo/korean-parallel-corpora", split="test")
validation = load_dataset("msarmi9/korean-english-multitarget-ted-talks-task", split="validation")

In [22]:
train

Dataset({
    features: ['id', 'score', 'translation'],
    num_rows: 19358582
})

In [23]:
validation

Dataset({
    features: ['korean', 'english'],
    num_rows: 1958
})

In [24]:
def reverse_translation(entry):
    en_translation = entry['translation']['en']
    ko_translation = entry['translation']['ko']
    new = {'ko':ko_translation,'en':en_translation}
    return new

In [25]:
new_train = train.map(reverse_translation)

In [26]:
new_train[0]

{'id': 0,
 'score': 1.2491111755371094,
 'translation': {'en': 'Many of the messages are for you and the world."',
  'ko': '많은 메시지는 너와 세상을 위한 것이다."'},
 'ko': '많은 메시지는 너와 세상을 위한 것이다."',
 'en': 'Many of the messages are for you and the world."'}

In [30]:
midpoint = len(new_train) // 10

In [31]:
print(midpoint)

1935858


In [None]:
two_half = midpoint*2
three_half = midpoint*3
four_half = midpoint*4
five_half = midpoint*5
six_half = midpoint*6
seven_half = midpoint*7

In [32]:
first_half = new_train[:midpoint]

In [40]:
"""
second_half = new_train[midpoint:two_half]
third_half = new_train[two_half:three_half]
fourth_half = new_train[three_half:four_half]
fifth_half = new_train[four_half:five_half]
sixth_half = new_train[five_half:six_half]
seventh_half = new_train[six_half:seven_half]
eighth_half = new_train[midpoint:]
"""

'\nsecond_half = new_train[midpoint:two_half]\nthird_half = new_train[two_half:three_half]\nfourth_half = new_train[three_half:four_half]\nfifth_half = new_train[four_half:five_half]\nsixth_half = new_train[five_half:six_half]\nseventh_half = new_train[six_half:seven_half]\neighth_half = new_train[midpoint:]\n'

# Tokenizer

In [33]:
inputs_train = tokenizer(first_half['ko'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs_train = tokenizer(first_half['en'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [34]:
inputs_validation = tokenizer(validation['korean'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs_validation = tokenizer(validation['english'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [35]:
train_dataset = torch.utils.data.TensorDataset(inputs_train.input_ids, inputs_train.attention_mask, outputs_train.input_ids, outputs_train.attention_mask)

In [36]:
validation_dataset = torch.utils.data.TensorDataset(inputs_validation.input_ids, inputs_validation.attention_mask, outputs_validation.input_ids, outputs_validation.attention_mask)

In [37]:
train_dataset[0]

(tensor([20151,    12,   645,     4,  4256,    25,  1146,     6,    86,    24,
             9,     7, 20367,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [38]:
validation_dataset[0]

(tensor([20004, 35163, 24938,    11,   729,    10,     4,    24,    59,    58,
          1198,     4,  3784,     6,   248,    13, 20016,    90,    24,     9,
            13, 20006,    48,  3784,     6, 10620, 20015,     2, 20018,  1527,
           399,    83,  2155,     4,    24,  3862,   513,     7, 20005,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [39]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [41]:
from transformers import TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
    remove_unused_columns=False,
    logging_dir="./logs",
)

def data_collator(batch):
        return {
            "input_ids": torch.stack([item[0] for item in batch]),
            "attention_mask": torch.stack([item[1] for item in batch]),
            "labels": torch.stack([item[2] for item in batch]),
        }

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


import transformers
transformers.logging.set_verbosity_info()

trainer.train()

# Save the trained model
output_dir = "./train_translatorKO_EN"
trainer.save_model(output_dir)

***** Running training *****
  Num examples = 1,935,858
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 120,992
  Number of trainable parameters = 296,696,448


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored