In [1]:
!pip install -q transformers sentencepiece datasets accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import MBartForConditionalGeneration, MBartTokenizer
import math
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets
import os
import shutil
from google.colab import drive

root="/content/drive/MyDrive/"
model_repo = "facebook/mbart-large-cc25"
# model_repo = root + "model_base"
# model_repo = root + "0.00005-16epochs"
data_files={
    # 'train': root + 'data/train.csv',
    'train': root + 'data/crawler.csv',
}
dirs = {
    "cache_dir": root + "cache",
    "output_dir": root + "output",
    "logs_dir": root + "logs",
    "store_model": root + "store_model",
}
lang_config = {
    # "src": "ja",
    # "tgt": "en"
    "src": "en",
    "tgt": "vi"
}
lang_code = {
    "ja": "ja_XX",
    "en": "en_XX",
    "vi": "vi_VN",
}

In [3]:
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/

Mounted at /content/drive


In [4]:
src = lang_config["src"]
tgt = lang_config["tgt"]
store_model=dirs["store_model"]
print([model_repo, store_model, src, tgt])

['facebook/mbart-large-cc25', '/content/drive/MyDrive/store_model', 'en', 'vi']


In [5]:
# Hyperparameters
batch_size = 2
# learning_rate = 0.00003
learning_rate = 0.00005
epochs = 8
logging_steps=1000
save_steps=1000
split=0.9
save_strategy="epoch"

In [6]:
def create_folder():
    folders = list(dirs.values())
    for folder in folders:
        print(folder)
        if os.path.exists(folder):
            shutil.rmtree(folder)
        if not os.path.exists(folder):
            os.makedirs(folder)
create_folder()

/content/drive/MyDrive/cache
/content/drive/MyDrive/output
/content/drive/MyDrive/logs
/content/drive/MyDrive/store_model


In [7]:
def load_model():
    model = MBartForConditionalGeneration.from_pretrained(model_repo)
    model.save_pretrained(store_model)
    model = MBartForConditionalGeneration.from_pretrained(store_model)
    return model

def load_tokenizer():
    src = lang_config["src"]
    tgt = lang_config["tgt"]
    tokenizer = MBartTokenizer.from_pretrained(model_repo, src_lang=lang_code[src], tgt_lang=lang_code[tgt])
    tokenizer.save_pretrained(store_model)
    tokenizer = MBartTokenizer.from_pretrained(store_model)
    return tokenizer

def load_data_set() -> datasets.arrow_dataset.Dataset:
    dataset = datasets.load_dataset("csv", data_files=data_files, cache_dir=dirs["cache_dir"])
    return dataset["train"]

def create_dataset():
    lst_data = load_data_set()
    train_data = []
    for data in lst_data:
        dic = {}
        dic[src] = data[src]
        dic[tgt] = data[tgt]
        train_data.append({"translation": dic})
    return train_data

def create_data_train():
    raw_data = create_dataset()
    train_length = math.floor(split * len(raw_data))
    train_data = raw_data[:train_length]
    eval_data = raw_data[train_length:]
    print(f"raw_data size: {len(raw_data)}")
    print(f"train_data size: {len(train_data)}")
    print(f"eval_data size: {len(eval_data)}")
    return train_data, eval_data

In [8]:
def repair() -> Seq2SeqTrainer:
    train_data, eval_data = create_data_train()
    model = load_model()
    tokenizer = load_tokenizer()


    def data_collator(features: list):
        x = [f["translation"][src] for f in features]
        y = [f["translation"][tgt] for f in features]
        inputs = tokenizer(x, return_tensors="pt", padding='max_length', truncation=True, max_length=32)
        with tokenizer.as_target_tokenizer():
            inputs['labels'] = tokenizer(y, return_tensors="pt", padding='max_length', truncation=True, max_length=48)['input_ids']
        return inputs

    args = Seq2SeqTrainingArguments(output_dir=dirs["output_dir"],
                                    overwrite_output_dir =True,
                                    do_train=True,
                                    do_eval=True,
                                    evaluation_strategy="epoch",
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    save_steps=save_steps,
                                    save_strategy=save_strategy,
                                    learning_rate=learning_rate,
                                    logging_steps=logging_steps,
                                    num_train_epochs=epochs,
                                    remove_unused_columns=False,
                                )

    trainer = Seq2SeqTrainer(model=model,
                            args=args,
                            data_collator=data_collator,
                            tokenizer=tokenizer,
                            train_dataset=train_data,
                            eval_dataset=eval_data,
                            )
    return trainer

In [9]:
def run(trainer: Seq2SeqTrainer):
    output_dir=store_model
    trainer.train()
    trainer.save_model(output_dir)


def manual_test():
    output_dir=store_model
    model = MBartForConditionalGeneration.from_pretrained(output_dir)
    tokenizer = MBartTokenizer.from_pretrained(output_dir)

    sentence = "上部消化管の愁訴としては以下のものがある"
    inputs = tokenizer(sentence, return_tensors="pt")
    translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id[lang_code[lang_config["tgt"]]], early_stopping=True, max_length=48)
    pred = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    print(type(translated_tokens))
    print(f"日本語 - {sentence}: English - {pred}")

In [10]:
trainer = repair()

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

raw_data size: 25430
train_data size: 22887
eval_data size: 2543


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [11]:
run(trainer)
manual_test()



Epoch,Training Loss,Validation Loss
1,1.1106,1.295378
2,0.8034,1.237724
3,0.5903,1.359444




FailedPreconditionError: ignored