In [1]:
!pip install -q transformers sentencepiece datasets accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import MBartForConditionalGeneration, MBartTokenizer
import math
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets
import os
import shutil
from google.colab import drive

root="/content/drive/MyDrive/"
model_repo = "facebook/mbart-large-cc25"
data_files={
    'train': root + 'data/train.csv',
}
dirs = {
    "cache_dir": root + "cache",
    "output_dir": root + "output",
    "logs_dir": root + "logs",
    "store_model": root + "store_model",
}
lang_config = {
    "src": "ja",
    "tgt": "en"
    # "src": "en",
    # "tgt": "vi"
}
lang_code = {
    "ja": "ja_XX",
    "en": "en_XX",
    "vi": "vi_VN",
}

In [3]:
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/

Mounted at /content/drive


In [4]:
src = lang_config["src"]
tgt = lang_config["tgt"]
store_model=dirs["store_model"]
print([store_model, src, tgt])

['/content/drive/MyDrive/store_model', 'ja', 'en']


In [5]:
# Hyperparameters
batch_size = 2
learning_rate = 0.0005
epochs = 4
split=0.9

In [6]:
def create_folder():
    folders = list(dirs.values())
    for folder in folders:
        print(folder)
        if os.path.exists(folder):
            shutil.rmtree(folder)
        if not os.path.exists(folder):
            os.makedirs(folder)
create_folder()

/content/drive/MyDrive/cache
/content/drive/MyDrive/output
/content/drive/MyDrive/logs
/content/drive/MyDrive/store_model


In [7]:
def load_model():
    model = MBartForConditionalGeneration.from_pretrained(model_repo)
    model.save_pretrained(store_model)
    model = MBartForConditionalGeneration.from_pretrained(store_model)
    return model

def load_tokenizer():
    src = lang_config["src"]
    tgt = lang_config["tgt"]
    tokenizer = MBartTokenizer.from_pretrained(model_repo, src_lang=lang_code[src], tgt_lang=lang_code[tgt])
    tokenizer.save_pretrained(store_model)
    tokenizer = MBartTokenizer.from_pretrained(store_model)
    return tokenizer

def load_data_set() -> datasets.arrow_dataset.Dataset:
    dataset = datasets.load_dataset("csv", data_files=data_files, cache_dir=dirs["cache_dir"])
    return dataset["train"]

def create_dataset():
    lst_data = load_data_set()
    train_data = []
    for data in lst_data:
        dic = {}
        dic[src] = data[src]
        dic[tgt] = data[tgt]
        train_data.append({"translation": dic})
    return train_data

def create_data_train():
    raw_data = create_dataset()
    train_length = math.floor(split * len(raw_data))
    train_data = raw_data[:train_length]
    eval_data = raw_data[train_length:]
    print(f"raw_data size: {len(raw_data)}")
    print(f"train_data size: {len(train_data)}")
    print(f"eval_data size: {len(eval_data)}")
    return train_data, eval_data

In [8]:
def repair() -> Seq2SeqTrainer:
    train_data, eval_data = create_data_train()
    model = load_model()
    tokenizer = load_tokenizer()


    def data_collator(features: list):
        x = [f["translation"][src] for f in features]
        y = [f["translation"][tgt] for f in features]
        inputs = tokenizer(x, return_tensors="pt", padding='max_length', truncation=True, max_length=32)
        with tokenizer.as_target_tokenizer():
            inputs['labels'] = tokenizer(y, return_tensors="pt", padding='max_length', truncation=True, max_length=48)['input_ids']
        return inputs

    args = Seq2SeqTrainingArguments(output_dir=dirs["output_dir"],
                                    do_train=True,
                                    do_eval=True,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    learning_rate=learning_rate,
                                    num_train_epochs=epochs,
                                    evaluation_strategy="epoch",
                                    remove_unused_columns=False,
                                    )

    trainer = Seq2SeqTrainer(model=model,
                            args=args,
                            data_collator=data_collator,
                            train_dataset=train_data,
                            eval_dataset=eval_data,
                            )
    return trainer

In [11]:
def run(trainer: Seq2SeqTrainer):
    output_dir=store_model
    trainer.train()
    trainer.save_model(output_dir)


def manual_test():
    output_dir=store_model
    model = MBartForConditionalGeneration.from_pretrained(output_dir)
    tokenizer = MBartTokenizer.from_pretrained(output_dir)

    sentence = "上部消化管の愁訴としては以下のものがある"
    inputs = tokenizer(sentence, return_tensors="pt")
    translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id[lang_code[lang_config["tgt"]]], early_stopping=True, max_length=48)
    pred = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    print(type(translated_tokens))
    print(f"日本語 - {sentence}: English - {pred}")


In [9]:
trainer = repair()

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

raw_data size: 5560
train_data size: 5004
eval_data size: 556


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
run(trainer)
manual_test()



Epoch,Training Loss,Validation Loss




In [12]:
model = MBartForConditionalGeneration.from_pretrained(store_model)
tokenizer = MBartTokenizer.from_pretrained(store_model)

In [19]:
raw_data = create_dataset()
print(raw_data[0])
train_length = math.floor(split * len(raw_data))
# train_data = raw_data[:train_length]
# eval_data = raw_data[train_length:]
print(raw_data[train_length:][0])

{'translation': {'ja': '救命医療は病状が最も重篤な患者のケアに特化している。こういった患者は，熟練したスタッフが配属された集中治療室（ICU）で治療を行うのが最もよい。病院によっては，特定の患者群（例，心疾患患者，外傷患者，手術患者，神経学的異常のある患者，小児，新生児の患児）に対し別々のICUを有するところもある。ICUは患者1人当たりの看護師の人数が多く，処置および生理学的パラメータのモニタリングなど，必要な集中治療管理を行える。', 'en': 'Critical care medicine specializes in caring for the most seriously ill patients. These patients are best treated in an intensive care unit (ICU) staffed by experienced personnel. Some hospitals maintain separate units for special populations (eg, cardiac, transplant, trauma, surgical, neurologic, pediatric, or neonatal patients). ICUs have a high nurse:patient ratio to provide the necessary high intensity of service, including treatment and monitoring of physiologic parameters.'}}
{'translation': {'ja': 'Vedolizumab and natalizumab are antibodies to leukocyte adhesion molecules. Vedolizumab is available for moderate to severe ulcerative colitis and Crohn disease. The recommended dose of IV vedolizumab is 300 mg at 0, 2, and 6 weeks and then every 8

In [16]:
eval_data[0]

{'translation': {'ja': 'Vedolizumab and natalizumab are antibodies to leukocyte adhesion molecules. Vedolizumab is available for moderate to severe ulcerative colitis and Crohn disease. The recommended dose of IV vedolizumab is 300 mg at 0, 2, and 6 weeks and then every 8 weeks. Its effect is believed to be limited to the gut, making it safer than natalizumab, which is used only as a 2nd-line drug through a restricted-prescribing program for the most refractory cases of Crohn disease. The accepted therapeutic serum level of vedolizumab is > 20 mcg/mL. These medications can cause hypersensitivity reactions and increase the risk of infections. Natalizumab is currently available only through a restricted-use program because it increases the risk of progressive multifocal leukoencephalopathy            (PML). Vedolizumab has a theoretical risk of PML because it is in the same class of medications as natalizumab.',
  'en': 'Vedolizumab and natalizumab are antibodies to leukocyte adhesion mo

In [13]:
!pip install -q evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
import evaluate
meteor = evaluate.load('meteor')

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# English to Vietnam 11p
lst = []
for eval in eval_data:
    translation = eval["translation"]
    src_text = translation[src]
    tgt_text = translation[tgt]
    inputs = tokenizer(src_text, return_tensors="pt")
    translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id[lang_code[tgt]], early_stopping=True, max_length=32)
    pred = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    predictions = [pred]
    references = [tgt_text]
    results = meteor.compute(predictions=predictions, references=references)
    print(results['meteor'])
    lst.append(results['meteor'])


In [None]:
# English to Vietnam
import numpy as np
def notEmpty(v: np.float64):
    zero=np.float64(0)
    return zero != v

result = list(filter(notEmpty, lst))
print(len(result))
print(max(result))

161
0.06024096385542168


In [None]:
print("Mean lst = ", np.mean(lst))

Mean lst =  0.006969299290006348


In [None]:
# Japanese to English

In [None]:
raw_data = create_dataset()
# train_length = math.floor(split * len(raw_data))
# train_data = raw_data[:train_length]
# eval_data = raw_data[train_length:]
eval_data = raw_data[:200]
eval_data[0]

{'translation': {'ja': '長期の抗菌薬治療が必要であり，再発がよくみられる。',
  'en': 'Infection by the bacteria T. whipplei affects many organs, including the gastrointestinal tract.'}}