#### Using mt5-base for translation

Polish-->Japanese based on the Tatoeba dataset, fine-tuning with LoRA

Dataset used:

[Tatoeba](https://opus.nlpl.eu/Tatoeba/corpus/version/Tatoeba)

Citations: J. Tiedemann, 2012, [Parallel Data, Tools and Interfaces in OPUS](http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf). In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)

### Loading the dataset

In [None]:
!pip install transformers datasets evaluate scikit-learn peft -Uqq

In [None]:
import datasets
from datasets import load_dataset

# Data from the Tatoeba project, split up and converted to HF dataset format
# For other datasets remember to shuffle! This one already is shuffled
dataset_train = load_dataset("json", data_files="/content/drive/MyDrive/Datasets/Tatoeba_train.json")
dataset_valid = load_dataset("json", data_files="/content/drive/MyDrive/Datasets/Tatoeba_valid.json")
dataset_test = load_dataset("json", data_files="/content/drive/MyDrive/Datasets/Tatoeba_test.json")


### Train/valid/test split

In [None]:
from datasets import DatasetDict

ds_splits = DatasetDict({
    'train': dataset_train['train'],
    'valid': dataset_valid['train'],
    'test': dataset_test["train"]
})

In [None]:
ds_splits

DatasetDict({
    train: Dataset({
        features: ['Source', 'Target'],
        num_rows: 22350
    })
    valid: Dataset({
        features: ['Source', 'Target'],
        num_rows: 1242
    })
    test: Dataset({
        features: ['Source', 'Target'],
        num_rows: 1242
    })
})

In [None]:
ds_splits["train"][0]

{'Source': 'Dlaczego powiedzia≈Çe≈õ co≈õ tak g≈Çupiego?',
 'Target': '„Å©„ÅÜ„Åó„Å¶„Åù„Çì„Å™„Å´È¶¨Èπø„Å™„Åì„Å®„ÇíË®Ä„Å£„Åü„ÅÆÔºü'}

### Check GPU availability

In [None]:
import torch


if torch.cuda.is_available():
  print("CUDA available. Device count:")
  print(torch.cuda.device_count())
  device_id = torch.cuda.current_device()
  print(torch.cuda.get_device_name(device_id))
else:
  print("CUDA unavailable")

CUDA available. Device count:
1
NVIDIA A100-SXM4-40GB


### Get the model and wrap it in the peft object

In [None]:
from transformers import T5Tokenizer, MT5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
original_model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_2_SEQ_LM

)

In [None]:
model = get_peft_model(original_model,
                            lora_config)

In [None]:
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 585,940,224 || trainable%: 0.6040


In [None]:
print(model)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): MT5ForConditionalGeneration(
      (shared): Embedding(250112, 768)
      (encoder): MT5Stack(
        (embed_tokens): Embedding(250112, 768)
        (block): ModuleList(
          (0): MT5Block(
            (layer): ModuleList(
              (0): MT5LayerSelfAttention(
                (SelfAttention): MT5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=32, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=32, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
     

### Test the tokenizer

In [None]:
def test_tokenizer(input_text):
  input_tokenized = tokenizer(input_text, return_tensors="pt")
  print(input_tokenized)
  out = tokenizer.decode(input_tokenized.input_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  print(f"In: {input_text}")
  print(f"Out: {out}")

test_tokenizer("Samoch√≥d")
test_tokenizer("Chod≈∫my do ≈ºabki")
test_tokenizer("„Ç∂„Éñ„Ç´„Å∏Ë°å„Åç„Åæ„Åó„Çá„ÅÜ")

{'input_ids': tensor([[22115, 55337,   285,     1]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
In: Samoch√≥d
Out: Samoch√≥d
{'input_ids': tensor([[  8144,  15732,   1813,    342,  50478, 111528,      1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
In: Chod≈∫my do ≈ºabki
Out: Chod≈∫my do ≈ºabki
{'input_ids': tensor([[  259, 16786, 11594,  6388,  6031, 68222, 46265,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
In: „Ç∂„Éñ„Ç´„Å∏Ë°å„Åç„Åæ„Åó„Çá„ÅÜ
Out: „Ç∂„Éñ„Ç´„Å∏Ë°å„Åç„Åæ„Åó„Çá„ÅÜ


### Tokenize

In [None]:
def preprocess_function(examples):
    inputs = [f"Translate Polish to Japanese: {source_text}" for source_text in examples["Source"]]
    targets = examples["Target"]

    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Preprocess the dataset
tokenized_dataset = ds_splits.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset["train"][0]

{'Source': 'Dlaczego powiedzia≈Çe≈õ co≈õ tak g≈Çupiego?',
 'Target': '„Å©„ÅÜ„Åó„Å¶„Åù„Çì„Å™„Å´È¶¨Èπø„Å™„Åì„Å®„ÇíË®Ä„Å£„Åü„ÅÆÔºü',
 'input_ids': [89349,
  259,
  58459,
  288,
  30865,
  267,
  259,
  30104,
  22099,
  259,
  58942,
  78179,
  964,
  3376,
  756,
  259,
  318,
  82729,
  52770,
  291,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

### Training

In [None]:
!pip install fugashi[unidic-lite]



In [None]:
from fugashi import Tagger

tagger = Tagger('-Owakati')
def tokenize_japanese(text):
  return [word.surface for word in tagger(text)]

In [None]:
text = "È∫©ËèìÂ≠ê„ÅØ„ÄÅÈ∫©„Çí‰∏ªÊùêÊñô„Å®„Åó„ÅüÊó•Êú¨„ÅÆËèìÂ≠ê„ÄÇ"
tokenize_japanese(text)

['È∫©', 'ËèìÂ≠ê', '„ÅØ', '„ÄÅ', 'È∫©', '„Çí', '‰∏ªÊùê', 'Êñô', '„Å®', '„Åó', '„Åü', 'Êó•Êú¨', '„ÅÆ', 'ËèìÂ≠ê', '„ÄÇ']

In [None]:
!pip install sacrebleu



In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=4e-4,
    per_device_train_batch_size=32, # 32 or 16--> OOM, 8 was fine on a T4
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy = "epoch",
    metric_for_best_model='eval_loss',
    predict_with_generate=True
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    callbacks=[EarlyStoppingCallback(3, 0.0)]
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.3516,0.260545
2,0.3036,0.222252
3,0.2826,0.213637
4,0.2823,0.21112


TrainOutput(global_step=2796, training_loss=1.3506807056875187, metrics={'train_runtime': 1510.6134, 'train_samples_per_second': 59.181, 'train_steps_per_second': 1.851, 'total_flos': 2.7041662107648e+16, 'train_loss': 1.3506807056875187, 'epoch': 4.0})

In [None]:
from datetime import datetime
save_time = datetime.now()
save_time_str = save_time.strftime("%Y-%m-%d_%H-%M-%S")
save_dir = f"mt5-base-pl-ja-adapter-{save_time_str}"
print("Saving the model")
model.save_pretrained(save_dir)

Saving the model


In [None]:
import os

zip_filename = f"{save_dir}.zip"
drive_path = f"/content/drive/MyDrive/Models/{zip_filename}"
print("Zipping the model")
os.system(f"zip -r {zip_filename} {save_dir}")

Zipping the model


0

In [None]:
os.system(f"mv {zip_filename} '{drive_path}'")

0

In [None]:
print("Model moved to google drive!")

Model moved to google drive!


In [None]:
print("Model filename:")
print(zip_filename)

Model filename:
mt5-base-pl-ja-adapter-2025-02-12_15-01-23.zip


### Testing the model

In [None]:
tokenizer.decode(tokenized_dataset["train"]["input_ids"][0])

'Translate Polish to Japanese: Dlaczego powiedzia≈Çe≈õ co≈õ tak g≈Çupiego?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [None]:
tokenizer.decode(tokenized_dataset["train"]["labels"][0])

'„Å©„ÅÜ„Åó„Å¶„Åù„Çì„Å™„Å´È¶¨Èπø„Å™„Åì„Å®„ÇíË®Ä„Å£„Åü„ÅÆ?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_to_translation_prompt(source_text):
  return f"Translate Polish to Japanese: {source_text}"

def translate_text(source_text, temperature=0.3, top_k=20):
  input_text = text_to_translation_prompt(source_text)
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
  with torch.no_grad():
    output_ids = model.generate(input_ids=input_ids,
                                top_k=top_k,
                                temperature = temperature,
                                do_sample=True)
  print(f"PL: {source_text}")
  print(f"JP: {tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)}")
  print("---")

texts_to_translate = [
    "Pogasi≈Ça wszystkie ≈õwiat≈Ça o dziesiƒÖtej."
    "Chod≈∫my do ≈ºabki",
    "Chod≈∫my do kina",
    "Spokojnie jak na wojnie",
    "lol",
    "Wied≈∫min to super gra",
    "Lubiƒô programowaƒá",
    "Polski to trudny jƒôzyk",
    "Kalendarz Gregoria≈Ñski zosta≈Ç wprowadzony w 1582 roku",
    "Nie mo≈ºna oczekiwaƒá ≈õwietnych wynik√≥w od modelu kt√≥ry nie uczy≈Ç siƒô nawet na ca≈Çym zbiorze danych",
    "Test test test",
    "Jestem g≈Çodny"
]

for text in texts_to_translate:
  translate_text(text)

PL: Pogasi≈Ça wszystkie ≈õwiat≈Ça o dziesiƒÖtej.Chod≈∫my do ≈ºabki
JP: ÂΩºÂ•≥„ÅØ„Åô„Åπ„Å¶„ÅÆÂÖâ„ÇíÈõ®„Å´ÁÑö„ÅÑ„Åü„ÄÇ
---
PL: Chod≈∫my do kina
JP: ÁßÅ„Åü„Å°„ÅØÊò†Áîª„Å´Ë°å„Åì„ÅÜ„ÄÇ
---
PL: Spokojnie jak na wojnie
JP: Êà¶‰∫â‰∏≠„Å†„ÄÇ
---
PL: lol
JP: Á¨ë„ÅÜ„ÄÇ
---
PL: Wied≈∫min to super gra
JP: „Éü„Éü„ÅØÁ¥†Êô¥„Çâ„Åó„ÅÑ„Ç≤„Éº„É†„Å†„ÄÇ
---
PL: Lubiƒô programowaƒá
JP: „Éó„É≠„Ç∞„É©„É†„ÅåÂ•Ω„Åç„Å†„ÄÇ
---
PL: Polski to trudny jƒôzyk
JP: Êó•Êú¨Ë™û„ÅØÈõ£„Åó„Åô„Åé„Çã„ÄÇ
---
PL: Kalendarz Gregoria≈Ñski zosta≈Ç wprowadzony w 1582 roku
JP: Gregori„ÅÆ„Ç´„É¨„É≥„ÉÄ„Éº„ÅØ1582Âπ¥Êîπ„ÇÅ„Çâ„Çå„Åü„ÄÇ
---
PL: Nie mo≈ºna oczekiwaƒá ≈õwietnych wynik√≥w od modelu kt√≥ry nie uczy≈Ç siƒô nawet na ca≈Çym zbiorze danych
JP: „Åù„ÅÆ„É¢„Éá„É´„ÅØÂÖ®ÈÉ®„Éá„Éº„Çø„Å´ÁêÜËß£„Åß„Åç„Å™„Åã„Å£„Åü„ÄÇ
---
PL: Test test test
JP: „ÉÜ„Çπ„Éà„ÉÜ„Çπ„Éà„ÉÜ„Çπ„Éà„ÇíËã±Ë™û„ÅßÁøªË®≥„Åó„Åü„ÄÇ
---
PL: Jestem g≈Çodny
JP: Áñ≤„Çå„Å¶„ÅÑ„Çã„ÄÇ
---


In [None]:
translate_text("samoch√≥d")

PL: samoch√≥d
JP: Ëªä„ÅØÈÅãËª¢Êâã„Å†„ÄÇ
---


In [None]:
texts_to_translate = [
    "Test test test",
    "Mam na imiƒô Adrian",
    "T≈Çumaczenie jest trudne",
    "MajƒÖ ulubionƒÖ potrawƒÖ jest omlet",
    "Tom ma bardzo szybki samoch√≥d",
    "Samoch√≥d Toma jest bardzo szybki"
]

for text in texts_to_translate:
  translate_text(text)

PL: Test test test
JP: „ÉÜ„Çπ„Éà„ÉÜ„Çπ„Éà„ÉÜ„Çπ„Éà„Çí„ÉÜ„Çπ„Éà„Åó„Åü„ÄÇ
---
PL: Mam na imiƒô Adrian
JP: ÁßÅ„ÅÆÂêçÂâç„ÅØAdrian„Åß„Åô„ÄÇ
---
PL: T≈Çumaczenie jest trudne
JP: ÁøªË®≥„ÅØÈõ£„Åó„Åù„ÅÜ„ÄÇ
---
PL: MajƒÖ ulubionƒÖ potrawƒÖ jest omlet
JP: ÂΩº„Çâ„ÅØ„ÅäÊ∞ó„Å´ÂÖ•„Çä„ÅÆÊñôÁêÜ„ÅØ„Ç™„É™„Éº„Éñ„Åß„Åô„ÄÇ
---
PL: Tom ma bardzo szybki samoch√≥d
JP: „Éà„É†„ÅØÈÄü„ÅÑËªä„ÇíÊåÅ„Å£„Å¶„ÅÑ„Çã„ÄÇ
---
PL: Samoch√≥d Toma jest bardzo szybki
JP: „Éà„É†„ÅØÈÄü„ÅÑ„ÄÇ
---


In [None]:
texts_to_translate = [
    "Zagrajmy w grƒô",
    "Poczekaj chwilƒô!",
    "Nie wiem co zrobiƒá",
    "Gdzie jest stacja kolejowa?",
    "Jak doj≈õƒá na stacjƒô kolejowƒÖ?",
    "Smutno mi",
    "Cicho bƒÖd≈∫!"
]

for text in texts_to_translate:
  translate_text(text)

PL: Zagrajmy w grƒô
JP: „Ç≤„Éº„É†„Çí„Éó„É¨„Ç§„Åó„Åæ„Åó„Çá„ÅÜ„ÄÇ
---
PL: Poczekaj chwilƒô!
JP: ÊôÇÈñì„Åå„ÅÇ„Å£„Åü„ÇâÂæÖ„Å§„ÄÇ
---
PL: Nie wiem co zrobiƒá
JP: ‰Ωï„Çí„Åô„Çå„Å∞„ÅÑ„ÅÑ„ÅãÂàÜ„Åã„Çâ„Å™„ÅÑ„ÄÇ
---
PL: Gdzie jest stacja kolejowa?
JP: ÈõªËªä„ÅÆÈßÖ„ÅØ„Å©„Åì„Åß„Åô„Åã„ÄÇ
---
PL: Jak doj≈õƒá na stacjƒô kolejowƒÖ?
JP: ÂàóËªä„Å´Ë°å„Åç„Åæ„Åô„Åã„ÄÇ
---
PL: Smutno mi
JP: „Å®„Å¶„ÇÇÊÇ≤„Åó„Åè„ÄÇ
---
PL: Cicho bƒÖd≈∫!
JP: „ÅÇ„Å™„Åü„ÅØ„ÄÅ„ÅÑ„ÅÑ„Çà„ÄÇ
---


In [None]:
for text in texts_to_translate:
  translate_text(text, temperature=1, top_k=100)

PL: Zagrajmy w grƒô
JP: „Ç≤„Éº„É†„Çí„Çπ„Çø„Éº„Éà„Åó„Åæ„Åó„Çá„ÅÜ„ÄÇ
---
PL: Poczekaj chwilƒô!
JP: „Åó„Å∞„Çâ„ÅèÂæÖ„Å£„Å¶„Åè„Çå„ÄÇ
---
PL: Nie wiem co zrobiƒá
JP: ‰Ωï„Åô„Çå„Å∞„ÅÑ„ÅÑ„ÅÆ„Çà„ÄÇ
---
PL: Gdzie jest stacja kolejowa?
JP: ÈõªËªä„ÅÆËøë„Åè„ÅÆÈßÖ„ÅØ„Å©„Åì„Åß„Åô„Åã„ÄÇ
---
PL: Jak doj≈õƒá na stacjƒô kolejowƒÖ?
JP: „Å™„ÅúÊñ∞ÂππÁ∑ö„Å´‰πó„Å£„Å¶Â∏∞„Å£„Åü„Åã„ÇíÁü•„Çä„Åæ„Åó„Åü„ÄÇ
---
PL: Smutno mi
JP: ÁßÅ„Å´„ÅØÁ∑äÂºµ„Åó„Å¶„ÅÑ„Çã„Å®ÁßÅ„ÅåÊÇ™„ÅÑ„ÄÇ
---
PL: Cicho bƒÖd≈∫!
JP: Ëâ≤„ÅÑ„ÅÑ„Åß„Åô„Å≠„ÄÇ
---


### Checking the BLEU score

In [None]:
model_predictions = []
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_input_ids = tokenized_dataset['test']["input_ids"]
true_translations = tokenized_dataset['test']["Target"]

for i in range(0, len(test_input_ids), batch_size):
    batch_input_ids = test_input_ids[i:i + batch_size]

    input_ids_tensor = torch.tensor(batch_input_ids).to(device)

    with torch.no_grad():
        output_ids = model.generate(input_ids=input_ids_tensor)

    batch_predictions = [tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False) for output in output_ids]
    model_predictions.extend(batch_predictions)




In [None]:
model_predictions[0]

'ÊòéÊó•„ÄÅ„Åì„ÅÆÂ†¥ÊâÄ„Å´Ë°å„Å£„Å¶„Åç„Åæ„Åô„ÄÇ'

In [None]:
len(model_predictions)

1242

In [None]:
!pip install nltk



In [None]:
import fugashi
tagger = fugashi.Tagger()

tagger = Tagger('-Owakati')
def tokenize_japanese(text):
  return [word.surface for word in tagger(text)]

tokenized_target = []
tokenized_predictions = []

for text in tokenized_dataset['test']["Target"]:
    tokenized_target.append([tokenize_japanese(text)]) # Single reference

In [None]:
for text in model_predictions:
    tokenized_predictions.append(tokenize_japanese(text))

In [None]:
tokenized_dataset['test']["Target"][0]

'ÊòéÊó•„ÅÆ‰ªäÈ†É„ÅØÂ§ßÈò™„ÇíË¶ãÁâ©„Åó„Å¶„ÅÑ„Çã„Åß„Åó„Çá„ÅÜ„ÄÇ'

In [None]:
tokenized_target[0]

[['ÊòéÊó•', '„ÅÆ', '‰ªäÈ†É', '„ÅØ', 'Â§ßÈò™', '„Çí', 'Ë¶ãÁâ©', '„Åó', '„Å¶', '„ÅÑ„Çã', '„Åß„Åó„Çá„ÅÜ', '„ÄÇ']]

In [None]:
tokenized_predictions[0]

['ÊòéÊó•', '„ÄÅ', '„Åì„ÅÆ', 'Â†¥ÊâÄ', '„Å´', 'Ë°å„Å£', '„Å¶', '„Åç', '„Åæ„Åô', '„ÄÇ']

In [None]:
len(tokenized_target)

1242

In [None]:
len(tokenized_predictions)

1242

In [None]:
import nltk

nltk.translate.bleu_score.corpus_bleu(tokenized_target, tokenized_predictions)

0.12440461525629558