# mT5-large Finetune


In [1]:
!pip install datasets bitsandbytes evaluate sacrebleu rouge_score

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
from evaluate import load
import torch

In [None]:
csv_path = "../data/inputs/standard/train.csv"
df = pd.read_csv(csv_path)

import re

examples = []
for _, row in df.iterrows():
    if pd.notna(row.get("Pinyin")) and pd.notna(row.get("Chinese")):
        examples.append({
            "input": row["Pinyin"],
            "target": row["Chinese"]
        })
dataset = Dataset.from_pandas(pd.DataFrame(examples))

train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_subset, eval_subset = torch.utils.data.random_split(dataset, [train_size, eval_size])

train_dataset = Dataset.from_list([dataset[i] for i in train_subset.indices])
eval_dataset = Dataset.from_list([dataset[i] for i in eval_subset.indices])
del dataset

In [4]:
model_name = "google/mt5-large"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
def preprocess_function(examples):
    # Optionally add a task prefix (useful when fine-tuning T5/MT5 models)
    # prefix = "这句话是拼音，每个拼音中间隔开了空格，每个拼音对应一个字，数字是音调。请将此拼音转成中文："
    prefix = "拼音转中文："
    inputs = [prefix + text for text in examples["input"]]
    # inputs = examples["input"] # Pinyin
    targets = examples["target"] # Chinese

    model_inputs = tokenizer(
        inputs,
        padding="longest",
        truncation=True,
        # padding="max_length"
    )

    labels = tokenizer(
        text_target=targets,
        truncation=True,
        padding="longest"
        # padding="max_length"
        # padding=True
    )

    # label_ids = labels["input_ids"]

    label_ids = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = label_ids

    return model_inputs

# Tokenize the entire dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


Map:   0%|          | 0/14959 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3740 [00:00<?, ? examples/s]

In [6]:
bleu = load("bleu")
chrf = load("chrf")
rouge = load("rouge")

import numpy as np

def space_chars(text):
    return " ".join(list(text.strip()))

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    print("Pred shape:", np.array(preds).shape)
    print("Labels shape:", np.array(labels).shape)

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    # clip predictions to valid token ID range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    invalid = [(i, val) for i, row in enumerate(labels) for val in row if val < 0 or val >= tokenizer.vocab_size]
    if invalid:
        print("Invalid token IDs found:", invalid[:5])  # print just a few for now
        raise ValueError("Found token ids out of tokenizer vocab range.")

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Normalize whitespace, remove special tokens, etc. if needed
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Character-level accuracy
    char_correct = 0
    char_total = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        char_total += len(label)
        char_correct += sum(p == l for p, l in zip(pred, label))

    char_accuracy = char_correct / char_total if char_total > 0 else 0.0

    spaced_preds = [space_chars(pred) for pred in decoded_preds]
    spaced_labels = [space_chars(label) for label in decoded_labels]

    # BLEU (optional, use for logging or reference)
    bleu_result = bleu.compute(predictions=spaced_preds, references=[[lbl] for lbl in spaced_labels])
    bleu_score = bleu_result["bleu"]

    # chrf
    chrf_score = chrf.compute(predictions=spaced_preds, references=[[lbl] for lbl in spaced_labels])["score"]

    # ROUGE (use spaced strings so it treats each char as a token)
    rouge_result = rouge.compute(predictions=spaced_preds, references=spaced_labels, use_stemmer=False)
    rouge1 = rouge_result["rouge1"]
    rouge2 = rouge_result["rouge2"]
    rougeL = rouge_result["rougeL"]

    return {
        "char_accuracy": char_accuracy,
        "chrf": chrf_score,
        "bleu": bleu_score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_large_pinyin_to_chinese",  # output directory
    evaluation_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=16,        # batch size per device during training
    per_device_eval_batch_size=16,         # batch size per device during training
    weight_decay=0.01,
    save_total_limit=2,                    # limit the total amount of checkpoints
    num_train_epochs=20,                   # total number of training epochs
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="char_accuracy", # since character to character mapping
    gradient_accumulation_steps=8,
    save_steps=200,                        # save checkpoint every 500 steps
    logging_steps=100,                     # log every 100 steps
    fp16=False,
    bf16=True,
    eval_steps=200,                        # evaluation step
    save_strategy="steps",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


In [7]:
trainer.train()
trainer.evaluate()
trainer.save_model("./mt5_large_pinyin_to_chinese_final")
# # ----------------- VALIDATE -------------------
import pandas as pd
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

model_path = "./mt5_large_pinyin_to_chinese_final"  # Adjust path if different
tokenizer = MT5Tokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
model.to(device)

test_csv_path = "eval.csv"
df_test = pd.read_csv(test_csv_path)

if "Pinyin" not in df_test.columns:
    raise ValueError("The CSV file must contain a column with Chinese words.")

def generate_chinese_text(pinyin_sentence):
    # input_text = "这句话是拼音，每个拼音中间隔开了空格，每个拼音对应一个字，数字是音调。请将此拼音转成中文：" + pinyin_sentence
    input_text = "拼音转中文：" + pinyin_sentence
    input_ids = tokenizer(input_text, return_tensors="pt", padding="longest", truncation=True)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)

    generated_ids = model.generate(
        input_ids,
        max_new_tokens=64,
        num_beams=4,  # new
        early_stopping=True,  # new
        num_return_sequences=1, # new
    )
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Pinyin: {pinyin_sentence}\nChinese: {prediction}\n")

    return prediction

df_test["Predicted Chinese"] = df_test["Pinyin"].apply(generate_chinese_text)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpipiroy03[0m ([33mpipiroy03-simon-fraser-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Char Accuracy,Chrf,Bleu,Rouge1,Rouge2,Rougel
200,4.5984,2.045696,0.336985,23.603024,0.275929,0.013889,0.012552,0.013711
400,2.5156,1.423862,0.500036,34.858541,0.405893,0.01534,0.013404,0.015295
600,1.971,1.177722,0.581939,41.17584,0.473639,0.015476,0.013636,0.015419
800,1.6782,1.053928,0.61369,44.467705,0.507414,0.015476,0.013636,0.015419
1000,1.4901,0.967314,0.646915,47.317729,0.53673,0.015476,0.013636,0.015419
1200,1.3703,0.903499,0.666397,49.402358,0.55776,0.015699,0.013636,0.015699
1400,1.2728,0.860882,0.67809,50.588827,0.568776,0.015476,0.013636,0.015419
1600,1.1936,0.83267,0.689908,51.894333,0.581557,0.015699,0.013708,0.015684
1800,1.1471,0.807542,0.693956,52.546089,0.587811,0.015699,0.013636,0.015699
2000,1.1149,0.793031,0.698687,53.279301,0.594678,0.015763,0.013743,0.015726


Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)
Pred shape: (3740, 21)
Labels shape: (3740, 47)


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Pred shape: (3740, 21)
Labels shape: (3740, 47)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Pinyin: ta xi huan na ta de tong xue kai wan xiao
Chinese: 他喜欢拿他的同学开玩笑

Pinyin: ru guo you ji hui wo jiang qu kan na bu dian ying
Chinese: 如果有机会我将去看那部电影

Pinyin: cong zhe ge jiao du lai kan hua zhong de nv zi mian dai wei xiao
Chinese: 从这个角度来看花中的女子面带微小

Pinyin: ta song wo de ji nian pin hen xiao er qie bu zhi qian
Chinese: 他送我的纪念品很小而且不值得

Pinyin: zhe ben shu shi ta yi qian xie de ju ben de kuo chong
Chinese: 这本书是他以前写的剧本的扩充

Pinyin: wo men zhen de chu yu kun jing mei ren zhao gu ying er
Chinese: 我们真得出于困境没人照古英而

Pinyin: ta shi huo de yi xue jiao ke shu
Chinese: 她是获得一学教考试

Pinyin: wo dong yi dian er de yu
Chinese: 我动一点儿的语

Pinyin: ci li shi yi zhong zi ran xian xiang
Chinese: 磁力是一种自然现象

Pinyin: qi shi nian dai shi chao duan qun shi dai
Chinese: 七十年代是大规模群时代

Pinyin: ji qi sheng chan yi jing dai ti le shou gong lao zuo
Chinese: 机器生产已经带起了手工劳动

Pinyin: lao zi shuang fang zhi jian cun zai da liang mao dun
Chinese: 老子双方之间存在大量矛盾

Pinyin: kong pa wo de fang wei gan hen cha yin ci wo rong yi mi lu

In [None]:
trainer.save_model("./t5_pinyin_to_chinese_final")

# Validate

In [None]:
import pandas as pd
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

In [None]:
import pandas as pd
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

model_path = "./mt5_pinyin_to_chinese_final"  # Adjust path if different
tokenizer = MT5Tokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
model.to(device)

test_csv_path = "eval.csv"
df_test = pd.read_csv(test_csv_path)

if "Pinyin" not in df_test.columns:
    raise ValueError("The CSV file must contain a column with Chinese words.")

def generate_chinese_text(pinyin_sentence):
    # input_text = "这句话是拼音，每个拼音中间隔开了空格，每个拼音对应一个字，数字是音调。请将此拼音转成中文：" + pinyin_sentence
    input_text = "拼音转中文：" + pinyin_sentence
    input_ids = tokenizer(input_text, return_tensors="pt", padding="longest", truncation=True)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)

    generated_ids = model.generate(
        input_ids,
        max_new_tokens=64,
        num_beams=4,  # new
        early_stopping=True,  # new
        num_return_sequences=1, # new
    )
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Pinyin: {pinyin_sentence}\nChinese: {prediction}\n")

    return prediction

df_test["Predicted Chinese"] = df_test["Pinyin"].apply(generate_chinese_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Pinyin: ta xi huan na ta de tong xue kai wan xiao
Chinese: 他喜欢那他的同学开课

Pinyin: ru guo you ji hui wo jiang qu kan na bu dian ying
Chinese: 如果有机会我将去看那部电影

Pinyin: cong zhe ge jiao du lai kan hua zhong de nv zi mian dai wei xiao
Chinese: 从这个教堂来看画中的女孩面大为小

Pinyin: ta song wo de ji nian pin hen xiao er qie bu zhi qian
Chinese: 他送我的年度礼物很小但是并不之前

Pinyin: zhe ben shu shi ta yi qian xie de ju ben de kuo chong
Chinese: 这本书是他以前写的简短的短篇

Pinyin: wo men zhen de chu yu kun jing mei ren zhao gu ying er
Chinese: 我们真得处於荒凉没有人照顧爱儿

Pinyin: ta shi huo de yi xue jiao ke shu
Chinese: 他是个学生教科书

Pinyin: wo dong yi dian er de yu
Chinese: 我握一点儿的泪

Pinyin: ci li shi yi zhong zi ran xian xiang
Chinese: 奇缘是一种自私现象

Pinyin: qi shi nian dai shi chao duan qun shi dai
Chinese: 历史年代是寥寥数年代

Pinyin: ji qi sheng chan yi jing dai ti le shou gong lao zuo
Chinese: 计算机工作已经完成了手工工作

Pinyin: lao zi shuang fang zhi jian cun zai da liang mao dun
Chinese: 驴子双脚处处在大面积马达

Pinyin: kong pa wo de fang wei gan hen cha yin ci wo rong yi mi l

In [None]:
df_test.to_csv("./output.csv", index=False, encoding="utf-8")

In [None]:
test_csv_path = "eval.csv"  # Update path if necessary
df_test = pd.read_csv(test_csv_path)

if "Pinyin" not in df_test.columns:
    raise ValueError("The CSV file must contain a column with Chinese words.")

In [None]:
def generate_chinese_text(pinyin_sentence):
    """ Generates Chinese text from a given pinyin sentence using the fine-tuned MT5 model """
    input_text = "这句话是拼音，每个拼音中间隔开了空格，每个拼音对应一个字，数字是音调。请将此拼音转成中文：" + pinyin_sentence
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=1200).input_ids
    input_ids = input_ids.to(device)

    # print(f"Input IDs: {input_ids}")

    # Generate output
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=350, min_length=4, do_sample=False)

    # print(f"Output IDs: {output_ids}")
    prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"Pinyin: {pinyin_sentence}\nChinese: {prediction}\n")

    return prediction

df_test["Predicted Chinese"] = df_test["Pinyin"].apply(generate_chinese_text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Chinese: 他已经在在

Pinyin: ta1 zai4 lan2 ban3 qiu2 de zheng1 qiang3 shang4 mei2 you3 gei3 qiu2 dui4 dai4 lai2 xu1 yao4 de bang1 zhu4
Chinese: 他已经在在

Pinyin: ta1 de lan2 ban3 qiu2 he2 ke1 bi3 bu4 lai2 en1 te4 ma3 te4 ba1 en1 si1 chi2 ping2
Chinese: 他已经在比赛的

Pinyin: zhe4 shi4 chu2 le si4 yue4 shi2 san1 ri4 dui4 zhen4 ma3 ci4 dui4 yin1 shang1 ti2 qian2 tui4 chang3 wai4
Chinese: 他已经在被打败的

Pinyin: ke3 shi4 zai4 jin1 nian2 de ji4 hou4 sai4 zhong1 ta1 men que4 mian4 lin2 yi2 ge4 xiang1 dang1 gan1 ga4 de wen4 ti2
Chinese: 他已经在在

Pinyin: hu2 ren2 yi1 zhan4 jiu1 chu1 yi2 ge4 zui4 cha4 xian1 sheng1 jia1 yi1 jia1 yi1 jia1 gong1 ye3 bu4 xing2 fang2 ye3 bu4 xing2 xin1 lang4 ti3 yu4 xun4 jin1 tian1 hu2 ren2 dui4 zai4 ji4 hou4 sai4 ci4 lun2 mian4 dui4 xiao3 niu2 dui4 de tiao3 zhan4 shi2
Chinese: 他已经在被认为的

Pinyin: er2 qiu2 dui4 zai4 ben3 sai4 ji4 zhi1 qian2 zhong4 jin1 qian1 xia4 de shi3 di4 fu1 bu4 lei2 ke4 jin3 chu1 zhan4 jiu3 fen1 zhong1


KeyboardInterrupt: 

# Misc

In [None]:
del dataset

In [9]:
# !zip -r mt5_pinyin_to_chinese.zip mt5_pinyin_to_chinese//
!zip -r mt5_large_pinyin_to_chinese_final.zip mt5_large_pinyin_to_chinese_final//

  adding: mt5_large_pinyin_to_chinese_final// (stored 0%)
  adding: mt5_large_pinyin_to_chinese_final//generation_config.json (deflated 29%)
  adding: mt5_large_pinyin_to_chinese_final//special_tokens_map.json (deflated 73%)
  adding: mt5_large_pinyin_to_chinese_final//model.safetensors (deflated 16%)
  adding: mt5_large_pinyin_to_chinese_final//training_args.bin (deflated 51%)
  adding: mt5_large_pinyin_to_chinese_final//spiece.model (deflated 46%)
  adding: mt5_large_pinyin_to_chinese_final//tokenizer_config.json (deflated 68%)
  adding: mt5_large_pinyin_to_chinese_final//config.json (deflated 48%)


In [None]:
from google.colab import files
files.download("/content/mt5_pinyin_to_chinese.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>