# mT5-base Finetune


In [11]:
!pip install datasets bitsandbytes evaluate sacrebleu rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e6f0f6cb0f5b13cfddf53c2bbb2e6888574c733813e0420dafd29938aef9f044
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
from evaluate import load
import numpy as np
import torch

In [None]:
csv_path = "../data/inputs/standard/train.csv"
df = pd.read_csv(csv_path)

examples = []
for _, row in df.iterrows():
    if pd.notna(row.get("Pinyin")) and pd.notna(row.get("Chinese")):
        examples.append({
            "input": row["Pinyin"],
            "target": row["Chinese"]
        })
dataset = Dataset.from_pandas(pd.DataFrame(examples))

train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_subset, eval_subset = torch.utils.data.random_split(dataset, [train_size, eval_size])

train_dataset = Dataset.from_list([dataset[i] for i in train_subset.indices])
eval_dataset = Dataset.from_list([dataset[i] for i in eval_subset.indices])
del dataset

In [5]:
model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only 

In [None]:
def preprocess_function(examples):
    prefix = "拼音转中文："
    inputs = [prefix + text for text in examples["input"]]
    targets = examples["target"] # Chinese

    model_inputs = tokenizer(
        inputs,
        padding="longest",
        truncation=True,
    )

    labels = tokenizer(
        text_target=targets,
        truncation=True,
        padding="longest"
    )

    label_ids = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = label_ids

    return model_inputs

# Tokenize the entire dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


Map:   0%|          | 0/14959 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3740 [00:00<?, ? examples/s]

In [None]:
bleu = load("bleu")
chrf = load("chrf")
rouge = load("rouge")

def space_chars(text):
    return " ".join(list(text.strip()))

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    print("Pred shape:", np.array(preds).shape)
    print("Labels shape:", np.array(labels).shape)

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    # clip predictions to valid token ID range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    invalid = [(i, val) for i, row in enumerate(labels) for val in row if val < 0 or val >= tokenizer.vocab_size]
    if invalid:
        print("Invalid token IDs found:", invalid[:5])  # print just a few for now
        raise ValueError("Found token ids out of tokenizer vocab range.")

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Normalize whitespace, remove special tokens, etc. if needed
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Character-level accuracy
    char_correct = 0
    char_total = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        char_total += len(label)
        char_correct += sum(p == l for p, l in zip(pred, label))

    char_accuracy = char_correct / char_total if char_total > 0 else 0.0

    spaced_preds = [space_chars(pred) for pred in decoded_preds]
    spaced_labels = [space_chars(label) for label in decoded_labels]

    # BLEU (optional, use for logging or reference)
    bleu_result = bleu.compute(predictions=spaced_preds, references=[[lbl] for lbl in spaced_labels])
    bleu_score = bleu_result["bleu"]

    # chrf
    chrf_score = chrf.compute(predictions=spaced_preds, references=[[lbl] for lbl in spaced_labels])["score"]

    # ROUGE (use spaced strings so it treats each char as a token)
    rouge_result = rouge.compute(predictions=spaced_preds, references=spaced_labels, use_stemmer=False)
    rouge1 = rouge_result["rouge1"]
    rouge2 = rouge_result["rouge2"]
    rougeL = rouge_result["rougeL"]

    return {
        "char_accuracy": char_accuracy,
        "chrf": chrf_score,
        "bleu": bleu_score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_pinyin_to_chinese",  
    evaluation_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,   
    weight_decay=0.01,
    save_total_limit=2,     
    num_train_epochs=20,         
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="char_accuracy",
    gradient_accumulation_steps=8,
    save_steps=200,                    
    logging_steps=100,         
    fp16=False,
    bf16=True,
    eval_steps=200,          
    save_strategy="steps",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()
trainer.evaluate()
trainer.save_model("./mt5_pinyin_to_chinese_final")
# # ----------------- VALIDATE -------------------
import pandas as pd
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

model_path = "./mt5_pinyin_to_chinese_final"  # Adjust path if different
tokenizer = MT5Tokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
model.to(device)

test_csv_path = "eval.csv"
df_test = pd.read_csv(test_csv_path)

if "Pinyin" not in df_test.columns:
    raise ValueError("The CSV file must contain a column with Chinese words.")

def generate_chinese_text(pinyin_sentence):
    input_text = "拼音转中文：" + pinyin_sentence
    input_ids = tokenizer(input_text, return_tensors="pt", padding="longest", truncation=True)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)

    generated_ids = model.generate(
        input_ids,
        max_new_tokens=64,
        num_beams=4,  # new
        early_stopping=True,  # new
        num_return_sequences=1, # new
    )
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Pinyin: {pinyin_sentence}\nChinese: {prediction}\n")

    return prediction

df_test["Predicted Chinese"] = df_test["Pinyin"].apply(generate_chinese_text)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpipiroy03[0m ([33mpipiroy03-simon-fraser-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Char Accuracy,Chrf,Bleu,Rouge1,Rouge2,Rougel
200,6.0429,4.546898,0.027322,1.736691,0.003458,0.0,0.0,0.0
400,4.768,3.325936,0.148407,9.137683,0.087714,0.01426,0.012451,0.014216
600,4.1207,2.902268,0.209706,12.575057,0.133871,0.015242,0.013359,0.015196
800,3.7467,2.644571,0.261885,15.501857,0.173246,0.016079,0.013738,0.016038
1000,3.468,2.447698,0.309405,17.980902,0.205804,0.015998,0.01376,0.015865
1200,3.2487,2.315383,0.338939,19.830784,0.229717,0.015852,0.013837,0.015896
1400,3.0907,2.205347,0.367484,21.390649,0.249524,0.016178,0.014037,0.016121
1600,2.9509,2.130654,0.389248,22.653187,0.265482,0.01631,0.014439,0.01631
1800,2.8706,2.075549,0.398115,23.410806,0.274536,0.016178,0.014037,0.016121
2000,2.8076,2.03821,0.406011,23.95804,0.280952,0.016221,0.014305,0.016176


Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)
Pred shape: (3740, 21)
Labels shape: (3740, 46)


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Pred shape: (3740, 21)
Labels shape: (3740, 46)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Pinyin: ta xi huan na ta de tong xue kai wan xiao
Chinese: 他喜欢那他的同学开课

Pinyin: ru guo you ji hui wo jiang qu kan na bu dian ying
Chinese: 如果有机会我将去看那部电影

Pinyin: cong zhe ge jiao du lai kan hua zhong de nv zi mian dai wei xiao
Chinese: 从这个教堂来看画中的女孩面带为小

Pinyin: ta song wo de ji nian pin hen xiao er qie bu zhi qian
Chinese: 他送我的年度礼物很小但是并不之前

Pinyin: zhe ben shu shi ta yi qian xie de ju ben de kuo chong
Chinese: 这本书是他以前写的简短的短篇

Pinyin: wo men zhen de chu yu kun jing mei ren zhao gu ying er
Chinese: 我们真得处於荒凉没有人照护英勇

Pinyin: ta shi huo de yi xue jiao ke shu
Chinese: 他是计算机的一名计算机

Pinyin: wo dong yi dian er de yu
Chinese: 我握一点儿的泪

Pinyin: ci li shi yi zhong zi ran xian xiang
Chinese: 地球是一种自远现象

Pinyin: qi shi nian dai shi chao duan qun shi dai
Chinese: 历史年代是超长年代

Pinyin: ji qi sheng chan yi jing dai ti le shou gong lao zuo
Chinese: 计算机工作已经完成了手工作业

Pinyin: lao zi shuang fang zhi jian cun zai da liang mao dun
Chinese: 小孩双方之间存在大面积马达

Pinyin: kong pa wo de fang wei gan hen cha yin ci wo rong yi m

In [None]:
trainer.save_model("./mt5_base_pinyin_to_chinese_final")

# Misc

In [None]:
del dataset

In [None]:
!zip -r mt5_base_pinyin_to_chinese_final.zip mt5_base_pinyin_to_chinese_final//

  adding: t5_pinyin_to_chinese_final// (stored 0%)
  adding: t5_pinyin_to_chinese_final//config.json (deflated 48%)
  adding: t5_pinyin_to_chinese_final//generation_config.json (deflated 29%)
  adding: t5_pinyin_to_chinese_final//added_tokens.json (deflated 83%)
  adding: t5_pinyin_to_chinese_final//model.safetensors (deflated 7%)
  adding: t5_pinyin_to_chinese_final//training_args.bin (deflated 51%)
  adding: t5_pinyin_to_chinese_final//tokenizer_config.json (deflated 94%)
  adding: t5_pinyin_to_chinese_final//spiece.model (deflated 46%)
  adding: t5_pinyin_to_chinese_final//special_tokens_map.json (deflated 85%)


In [None]:
from google.colab import files
files.download("/content/mt5_base_pinyin_to_chinese.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>