# mT5-small Finetune


In [1]:
!pip install datasets bitsandbytes evaluate sacrebleu rouge_score

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
from evaluate import load
import numpy as np
import torch

In [None]:
csv_path = "../data/inputs/standard/train.csv"
df = pd.read_csv(csv_path)

examples = []
for _, row in df.iterrows():
    if pd.notna(row.get("Pinyin")) and pd.notna(row.get("Chinese")):
        examples.append({
            "input": row["Pinyin"],
            "target": row["Chinese"]
        })
dataset = Dataset.from_pandas(pd.DataFrame(examples))

train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_subset, eval_subset = torch.utils.data.random_split(dataset, [train_size, eval_size])

train_dataset = Dataset.from_list([dataset[i] for i in train_subset.indices])
eval_dataset = Dataset.from_list([dataset[i] for i in eval_subset.indices])
del dataset

In [5]:
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    prefix = "拼音转中文："
    inputs = [prefix + text for text in examples["input"]]
    targets = examples["target"] # Chinese

    model_inputs = tokenizer(
        inputs,
        padding="longest",
        truncation=True,
    )

    labels = tokenizer(
        text_target=targets,
        truncation=True,
        padding="longest"
    )

    label_ids = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = label_ids

    return model_inputs

# Tokenize the entire dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


Map:   0%|          | 0/14959 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3740 [00:00<?, ? examples/s]

In [None]:
bleu = load("bleu")
chrf = load("chrf")
rouge = load("rouge")

def space_chars(text):
    return " ".join(list(text.strip()))

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    print("Pred shape:", np.array(preds).shape)
    print("Labels shape:", np.array(labels).shape)

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    # clip predictions to valid token ID range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    invalid = [(i, val) for i, row in enumerate(labels) for val in row if val < 0 or val >= tokenizer.vocab_size]
    if invalid:
        print("Invalid token IDs found:", invalid[:5])  # print just a few for now
        raise ValueError("Found token ids out of tokenizer vocab range.")

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Normalize whitespace, remove special tokens, etc. if needed
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Character-level accuracy
    char_correct = 0
    char_total = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        char_total += len(label)
        char_correct += sum(p == l for p, l in zip(pred, label))

    char_accuracy = char_correct / char_total if char_total > 0 else 0.0

    spaced_preds = [space_chars(pred) for pred in decoded_preds]
    spaced_labels = [space_chars(label) for label in decoded_labels]

    # BLEU (optional, use for logging or reference)
    bleu_result = bleu.compute(predictions=spaced_preds, references=[[lbl] for lbl in spaced_labels])
    bleu_score = bleu_result["bleu"]

    # chrf
    chrf_score = chrf.compute(predictions=spaced_preds, references=[[lbl] for lbl in spaced_labels])["score"]

    # ROUGE (use spaced strings so it treats each char as a token)
    rouge_result = rouge.compute(predictions=spaced_preds, references=spaced_labels, use_stemmer=False)
    rouge1 = rouge_result["rouge1"]
    rouge2 = rouge_result["rouge2"]
    rougeL = rouge_result["rougeL"]

    return {
        "char_accuracy": char_accuracy,
        "chrf": chrf_score,
        "bleu": bleu_score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_pinyin_to_chinese",
    evaluation_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=16,      
    per_device_eval_batch_size=16,         
    weight_decay=0.01,
    save_total_limit=2,                    
    num_train_epochs=20,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="char_accuracy",
    gradient_accumulation_steps=8,
    save_steps=200,
    logging_steps=100,
    fp16=False,
    bf16=True,
    eval_steps=200,
    save_strategy="steps",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()
trainer.evaluate()
trainer.save_model("./mt5_pinyin_to_chinese_final")
# # ----------------- VALIDATE -------------------
import pandas as pd
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

model_path = "./mt5_pinyin_to_chinese_final"  # Adjust path if different
tokenizer = MT5Tokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
model.to(device)

test_csv_path = "eval.csv"
df_test = pd.read_csv(test_csv_path)

if "Pinyin" not in df_test.columns:
    raise ValueError("The CSV file must contain a column with Chinese words.")

def generate_chinese_text(pinyin_sentence):
    input_text = "拼音转中文：" + pinyin_sentence
    input_ids = tokenizer(input_text, return_tensors="pt", padding="longest", truncation=True)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)

    generated_ids = model.generate(
        input_ids,
        max_new_tokens=64,
        num_beams=4,  # new
        early_stopping=True,  # new
        num_return_sequences=1, # new
    )
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Pinyin: {pinyin_sentence}\nChinese: {prediction}\n")

    return prediction

df_test["Predicted Chinese"] = df_test["Pinyin"].apply(generate_chinese_text)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpipiroy03[0m ([33mpipiroy03-simon-fraser-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Char Accuracy,Chrf,Bleu,Rouge1,Rouge2,Rougel
200,10.542,7.423372,0.002218,0.758648,0.0,0.00127,0.0,0.001292
400,6.7156,5.261571,0.010747,0.564026,0.0,0.0,0.0,0.0
600,6.1256,5.1539,0.011648,0.695767,0.0,0.0,0.0,0.0
800,5.8947,5.047619,0.013596,0.864433,0.0,0.0,0.0,0.0
1000,5.7448,4.951478,0.020952,1.291674,0.001274,0.000178,0.0,0.000178
1200,5.6267,4.863754,0.024847,1.523726,0.001322,0.000267,0.000267,0.000267
1400,5.5582,4.791262,0.031573,1.818731,0.002234,0.000802,0.000579,0.000802
1600,5.4887,4.730064,0.036766,2.119532,0.003751,0.001248,0.001136,0.001159
1800,5.421,4.675488,0.041851,2.449204,0.005256,0.002507,0.002139,0.002516
2000,5.3943,4.639144,0.044988,2.655362,0.007235,0.003387,0.002674,0.003387


Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 20)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)
Pred shape: (3740, 21)
Labels shape: (3740, 52)


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Pred shape: (3740, 21)
Labels shape: (3740, 52)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Pinyin: ta xi huan na ta de tong xue kai wan xiao
Chinese: 他把他带到他的房间里

Pinyin: ru guo you ji hui wo jiang qu kan na bu dian ying
Chinese: 我希望你能把这项工作做好

Pinyin: cong zhe ge jiao du lai kan hua zhong de nv zi mian dai wei xiao
Chinese: 这座建筑物有许多建筑物

Pinyin: ta song wo de ji nian pin hen xiao er qie bu zhi qian
Chinese: 他对我来说很高兴

Pinyin: zhe ben shu shi ta yi qian xie de ju ben de kuo chong
Chinese: 这本书是他的著作

Pinyin: wo men zhen de chu yu kun jing mei ren zhao gu ying er
Chinese: 我们对我们来说不公平的

Pinyin: ta shi huo de yi xue jiao ke shu
Chinese: 他对他的爱好者来说很重要

Pinyin: wo dong yi dian er de yu
Chinese: 我对这项工作感到惊讶

Pinyin: ci li shi yi zhong zi ran xian xiang
Chinese: 这座建筑物有许多建筑物

Pinyin: qi shi nian dai shi chao duan qun shi dai
Chinese: 历史时期是历史时期

Pinyin: ji qi sheng chan yi jing dai ti le shou gong lao zuo
Chinese: 一艘船在船上飞翔

Pinyin: lao zi shuang fang zhi jian cun zai da liang mao dun
Chinese: 一艘船在船上走在船上

Pinyin: kong pa wo de fang wei gan hen cha yin ci wo rong yi mi lu
Chinese: 我对我来说很高兴

Pin

In [None]:
trainer.save_model("./mt5_small_pinyin_to_chinese_final")

# Misc

In [None]:
!zip -r mt5_small_pinyin_to_chinese_final.zip mt5_small_pinyin_to_chinese_final//

  adding: mt5_pinyin_to_chinese_final// (stored 0%)
  adding: mt5_pinyin_to_chinese_final//generation_config.json (deflated 29%)
  adding: mt5_pinyin_to_chinese_final//training_args.bin (deflated 51%)
  adding: mt5_pinyin_to_chinese_final//model.safetensors (deflated 24%)
  adding: mt5_pinyin_to_chinese_final//tokenizer_config.json (deflated 68%)
  adding: mt5_pinyin_to_chinese_final//config.json (deflated 48%)
  adding: mt5_pinyin_to_chinese_final//spiece.model (deflated 46%)
  adding: mt5_pinyin_to_chinese_final//special_tokens_map.json (deflated 73%)


In [None]:
from google.colab import files
files.download("/content/mt5_small_pinyin_to_chinese.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>