In [1]:
import torch
import pandas as pd
from evaluate import load
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bleu = load("bleu")
chrf = load("chrf")
rouge = load("rouge")

# Instruction to download our pretrained models

Visit public link to SFU Vault: https://vault.sfu.ca/index.php/s/TyWOLiIK7AZicmf

It contain 4 compressed pre-trained models for download

Download, unzip and move to ./fine_tuned_models for organization

### Uncomment to select pretuned model here and dataset to be evaluated

In [None]:
model_name = "./fine_tuned_models/mt5-large"
# model_name = "./fine_tuned_models/mt5-base"
# model_name = "./fine_tuned_models/mt5-small"
# model_name = "./fine_tuned_models/mengzi-t5-base"

model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# eval_csv_path = "./data/inputs/standard/eval.csv"
eval_csv_path = "./data/inputs/typo/eval.csv"

eval_data = pd.read_csv(eval_csv_path)

In [37]:
preds = []
labels = []
for sample in tqdm(eval_data.to_dict(orient="records")):
    input_text = "拼音转中文：" + sample["Pinyin"]
    input_ids = tokenizer(input_text, return_tensors="pt", padding="longest", truncation=True)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=64,
        num_beams=4,  # new
        early_stopping=True,  # new
        num_return_sequences=1,  # new
    )
    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Pinyin: " + sample["Pinyin"] + "\nChinese: " + pred + "\n\n")
    preds.append(pred.strip())
    labels.append(sample["Chinese"].strip())

  0%|          | 0/54 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  2%|▏         | 1/54 [01:56<1:42:37, 116.18s/it]

Pinyin: ta ix huan na ta de tong xu kai an xiao
Chinese: 他拒绝拿他的同意开案小




  4%|▎         | 2/54 [04:15<1:52:19, 129.61s/it]

Pinyin: r guo yuo ji hui wo jiang qu kn na bu dian ying
Chinese: 如果遇机会我将去看那部电影




  6%|▌         | 3/54 [07:57<2:25:57, 171.72s/it]

Pinyin: cong zhe ge jiao fu lai akn hua zhng de nv i mian ai wei xiao
Chinese: 从这个教父来慷慨捐献的女子面爱为小




  7%|▋         | 4/54 [08:44<1:42:15, 122.71s/it]

Pinyin: ta song wo de ji ia pin ehn xiao er qie b zhi qian
Chinese: 他送我的酒精品喝小而且比之前




  9%|▉         | 5/54 [11:31<1:53:12, 138.62s/it]

Pinyin: zhe ben hu shi ta yi qian xie de h ben de ku chong
Chinese: 这本书是他以前写的黑板的枯燥




 11%|█         | 6/54 [12:54<1:35:54, 119.89s/it]

Pinyin: wo men zhen de chu yu kun ign mei ren zhao gu ying er
Chinese: 我们真得出于困扰没人照古英而




 13%|█▎        | 7/54 [14:12<1:22:58, 105.92s/it]

Pinyin: ta shi huo ed yi xue jiao ke hu
Chinese: 她是护士一学教可护




 15%|█▍        | 8/54 [14:54<1:05:46, 85.79s/it] 

Pinyin: ow domg yi dian er de yu
Chinese: 呕呕一点儿的呕




 17%|█▋        | 9/54 [15:22<50:40, 67.57s/it]  

Pinyin: ci l shi yi zhnog zi ran xian xiang
Chinese: 丑陋是一桩自然现象




 19%|█▊        | 10/54 [15:57<42:11, 57.53s/it]

Pinyin: i shi nian dai shi chao duan qun shi qi
Chinese: 五十年代是大规模人群时期




 20%|██        | 11/54 [16:26<35:05, 48.97s/it]

Pinyin: ji qi sheng chan yi ing dai ti le whou gog lao zu
Chinese: 机器生产已中断带起了腐朽老鼠




 22%|██▏       | 12/54 [17:08<32:38, 46.62s/it]

Pinyin: lao zi suang fang zhk jian ucn zai da liang mao dun
Chinese: 老子钻破箱间乌云在大量毛掸




 24%|██▍       | 13/54 [17:30<26:51, 39.30s/it]

Pinyin: kony pa wo de fang wei gan hen cha yin di wo ong yi mi lu
Chinese: 抠爬我的房间感觉很差劲地我拥一米路




 26%|██▌       | 14/54 [20:27<53:52, 80.81s/it]

Pinyin: cao shang hza man le lu zhu
Chinese: 草上挤满了路柱




 28%|██▊       | 15/54 [22:03<55:35, 85.53s/it]

Pinyin: cai zheng bu yuan ze shnag fqn dui ahe xie ti yj
Chinese: 财政不原则怂恿诽谤对恶性提议




 30%|██▉       | 16/54 [22:42<45:09, 71.29s/it]

Pinyin: zhneg fu de tong gao yi shi wei ma xkang jie jeu ba gin wen ti de yi bu
Chinese: 赞助者的通告已是为马克斯解决巴金问题的第一步




 31%|███▏      | 17/54 [23:22<38:10, 61.90s/it]

Pinyin: wo hzeng ixe yi feng zheng zhong qi shi re hui xin
Chinese: 我更加依依风筝中其实热回心




 33%|███▎      | 18/54 [25:30<49:11, 81.98s/it]

Pinyin: yi ming fei tu zai jie shang hzi azo shi duan yin ren hzu mu qi yu ei tu ez qiang jie yin ng
Chinese: 一名废徒在街上挥洒时断引人呼救其与恶徒扼强击咽喉




 35%|███▌      | 19/54 [29:26<1:14:44, 128.12s/it]

Pinyin: xiang cun de tian ye li wian m znog heng
Chinese: 乡村的田野里微微葱凝固




 37%|███▋      | 20/54 [31:15<1:09:22, 122.41s/it]

Pinyin: iq ping di shi qi jn yun zai iy ge ping mian hsagn huo li bian pai ban
Chinese: 一平地时其均匀在一个平面黑砂或砾边布局




 39%|███▉      | 21/54 [32:06<55:33, 101.02s/it]  

Pinyin: so de wai z fu yao lai le
Chinese: 斯的外祖父来了




 41%|████      | 22/54 [32:29<41:24, 77.64s/it] 

Pinyin: gai tiao kuan eni rong ru ixa
Chinese: 该条款难以入账




 43%|████▎     | 23/54 [33:08<34:01, 65.84s/it]

Pinyin: xian cheng li hze ouy bai er ba shi li
Chinese: 先程力耗油百而把时力




 44%|████▍     | 24/54 [33:56<30:19, 60.65s/it]

Pinyin: ze hzong qi che ke yi zai ia shen chang du fan wei nei diao tou
Chinese: 蒸汽汽车可以在亚硝酸泛微内掉头




 46%|████▋     | 25/54 [34:23<24:25, 50.54s/it]

Pinyin: wo men ian mian de na liang qi che tu ran zhuan ru uzo iban de cioa lu shang qu le
Chinese: 我们眼前的那辆汽车突然转入迂回的车路上了




 48%|████▊     | 26/54 [34:56<21:08, 45.31s/it]

Pinyin: yan  zai kong zhong lve guo
Chinese: 烟在空中掠过




 50%|█████     | 27/54 [35:55<22:14, 49.41s/it]

Pinyin: wo kao uf mu uf yang mian qiang guo huo
Chinese: 我靠浮木浮羊面墙过火




 52%|█████▏    | 28/54 [36:21<18:22, 42.39s/it]

Pinyin: fu bai de yua yni chuan bu fu ba huo fu xiu de gdn yuan
Chinese: 腐烂的油烟船不腐烂或腐朽的天然气




 54%|█████▎    | 29/54 [36:54<16:29, 39.57s/it]

Pinyin: tong guo guan dao ba re shu cong guo lu shu song aod sa re qi l
Chinese: 通过管道把热水从过路输送氧酸热气




 56%|█████▌    | 30/54 [39:14<27:47, 69.50s/it]

Pinyin: kina xu ji ge xiao shi ta odu zia ai sheng atn qi xi wan negn cong ta mu qin na nong dian qian
Chinese: 近五个小时他呕刺爱生安气细忘脑从他母亲那弄点钱




 57%|█████▋    | 31/54 [40:03<24:17, 63.37s/it]

Pinyin: fei ji yi mei xiao shi 900 gong li de su du rei ixng
Chinese: 飞机每小时900公里的速度雷霆




 59%|█████▉    | 32/54 [41:02<22:49, 62.26s/it]

Pinyin: a ke ba wa hong hai de yi ge ahi wan za xi nai ban dao ji sha te a lq bo xi bi bu zhi jian ta cajng ai yi lai zai zohng doh ju you zhong ya de zhan lve xing
Chinese: 埃克把瓦红海的一个海湾杂耍难搬到几沙特阿拉伯斯比布之间他操纵起来在造船具有重要的战略性




 61%|██████    | 33/54 [41:29<18:02, 51.57s/it]

Pinyin: ta ub dan bu cheng re fan sr zhi wu qi ci
Chinese: 他呕但不成热反酸之物其词




 63%|██████▎   | 34/54 [42:05<15:41, 47.07s/it]

Pinyin: ci sh bian zuan zhe bi xu jnig yu gei cj yu xia ding yi de ji qoa
Chinese: 词诗编纂者必须简语给词语下定义的句子




 65%|██████▍   | 35/54 [42:47<14:24, 45.50s/it]

Pinyin: xioa huo shan kou yi zhogn di bu ping tan dw zhi cheng yuan xing de hou shan bao fa hou xing heng de shehg man shhi de di wa di
Chinese: 峡谷山口一纵横地不平坦dw之成圆形的湖山坝发湖形恒的沙漠绵绵的地瓦地




 67%|██████▋   | 36/54 [43:09<11:29, 38.31s/it]

Pinyin: ni ying gai dui ado g di zou zou yi zeng guag jian shi
Chinese: 你应该对傲慢地走走以增高见识




 69%|██████▊   | 37/54 [43:31<09:29, 33.51s/it]

Pinyin: ub ding dai ing ci de yong yu eai ci
Chinese: 呕丁带阴涕的用于恶涕




 70%|███████   | 38/54 [44:07<09:05, 34.12s/it]

Pinyin: wo dui dian noa e wei xui bao yang hen zai hagn
Chinese: 我对电量恶劣保养很在乎




 72%|███████▏  | 39/54 [44:42<08:38, 34.55s/it]

Pinyin: ta w xing cha mei you yi shi dao cu zia re wei xian
Chinese: 他无行查没有意识到粗刺热危险




 74%|███████▍  | 40/54 [45:11<07:41, 32.94s/it]

Pinyin: shi you shu chu uo jia u hi dogn jie shi you hu chu jia ge kan zhang
Chinese: 石有树出鱼夹鱼鳍洞结石有湖出鸡盖看长




 76%|███████▌  | 41/54 [45:58<07:59, 36.90s/it]

Pinyin: ri ben id ue shi g nei li de uo ji dn zui jin r ben de da du shi yi bei wu ran
Chinese: 原子电流是核内的铀基氮最近原子的大小是原子无原




 78%|███████▊  | 42/54 [46:29<07:02, 35.24s/it]

Pinyin: you dng fa chu rou he de liag guahg
Chinese: 有毒发出热烈的雷吼




 80%|███████▉  | 43/54 [48:40<11:43, 63.94s/it]

Pinyin: zhe gu zhang shh yue lai wie yin qi zhu yi
Chinese: 这幅长耸约来惟引起注意




 81%|████████▏ | 44/54 [53:34<22:09, 132.90s/it]

Pinyin: hsi jia yi jian hui sheng cang cu chh li diao bu zu wei qu
Chinese: 黑家一间会盛藏粗茶里掉不止为去




 83%|████████▎ | 45/54 [58:23<26:59, 179.93s/it]

Pinyin: wo zai qiang shsng dinv le yi ge tu ing ran hou aai shagn mian hua le yi zhang xiao tu pian
Chinese: 我在墙上钉了一个图印然后凹三角面画了一张小图片




 85%|████████▌ | 46/54 [58:54<18:01, 135.13s/it]

Pinyin: ta zbang de jiao ruo mei li
Chinese: 他颤抖得叫嚷没礼




 87%|████████▋ | 47/54 [59:18<11:53, 101.89s/it]

Pinyin: at ba ri ji suo azi shang ceng chou ti li
Chinese: 躲把日记所遗上层抽体里




 89%|████████▉ | 48/54 [59:44<07:54, 79.09s/it] 

Pinyin: so men jint chnog feb tao lun jue ding qian wang niu jin
Chinese: 人们紧绷脑袋讨论决定迁往纽约




 91%|█████████ | 49/54 [1:00:07<05:11, 62.23s/it]

Pinyin: iqng wu zai dheng qi de shi hou qu baj ru ci zhong yao de shi qing
Chinese: 烟雾在等起的时候去摆如此重要的事情




 93%|█████████▎| 50/54 [1:00:28<03:19, 49.82s/it]

Pinyin: gong zhu bei na ge ke wu de mo shu shi tou tuo dai dao l yi ge huang a shsmg
Chinese: 公爵被那个动物的模述时头托带到L一个黄阿斯玛




 94%|█████████▍| 51/54 [1:00:54<02:07, 42.64s/it]

Pinyin: wo mne dou xi gan yu suo shou de jiao yang
Chinese: 我满都喜感与所受的教养




 96%|█████████▋| 52/54 [1:01:11<01:10, 35.04s/it]

Pinyin: ta de qian dun yu kuai de aymg zi dou hsi zhuang hcu lai de
Chinese: 他的千吨与快的鞍子都是合金合金来的




 98%|█████████▊| 53/54 [1:01:30<00:30, 30.33s/it]

Pinyin: fnag she xing cai liao zhu un ai fang u she de te shu rong qi ni
Chinese: 帆纱性材料著云爱防雨纱的特点容启你




100%|██████████| 54/54 [1:01:55<00:00, 68.80s/it]

Pinyin: zi ong mou ji su ux xiao chuan chu you ren xi eu zhi ohu gao nian ji you ji ge nan sheng yi ebi kai chu
Chinese: 自ONG某急救学校传出有人吸毒致昏老人及有几个男孩已被开除







In [39]:
def space_chars(text):
    return " ".join(list(text.strip()))

char_correct = 0
char_total = 0
for pred, label in zip(preds, labels):
    char_total += len(label)
    char_correct += sum(p == l for p, l in zip(pred, label))
char_accuracy = char_correct / char_total if char_total > 0 else 0.0

spaced_preds = [space_chars(p) for p in preds]
spaced_labels = [space_chars(l) for l in labels]

bleu_result = bleu.compute(predictions=spaced_preds, references=[[l] for l in spaced_labels])
chrf_result = chrf.compute(predictions=spaced_preds, references=[[l] for l in spaced_labels])
rouge_result = rouge.compute(predictions=spaced_preds, references=spaced_labels, use_stemmer=False)

results = {
    "char_accuracy": char_accuracy,
    "bleu": bleu_result["bleu"],
    "chrf": chrf_result["score"],
    "rouge1": rouge_result["rouge1"],
    "rouge2": rouge_result["rouge2"],
    "rougeL": rouge_result["rougeL"],
    "predictions": preds,
    "references": labels
}

results

{'char_accuracy': 0.38729763387297633,
 'bleu': 0.28616623958361564,
 'chrf': 24.7365429284862,
 'rouge1': np.float64(0.018518518518518517),
 'rouge2': np.float64(0.018518518518518517),
 'rougeL': np.float64(0.018518518518518517),
 'predictions': ['他拒绝拿他的同意开案小',
  '如果遇机会我将去看那部电影',
  '从这个教父来慷慨捐献的女子面爱为小',
  '他送我的酒精品喝小而且比之前',
  '这本书是他以前写的黑板的枯燥',
  '我们真得出于困扰没人照古英而',
  '她是护士一学教可护',
  '呕呕一点儿的呕',
  '丑陋是一桩自然现象',
  '五十年代是大规模人群时期',
  '机器生产已中断带起了腐朽老鼠',
  '老子钻破箱间乌云在大量毛掸',
  '抠爬我的房间感觉很差劲地我拥一米路',
  '草上挤满了路柱',
  '财政不原则怂恿诽谤对恶性提议',
  '赞助者的通告已是为马克斯解决巴金问题的第一步',
  '我更加依依风筝中其实热回心',
  '一名废徒在街上挥洒时断引人呼救其与恶徒扼强击咽喉',
  '乡村的田野里微微葱凝固',
  '一平地时其均匀在一个平面黑砂或砾边布局',
  '斯的外祖父来了',
  '该条款难以入账',
  '先程力耗油百而把时力',
  '蒸汽汽车可以在亚硝酸泛微内掉头',
  '我们眼前的那辆汽车突然转入迂回的车路上了',
  '烟在空中掠过',
  '我靠浮木浮羊面墙过火',
  '腐烂的油烟船不腐烂或腐朽的天然气',
  '通过管道把热水从过路输送氧酸热气',
  '近五个小时他呕刺爱生安气细忘脑从他母亲那弄点钱',
  '飞机每小时900公里的速度雷霆',
  '埃克把瓦红海的一个海湾杂耍难搬到几沙特阿拉伯斯比布之间他操纵起来在造船具有重要的战略性',
  '他呕但不成热反酸之物其词',
  '词诗编纂者必须简语给词语下定义的句子',
  '峡谷山口一纵横地不平坦dw之成圆形的湖山坝发湖形恒的沙漠绵绵的地瓦地',
  '你应该对傲慢地走走以增高见识'