In [2]:
import pypinyin
import re
import pandas as pd
import random
from pypinyin import Style
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Dataset: https://huggingface.co/datasets/swaption2009/20k-en-zh-translation-pinyin-hsk

ds = load_dataset("swaption2009/20k-en-zh-translation-pinyin-hsk")
dataset = ds["train"]

contains_english = re.compile(r'[a-zA-Z]')

def clean_punctuations(p):
    """
    Remove common Chinese-style or western punctuation
    """
    return re.sub(r"[。.,，！？!?:：；;\"'‘’“”()（）《》【】＇｀……\-－／/、\[\]［］＂·—]", "", p)

def clean_spaces(text):
    """
    Remove all spaces from Chinese text
    """
    text = text.replace(" ", "").replace("\u00A0", "").replace("　", "")  # Remove regular and non-breaking spaces
    return text

def convert_fullwidth_to_normal(text):
    """
    Convert full-width digits (０１２３４５６７８９) to normal digits (0123456789).
    """
    return "".join(chr(ord(char) - 0xFEE0) if '０' <= char <= '９' else char for char in text)

def chinese_to_pinyin(text):
    return " ".join(pypinyin.lazy_pinyin(text, style=Style.NORMAL))

def apply_typo(pinyin):
    """
    Apply a typo into the Pinyin string
    One typo per 3 pinyin characters
    """
    num_typos = max(1, len(pinyin.split(" ")) // 3)
    for _ in range(num_typos):
        typo_type = random.choice(["neighboring_key", "missing_character", "misordering"])
        if typo_type == "neighboring_key":
            typo_index = random.randint(0, len(pinyin) - 1)
            neighboring_keys = {
                'a': 'qwsz', 'b': 'vghn', 'c': 'xdfv', 'd': 'erfcxs', 'e': 'wsdr', 'f': 'rtgvcd',
                'g': 'tyhbvf', 'h': 'yujnbg', 'i': 'ujko', 'j': 'uikmnh', 'k': 'iolmj',
                'l': 'opk', 'm': 'njk', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol',
                'q': 'wa', 'r': 'edft', 's': 'wedxza', 't': 'rfgy', 'u': 'yhji',
                'v': 'cfgb', 'w': 'qase', 'x': 'zsdc', 'y': 'tghu', 'z': 'asx'
            }
            if pinyin[typo_index] in neighboring_keys:
                typo_char = random.choice(neighboring_keys[pinyin[typo_index]])
                pinyin = pinyin[:typo_index] + typo_char + pinyin[typo_index + 1:]
        elif typo_type == "missing_character":
            if len(pinyin) > 2:
                char_to_delete = " "
                while char_to_delete == " ":
                    delete_index = random.randint(0, len(pinyin) - 1)
                    char_to_delete = pinyin[delete_index]
                pinyin = pinyin[:delete_index] + pinyin[delete_index + 1:]
        elif typo_type == "misordering":
            if len(pinyin) > 1:
                typo1, typo2 = " ", " "
                while typo1 == " " or typo2 == " ":
                    typo_index = random.randint(0, len(pinyin) - 2)
                    typo1 = pinyin[typo_index]
                    typo2 = pinyin[typo_index + 1]
                pinyin = (pinyin[:typo_index] + pinyin[typo_index + 1] +
                        pinyin[typo_index] + pinyin[typo_index + 2:])
    return pinyin

formatted_dataset, formatted_dataset_eval = [], []

typo_count = 0
for i in range(2, dataset.num_rows, 5):
    chinese = dataset[i]["text"][10:]
    pinyin = dataset[i + 1]["text"][8:]

    chinese = clean_punctuations(chinese)
    chinese = clean_spaces(chinese)
    chinese = convert_fullwidth_to_normal(chinese)

    pinyin = chinese_to_pinyin(chinese)

    # # 50% chance to apply a typo
    # if random.random() < 0.5:
    #     pinyin = apply_typo(pinyin)
    #     typo_count += 1

    pinyin = apply_typo(pinyin)

    if ((i+3) % 2000) == 0:
        if len(chinese) < 60 and not contains_english.search(chinese) and pinyin not in [entry["Pinyin"] for entry in formatted_dataset_eval]:
            formatted_dataset_eval.append({
                "Pinyin": pinyin,
                "Chinese": chinese
            })
    else:
        if len(chinese) < 60 and not contains_english.search(chinese) and chinese not in [entry["Chinese"] for entry in formatted_dataset]:
            formatted_dataset.append({
                "Chinese": chinese,
                "Pinyin": pinyin
            })

print(f"Typo count: {typo_count}")
df = pd.DataFrame(formatted_dataset)
df_eval = pd.DataFrame(formatted_dataset_eval)

df.to_csv("../data/inputs/typo/train.csv", index=False, encoding="utf-8")
df_eval.to_csv("../data/inputs/typo/eval.csv", index=False, encoding="utf-8")

Typo count: 0


### Check data length and verify data alignment

In [10]:
max_length = max(len(sen) for sen in df["Pinyin"])
print(f"The length of the longest Pinyin sentence is: {max_length}")
max_length = max(len(sen) for sen in df["Chinese"])
print(f"The length of the longest Chinese sentence is: {max_length}")

max_length = max(len(sen) for sen in df_eval["Pinyin"])
print(f"The length of the longest Pinyin sentence is: {max_length}")

for i in range(len(df)):
    chinese_text = df["Chinese"][i]
    pinyin_text = df["Pinyin"][i]
    
    numbers_in_chinese = re.findall(r"\d+", chinese_text)
    symbols_in_chinese = re.findall(r"[%]", chinese_text)
    
    # Calculate the total length of numbers
    total_number_length = sum(len(num) for num in numbers_in_chinese)
    total_symbol_count = len(symbols_in_chinese)
    
    adjusted_chinese_length = len(chinese_text) - total_number_length - total_symbol_count + len(numbers_in_chinese)
    pinyin_word_count = len(pinyin_text.split(" "))
    
    if adjusted_chinese_length != pinyin_word_count:
        print(f"Line {i} has a mismatch in length")
        print(f"Chinese length (adjusted): {adjusted_chinese_length}, Pinyin word count: {pinyin_word_count}")
        print(f"Chinese: {chinese_text}")
        print(f"Pinyin: {pinyin_text}")
        print("\n\n")

The length of the longest Pinyin sentence is: 243
The length of the longest Chinese sentence is: 59
The length of the longest Pinyin sentence is: 161
Line 7512 has a mismatch in length
Chinese length (adjusted): 40, Pinyin word count: 39
Chinese: 贴在窗上的海报写着所有的电冰箱洗衣机和吸尘哭降价5％许多其它家用电器也赔血本出售
Pinyin: tie zai chuang shang de hai bao xie zhe suo you de dian bing xiang xi yi ji he xi chen ku jiang jia 5％ xu duo qi ta jia yong dian qi ye pei xue ben chu shou



Line 11983 has a mismatch in length
Chinese length (adjusted): 20, Pinyin word count: 19
Chinese: 水结成冰的温度是32华氏度32或零摄氏度0℃
Pinyin: shui jie cheng bing de wen du shi 32 hua shi du 32 huo ping she shi du 0℃



Line 12084 has a mismatch in length
Chinese length (adjusted): 30, Pinyin word count: 29
Chinese: 军功章的获得者们被按级别高低召见──等级最高的官员排在第一位
Pinyin: jun gong zhang de huo de zhe men bei an ji bie gao di zhao jian ── deng ji zui gao de guan yuan pai zai di yi wei



Line 17423 has a mismatch in length
Chinese length (adjusted): 12, Pinyin word count