In [3]:
import datasets
from datasets import load_dataset
import pandas as pd
from simalign import SentenceAligner
from transformers import BertTokenizerFast
from hanziconv import HanziConv
import random
from tqdm import tqdm

In [8]:
ds = load_dataset('HKAllen/cantonese-chinese-parallel-corpus')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
aligner = SentenceAligner(model="bert-base-chinese", matching_methods="m", device="mps")
results = {
    'text': [],
    'label': []
}

2025-08-21 05:12:37,147 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-chinese


In [9]:
for sentence in tqdm(ds['train'].select(range(10000))):
    yue_tokens = tokenizer.tokenize(sentence['yue'])
    cmn_traditional = HanziConv.toTraditional(sentence['zh'])
    if sentence['yue'] == cmn_traditional or 127 <= len(yue_tokens) <= 5 or 127 <= len(cmn_traditional) <= 5:
        continue
    cmn_tokens = tokenizer.tokenize(cmn_traditional)
    alignment = aligner.get_word_aligns(yue_tokens, cmn_tokens)['mwmf']

    corrupted_sentences = set()
    for prob in (0.15, 0.33, 0.5):
        corrupted_sentence_yue = [token for token in yue_tokens]
        for pair in alignment:
            idx_0 = pair[0]
            idx_1 = pair[1]
            if random.random() < 0.25 and yue_tokens[idx_0] != cmn_tokens[idx_1]:
                corrupted_sentence_yue[idx_0] = cmn_tokens[idx_1]
        corrupted_sentences.add(''.join(corrupted_sentence_yue).replace('##', ''))
    for prob in (0.15, 0.33, 0.5):
        corrupted_sentence_cmn = [token for token in cmn_tokens]
        for pair in alignment:
            idx_0 = pair[0]
            idx_1 = pair[1]
            if random.random() < 0.25 and yue_tokens[idx_0] != cmn_tokens[idx_1]:
                corrupted_sentence_cmn[idx_1] = yue_tokens[idx_0]
        corrupted_sentences.add(''.join(corrupted_sentence_cmn).replace('[UNK]', ''))
    for corrupted_sentence in corrupted_sentences:
        if corrupted_sentence == sentence['yue'] or corrupted_sentence == cmn_traditional:
            continue
        results['text'].append(corrupted_sentence)
        results['label'].append(2)
    results['text'].append(sentence['yue'])
    results['label'].append(0)
    results['text'].append(cmn_traditional)
    results['label'].append(1)

 80%|████████  | 8038/10000 [16:37<03:38,  8.97it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (716 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 10000/10000 [21:23<00:00,  7.79it/s]


In [11]:
df = pd.DataFrame(results)
# drop where "text" contains [UNK]
df = df[~df['text'].str.contains('[UNK]')]
# replace ## with empty string
df['text'] = df['text'].str.replace('##', '', regex=False)
# drop containing english letters
df = df[~df['text'].str.contains('[a-zA-Z]')]
df

Unnamed: 0,text,label
0,建生邨屬於租者置其屋計劃的屋邨，同一大廈內，有啲單位已經賣咗，其餘的是租住單位。,2
1,建生邨屬於租者置其屋計劃嘅屋邨，同一大廈內，有啲單位已經賣咗，其餘嘅是租住單位。,2
2,建生邨屬於租者置其屋計劃的屋邨，同一大廈內，有啲單位已經賣咗，其餘的係租住單位。,2
3,建生邨屬於租者置其屋計劃的屋邨，同一大廈內，有些單位已經賣咗，其餘的是租住單位。,2
4,建生邨屬於租者置其屋計劃嘅屋邨，同一大廈內，有些單位已經賣瞭，其餘嘅是租住單位。,2
...,...,...
59978,17世紀時，英國國教會開始在美國、澳大利洲、加拿大、新西蘭同南非等前英殖民地建立教會。,2
59979,17世紀時，英國國教會開始在美國、澳大利亞、加拿大、紐西蘭和南非等前英殖民地建立教會。,2
59980,17世紀嗰陣，英國國教會開始在美國、澳洲、加拿大、紐西蘭同南非等前英殖民地建立教會。,2
59981,17世紀嗰陣，英國國教會開始響美國、澳洲、加拿大、紐西蘭同南非等前英殖民地創立教會。,0


In [12]:
ds = datasets.Dataset.from_pandas(df)
# save to disk
ds.save_to_disk('data/acceptability-dataset-2')

Saving the dataset (1/1 shards): 100%|██████████| 47578/47578 [00:00<00:00, 1853237.33 examples/s]


In [13]:
# count number of 1s
print(f"Number of 1s: {len(ds.filter(lambda x: x['label'] == 1))}")

Filter: 100%|██████████| 47578/47578 [00:00<00:00, 321106.22 examples/s]

Number of 1s: 7768



