In [1]:
import re
from itertools import zip_longest

from plane import CJK

from dbpunctuator.data_process import clean_up_data_from_txt, generate_corpus
from dbpunctuator.utils import DEFAULT_CHINESE_NER_MAPPING, remove_brackets_text

In [7]:
# self defined special cleaning func
# as the ch training data used is having en puncs
def normalize_puncs(input):
    normalization = {"?": "？", "!": "！", "（": "(", "）": ")", "...": "。", ",": "，"}
    normalizer = re.compile(
        "({})".format("|".join(map(re.escape, normalization.keys())))
    )
    return normalizer.sub(lambda m: normalization[m.string[m.start() : m.end()]], input)


def remove_title(input):
    """remove title inside training data. (title doesn't have period at the end)"""
    if input.strip() and input.strip()[-1] not in ["。", "？", "！"]:
        return ""
    return input


def revert_ascii_chars_whitespace(input):
    """revert the original data to remove spaces between latin chars

    Args:
        input (string): input to be processed

    """
    regex = re.compile("(?P<%s>%s)" % (CJK.name, CJK.pattern), CJK.flag)
    result = ""
    start = 0
    for t in regex.finditer(input):
        result += " " + "".join(
            [char for char in list(input[start : t.start()]) if char != " "]
        )
        result += " " + input[t.start() : t.end()]
        start = t.end()
    result += input[start:]
    return result


def merge_data(whole_data_path, *tokens_data_paths):
    all_lines = []
    with open(whole_data_path, "w+") as whole_data_file:
        for cleaned_data_path in tokens_data_paths:
            with open(cleaned_data_path, "r") as data_file:
                all_lines.append(data_file.readlines())
        for lines in zip_longest(*all_lines):
            for line in lines:
                if line:
                    whole_data_file.write(line)

In [9]:
def add_whitespace_zh_chars(input):
    """revert the original data to remove spaces between latin chars

    Args:
        input (string): input to be processed

    """
    regex = re.compile("(?P<%s>%s)" % (CJK.name, CJK.pattern), CJK.flag)
    result = ""
    start = 0
    for t in regex.finditer(input):
        result += input[start : t.start()]
        result += " " + " ".join(
            [char for char in list(input[t.start() : t.end()]) if char != " "]
        ) + " "
        start = t.end()
    result += input[start:]
    return result

In [3]:
input = "第二环节， 角色扮演。 你好， 我刚到新加坡， 我是马来西亚的交换的学生， 我来南大交流学习。 对新加坡还不是很熟悉， 请问， ah 你可以推荐我几个景点吗。\n"

print(add_whitespace_zh_chars(input))

 第 二 环 节 ，  角 色 扮 演 。  你 好 ，  我 刚 到 新 加 坡 ，  我 是 马 来 西 亚 的 交 换 的 学 生 ，  我 来 南 大 交 流 学 习 。  对 新 加 坡 还 不 是 很 熟 悉 ，  请 问 ， ah  你 可 以 推 荐 我 几 个 景 点 吗 。



In [27]:
with open("/root/autodl-tmp/datasets/mml-zh/cleaned_test.txt", "r") as file:
    source_data = file.readlines()

In [28]:
source_data[:5]

['第二环节， 角色扮演。 你好， 我刚到新加坡， 我是马来西亚的交换的学生， 我来南大交流学习。 对新加坡还不是很熟悉， 请问， ah 你可以推荐我几个景点吗。\n',
 '我有听过 eh， 是不是很像那个 london eye 一样。\n',
 '哦。\n',
 '那你是比较推荐我白天去还是晚上去呢。\n',
 'orh 就是夜景比较美啦。 哦那我会看到哪里就是什么样的景象呢。 但是因为你知道新加坡都是高楼大厦嘛， 对不对。 那如果我在上面的我是会会鸟览整个新加坡吗。 还是，\n']

In [29]:
clean_up_data_from_txt(
    source_data,
    "/root/autodl-tmp/datasets/mml-zh/processed/cleaned_test.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
    special_cleaning_funcs=[
        remove_brackets_text,
        add_whitespace_zh_chars,
    ],
)

2023-10-15 23:23:43,306 - [32mINFO[0m - data_cleanning.py:73 - data_cleanning.text_lines_cleaning - 947 - clean up text file line by line.
2023-10-15 23:23:43,309 - [32mINFO[0m - data_cleanning.py:74 - data_cleanning.text_lines_cleaning - 947 - replace email with <EMAIL>
2023-10-15 23:23:43,311 - [32mINFO[0m - data_cleanning.py:75 - data_cleanning.text_lines_cleaning - 947 - replace url with <URL>
2023-10-15 23:23:43,312 - [32mINFO[0m - data_cleanning.py:76 - data_cleanning.text_lines_cleaning - 947 - replace currency with <CURRENCY>
2023-10-15 23:23:43,313 - [32mINFO[0m - data_cleanning.py:77 - data_cleanning.text_lines_cleaning - 947 - replace telephone with <TEL>
2023-10-15 23:23:43,314 - [32mINFO[0m - data_cleanning.py:78 - data_cleanning.text_lines_cleaning - 947 - replace number with <NUM>
100%|██████████| 11576/11576 [00:01<00:00, 7477.11it/s]


In [30]:
generate_corpus(
    "/root/autodl-tmp/datasets/mml-zh/processed/cleaned_test.txt",
    "/root/autodl-tmp/datasets/mml-zh/processed/test_token_tag_data.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
)

2023-10-15 23:24:01,057 - [32mINFO[0m - data_process.py:172 - data_process.generate_corpus - 947 - generate training data
100%|██████████| 11576/11576 [00:00<00:00, 82262.02it/s]


In [31]:
with open("/root/autodl-tmp/datasets/mml-zh/cleaned_dev.txt", "r") as file:
    source_data = file.readlines()
    
clean_up_data_from_txt(
    source_data,
    "/root/autodl-tmp/datasets/mml-zh/processed/cleaned_dev.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
    special_cleaning_funcs=[
        remove_brackets_text,
        add_whitespace_zh_chars,
    ],
)

generate_corpus(
    "/root/autodl-tmp/datasets/mml-zh/processed/cleaned_dev.txt",
    "/root/autodl-tmp/datasets/mml-zh/token_tag_files/dev_token_tag_data.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
)

2023-10-15 23:31:29,211 - [32mINFO[0m - data_cleanning.py:73 - data_cleanning.text_lines_cleaning - 947 - clean up text file line by line.
2023-10-15 23:31:29,212 - [32mINFO[0m - data_cleanning.py:74 - data_cleanning.text_lines_cleaning - 947 - replace email with <EMAIL>
2023-10-15 23:31:29,213 - [32mINFO[0m - data_cleanning.py:75 - data_cleanning.text_lines_cleaning - 947 - replace url with <URL>
2023-10-15 23:31:29,214 - [32mINFO[0m - data_cleanning.py:76 - data_cleanning.text_lines_cleaning - 947 - replace currency with <CURRENCY>
2023-10-15 23:31:29,216 - [32mINFO[0m - data_cleanning.py:77 - data_cleanning.text_lines_cleaning - 947 - replace telephone with <TEL>
2023-10-15 23:31:29,217 - [32mINFO[0m - data_cleanning.py:78 - data_cleanning.text_lines_cleaning - 947 - replace number with <NUM>
100%|██████████| 11938/11938 [00:01<00:00, 7465.89it/s]
2023-10-15 23:31:34,879 - [32mINFO[0m - data_process.py:172 - data_process.generate_corpus - 947 - generate training data
1

In [32]:
with open("/root/autodl-tmp/datasets/mml-zh/cleaned_train.txt", "r") as file:
    source_data = file.readlines()
    
clean_up_data_from_txt(
    source_data,
    "/root/autodl-tmp/datasets/mml-zh/processed/cleaned_train.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
    special_cleaning_funcs=[
        remove_brackets_text,
        add_whitespace_zh_chars,
    ],
)

generate_corpus(
    "/root/autodl-tmp/datasets/mml-zh/processed/cleaned_train.txt",
    "/root/autodl-tmp/datasets/mml-zh/token_tag_files/train_token_tag_data.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
)

2023-10-15 23:31:58,222 - [32mINFO[0m - data_cleanning.py:73 - data_cleanning.text_lines_cleaning - 947 - clean up text file line by line.
2023-10-15 23:31:58,224 - [32mINFO[0m - data_cleanning.py:74 - data_cleanning.text_lines_cleaning - 947 - replace email with <EMAIL>
2023-10-15 23:31:58,225 - [32mINFO[0m - data_cleanning.py:75 - data_cleanning.text_lines_cleaning - 947 - replace url with <URL>
2023-10-15 23:31:58,226 - [32mINFO[0m - data_cleanning.py:76 - data_cleanning.text_lines_cleaning - 947 - replace currency with <CURRENCY>
2023-10-15 23:31:58,227 - [32mINFO[0m - data_cleanning.py:77 - data_cleanning.text_lines_cleaning - 947 - replace telephone with <TEL>
2023-10-15 23:31:58,228 - [32mINFO[0m - data_cleanning.py:78 - data_cleanning.text_lines_cleaning - 947 - replace number with <NUM>
100%|██████████| 96176/96176 [00:12<00:00, 7614.09it/s]
2023-10-15 23:32:43,726 - [32mINFO[0m - data_process.py:172 - data_process.generate_corpus - 947 - generate training data
1

## Generate Token Tag File for LLM Result

### LLAMA2 Result

In [28]:
with open("/root/autodl-tmp/datasets/mml-zh/llm_results/llama2_13b/train_results.txt", "r") as file:
    source_data = file.readlines()

In [29]:
source_data[:2]

['好，第三环节，扮演，好，你好，我刚来到新加坡，我是来自马来西亚的交换生啊，所以我对新加坡就不是很熟悉啦，所以我们目前是在NTU，我想问一下要怎么如何从NTU去榜鹅呢，因为榜鹅有那个海鲜嘛，对对对，对对对，那可以跟我讲是在啊，对对对，一个星期后，所以在新加坡待一段时间。\n',
 '啦，oh 没有啦，因为一个星期嘛就要问你很多问题，uh 是住在 hotel ，也是在市中心啊，bugis，but 你可以跟我讲一下那个，er 海鲜吃海鲜地方在哪里吗，okay 哦，可以跟我讲一下那边也有怎么样的美食呢，uh 你就由你来介绍吧，okay，oh，jumbo 啊嗯，okay，那除了还有不是它还有好几间分店吗哦，okay，okay，嗯嗯嗯，okay，那我想问一下那还有其他的分\n']

In [30]:
clean_up_data_from_txt(
    source_data,
    "/root/autodl-tmp/datasets/mml-zh/llm_results/llama2_13b/train_results_cleaned.txt",
    ner_mapping=DEFAULT_CHINESE_NER_MAPPING,
    special_cleaning_funcs=[
        normalize_puncs,
        add_whitespace_zh_chars,
    ],
)

2023-10-25 23:14:10,423 - [32mINFO[0m - data_cleanning.py:73 - data_cleanning.text_lines_cleaning - 1720 - clean up text file line by line.
2023-10-25 23:14:10,425 - [32mINFO[0m - data_cleanning.py:74 - data_cleanning.text_lines_cleaning - 1720 - replace email with <EMAIL>
2023-10-25 23:14:10,426 - [32mINFO[0m - data_cleanning.py:75 - data_cleanning.text_lines_cleaning - 1720 - replace url with <URL>
2023-10-25 23:14:10,428 - [32mINFO[0m - data_cleanning.py:76 - data_cleanning.text_lines_cleaning - 1720 - replace currency with <CURRENCY>
2023-10-25 23:14:10,430 - [32mINFO[0m - data_cleanning.py:77 - data_cleanning.text_lines_cleaning - 1720 - replace telephone with <TEL>
2023-10-25 23:14:10,430 - [32mINFO[0m - data_cleanning.py:78 - data_cleanning.text_lines_cleaning - 1720 - replace number with <NUM>
100%|██████████| 10109/10109 [00:03<00:00, 3361.31it/s]


In [27]:
regex = re.compile("[a-zA-Z]{2,}")
print(bool(re.match(regex, "nt")))

True


In [14]:
token