In [None]:
import os
import csv

SIG = "data/train/signatures"

with open("signatures.tsv", "w", encoding="utf-8", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')

    for dir in os.listdir(SIG):
        for file in os.listdir(os.path.join(SIG, dir)):
            if not file.endswith(".txt"):
                continue

            filename = file[:-4]
            if not os.path.exists(os.path.join(SIG, dir, f"{filename}.jpg")):
                continue
            
            # If jpg exists, read txt
            with open(os.path.join(SIG, dir, file), "r", encoding="utf-8") as txtfile:
                label = txtfile.read()
                writer.writerow([f"{SIG[5:]}/{dir}/{filename}.jpg", label])

In [3]:
def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False


def is_number(uchar):
    """判断一个unicode是否是半角数字"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False


def is_Qnumber(uchar):
    """判断一个unicode是否是全角数字"""
    if uchar >= u'\uff10' and uchar <= u'\uff19':
        return True
    else:
        return False


def is_alphabet(uchar):
    """判断一个unicode是否是半角英文字母"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061'
                                                       and uchar <= u'\u007a'):
        return True
    else:
        return False


def is_Qalphabet(uchar):
    """判断一个unicode是否是全角英文字母"""
    if (uchar >= u'\uff21' and uchar <= u'\uff3a') or (uchar >= u'\uff41'
                                                       and uchar <= u'\uff5a'):
        return True
    else:
        return False


def is_other(uchar):
    """判断是否非汉字，数字和英文字符"""
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return True
    else:
        return False


def B2Q(uchar):
    """单个字符 半角转全角"""
    inside_code = ord(uchar)
    if inside_code < 0x0020 or inside_code > 0x7e:  # 不是半角字符就返回原来的字符
        return uchar
    if inside_code == 0x0020:  # 除了空格其他的全角半角的公式为: 半角 = 全角 - 0xfee0
        inside_code = 0x3000
    else:
        inside_code += 0xfee0
    return chr(inside_code)


def Q2B(uchar):
    """单个字符 全角转半角"""
    inside_code = ord(uchar)
    if inside_code == 0x3000:
        inside_code = 0x0020
    else:
        inside_code -= 0xfee0
    if inside_code < 0x0020 or inside_code > 0x7e:  #转完之后不是半角字符返回原来的字符
        return uchar
    return chr(inside_code)


def stringQ2B(ustring):
    """把字符串全角转半角"""
    return "".join([Q2B(uchar) for uchar in ustring])


def stringpartQ2B(ustring):
    """把字符串中数字和字母全角转半角"""
    ustring = ustring.replace("％", "%").replace("／", "/").replace("－", "-").replace("．", ".").replace("＂", "“")
    return "".join([
        Q2B(uchar) if is_Qnumber(uchar) or is_Qalphabet(uchar) else uchar
        for uchar in ustring
    ])


In [4]:
def punc_to_chinese(string):
    return string.replace(",", "，").replace(";", "；").replace(":", "：").replace("!", "！").replace("?", "？").replace(")", "）").replace("(", "（")

def pubc_to_english(string):
    return string.replace("，", ",").replace("；", ";").replace("：", ":").replace("！", "!").replace("？", "?").replace("）", ")").replace("（", "(")

In [5]:
import os

opts = {"punc": False, "q2b": False}
file_opts = {}

for root, dirs, files in os.walk("labels-src"):
    for file in files:
        if not file.endswith(".tsv"):
            continue

        file_opts[file] = opts

print(file_opts)


{'digit_95k.tsv': {'punc': False, 'q2b': False}, 'hand_line_all_548k.tsv': {'punc': False, 'q2b': False}, 'tang_syn_1577k.tsv': {'punc': False, 'q2b': False}, 'web_line_238k.tsv': {'punc': False, 'q2b': False}, 'hw_chinese_240k.tsv': {'punc': False, 'q2b': False}, 'hwdb_ic13_47k.tsv': {'punc': False, 'q2b': False}, 'hwdb2.1_13k.tsv': {'punc': False, 'q2b': False}, 'hwdb2.2_12k.tsv': {'punc': False, 'q2b': False}, 'hwdb2.0_16k.tsv': {'punc': False, 'q2b': False}, 'signatures_472k.tsv': {'punc': False, 'q2b': False}, 'hwdb2.0_4k.tsv': {'punc': False, 'q2b': False}, 'hwdb_ic13_3k.tsv': {'punc': False, 'q2b': False}, 'hw_chinese_17k.tsv': {'punc': False, 'q2b': False}, 'hwdb2.2_3k.tsv': {'punc': False, 'q2b': False}, 'hwdb2.1_3k.tsv': {'punc': False, 'q2b': False}, 'hwdb_ic13_val_5k.tsv': {'punc': False, 'q2b': False}}


In [6]:
import os
    
file_opts = {
    'labels-src/train/digit_95k.tsv': {
        'punc': False,
        'q2b': False
    },
    'labels-src/train/hand_line_all_548k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/train/tang_syn_1577k.tsv': {
        'punc': False,
        'q2b': False
    },
    'labels-src/train/web_line_238k.tsv': {
        'punc': False,
        'q2b': False
    },
    'labels-src/train/hw_chinese_240k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/train/hwdb_ic13_47k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/train/hwdb2.1_13k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/train/hwdb2.2_12k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/train/hwdb2.0_16k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/test/hwdb2.0_4k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/test/hwdb_ic13_3k.tsv': {
        'punc': True,
        'q2b': True
    },
    'labels-src/test/hw_chinese_17k.tsv': {
        'punc': False,
        'q2b': False
    },
    'labels-src/test/hwdb2.2_3k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/test/hwdb2.1_3k.tsv': {
        'punc': True,
        'q2b': False
    },
    'labels-src/test/hwdb_ic13_val_5k.tsv': {
        'punc': True,
        'q2b': False
    },
    "labels-src/train/signatures_472k.tsv": {
        "punc": False,
        "q2b": False
    }
}

OUTPUT_DIR = "clean_labels"

for root, dirs, files in os.walk("labels-src"):
    for file in files:
        if not file.endswith(".tsv"):
            continue

        output_dir = os.path.join(OUTPUT_DIR, root)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        opts = file_opts[f"{root}/{file}"]

        with open(os.path.join(root, file), "r", encoding="utf-8") as input:
            with open(os.path.join(output_dir, file), "w", encoding="utf-8") as output:
                text = input.read()
                if opts["q2b"]:
                    text = stringpartQ2B(text)
                if opts["punc"]:
                    text = punc_to_chinese(text)
                
                output.write(text)