In [None]:
import os
from pathlib import Path
from typing import List
from PIL import Image, ImageDraw, ImageFont
import freetype

class FontGlyphExtractor:
    def __init__(self, font_path: str, output_dir: str):
        self.font_path = font_path
        self.output_dir = output_dir
        self.font_name = Path(font_path).stem
        self.face = freetype.Face(font_path)
        self.glyphs = self._get_glyphs()

    def _get_glyphs(self) -> List[int]:
        """获取字体文件中所有已制作字形的字符编码。"""
        char_codes = []
        char_code, glyph_index = self.face.get_first_char()
        while glyph_index != 0:
            if glyph_index != 0:
                char_codes.append(char_code)
            char_code, glyph_index = self.face.get_next_char(char_code, glyph_index)
        return char_codes

    def _render_glyph(self, char_code: int) -> Image.Image:
        """将单个字形渲染为图片。"""
        char = chr(char_code)
        font = ImageFont.truetype(self.font_path, size=64) # 修改字体大小
        image = Image.new('L', (80, 80), color=255) # 修改图片大小
        draw = ImageDraw.Draw(image)
        w, h = draw.textsize(char, font=font)
        draw.text(((80 - w) / 2, (80 - h) / 2), char, font=font, fill=0)
        return image

    def extract_glyphs(self):
        """提取所有字形并保存为图片。"""
        output_path = Path(self.output_dir) / self.font_name
        output_path.mkdir(parents=True, exist_ok=True)
        for char_code in self.glyphs:
            glyph_name = f"U+{char_code:04X}"
            glyph_dir = output_path / glyph_name
            glyph_dir.mkdir(exist_ok=True)
            image = self._render_glyph(char_code)
            image_path = glyph_dir / f"{glyph_name}.png"
            image.save(image_path)
            print(f"Saved glyph {glyph_name} to {image_path}")

def main():
    font_files = [
        '中华书局宋体字库/FZSONG_ZhongHuaSongPlane00_2021051420210514150927.TTF',
        '中华书局宋体字库/FZSONG_ZhongHuaSongPlane02_2021012120210122112919.TTF',
        '中华书局宋体字库/FZSONG_ZhongHuaSongPlane15_2021051420210514150947.TTF'
    ]
    output_dir = 'output_glyphs'
    for font_file in font_files:
        extractor = FontGlyphExtractor(font_file, output_dir)
        extractor.extract_glyphs()

if __name__ == "__main__":
    main()


In [None]:
import os
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import freetype

class FontProcessor:
    def __init__(self, font_path: str, output_dir: str):
        self.font_path = font_path
        self.output_dir = output_dir
        self.font_name = Path(font_path).stem
        self.face = freetype.Face(font_path)

    def get_glyphs(self):
        """获取字体文件中所有已制作的字形的字符编码。"""
        char_codes = []
        char_code, glyph_index = self.face.get_first_char()
        while glyph_index != 0:
            char_codes.append(char_code)
            char_code, glyph_index = self.face.get_next_char(char_code, glyph_index)
        return char_codes

    def render_glyph(self, char_code: int) -> Image:
        """将单个字形渲染为图片。"""
        char = chr(char_code)
        font = ImageFont.truetype(self.font_path, size=64)
        image = Image.new('L', (80, 80), color=255)  # 创建白色背景的图片
        draw = ImageDraw.Draw(image)
        w, h = draw.textsize(char, font=font)
        draw.text(((80 - w) / 2, (80 - h) / 2), char, font=font, fill=0)  # 黑色文字
        return image

    def save_glyph_images(self):
        """保存字体文件中每个字形为图片，并按字形编码进行组织。"""
        output_path = Path(self.output_dir) / self.font_name
        output_path.mkdir(parents=True, exist_ok=True)

        for char_code in self.get_glyphs():
            glyph_name = f"U+{char_code:04X}"
            glyph_dir = output_path / glyph_name
            glyph_dir.mkdir(parents=True, exist_ok=True)

            image = self.render_glyph(char_code)
            image_path = glyph_dir / f"{glyph_name}.png"
            image.save(image_path)
            print(f"Saved glyph {glyph_name} at {image_path}")

class FontDatasetBuilder:
    def __init__(self, font_files: list, output_dir: str):
        self.font_files = font_files
        self.output_dir = output_dir

    def process_fonts(self):
        """处理所有字体文件，提取字形并保存为图片。"""
        for font_file in self.font_files:
            processor = FontProcessor(font_file, self.output_dir)
            processor.save_glyph_images()

def main():
    font_files = [
        '中华书局宋体字库 2024/中华书局宋体02平面_20231010.TTF',
        '中华书局宋体字库 2024/中华书局宋体15平面_20240514.TTF',
        '中华书局宋体字库 2024/中华书局宋体16平面_20240906.TTF'
    ]
    output_dir = 'glyph_output'

    dataset_builder = FontDatasetBuilder(font_files, output_dir)
    dataset_builder.process_fonts()

if __name__ == "__main__":
    main()

In [None]:
import os
from pathlib import Path
from typing import List
from PIL import Image, ImageDraw, ImageFont
import freetype

class FontGlyphExtractor:
    def __init__(self, font_path: str, output_dir: str):
        self.font_path = font_path
        self.output_dir = output_dir
        self.font_name = Path(font_path).stem
        self.face = freetype.Face(font_path)
        self.glyphs = self._get_glyphs()

    def _get_glyphs(self) -> List[int]:
        """获取字体文件中所有已制作字形的字符编码。"""
        char_codes = []
        char_code, glyph_index = self.face.get_first_char()
        while glyph_index != 0:
            if glyph_index != 0:
                char_codes.append(char_code)
            char_code, glyph_index = self.face.get_next_char(char_code, glyph_index)
        return char_codes

    def _render_glyph(self, char_code: int) -> Image.Image:
        """将单个字形渲染为图片。"""
        char = chr(char_code)
        font = ImageFont.truetype(self.font_path, size=62)  # 修改字体大小
        image = Image.new('L', (64, 64), color=255)  # 修改图片大小
        draw = ImageDraw.Draw(image)
        w, h = draw.textsize(char, font=font)
        draw.text(((64 - w) / 2, (64 - h) / 2), char, font=font, fill=0)
        return image

    def extract_glyphs(self):
        """提取所有字形并保存为图片。"""
        output_path = Path(self.output_dir) / self.font_name
        output_path.mkdir(parents=True, exist_ok=True)
        for char_code in self.glyphs:
            glyph_name = f"U+{char_code:04X}"
            image = self._render_glyph(char_code)
            image_path = output_path / f"{glyph_name}.png"
            image.save(image_path)
            print(f"Saved glyph {glyph_name} to {image_path}")

def main():
    font_files = [
        '中华书局宋体字库 2024/中华书局宋体02平面_20231010.TTF',
        '中华书局宋体字库 2024/中华书局宋体15平面_20240514.TTF',
        '中华书局宋体字库 2024/中华书局宋体16平面_20240906.TTF'
    ]
    output_dir = 'output_glyphs'
    for font_file in font_files:
        extractor = FontGlyphExtractor(font_file, output_dir)
        extractor.extract_glyphs()

if __name__ == "__main__":
    main()


In [None]:
from fontTools.ttLib import TTFont
from tkinter import Tk, Label, Entry, Button

def char_code_to_text(char_code, font_path):
    """
    将字符编码转换为字符文本。
    :param char_code: 字符编码（整数）
    :param font_path: 字体文件路径（字符串）
    :return: 对应的字符文本（字符串），如果找不到则返回空字符串
    """
    try:
        # 打开字体文件
        font = TTFont(font_path)
        # 获取 cmap 表（字符编码到字形ID的映射）
        cmap = font['cmap']
        for table in cmap.tables:
            if table.isUnicode():
                if char_code in table.cmap:
                    char = chr(char_code)
                    return char
        # 如果字符编码不在 Unicode 范围内，尝试从 glyf 表获取字形名称
        glyf = font['glyf']
        glyph_set = font.getGlyphSet()
        for glyph_name in glyf.glyphs:
            glyph_id = font.getGlyphID(glyph_name)
            if glyph_id == char_code:
                # 返回字形名称作为字符文本
                return glyph_name
        return ''
    except Exception as e:
        print(f"Error: {e}")
        return ''

def display_and_edit_result(char_code, font_path):
    """
    显示和编辑识别结果的 GUI 界面。
    :param char_code: 字符编码（整数）
    :param font_path: 字体文件路径（字符串）
    """
    char_text = char_code_to_text(char_code, font_path)

    def save_edit():
        new_text = entry.get()
        print(f"Edited Text: {new_text}")
        root.destroy()

    root = Tk()
    root.title("识别结果编辑")
    Label(root, text="识别结果：").grid(row=0, column=0)
    entry = Entry(root)
    entry.grid(row=0, column=1)
    entry.insert(0, char_text)
    Button(root, text="保存", command=save_edit).grid(row=1, column=0, columnspan=2)
    root.mainloop()

# 示例用法
if __name__ == "__main__":
    # 假设识别出的字符编码为 0x4E00（'一' 字）
    char_code = 0x2FB6
    font_path = '/Volumes/北海王/GitHub/ZhongHuaSongFont/output_glyphs/中华书局宋体00平面_20231107'

    # 获取字符文本
    char_text = char_code_to_text(char_code, font_path)
    print(f"识别出的字符文本：{char_text}")

    # 显示和编辑识别结果
    display_and_edit_result(char_code, font_path)


In [None]:
import tkinter as tk
from tkinter import font as tkFont
from PIL import ImageFont

# 创建主窗口
root = tk.Tk()
root.title("字体显示器")

# 可编辑的文本框
text_widget = tk.Text(root, wrap=tk.WORD, width=100, height=40)
text_widget.pack(fill=tk.BOTH, expand=True)

# 加载字体文件
font_path = "/Volumes/北海王/GitHub/ZhongHuaSongFont/font/康熙字典体/KX_47043_15.ttf"  # 替换为你本地的字体路径
font_size = 20  # 设置字体大小

# 使用PIL的ImageFont加载字体，获取字形
try:
    pil_font = ImageFont.truetype(font_path, font_size)
    print("字体加载成功！")
except IOError:
    print(f"无法加载字体：{font_path}")
    exit(1)

# Tkinter 不能直接使用PIL的字体，所以我们需要将字体注册到Tkinter中
tk_font = tkFont.Font(family="CustomFont", size=font_size)

# 设置自定义字体
text_widget.configure(font=tk_font)

# 填充所有可显示的字符
all_characters = ""
undisplayable_characters = ""  # 保存无法显示的字符

# 打开一个txt文件用于保存无法显示的字符及其编码
with open("undisplayable_characters.txt", "w", encoding="utf-8") as f:
    # Unicode 范围中，有些字符是无效的代理对，需要过滤
    for code_point in range(0x0000, 0xFFFF):  # 遍历 0x0000 到 0xFFFF 范围的字符
        character = chr(code_point)
        # 检查字符是否在字体中存在，并且过滤掉无效的代理对字符
        if 0xD800 <= code_point <= 0xDFFF:
            continue  # 跳过代理对
        try:
            # 检查字符是否存在于字体中
            if pil_font.getsize(character)[0] > 0:  # 字符宽度大于0，说明它存在于字体中
                all_characters += character
            else:
                # 如果字形不存在，记录字符编码
                undisplayable_characters += character
                f.write(f"无法显示的字符: {character} - 编码: U+{code_point:04X}\n")
        except Exception as e:
            # 捕捉异常并记录字符
            f.write(f"错误: {e}, 字符: {character} - 编码: U+{code_point:04X}\n")

# 在文本框中插入所有字符
text_widget.insert(tk.END, all_characters)

# 运行应用
root.mainloop()

In [2]:
from fontTools.ttLib import TTFont
import os

# 字体文件列表
font_files = [
    '/Volumes/北海王/GitHub/ZhongHuaSongFont/font/中华书局宋体字库/中华书局宋体00平面_20231107.TTF',
    '/Volumes/北海王/GitHub/ZhongHuaSongFont/font/中华书局宋体字库/中华书局宋体02平面_20231010.TTF',
    '/Volumes/北海王/GitHub/ZhongHuaSongFont/font/中华书局宋体字库/中华书局宋体15平面_20240514.TTF',
    '/Volumes/北海王/GitHub/ZhongHuaSongFont/font/中华书局宋体字库/中华书局宋体16平面_20240906.TTF'
]

# 遍历每个字体文件
for font_path in font_files:
    try:
        # 读取字体文件
        font = TTFont(font_path)

        # 获取字形编码
        glyphs = font.getGlyphOrder()

        # 获取字体文件名（不带路径和扩展名）
        font_name = os.path.splitext(os.path.basename(font_path))[0]

        # 输出单独的txt文件
        output_file = f'{font_name}_glyph_codes.txt'

        # 写入字形编码和总数到txt文件
        with open(output_file, 'w', encoding='utf-8') as out_file:
            out_file.write(f'Glyph Codes:\n')
            out_file.write(f'{", ".join(glyphs)}\n')
            out_file.write(f'Total Glyphs: {len(glyphs)}\n')

        print(f"Glyph codes and total count have been written to {output_file}")

    except Exception as e:
        print(f'Error reading {font_path}: {e}')


Glyph codes and total count have been written to 中华书局宋体00平面_20231107_glyph_codes.txt
Glyph codes and total count have been written to 中华书局宋体02平面_20231010_glyph_codes.txt
Glyph codes and total count have been written to 中华书局宋体15平面_20240514_glyph_codes.txt
Glyph codes and total count have been written to 中华书局宋体16平面_20240906_glyph_codes.txt


In [23]:
import json
import re

# 文件路径
file_path = '/Volumes/北海王/GitHub/ZhongHuaSongFont/text_recognition_project/assets/ids-main/ids_lv0.txt'
output_txt_path = 'char_to_first_sequence.txt'

# 初始化映射字典
sequence_to_id = {}
char_to_sequence_id = {}

# 计数器用于编码
current_id = 1

# 定义删除不在 #() 中的字母的正则表达式
delete_letters_pattern = re.compile(r'(?<![#\(\[])[a-zA-Z]+(?![\w]*[\)\]])')

# 定义删除“(.)”、“.(.)”、“." 组合的正则表达式
delete_dots_pattern = re.compile(r'\(\.\)|\.\(\.\)|\.')

# 匹配仅包裹字母的 #(X) 正则表达式 (仅字母，不区分大小写)
hash_letter_pattern = re.compile(r'^#\([a-zA-Z]+\)')



def get_encoded_sequence(sequence):
    global current_id
    encoded_sequence = []
    for char in sequence:
        if char not in sequence_to_id:
            sequence_to_id[char] = current_id
            current_id += 1
        encoded_sequence.append(sequence_to_id[char])
    return encoded_sequence

with open(file_path, 'r', encoding='utf-8') as f, open(output_txt_path, 'w', encoding='utf-8') as out_txt:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) < 2:
            continue
            
        character = parts[0]
        ids_list = parts[1].split(';')
        first_ids = ids_list[0]
        
        # 如果序列是 #(字母) 形式，则将字符本身作为序列
        if hash_letter_pattern.fullmatch(first_ids):
            cleaned_ids = character
        else:
            # 1. 删除不在 #() 中的字母
            cleaned_ids = delete_letters_pattern.sub('', first_ids)
        
            # 2. 删除“(.)”、“.(.)”、"." 组合
            cleaned_ids = delete_dots_pattern.sub('', cleaned_ids)
        
        # 将字符和清理后的第一个表意序列写入新的txt文件
        out_txt.write(f"{character}\t{cleaned_ids}\n")
        
        # 获取编码后的序列并保存到char_to_sequence_id字典中
        encoded_ids_sequence = get_encoded_sequence(cleaned_ids)
        char_to_sequence_id[character] = encoded_ids_sequence

# 写入 JSON 文件
with open('sequence_to_id.json', 'w', encoding='utf-8') as f:
    json.dump(sequence_to_id, f, ensure_ascii=False, indent=2)

with open('char_to_sequence_id.json', 'w', encoding='utf-8') as f:
    json.dump(char_to_sequence_id, f, ensure_ascii=False, indent=2)

import re

# 文件路径
file_path = '/Volumes/北海王/GitHub/ZhongHuaSongFont/text_recognition_project/assets/ids-main/char_to_first_sequence.txt'  # 替换为你的txt文件路径
output_txt_path = '/Volumes/北海王/GitHub/ZhongHuaSongFont/text_recognition_project/assets/ids-main/char_to_first_sequence_2.txt'

# 匹配仅包裹字母的 #(X) 正则表达式 (仅字母，不区分大小写)
hash_letter_pattern = re.compile(r'#\([a-zA-Z]+\)')

# 匹配包裹单个字母的 (X) 形式，去掉这些部分
single_letter_pattern = re.compile(r'\([a-zA-Z]\)')

with open(file_path, 'r', encoding='utf-8') as f, open(output_txt_path, 'w', encoding='utf-8') as out_txt:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) < 2:
            continue
        
        character = parts[0]
        ids_sequence = parts[1]

        # 如果 IDS 包含 #(字母) 的形式，则替换为字符本身
        cleaned_ids = hash_letter_pattern.sub(character, ids_sequence)
        
        # 删除 () 中包裹的单个字母
        cleaned_ids = single_letter_pattern.sub('', cleaned_ids)

        # 将处理后的结果写入到新的文件中
        out_txt.write(f"{character}\t{cleaned_ids}\n")


In [10]:
import json

# 读取 JSON 文件
with open('/Volumes/北海王/GitHub/ZhongHuaSongFont/text_recognition_project/assets/ids-main/sequence_to_id.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# 打开 txt 文件并写入所有键
with open('output.txt', 'w', encoding='utf-8') as txt_file:
    for key in data.keys():
        txt_file.write(key + '\n')

print("所有键已成功写入 output.txt 文件中。")


所有键已成功写入 output.txt 文件中。


In [25]:
import json

# 读取 JSON 文件
with open('/Volumes/北海王/GitHub/ZhongHuaSongFont/text_recognition_project/assets/ids-main/char_to_sequence_id.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 计算最大列表长度
max_length = max(len(seq) for seq in data.values())

# 输出最大长度
print(max_length)


44


In [3]:
import os
import shutil

# 定义要处理的路径
txt_file_path = '/Volumes/北海王/GitHub/ZhongHuaSongFont/text_recognition_project/assets/ids-main/ids_lv0.txt'  # 你的txt文件路径
source_dir = '/Volumes/北海王/GitHub/ZhongHuaSongFont/output_glyphs'                                                                                                                                 
destination_dir = '/Volumes/北海王/GitHub/ZhongHuaSongFont/train'                                                                                                                                 
  # 新文件夹的路径

# 创建新文件夹，如果不存在
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)

# 函数：将汉字转换为Unicode编码格式，如U+5C10
def char_to_unicode(char):
    return 'U+' + hex(ord(char))[2:].upper()

# 读取txt文件中的所有汉字
with open(txt_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 每一行的第一个字符是目标汉字
        char = line.split()[0]
        unicode_name = char_to_unicode(char)
        
        # 在多层文件夹中查找对应的图片文件
        for root, dirs, files in os.walk(source_dir):
            for file_name in files:
                if file_name == f'{unicode_name}.png':  # 如果找到匹配的文件
                    # 源文件路径
                    source_file_path = os.path.join(root, file_name)
                    # 目标文件路径
                    destination_file_path = os.path.join(destination_dir, file_name)
                    # 移动文件
                    shutil.move(source_file_path, destination_file_path)
                    print(f'Moved: {file_name}')


Moved: U+4E00.png
Moved: U+4E01.png
Moved: U+4E02.png
Moved: U+4E03.png
Moved: U+4E04.png
Moved: U+4E05.png
Moved: U+4E06.png
Moved: U+4E07.png
Moved: U+4E08.png
Moved: U+4E09.png
Moved: U+4E0A.png
Moved: U+4E0B.png
Moved: U+4E0C.png
Moved: U+4E0D.png
Moved: U+4E0E.png
Moved: U+4E0F.png
Moved: U+4E10.png
Moved: U+4E11.png
Moved: U+4E12.png
Moved: U+4E13.png
Moved: U+4E14.png
Moved: U+4E15.png
Moved: U+4E16.png
Moved: U+4E17.png
Moved: U+4E18.png
Moved: U+4E19.png
Moved: U+4E1A.png
Moved: U+4E1B.png
Moved: U+4E1C.png
Moved: U+4E1D.png
Moved: U+4E1E.png
Moved: U+4E1F.png
Moved: U+4E20.png
Moved: U+4E21.png
Moved: U+4E22.png
Moved: U+4E23.png
Moved: U+4E24.png
Moved: U+4E25.png
Moved: U+4E26.png
Moved: U+4E27.png
Moved: U+4E28.png
Moved: U+4E29.png
Moved: U+4E2A.png
Moved: U+4E2B.png
Moved: U+4E2C.png
Moved: U+4E2D.png
Moved: U+4E2E.png
Moved: U+4E2F.png
Moved: U+4E30.png
Moved: U+4E31.png
Moved: U+4E32.png
Moved: U+4E33.png
Moved: U+4E34.png
Moved: U+4E35.png
Moved: U+4E36.png
Moved: U+4