In [1]:
import random

def split_text(text):
    text_length = len(text)
    split_lengths = []

    while text_length > 0:
        split_length = random.randint(1, 20)
        split_lengths.append(split_length)
        text_length -= split_length

    start = 0
    split_texts = []

    for length in split_lengths:
        split_texts.append(text[start:start + length])
        start += length
    
    return split_texts

Process and save *苦难的年代* into textlines.

In [2]:
import os
import fitz


with open("textlines/niandai-0.txt", "w", encoding="utf-8") as output:
    for file in os.listdir("corpus"):
        if not file.endswith(".pdf"):
            continue

        with fitz.open(f"corpus/{file}") as doc:
            for page in doc:
                text = page.get_text("text")
                text = text.strip().replace("\n", "").replace(" ", "")
                if text:
                    for seg in split_text(text):
                        if random.random() > 0.8:
                            seg = "  " + seg

                        output.write(seg + "\n")


In [1]:
import re
import os
import csv

import fitz

LABELS_DIR = os.path.join("dataset_syn", "labels", "niandai")
INDEX_PATH = os.path.join("dataset_syn", "indexes", "niandai.tsv")

if not os.path.exists(LABELS_DIR):
    os.makedirs(LABELS_DIR)

with open(INDEX_PATH, "w", encoding="utf-8") as index_file:
    writer = csv.writer(index_file, delimiter='\t', lineterminator='\n')

    file_idx = 0

    for file in os.listdir("corpus"):
        print(file)

        if not file.endswith(".pdf"):
            continue

        lines = []

        with fitz.open(f"corpus/{file}") as doc:
            for page in doc:
                blocks = page.get_text("blocks")
                for block in blocks:
                    text = block[4]
                    text = text.strip().replace("\n", "")
                    text = re.sub(r"\s+", " ", text)
                    if text:
                        lines.append(text)

        if lines:
            with open(os.path.join(LABELS_DIR, f"niandai-{file_idx}.txt"), "w", encoding="utf-8") as f:
                f.write("\n".join(lines))

            relative_label_file_path = os.path.join(
                "niandai", f"niandai-{file_idx}.txt")
            writer.writerow([relative_label_file_path, len(lines)])

            file_idx += 1

《苦难的年代》（第二十一章至第三十章）.pdf
《苦难的年代》（第六十一章至第七十一章）.pdf
《苦难的年代》（第三十一章至第四十章）.pdf
《苦难的年代》（第四十一章至第五十章）.pdf
《苦难的年代》（第五十一章至第六十章）.pdf
《苦难的年代》（第十一章至第二十章）.pdf
《苦难的年代》（第一章至第十章）.pdf


Process poetry dataset

In [3]:
import os

POETRY_DATA = "../poetry/data/"


def output_handle(num, output, current_output_index):
    new_index = num // 50000

    if output and current_output_index == new_index:
        return current_output_index, output

    if output:
        output.close()

    return new_index, open(f"textlines/poetry-{new_index}.txt",
                           "w",
                           encoding="utf-8")


def process_poetry():
    num_lines = 0
    output_index = 0
    _, output = output_handle(num_lines, None, output_index)

    for root, _dirs, files in os.walk(POETRY_DATA):
        for file in files:
            if not file.endswith(".pt"):
                continue

            with open(os.path.join(root, file), "r", encoding="utf-8") as doc:
                text = doc.read()
                text = "".join(text.split("\n")[2:])
                if text:
                    for seg in split_text(text):
                        if random.random() > 0.8:
                            seg = "  " + seg

                        output_index, output = output_handle(
                            num_lines, output, output_index)
                        output.write(seg + "\n")
                        num_lines += 1


process_poetry()

In [4]:
import os
import random

from tqdm import tqdm
import ipyplot
from PIL import Image, ImageFont
from handright import Template, handwrite
from itertools import product
import concurrent.futures

available_fonts = [
    ImageFont.truetype(f"fonts/{font}", size=160)
    for font in os.listdir("fonts") if font.endswith((".ttf", ".otf"))
]
fonts = lambda: random.choice(available_fonts)
fills = lambda: tuple(random.choices(range(50), k=3))
graph_outlines = lambda: tuple(random.choices(range(128), k=3))
graph_widths = lambda: random.randint(2, 8)
graph_papers = lambda min_chars: (random.randint(min_chars, 25), 1)
margins = lambda: random.randint(0, 150)


def construct_template(text):
    min_chars = len(text)
    args = {
        "font": fonts(),
        "fill": fills(),
        "graph_outline": graph_outlines(),
        "graph_width": graph_widths(),
        "graph_paper": graph_papers(min_chars),
        "left_margin": margins(),
        "top_margin": margins(),
        "right_margin": margins(),
        "bottom_margin": margins(),
    }

    return Template(
        **args,
        background=Image.new(mode="RGB", size=(200, 200), color="#fff"),
        line_spacing_sigma=4,  # 行间距随机扰动
        font_size_sigma=2,  # 字体大小随机扰动
        word_spacing_sigma=4,  # 字间距随机扰动
        # start_chars="“（[<",  # 特定字符提前换行，防止出现在行尾
        # end_chars="，。",  # 防止特定字符因排版算法的自动换行而出现在行首
        perturb_x_sigma=4,  # 笔画横向偏移随机扰动
        perturb_y_sigma=6,  # 笔画纵向偏移随机扰动
        perturb_theta_sigma=0.05,  # 笔画旋转偏移随机扰动
        single_line=True,
        features=set([1]))


def draw_line(data):
    file, line, i = data
    if not line:
        return (file, [], i)

    images = list(handwrite(line, construct_template(line)))
    return (file, images, i)


def textlines(file):
    for i, line in enumerate(open(file, "r", encoding="utf-8")):
        if not line:
            continue

        yield (file, line, i)


def generate_data():
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for file in os.listdir(TEXTLINES_DIR):
            print(f"Processing {file}")
            output_dir = f"output/{file[:-4]}"
            input_file = os.path.join(TEXTLINES_DIR, file)

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if not file.endswith(".txt"):
                continue

            for res in tqdm(executor.map(draw_line, textlines(input_file), chunksize=3)):
                _file, images, i = res
                for im in images:
                    im.save(f"{output_dir}/{i}.jpg")


TEXTLINES_DIR = "textlines"
generate_data()


Processing poetry-0.txt


50000it [09:26, 88.27it/s] 


Processing poetry-1.txt


50000it [09:25, 88.35it/s] 


Processing poetry-2.txt


50000it [09:18, 89.47it/s] 


Processing poetry-3.txt


50000it [09:21, 89.00it/s] 


Processing poetry-4.txt


50000it [09:26, 88.24it/s] 


Processing poetry-5.txt


50000it [09:21, 89.00it/s] 


Processing poetry-6.txt


50000it [09:17, 89.73it/s] 


Processing poetry-7.txt


50000it [09:21, 89.09it/s] 


Processing poetry-8.txt


50000it [09:01, 92.39it/s] 


Processing poetry-9.txt


50000it [09:14, 90.20it/s] 


Processing poetry-10.txt


50000it [09:14, 90.13it/s] 


Processing poetry-11.txt


50000it [09:27, 88.12it/s] 


Processing poetry-12.txt


50000it [09:22, 88.83it/s] 


Processing poetry-13.txt


50000it [09:24, 88.60it/s] 


Processing poetry-14.txt


50000it [09:21, 88.97it/s] 


Processing poetry-15.txt


50000it [09:20, 89.13it/s] 


Processing poetry-16.txt


50000it [09:22, 88.87it/s] 


Processing poetry-17.txt


50000it [09:20, 89.13it/s] 


Processing poetry-18.txt


50000it [09:21, 89.09it/s] 


Processing poetry-19.txt


50000it [09:05, 91.69it/s] 


Processing poetry-20.txt


50000it [09:23, 88.78it/s] 


Processing poetry-21.txt


50000it [09:26, 88.18it/s] 


Processing poetry-22.txt


50000it [09:28, 87.94it/s] 


Processing poetry-23.txt


50000it [09:27, 88.17it/s] 


Processing poetry-24.txt


50000it [09:18, 89.48it/s] 


Processing poetry-25.txt


50000it [09:17, 89.68it/s] 


Processing poetry-26.txt


50000it [09:17, 89.70it/s] 


Processing poetry-27.txt


50000it [09:25, 88.44it/s] 


Processing poetry-28.txt


50000it [09:34, 87.00it/s] 


Processing poetry-29.txt


50000it [09:21, 89.11it/s] 


Processing poetry-30.txt


20660it [03:54, 88.00it/s] 


Processing niandai-0.txt


57014it [10:05, 94.10it/s] 


In [8]:
import os
import csv

from tqdm import tqdm


TEXTLINES_DIR = "textlines"
OUTPUT_DIR = "output"

with open(f'tang-syn-labels.tsv', 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')

    for file in os.listdir(TEXTLINES_DIR):
        image_dir = file[:-4]

        with open(os.path.join(TEXTLINES_DIR, file), "r", encoding="utf-8") as f:
            for i, line in enumerate(tqdm(f)):

                if not os.path.exists(os.path.join(OUTPUT_DIR, image_dir, f"{i}.jpg")):
                    print(f"{image_dir}/{i}.jpg not found")
                    continue

                writer.writerow([f"{image_dir}/{i}.jpg", line.replace("\n", "")])
            

50000it [00:00, 320336.78it/s]
50000it [00:00, 334265.55it/s]
50000it [00:00, 325044.60it/s]
50000it [00:00, 327158.59it/s]
50000it [00:00, 324450.20it/s]
50000it [00:00, 333017.12it/s]
50000it [00:00, 334381.74it/s]
50000it [00:00, 339411.44it/s]
50000it [00:00, 339329.61it/s]
50000it [00:00, 339814.57it/s]
50000it [00:00, 342950.96it/s]
50000it [00:00, 341700.95it/s]
50000it [00:00, 333677.86it/s]
50000it [00:00, 341433.36it/s]
50000it [00:00, 333471.99it/s]
50000it [00:00, 335712.44it/s]
50000it [00:00, 335487.96it/s]
50000it [00:00, 341806.21it/s]
50000it [00:00, 341016.96it/s]
50000it [00:00, 337007.74it/s]
50000it [00:00, 331812.62it/s]
50000it [00:00, 342474.36it/s]
50000it [00:00, 339945.11it/s]
50000it [00:00, 343602.20it/s]
50000it [00:00, 342594.08it/s]
50000it [00:00, 345254.53it/s]
50000it [00:00, 345018.24it/s]
50000it [00:00, 338480.71it/s]
50000it [00:00, 334829.66it/s]
50000it [00:00, 327575.59it/s]
20660it [00:00, 329578.10it/s]
57014it [00:00, 335353.27it/s]
