In [3]:
import random

def split_text(text):
    text_length = len(text)
    split_lengths = []

    while text_length > 0:
        split_length = random.randint(1, 20)
        split_lengths.append(split_length)
        text_length -= split_length

    start = 0
    split_texts = []

    for length in split_lengths:
        split_texts.append(text[start:start + length])
        start += length
    
    return split_texts

Process and save *苦难的年代* into textlines.

In [15]:
import os
import fitz


with open("textlines/niandai.txt", "w", encoding="utf-8") as output:
    for file in os.listdir("corpus"):
        if not file.endswith(".pdf"):
            continue

        with fitz.open(f"corpus/{file}") as doc:
            for page in doc:
                text = page.get_text("text")
                text = text.strip().replace("\n", "").replace(" ", "")
                if text:
                    for seg in split_text(text):
                        if random.random() > 0.8:
                            seg = "  " + seg

                        output.write(seg + "\n")


Process poetry dataset

In [16]:
import os

POETRY_DATA = "../poetry/data/"

with open("textlines/poetry.txt", "w", encoding="utf-8") as output:
    for root, _dirs, files in os.walk(POETRY_DATA):
        for file in files:
            if not file.endswith(".pt"):
                continue

            with open(os.path.join(root, file), "r", encoding="utf-8") as doc:
                text = doc.read()
                text = "".join(text.split("\n")[2:])
                if text:
                    for seg in split_text(text):
                        if random.random() > 0.8:
                            seg = "  " + seg

                        output.write(seg + "\n")

In [None]:
import os
import random

from tqdm import tqdm
import ipyplot
from PIL import Image, ImageFont
from handright import Template, handwrite
from itertools import product
import concurrent.futures


fonts = lambda: random.choice([
    ImageFont.truetype(f"fonts/{font}", size=160)
    for font in os.listdir("fonts") if font.endswith((".ttf", ".otf"))
])
fills = lambda: tuple(random.choices(range(50), k=3))
graph_outlines = lambda: tuple(random.choices(range(128), k=3))
graph_widths = lambda: random.randint(2, 8)
graph_papers = lambda min_chars: (random.randint(min_chars, 25), 1)


def construct_template(text):
    min_chars = len(text)
    args = {
        "font": fonts(),
        "fill": fills(),
        "graph_outline": graph_outlines(),
        "graph_width": graph_widths(),
        "graph_paper": graph_papers(min_chars)
    }

    return Template(
        **args,
        background=Image.new(mode="RGB", size=(200, 200), color="#fff"),
        left_margin=20,
        top_margin=20,
        right_margin=20,
        bottom_margin=20,
        line_spacing_sigma=4,  # 行间距随机扰动
        font_size_sigma=2,  # 字体大小随机扰动
        word_spacing_sigma=4,  # 字间距随机扰动
        # start_chars="“（[<",  # 特定字符提前换行，防止出现在行尾
        # end_chars="，。",  # 防止特定字符因排版算法的自动换行而出现在行首
        perturb_x_sigma=4,  # 笔画横向偏移随机扰动
        perturb_y_sigma=6,  # 笔画纵向偏移随机扰动
        perturb_theta_sigma=0.05,  # 笔画旋转偏移随机扰动
        single_line=True,
        features=set([1]))

def draw_line(data):
    file, line, i = data
    if not line:
        return (file, [], i)

    images = list(handwrite(line, construct_template(line)))
    return (file, images, i)


def textlines(file):
    for i, line in enumerate(open(file, "r", encoding="utf-8")):
        yield (file, line, i)

TEXTLINES_DIR = "textlines"

for file in os.listdir(TEXTLINES_DIR):
    if not os.path.exists(f"output/{file}"):
        os.makedirs(f"output/{file}")
    if not file.endswith(".txt"):
        continue

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = tqdm(executor.map(draw_line, textlines(os.path.join(TEXTLINES_DIR, file))))
        for res in results:
            _file, images, i = res
            for im in images:
                im.save(f"output/{file}/{i}.jpg")