In [None]:
!pip install -U wxPython

In [None]:
!pip install -U Faker

In [None]:
import string

import pandas as pd
import wx
from faker import Faker

In [None]:
FONT_FACES = [
    # main
    "Courier",
    "Geneva",
    "Georgia",
    "Helvetica",
    "Lucida Grande",
    "Times New Roman",
    "Verdana",
    # additional
    "Arial", # pretty standard
    "Brush Script MT", # very curved
    "Lucida Console", # monospaced
    "Wingdings", # just symbols
]

In [None]:
FONT_SIZES = [
    9, 11, 12, 14, 17, 20,
]

In [None]:
FONT_VERSIONS = [
    {'bold': False, 'italic': False},
    {'bold': True, 'italic': False},
    {'bold': False, 'italic': True},
    {'bold': True, 'italic': True},
]

In [None]:
def get_japanese(random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker(locale="ja_JP")
    symbols = sorted([(ord(c), c) for c in set(fake.paragraph(nb_sentences=1_000))], key=lambda p: p[0])
    assert len(symbols) == 316
    return symbols

def get_greek(random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker(locale="el_GR")
    symbols = sorted([(ord(c), c) for c in set(fake.paragraph(nb_sentences=1_000))], key=lambda p: p[0])
    assert len(symbols) == 65
    return symbols

SYMBOLS = [(ord(c), c) for c in string.digits + string.punctuation + " "]
BASIC_LATIN_CHARS = \
    [(i, chr(i)) for i in list(range(ord(u'\u0041'), ord(u'\u005a') + 1)) + \
                          list(range(ord(u'\u0061'), ord(u'\u007a') + 1)) \
     if chr(i).isprintable()] + SYMBOLS
RUSSIAN_CHARS = \
    [(i, chr(i)) for i in range(ord(u'\u0410'), ord(u'\u044f') + 1) \
     if chr(i).isprintable()] + SYMBOLS
GREEK_CHARS = get_greek()
JAPANESE_CHARS = get_japanese()
CHARS = \
    [(p[0], p[1], "basic_latin") for p in BASIC_LATIN_CHARS] + \
    [(p[0], p[1], "russian") for p in RUSSIAN_CHARS] + \
    [(p[0], p[1], "greek") for p in GREEK_CHARS] + \
    [(p[0], p[1], "japanese") for p in JAPANESE_CHARS]

In [None]:
def size_in_px(text, font_face="Arial", font_size=11, bold=False, italic=False):
    _ = wx.App()

    font_info = wx.FontInfo(font_size).FaceName(font_face)
    if bold:
        font_info = font_info.Bold()
    if italic:
        font_info = font_info.Italic()
    font = wx.Font(font_info)

    screen_dc = wx.ScreenDC()
    screen_dc.SetFont(font)
    size = screen_dc.GetTextExtent(text)

    return size

In [None]:
def get_char_widths_df():
    from itertools import product

    data = {
        "char_id": [],
        "char": [],
        "alphabet": [],
        "font_face": [],
        "font_size": [],
        "font_version": [],
        "width": [],
    }
    for t in product(FONT_FACES, FONT_SIZES, FONT_VERSIONS, CHARS):
        data["font_face"] += [t[0]]
        data["font_size"] += [t[1]]
        data["font_version"] += [
            "bi" if t[2]['bold'] and t[2]['italic'] else
            "b" if t[2]['bold'] and not t[2]['italic'] else
            "i" if not t[2]['bold'] and t[2]['italic'] else ""
        ]
        data["char_id"] += [t[3][0]]
        data["char"] += [t[3][1]]
        data["alphabet"] += [t[3][2]]
        data["width"] += [size_in_px(t[3][1], t[0], t[1], t[2]['bold'], t[2]['italic'])[0]]

    return pd.DataFrame(data)

In [None]:
def get_local_control_df(alphabet, font_face="Arial", font_size=11, font_version="", size=1000, max_words_count=100, random_state=42):
    from faker import Faker

    LOCALES = {
        "basic_latin": "en_US",
        "russian": "ru_RU",
        "greek": "el_GR",
        "japanese": "ja_JP",
    }
    Faker.seed(random_state)
    fake = Faker(locale=LOCALES[alphabet])
    result_df = pd.DataFrame({"text": [fake.sentence(nb_words=(int(max_words_count * n / size) + 1)) for n in range(size)]})
    result_df = result_df.assign(symbols_count=result_df.text.str.len()).sort_values("symbols_count")
    bold = "b" in font_version
    italic = "i" in font_version
    result_df["width"] = result_df.text.apply(lambda s: size_in_px(s, font_face, font_size, bold, italic)[0])

    return result_df.assign(
        alphabet=lambda x: alphabet,
        font_face=lambda x: font_face,
        font_size=lambda x: font_size,
        font_version=lambda x: font_version,
    )

In [None]:
def get_control_df(bunch_size=1000, random_state=42):
    from itertools import product

    df = pd.DataFrame(columns=["text", "width", "alphabet", "font_face", "font_size", "font_version"])
    for t in product(list(set([c[2] for c in CHARS])), FONT_FACES, FONT_SIZES, FONT_VERSIONS):
        alphabet = t[0]
        font_face = t[1]
        font_size = t[2]
        font_version = "bi" if t[3]['bold'] and t[3]['italic'] else \
                       "b" if t[3]['bold'] and not t[3]['italic'] else \
                       "i" if not t[3]['bold'] and t[3]['italic'] else ""

        df = pd.concat([df, get_local_control_df(alphabet, font_face=font_face, font_size=font_size, font_version=font_version, size=bunch_size, random_state=random_state)])

    return df

In [None]:
char_widths_df = get_char_widths_df()
char_widths_df.head()

In [None]:
char_widths_df.to_csv("../data/full/char_widths.csv", index=False)

In [None]:
control_df = get_control_df()
control_df.head()

In [None]:
control_df.to_csv("../data/full/control.csv", index=False)