In [1]:
!pip install -U wxPython

Collecting wxPython
  Using cached wxPython-4.2.0-cp38-cp38-win_amd64.whl (18.0 MB)
Collecting pillow
  Using cached Pillow-9.2.0-cp38-cp38-win_amd64.whl (3.3 MB)
Installing collected packages: pillow, wxPython
Successfully installed pillow-9.2.0 wxPython-4.2.0


In [2]:
!pip install -U Faker

Collecting Faker
  Using cached Faker-14.2.0-py3-none-any.whl (1.6 MB)
Installing collected packages: Faker
Successfully installed Faker-14.2.0


In [3]:
import os
import string

import pandas as pd
import wx

import os; import sys; sys.path.append(os.path.join(sys.path[0], ".."))
from utils import font as ufont

In [4]:
DATA_DIR = "../data"

In [5]:
def prepare_chars(alphabet, *char_lists):
    from collections import namedtuple

    Char = namedtuple("Char", ["id", "char", "alphabet", "subset"])

    return [
        Char(char_id, char, alphabet, subset)
        for (char_id, char, subset) in sum(char_lists, start=[]) if char.isprintable()
    ]

def drop_duplicates_from_chars(chars):
    return [char for char_id, char in {c.id: c for c in chars}.items()]

SYMBOLS = [(ord(c), c, "symbols") for c in string.digits + string.punctuation + " "]

BASIC_LATIN = [(i, chr(i), "basic") for i in range(ord(u'\u0020'), ord(u'\u007f') + 1)]
LATIN_SUPPLEMENT = [(i, chr(i), "supplement") for i in range(ord(u'\u00a0'), ord(u'\u00ff') + 1)]
LATIN_EXTENDED_A = [(i, chr(i), "extended") for i in range(ord(u'\u0100'), ord(u'\u017f') + 1)]
LATIN_EXTENDED_B = [(i, chr(i), "extended") for i in range(ord(u'\u0180'), ord(u'\u024f') + 1)]

BASIC_CYRILLIC = [(i, chr(i), "basic") for i in range(ord(u'\u0410'), ord(u'\u044f') + 1)]
CYRILLIC_SUPPLEMENTARY = [(i, chr(i), "supplement") for i in range(ord(u'\u0500'), ord(u'\u052f') + 1)]
FULL_CYRILLIC = [(i, chr(i), "extended") for i in range(ord(u'\u0400'), ord(u'\u04ff') + 1) if i not in [1155, 1156, 1157, 1158, 1159]]

LATIN = prepare_chars("latin", LATIN_EXTENDED_A, LATIN_EXTENDED_B, LATIN_SUPPLEMENT, BASIC_LATIN, SYMBOLS)
CYRILLIC = prepare_chars("cyrillic", FULL_CYRILLIC, CYRILLIC_SUPPLEMENTARY, BASIC_CYRILLIC)
CHARS = sorted(drop_duplicates_from_chars(CYRILLIC + LATIN), key=lambda t: t[0])
print(len(CHARS))
CHARS[:5]

824


[Char(id=32, char=' ', alphabet='latin', subset='symbols'),
 Char(id=33, char='!', alphabet='latin', subset='symbols'),
 Char(id=34, char='"', alphabet='latin', subset='symbols'),
 Char(id=35, char='#', alphabet='latin', subset='symbols'),
 Char(id=36, char='$', alphabet='latin', subset='symbols')]

In [6]:
LOCALES = {
    "en_US": "latin",
    "es_ES": "latin",
    "fr_FR": "latin",
    "pt_PT": "latin",
    "ru_RU": "cyrillic",
}

In [7]:
def width_in_px(text, font=ufont.BASIC_FONT):
    _ = wx.App()

    font_info = wx.FontInfo(font.size).FaceName(font.family)
    if font.face.bold:
        font_info = font_info.Bold()
    if font.face.italic:
        font_info = font_info.Italic()
    wx_font = wx.Font(font_info)

    screen_dc = wx.ScreenDC()
    screen_dc.SetFont(wx_font)
    size = screen_dc.GetTextExtent(text)

    return size[0]

In [8]:
def get_char_widths_df(
        font_families=ufont.FONT_FAMILIES,
        font_sizes=ufont.FONT_SIZES,
        font_faces=ufont.FONT_FACES,
        chars=CHARS
    ):
    from itertools import product

    data = {
        "char_id": [],
        "char": [],
        "alphabet": [],
        "subset": [],
        "font_family": [],
        "font_size": [],
        "font_face": [],
        "is_monospaced": [],
        "width": [],
    }
    for (font_family, font_size, font_face, char) in product(font_families, font_sizes, font_faces, chars):
        font = ufont.Font(font_family, font_size, font_face)
        data["font_family"] += [font_family]
        data["font_size"] += [font_size]
        data["font_face"] += [str(font_face)]
        data["is_monospaced"] += [ufont.is_monospaced(font)]
        data["char_id"] += [char.id]
        data["char"] += [char.char]
        data["alphabet"] += [char.alphabet]
        data["subset"] += [char.subset]
        data["width"] += [width_in_px(char.char, font)]

    return pd.DataFrame(data)

In [9]:
def get_aes_texts_s(locale):
    if locale == "en_US":
        return pd.Series([
            "x",
            "y",
            "z",
            "color",
            "fill",
            "alpha",
            "shape",
            "linetype",
            "size",
            "stacksize",
            "width",
            "height",
            "binwidth",
            "violinwidth",
            "weight",
            "intercept",
            "slope",
            "xintercept",
            "yintercept",
            "lower",
            "middle",
            "upper",
            "sample",
            "xmin",
            "xmax",
            "ymin",
            "ymax",
            "xend",
            "yend",
        ], name="text")
    else:
        return pd.Series([], dtype=str, name="text")

def get_pure_texts_s(size, locale, *, max_words_count=5, size_reserve_coeff=2, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker(locale=locale)
    result_df = pd.DataFrame({
        "text": [
            fake.sentence(nb_words=(int(max_words_count * n / size) + 1), variable_nb_words=False)
            for n in list(range(size)) * size_reserve_coeff
        ]
    }).drop_duplicates(subset="text").sample(size, random_state=random_state).reset_index(drop=True)

    return result_df.text.str[:-1]

def get_name_texts_s(size, locale, *, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker(locale=locale)

    return pd.Series([fake.name() for n in range(size)], dtype=str, name="text")

def get_date_texts_s(size, locale, *, us_only=True, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker()
    if us_only and locale != "en_US":
        size = 0

    return pd.Series([fake.date() for n in range(size)], dtype=str, name="text")

def get_latlon_texts_s(size, locale, *, us_only=True, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker()
    if us_only and locale != "en_US":
        size = 0

    return pd.concat([
        pd.Series([fake.latitude() for n in range(size)], dtype=str, name="text").astype(str),
        pd.Series([fake.longitude() for n in range(size)], dtype=str, name="text").astype(str),
    ])

def get_fake_texts_s(size, locale, *, random_state=42):
    return pd.concat([
        get_name_texts_s(size, locale, random_state=random_state),
        get_date_texts_s(size, locale, random_state=random_state),
        get_latlon_texts_s(round(size / 2), locale, random_state=random_state),
    ], ignore_index=True)

def get_small_int_texts_s(limit, *, step=1):
    return pd.Series([str(n) for n in range(-limit, limit + 1, step)], dtype=str, name="text")

def get_random_int_texts_s(size, *, limit=1_000_000, step=1, random_state=42):
    return pd.DataFrame({"text": list(range(-limit, limit + 1, step))}).sample(size, random_state=random_state).text.astype(str)

def get_small_float_texts_s(size=201, *, limit=1):
    import numpy as np

    return pd.Series([str(x) for x in np.linspace(-limit, limit, size)], dtype=str, name="text")

def get_random_float_texts_s(size, *, limit=1_000, random_state=42):
    import numpy as np

    np.random.seed(random_state)

    return pd.DataFrame({"text": 2 * limit * np.random.random_sample(size) - limit}).text.astype(str)

def get_number_texts_s(small_size, big_size, *, random_state=42):
    return pd.concat([
        get_small_int_texts_s(small_size),
        get_small_float_texts_s(small_size * 2 + 1),
        get_random_int_texts_s(big_size, random_state=random_state),
        get_random_float_texts_s(big_size, random_state=random_state),
    ], ignore_index=True)

In [10]:
def get_texts_df(
        font_families=ufont.FONT_FAMILIES,
        font_sizes=ufont.FONT_SIZES,
        font_faces=ufont.FONT_FACES,
        locales=LOCALES,
        texts_bunch_size=100,
        fake_bunch_size=10,
        numbers_small_bunch_size=10,
        numbers_big_bunch_size=20,
        pure_texts_only=False,
        aes_texts=True,
        max_words_count=5,
        size_reserve_coeff=2,
        random_state=42
    ):
    from itertools import product

    def get_local_texts_df(locale, font_family, font_size, font_face, number_random_state):
        font = ufont.Font(font_family, font_size, font_face)
        result_s = get_pure_texts_s(texts_bunch_size, locale, max_words_count=max_words_count, random_state=random_state)
        if not pure_texts_only:
            result_s = pd.concat([
                result_s,
                get_fake_texts_s(fake_bunch_size, locale, random_state=random_state),
                get_number_texts_s(numbers_small_bunch_size, numbers_big_bunch_size, random_state=number_random_state),
            ], ignore_index=True)
            if aes_texts:
                result_s = pd.concat([result_s, get_aes_texts_s(locale)], ignore_index=True)
        return result_s.to_frame().assign(
            symbols_count=result_s.str.len(),
            width=result_s.apply(lambda s: width_in_px(s, font))
        ).sort_values(["symbols_count", "width", "text"]).assign(
            alphabet=lambda r: locales[locale],
            locale=lambda r: locale,
            font_family=lambda r: font_family,
            font_size=lambda r: font_size,
            font_face=lambda r: str(font_face),
            is_monospaced=lambda r: ufont.is_monospaced(font_family)
        )

    random_states = {
        locale: random_state + i
        for i, locale in enumerate(locales.keys())
    }
    df = pd.DataFrame(columns=["text", "width", "alphabet", "locale", "font_family", "font_size", "font_face"])
    for (locale, font_family, font_size, font_face) in product(locales.keys(), font_families, font_sizes, font_faces):
        df = pd.concat([df, get_local_texts_df(locale, font_family, font_size, font_face, \
                                               random_states[locale])], ignore_index=True)

    df.symbols_count = df.symbols_count.astype(int)
    return df.reset_index(drop=True)

In [11]:
char_widths_df = get_char_widths_df()
print(char_widths_df.shape)
char_widths_df

(237312, 9)


Unnamed: 0,char_id,char,alphabet,subset,font_family,font_size,font_face,is_monospaced,width
0,32,,latin,symbols,Courier,9,normal,True,8
1,33,!,latin,symbols,Courier,9,normal,True,8
2,34,"""",latin,symbols,Courier,9,normal,True,8
3,35,#,latin,symbols,Courier,9,normal,True,8
4,36,$,latin,symbols,Courier,9,normal,True,8
...,...,...,...,...,...,...,...,...,...
237307,1323,ԫ,cyrillic,supplement,Rockwell,20,bold+italic,False,27
237308,1324,Ԭ,cyrillic,supplement,Rockwell,20,bold+italic,False,27
237309,1325,ԭ,cyrillic,supplement,Rockwell,20,bold+italic,False,27
237310,1326,Ԯ,cyrillic,supplement,Rockwell,20,bold+italic,False,27


In [12]:
char_widths_df.to_csv("{0}/char_widths.csv".format(DATA_DIR), index=False)

In [13]:
control_df = get_texts_df(
    texts_bunch_size=100,
    fake_bunch_size=10,
    numbers_small_bunch_size=10,
    numbers_big_bunch_size=20,
    pure_texts_only=False,
    aes_texts=True,
    max_words_count=4
)
print(control_df.shape)
control_df

(290592, 9)


Unnamed: 0,text,width,alphabet,locale,font_family,font_size,font_face,symbols_count,is_monospaced
0,0,8,latin,en_US,Courier,9,normal,1,True
1,1,8,latin,en_US,Courier,9,normal,1,True
2,2,8,latin,en_US,Courier,9,normal,1,True
3,3,8,latin,en_US,Courier,9,normal,1,True
4,4,8,latin,en_US,Courier,9,normal,1,True
...,...,...,...,...,...,...,...,...,...
290587,Дыхание означать горький выбирать,831,cyrillic,ru_RU,Rockwell,20,bold+italic,33,False
290588,Правление рот человечек мелькнуть,831,cyrillic,ru_RU,Rockwell,20,bold+italic,33,False
290589,Металл назначить ложиться подземный,885,cyrillic,ru_RU,Rockwell,20,bold+italic,35,False
290590,Металл поставить провинция разводить,912,cyrillic,ru_RU,Rockwell,20,bold+italic,36,False


In [14]:
control_df.to_csv("{0}/control.csv".format(DATA_DIR), index=False)

In [15]:
texts_df = get_texts_df(
    texts_bunch_size=50,
    fake_bunch_size=20,
    numbers_small_bunch_size=20,
    numbers_big_bunch_size=50,
    pure_texts_only=False,
    aes_texts=True,
    max_words_count=10
)
print(texts_df.shape)
texts_df

(382752, 9)


Unnamed: 0,text,width,alphabet,locale,font_family,font_size,font_face,symbols_count,is_monospaced
0,0,8,latin,en_US,Courier,9,normal,1,True
1,1,8,latin,en_US,Courier,9,normal,1,True
2,2,8,latin,en_US,Courier,9,normal,1,True
3,3,8,latin,en_US,Courier,9,normal,1,True
4,4,8,latin,en_US,Courier,9,normal,1,True
...,...,...,...,...,...,...,...,...,...
382747,Спалить хотеть устройство магазин приятель про...,1892,cyrillic,ru_RU,Rockwell,20,bold+italic,76,False
382748,Интернет вперед зима мелькнуть стакан крыса ср...,1980,cyrillic,ru_RU,Rockwell,20,bold+italic,80,False
382749,Сбросить витрина холодно настать тюрьма металл...,2000,cyrillic,ru_RU,Rockwell,20,bold+italic,80,False
382750,Рот человечек мелькнуть единый набор вариант в...,2007,cyrillic,ru_RU,Rockwell,20,bold+italic,81,False


In [16]:
texts_df.to_csv("{0}/texts.csv".format(DATA_DIR), index=False)