In [1]:
!pip install -U wxPython



In [2]:
!pip install -U Faker



In [3]:
import os
import string

import pandas as pd
import wx

import os; import sys; sys.path.append(os.path.join(sys.path[0], ".."))
from util import font as ufont

In [4]:
DATA_DIR = "../data"
FULL_DIR = "{0}/full".format(DATA_DIR)
CROPPED_DIR = "{0}/cropped".format(DATA_DIR)
SHORT_DIR = "{0}/short".format(DATA_DIR)

In [5]:
def prepare_chars(alphabet, *char_lists):
    from collections import namedtuple

    Char = namedtuple("Char", ["id", "char", "alphabet"])

    return sorted([
        Char(char_id, char, alphabet)
        for (char_id, char) in sum(char_lists, start=[]) if char.isprintable()
    ], key=lambda t: t[0])

def drop_duplicates_from_chars(chars):
    return [char for char_id, char in {c.id: c for c in chars}.items()]

SYMBOLS = [(ord(c), c) for c in string.digits + string.punctuation + " "]

BASIC_LATIN = [(i, chr(i)) for i in range(ord(u'\u0020'), ord(u'\u007f') + 1)]
LATIN_SUPPLEMENT = [(i, chr(i)) for i in range(ord(u'\u00a0'), ord(u'\u00ff') + 1)]
LATIN_EXTENDED_A = [(i, chr(i)) for i in range(ord(u'\u0100'), ord(u'\u017f') + 1)]
LATIN_EXTENDED_B = [(i, chr(i)) for i in range(ord(u'\u0180'), ord(u'\u024f') + 1)]

BASIC_CYRILLIC = [(i, chr(i)) for i in range(ord(u'\u0410'), ord(u'\u044f') + 1)]
FULL_CYRILLIC = [(i, chr(i)) for i in range(ord(u'\u0400'), ord(u'\u04ff') + 1) if i not in [1155, 1156, 1157, 1158, 1159]]
CYRILLIC_SUPPLEMENTARY = [(i, chr(i)) for i in range(ord(u'\u0500'), ord(u'\u052f') + 1)]

LATIN = prepare_chars("latin", SYMBOLS, BASIC_LATIN, LATIN_SUPPLEMENT, LATIN_EXTENDED_A, LATIN_EXTENDED_B)
CYRILLIC = prepare_chars("cyrillic", BASIC_CYRILLIC, FULL_CYRILLIC, CYRILLIC_SUPPLEMENTARY)
CHARS = drop_duplicates_from_chars(LATIN + CYRILLIC)
print(len(CHARS))
CHARS[:5]

824


[Char(id=32, char=' ', alphabet='latin'),
 Char(id=33, char='!', alphabet='latin'),
 Char(id=34, char='"', alphabet='latin'),
 Char(id=35, char='#', alphabet='latin'),
 Char(id=36, char='$', alphabet='latin')]

In [6]:
LOCALES = {
    "en_US": "latin",
    "es_ES": "latin",
    "fr_FR": "latin",
    "pt_PT": "latin",
    "ru_RU": "cyrillic",
}

In [7]:
def width_in_px(text, font=ufont.BASIC_FONT):
    _ = wx.App()

    font_info = wx.FontInfo(font.size).FaceName(font.family)
    if font.face.bold:
        font_info = font_info.Bold()
    if font.face.italic:
        font_info = font_info.Italic()
    wx_font = wx.Font(font_info)

    screen_dc = wx.ScreenDC()
    screen_dc.SetFont(wx_font)
    size = screen_dc.GetTextExtent(text)

    return size[0]

In [8]:
def get_char_widths_df(
        font_families=ufont.FONT_FAMILIES,
        font_sizes=ufont.FONT_SIZES,
        font_faces=ufont.FONT_FACES,
        chars=CHARS
    ):
    from itertools import product

    data = {
        "char_id": [],
        "char": [],
        "alphabet": [],
        "font_family": [],
        "font_size": [],
        "font_face": [],
        "is_monospaced": [],
        "width": [],
    }
    for (font_family, font_size, font_face, char) in product(font_families, font_sizes, font_faces, chars):
        font = ufont.Font(font_family, font_size, font_face)
        data["font_family"] += [font_family]
        data["font_size"] += [font_size]
        data["font_face"] += [str(font_face)]
        data["is_monospaced"] += [ufont.is_monospaced(font)]
        data["char_id"] += [char.id]
        data["char"] += [char.char]
        data["alphabet"] += [char.alphabet]
        data["width"] += [width_in_px(char.char, font)]

    return pd.DataFrame(data)

In [9]:
def get_aes_texts_s():
    return pd.Series([
        "x",
        "y",
        "z",
        "color",
        "fill",
        "alpha",
        "shape",
        "linetype",
        "size",
        "stacksize",
        "width",
        "height",
        "binwidth",
        "violinwidth",
        "weight",
        "intercept",
        "slope",
        "xintercept",
        "yintercept",
        "lower",
        "middle",
        "upper",
        "sample",
        "xmin",
        "xmax",
        "ymin",
        "ymax",
        "xend",
        "yend",
    ], name="text")

def get_pure_texts_s(size, locale, *, max_words_count=5, size_reserve_coeff=2, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker(locale=locale)
    result_df = pd.DataFrame({
        "text": [
            fake.sentence(nb_words=(int(max_words_count * n / size) + 1), variable_nb_words=False)
            for n in list(range(size)) * size_reserve_coeff
        ]
    }).drop_duplicates(subset="text").sample(size, random_state=random_state).reset_index(drop=True)

    return result_df.text.str[:-1]

def get_name_texts_s(size, *, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker()

    return pd.Series([fake.name() for n in range(size)], name="text")

def get_date_texts_s(size, *, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker()

    return pd.Series([fake.date() for n in range(size)], name="text")

def get_latlon_texts_s(size, *, random_state=42):
    from faker import Faker

    Faker.seed(random_state)
    fake = Faker()

    return pd.concat([
        pd.Series([fake.latitude() for n in range(size)], name="text").astype(str),
        pd.Series([fake.longitude() for n in range(size)], name="text").astype(str),
    ])

def get_fake_texts_s(size, *, random_state=42):
    return pd.concat([
        get_name_texts_s(size, random_state=random_state),
        get_date_texts_s(size, random_state=random_state),
        get_latlon_texts_s(round(size / 2), random_state=random_state),
    ], ignore_index=True)

def get_small_int_texts_s(limit, *, step=1):
    return pd.Series([str(n) for n in range(-limit, limit + 1, step)], name="text")

def get_random_int_texts_s(size, *, limit=1_000_000, step=1, random_state=42):
    return pd.DataFrame({"text": list(range(-limit, limit + 1, step))}).sample(size, random_state=random_state).text.astype(str)

def get_small_float_texts_s(size=201, *, limit=1):
    import numpy as np

    return pd.Series([str(x) for x in np.linspace(-limit, limit, size)], name="text")

def get_random_float_texts_s(size, *, limit=1_000, random_state=42):
    import numpy as np

    np.random.seed(random_state)

    return pd.DataFrame({"text": 2 * limit * np.random.random_sample(size) - limit}).text.astype(str)

def get_number_texts_s(small_size, big_size, *, random_state=42):
    return pd.concat([
        get_small_int_texts_s(small_size),
        get_small_float_texts_s(small_size * 2 + 1),
        get_random_int_texts_s(big_size, random_state=random_state),
        get_random_float_texts_s(big_size, random_state=random_state),
    ], ignore_index=True)

In [10]:
def get_texts_df(
        font_families=ufont.FONT_FAMILIES,
        font_sizes=ufont.FONT_SIZES,
        font_faces=ufont.FONT_FACES,
        locales=LOCALES,
        texts_bunch_size=100,
        fake_bunch_size=10,
        numbers_small_bunch_size=10,
        numbers_big_bunch_size=20,
        pure_texts_only=False,
        aes_texts=True,
        max_words_count=5,
        size_reserve_coeff=2,
        random_state=42
    ):
    from itertools import product

    def get_local_texts_df(locale, font_family, font_size, font_face):
        font = ufont.Font(font_family, font_size, font_face)
        result_s = get_pure_texts_s(texts_bunch_size, locale, max_words_count=max_words_count, random_state=random_state)
        if not pure_texts_only:
            result_s = pd.concat([
                result_s,
                get_fake_texts_s(fake_bunch_size, random_state=random_state),
                get_number_texts_s(numbers_small_bunch_size, numbers_big_bunch_size, random_state=random_state),
            ], ignore_index=True)
            if aes_texts:
                result_s = pd.concat([result_s, get_aes_texts_s()], ignore_index=True)
        return result_s.to_frame().assign(
            symbols_count=result_s.str.len(),
            width=result_s.apply(lambda s: width_in_px(s, font))
        ).sort_values(["symbols_count", "width", "text"]).assign(
            alphabet=lambda r: locales[locale],
            font_family=lambda r: font_family,
            font_size=lambda r: font_size,
            font_face=lambda r: str(font_face),
            is_monospaced=lambda r: ufont.is_monospaced(font_family)
        )

    df = pd.DataFrame(columns=["text", "width", "alphabet", "font_family", "font_size", "font_face"])
    for (locale, font_family, font_size, font_face) in product(locales, font_families, font_sizes, font_faces):
        df = pd.concat([df, get_local_texts_df(locale, font_family=font_family, \
                                               font_size=font_size, font_face=font_face)], ignore_index=True)

    df.symbols_count = df.symbols_count.astype(int)
    return df.reset_index(drop=True)

In [11]:
def crop_df(df, proportion=.1, random_state=42):
    import numpy as np

    features = ["alphabet", "font_family", "font_size", "font_face"]
    n = df.groupby(features).count().iloc[:, 0].min()
    result_size = max(1, int(np.round(n * proportion)))

    return df.groupby(features).sample(n=result_size, random_state=random_state).reset_index(drop=True)

In [12]:
if not os.path.exists(FULL_DIR):
    os.makedirs(FULL_DIR)
if not os.path.exists(CROPPED_DIR):
    os.makedirs(CROPPED_DIR)
if not os.path.exists(SHORT_DIR):
    os.makedirs(SHORT_DIR)

In [13]:
char_widths_df = get_char_widths_df()
print(char_widths_df.shape)
char_widths_df

(237312, 8)


Unnamed: 0,char_id,char,alphabet,font_family,font_size,font_face,is_monospaced,width
0,32,,latin,Courier,9,normal,True,8
1,33,!,latin,Courier,9,normal,True,8
2,34,"""",latin,Courier,9,normal,True,8
3,35,#,latin,Courier,9,normal,True,8
4,36,$,latin,Courier,9,normal,True,8
...,...,...,...,...,...,...,...,...
237307,1323,ԫ,cyrillic,Rockwell,20,bold+italic,False,27
237308,1324,Ԭ,cyrillic,Rockwell,20,bold+italic,False,27
237309,1325,ԭ,cyrillic,Rockwell,20,bold+italic,False,27
237310,1326,Ԯ,cyrillic,Rockwell,20,bold+italic,False,27


In [14]:
char_widths_df.to_csv("{0}/char_widths.csv".format(FULL_DIR), index=False)

In [15]:
char_widths_df.to_csv("{0}/char_widths.csv".format(CROPPED_DIR), index=False)

In [16]:
char_widths_df.to_csv("{0}/char_widths.csv".format(SHORT_DIR), index=False)

In [17]:
control_df = get_texts_df(
    texts_bunch_size=100,
    fake_bunch_size=10,
    numbers_small_bunch_size=10,
    numbers_big_bunch_size=20,
    pure_texts_only=False,
    aes_texts=True,
    max_words_count=4
)
print(control_df.shape)
control_df

(347040, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,0,8,latin,Courier,9,normal,1,True
1,1,8,latin,Courier,9,normal,1,True
2,2,8,latin,Courier,9,normal,1,True
3,3,8,latin,Courier,9,normal,1,True
4,4,8,latin,Courier,9,normal,1,True
...,...,...,...,...,...,...,...,...
347035,Дыхание означать горький выбирать,831,cyrillic,Rockwell,20,bold+italic,33,False
347036,Правление рот человечек мелькнуть,831,cyrillic,Rockwell,20,bold+italic,33,False
347037,Металл назначить ложиться подземный,885,cyrillic,Rockwell,20,bold+italic,35,False
347038,Металл поставить провинция разводить,912,cyrillic,Rockwell,20,bold+italic,36,False


In [18]:
control_df.to_csv("{0}/control.csv".format(FULL_DIR), index=False)

In [19]:
cropped_control_df = crop_df(control_df, proportion=.01)
print(cropped_control_df.shape)
cropped_control_df

(1152, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,0.5,17,cyrillic,Arial,9,bold,3,False
1,4,7,cyrillic,Arial,9,bold,1,False
2,-125059,46,cyrillic,Arial,9,bold+italic,7,False
3,weight,39,cyrillic,Arial,9,bold+italic,6,False
4,Мягкий развитый,105,cyrillic,Arial,9,italic,15,False
...,...,...,...,...,...,...,...,...
1147,112.791430,181,latin,Verdana,20,bold+italic,10,False
1148,379901,102,latin,Verdana,20,italic,6,False
1149,-3,29,latin,Verdana,20,italic,2,False
1150,xend,66,latin,Verdana,20,normal,4,False


In [20]:
cropped_control_df.to_csv("{0}/control.csv".format(CROPPED_DIR), index=False)

In [21]:
short_control_df = get_texts_df(
    texts_bunch_size=10,
    pure_texts_only=True,
    max_words_count=4
)
print(short_control_df.shape)
short_control_df

(14400, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,Brother,56,latin,Courier,9,normal,7,True
1,Clearly,56,latin,Courier,9,normal,7,True
2,Purpose,56,latin,Courier,9,normal,7,True
3,Score each,80,latin,Courier,9,normal,10,True
4,Her need stop,104,latin,Courier,9,normal,13,True
...,...,...,...,...,...,...,...,...
14395,Уронить полоска цвет,500,cyrillic,Rockwell,20,bold+italic,20,False
14396,Сутки головной висеть труп,642,cyrillic,Rockwell,20,bold+italic,26,False
14397,Стакан механический командующий,797,cyrillic,Rockwell,20,bold+italic,31,False
14398,Наткнуться подробность демократия,851,cyrillic,Rockwell,20,bold+italic,33,False


In [22]:
short_control_df.to_csv("{0}/control.csv".format(SHORT_DIR), index=False)

In [23]:
texts_df = get_texts_df(
    texts_bunch_size=50,
    fake_bunch_size=20,
    numbers_small_bunch_size=20,
    numbers_big_bunch_size=50,
    pure_texts_only=False,
    aes_texts=True,
    max_words_count=10
)
print(texts_df.shape)
texts_df

(462240, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,0,8,latin,Courier,9,normal,1,True
1,1,8,latin,Courier,9,normal,1,True
2,2,8,latin,Courier,9,normal,1,True
3,3,8,latin,Courier,9,normal,1,True
4,4,8,latin,Courier,9,normal,1,True
...,...,...,...,...,...,...,...,...
462235,Спалить хотеть устройство магазин приятель про...,1892,cyrillic,Rockwell,20,bold+italic,76,False
462236,Интернет вперед зима мелькнуть стакан крыса ср...,1980,cyrillic,Rockwell,20,bold+italic,80,False
462237,Сбросить витрина холодно настать тюрьма металл...,2000,cyrillic,Rockwell,20,bold+italic,80,False
462238,Рот человечек мелькнуть единый набор вариант в...,2007,cyrillic,Rockwell,20,bold+italic,81,False


In [24]:
texts_df.to_csv("{0}/texts.csv".format(FULL_DIR), index=False)

In [25]:
cropped_texts_df = crop_df(texts_df, proportion=.05)
print(cropped_texts_df.shape)
cropped_texts_df

(9216, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,137.018253,66,cyrillic,Arial,9,bold,10,False
1,-852445,46,cyrillic,Arial,9,bold,7,False
2,Abigail Shaffer,83,cyrillic,Arial,9,bold,15,False
3,7,7,cyrillic,Arial,9,bold,1,False
4,821417,42,cyrillic,Arial,9,bold,6,False
...,...,...,...,...,...,...,...,...
9211,Praesentium fugit aliquam iusto dicta,506,latin,Verdana,20,normal,37,False
9212,Mouth call process water close month parent who,676,latin,Verdana,20,normal,47,False
9213,7,17,latin,Verdana,20,normal,1,False
9214,alpha,73,latin,Verdana,20,normal,5,False


In [26]:
cropped_texts_df.to_csv("{0}/texts.csv".format(CROPPED_DIR), index=False)

In [27]:
short_texts_df = get_texts_df(
    texts_bunch_size=20,
    pure_texts_only=True,
    max_words_count=10
)
print(short_texts_df.shape)
short_texts_df

(28800, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,Purpose,56,latin,Courier,9,normal,7,True
1,Out major born guy,144,latin,Courier,9,normal,18,True
2,Claim well two truth,160,latin,Courier,9,normal,20,True
3,Medical current hear,160,latin,Courier,9,normal,20,True
4,Blood personal success,176,latin,Courier,9,normal,22,True
...,...,...,...,...,...,...,...,...
28795,Академик вскакивать потянуться наслаждение кру...,1770,cyrillic,Rockwell,20,bold+italic,70,False
28796,Хлеб налоговый вообще разнообразный чем крутой...,1899,cyrillic,Rockwell,20,bold+italic,77,False
28797,Ход заведение легко торопливый премьера видимо...,1946,cyrillic,Rockwell,20,bold+italic,78,False
28798,Тесно степь ребятишки сбросить потрясти поезд ...,2054,cyrillic,Rockwell,20,bold+italic,82,False


In [28]:
short_texts_df.to_csv("{0}/texts.csv".format(SHORT_DIR), index=False)