In [1]:
!pip install -U wxPython



In [2]:
!pip install -U Faker



In [3]:
import os
import string

import pandas as pd
import wx

import os; import sys; sys.path.append(os.path.join(sys.path[0], ".."))
from util import font as ufont

In [4]:
DATA_DIR = "../data"
FULL_DIR = "{0}/full".format(DATA_DIR)
CROPPED_DIR = "{0}/cropped".format(DATA_DIR)

In [5]:
def prepare_chars(alphabet, *char_lists):
    from collections import namedtuple

    Char = namedtuple("Char", ["id", "char", "alphabet"])

    return sorted([
        Char(char_id, char, alphabet)
        for (char_id, char) in sum(char_lists, start=[]) if char.isprintable()
    ], key=lambda t: t[0])

def drop_duplicates_from_chars(chars):
    return [char for char_id, char in {c.id: c for c in chars}.items()]

SYMBOLS = [(ord(c), c) for c in string.digits + string.punctuation + " "]

BASIC_LATIN = [(i, chr(i)) for i in range(ord(u'\u0020'), ord(u'\u007f') + 1)]
LATIN_SUPPLEMENT = [(i, chr(i)) for i in range(ord(u'\u00a0'), ord(u'\u00ff') + 1)]
LATIN_EXTENDED_A = [(i, chr(i)) for i in range(ord(u'\u0100'), ord(u'\u017f') + 1)]
LATIN_EXTENDED_B = [(i, chr(i)) for i in range(ord(u'\u0180'), ord(u'\u024f') + 1)]

BASIC_CYRILLIC = [(i, chr(i)) for i in range(ord(u'\u0410'), ord(u'\u044f') + 1)]
FULL_CYRILLIC = [(i, chr(i)) for i in range(ord(u'\u0400'), ord(u'\u04ff') + 1) if i not in [1155, 1156, 1157, 1158, 1159]]
CYRILLIC_SUPPLEMENTARY = [(i, chr(i)) for i in range(ord(u'\u0500'), ord(u'\u052f') + 1)]

LATIN = prepare_chars("latin", SYMBOLS, BASIC_LATIN, LATIN_SUPPLEMENT, LATIN_EXTENDED_A, LATIN_EXTENDED_B)
CYRILLIC = prepare_chars("cyrillic", BASIC_CYRILLIC, FULL_CYRILLIC, CYRILLIC_SUPPLEMENTARY)
CHARS = drop_duplicates_from_chars(LATIN + CYRILLIC)
print(len(CHARS))
CHARS[:5]

824


[Char(id=32, char=' ', alphabet='latin'),
 Char(id=33, char='!', alphabet='latin'),
 Char(id=34, char='"', alphabet='latin'),
 Char(id=35, char='#', alphabet='latin'),
 Char(id=36, char='$', alphabet='latin')]

In [6]:
LOCALES = {
    "en_US": "latin",
    "es_ES": "latin",
    "fr_FR": "latin",
    "pt_PT": "latin",
    "ru_RU": "cyrillic",
}

In [7]:
def width_in_px(text, font=ufont.BASIC_FONT):
    _ = wx.App()

    font_info = wx.FontInfo(font.size).FaceName(font.family)
    if font.face.bold:
        font_info = font_info.Bold()
    if font.face.italic:
        font_info = font_info.Italic()
    wx_font = wx.Font(font_info)

    screen_dc = wx.ScreenDC()
    screen_dc.SetFont(wx_font)
    size = screen_dc.GetTextExtent(text)

    return size[0]

In [8]:
def get_char_widths_df(
        font_families=ufont.FONT_FAMILIES,
        font_sizes=ufont.FONT_SIZES,
        font_faces=ufont.FONT_FACES,
        chars=CHARS
    ):
    from itertools import product

    data = {
        "char_id": [],
        "char": [],
        "alphabet": [],
        "font_family": [],
        "font_size": [],
        "font_face": [],
        "is_monospaced": [],
        "width": [],
    }
    for (font_family, font_size, font_face, char) in product(font_families, font_sizes, font_faces, chars):
        font = ufont.Font(font_family, font_size, font_face)
        data["font_family"] += [font_family]
        data["font_size"] += [font_size]
        data["font_face"] += [str(font_face)]
        data["is_monospaced"] += [ufont.is_monospaced(font)]
        data["char_id"] += [char.id]
        data["char"] += [char.char]
        data["alphabet"] += [char.alphabet]
        data["width"] += [width_in_px(char.char, font)]

    return pd.DataFrame(data)

In [9]:
def get_texts_df(
        font_families=ufont.FONT_FAMILIES,
        font_sizes=ufont.FONT_SIZES,
        font_faces=ufont.FONT_FACES,
        locales=LOCALES,
        bunch_size=1000,
        max_words_count=5,
        size_reserve_coeff=2,
        random_state=42
    ):
    from itertools import product
    from faker import Faker

    def get_local_texts_df(locale, font_family, font_size, font_face):
        Faker.seed(random_state)
        fake = Faker(locale=locale)
        font = ufont.Font(font_family, font_size, font_face)
        result_df = pd.DataFrame({
            "text": [
                fake.sentence(nb_words=(int(max_words_count * n / bunch_size) + 1), variable_nb_words=False)
                for n in list(range(bunch_size)) * size_reserve_coeff
            ]
        }).drop_duplicates(subset="text").sample(bunch_size, random_state=random_state)
        result_df.text = result_df.text.str[:-1]
        return result_df.assign(
            symbols_count=result_df.text.str.len(),
            width=result_df.text.apply(lambda s: width_in_px(s, font))
        ).sort_values("symbols_count").assign(
            alphabet=lambda r: locales[locale],
            font_family=lambda r: font_family,
            font_size=lambda r: font_size,
            font_face=lambda r: str(font_face),
            is_monospaced=lambda r: ufont.is_monospaced(font_family)
        )

    df = pd.DataFrame(columns=["text", "width", "alphabet", "font_family", "font_size", "font_face"])
    for (locale, font_family, font_size, font_face) in product(locales, font_families, font_sizes, font_faces):
        df = pd.concat([df, get_local_texts_df(locale, font_family=font_family, \
                                                 font_size=font_size, font_face=font_face)])

    return df.reset_index(drop=True)

In [10]:
def crop_df(df, proportion=.1, random_state=42):
    import numpy as np

    features = ["alphabet", "font_family", "font_size", "font_face"]
    n = df.groupby(features).count().iloc[:, 0].min()
    result_size = max(1, int(np.round(n * proportion)))

    return df.groupby(features).sample(n=result_size, random_state=random_state).reset_index(drop=True)

In [11]:
if not os.path.exists(FULL_DIR):
    os.makedirs(FULL_DIR)
if not os.path.exists(CROPPED_DIR):
    os.makedirs(CROPPED_DIR)

In [12]:
char_widths_df = get_char_widths_df()
print(char_widths_df.shape)
char_widths_df.head()

(237312, 8)


Unnamed: 0,char_id,char,alphabet,font_family,font_size,font_face,is_monospaced,width
0,32,,latin,Courier,9,normal,True,8
1,33,!,latin,Courier,9,normal,True,8
2,34,"""",latin,Courier,9,normal,True,8
3,35,#,latin,Courier,9,normal,True,8
4,36,$,latin,Courier,9,normal,True,8


In [13]:
char_widths_df.to_csv("{0}/char_widths.csv".format(FULL_DIR), index=False)

In [14]:
char_widths_df.to_csv("{0}/char_widths.csv".format(CROPPED_DIR), index=False)

In [15]:
control_df = get_texts_df(max_words_count=5)
print(control_df.shape)
control_df.head()

(1440000, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,I,8,latin,Courier,9,normal,1.0,True
1,A,8,latin,Courier,9,normal,1.0,True
2,On,16,latin,Courier,9,normal,2.0,True
3,Or,16,latin,Courier,9,normal,2.0,True
4,He,16,latin,Courier,9,normal,2.0,True


In [16]:
control_df.to_csv("{0}/control.csv".format(FULL_DIR), index=False)

In [17]:
crop_df(control_df, proportion=.01).to_csv("{0}/control.csv".format(CROPPED_DIR), index=False)

In [18]:
texts_df = get_texts_df(bunch_size=500, max_words_count=50)
print(texts_df.shape)
texts_df.head()

(720000, 8)


Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count,is_monospaced
0,Dog,24,latin,Courier,9,normal,3.0,True
1,Ago,24,latin,Courier,9,normal,3.0,True
2,Move,32,latin,Courier,9,normal,4.0,True
3,Blue,32,latin,Courier,9,normal,4.0,True
4,Push,32,latin,Courier,9,normal,4.0,True


In [19]:
texts_df.to_csv("{0}/texts.csv".format(FULL_DIR), index=False)

In [20]:
crop_df(texts_df, proportion=.05).to_csv("{0}/texts.csv".format(CROPPED_DIR), index=False)