In [1]:
!pip install -U wxPython



In [2]:
!pip install -U Faker



In [3]:
import os
import string

import pandas as pd
import wx

In [4]:
DATA_DIR = "../data"
FULL_DIR = "{0}/full".format(DATA_DIR)
CROPPED_DIR = "{0}/cropped".format(DATA_DIR)

In [5]:
FONT_FAMILIES = [
    # main
    "Courier",
    "Geneva",
    "Georgia",
    "Helvetica",
    "Lucida Grande",
    "Times New Roman",
    "Verdana",
    # additional
    "Arial", # pretty standard
    "Lucida Console", # monospaced
]

In [6]:
FONT_SIZES = [
    9, 11, 12, 14, 16, 20
]

In [7]:
FONT_FACES = [
    {'bold': False, 'italic': False},
    {'bold': True, 'italic': False},
    {'bold': False, 'italic': True},
    {'bold': True, 'italic': True},
]

In [8]:
SYMBOLS = [(ord(c), c) for c in string.digits + string.punctuation + " "]
BASIC_LATIN_CHARS = \
    [(i, chr(i)) for i in list(range(ord(u'\u0041'), ord(u'\u005a') + 1)) + \
                          list(range(ord(u'\u0061'), ord(u'\u007a') + 1)) \
     if chr(i).isprintable()] + SYMBOLS
RUSSIAN_CHARS = \
    [(i, chr(i)) for i in range(ord(u'\u0410'), ord(u'\u044f') + 1) \
     if chr(i).isprintable()] + SYMBOLS
CHARS = \
    [(p[0], p[1], "basic_latin") for p in BASIC_LATIN_CHARS] + \
    [(p[0], p[1], "russian") for p in RUSSIAN_CHARS]

In [9]:
def size_in_px(text, font_family="Arial", font_size=11, bold=False, italic=False):
    _ = wx.App()

    font_info = wx.FontInfo(font_size).FaceName(font_family)
    if bold:
        font_info = font_info.Bold()
    if italic:
        font_info = font_info.Italic()
    font = wx.Font(font_info)

    screen_dc = wx.ScreenDC()
    screen_dc.SetFont(font)
    size = screen_dc.GetTextExtent(text)

    return size

In [10]:
def get_char_widths_df():
    from itertools import product

    data = {
        "char_id": [],
        "char": [],
        "alphabet": [],
        "font_family": [],
        "font_size": [],
        "font_face": [],
        "width": [],
    }
    for t in product(FONT_FAMILIES, FONT_SIZES, FONT_FACES, CHARS):
        data["font_family"] += [t[0]]
        data["font_size"] += [t[1]]
        data["font_face"] += [
            "bold+italic" if t[2]['bold'] and t[2]['italic'] else
            "bold" if t[2]['bold'] and not t[2]['italic'] else
            "italic" if not t[2]['bold'] and t[2]['italic'] else "normal"
        ]
        data["char_id"] += [t[3][0]]
        data["char"] += [t[3][1]]
        data["alphabet"] += [t[3][2]]
        data["width"] += [size_in_px(t[3][1], t[0], t[1], t[2]['bold'], t[2]['italic'])[0]]

    return pd.DataFrame(data)

In [11]:
def get_local_control_df(alphabet, font_family="Arial", font_size=11, font_face="", size=100, max_words_count=10, random_state=42):
    from faker import Faker

    LOCALES = {
        "basic_latin": "en_US",
        "russian": "ru_RU",
    }
    Faker.seed(random_state)
    fake = Faker(locale=LOCALES[alphabet])
    result_df = pd.DataFrame({"text": [fake.sentence(nb_words=(int(max_words_count * n / size) + 1)) for n in range(size)]})
    result_df = result_df.assign(symbols_count=result_df.text.str.len()).sort_values("symbols_count")
    bold = "bold" in font_face
    italic = "italic" in font_face
    result_df["width"] = result_df.text.apply(lambda s: size_in_px(s, font_family, font_size, bold, italic)[0])

    return result_df.assign(
        alphabet=lambda x: alphabet,
        font_family=lambda x: font_family,
        font_size=lambda x: font_size,
        font_face=lambda x: font_face,
    )

In [12]:
def get_control_df(bunch_size=1000, max_words_count=10, random_state=42):
    from itertools import product

    df = pd.DataFrame(columns=["text", "width", "alphabet", "font_family", "font_size", "font_face"])
    for t in product(list(set([c[2] for c in CHARS])), FONT_FAMILIES, FONT_SIZES, FONT_FACES):
        alphabet = t[0]
        font_family = t[1]
        font_size = t[2]
        font_face = "bold+italic" if t[3]['bold'] and t[3]['italic'] else \
                    "bold" if t[3]['bold'] and not t[3]['italic'] else \
                    "italic" if not t[3]['bold'] and t[3]['italic'] else "normal"

        df = pd.concat([df, get_local_control_df(alphabet, font_family=font_family, \
                                                 font_size=font_size, font_face=font_face, \
                                                 size=bunch_size, max_words_count=max_words_count, \
                                                 random_state=random_state)])

    return df

In [13]:
def crop_df(df, proportion=.1, random_state=42):
    import numpy as np

    features = ["alphabet", "font_family", "font_size", "font_face"]
    n = df.groupby(features).count().iloc[:, 0].min()
    result_size = max(1, int(np.round(n * proportion)))

    return df.groupby(features).sample(n=result_size, random_state=random_state).reset_index(drop=True)

In [14]:
if not os.path.exists(FULL_DIR):
    os.makedirs(FULL_DIR)
if not os.path.exists(CROPPED_DIR):
    os.makedirs(CROPPED_DIR)

In [15]:
char_widths_df = get_char_widths_df()
char_widths_df.head()

Unnamed: 0,char_id,char,alphabet,font_family,font_size,font_face,width
0,65,A,basic_latin,Courier,9,normal,8
1,66,B,basic_latin,Courier,9,normal,8
2,67,C,basic_latin,Courier,9,normal,8
3,68,D,basic_latin,Courier,9,normal,8
4,69,E,basic_latin,Courier,9,normal,8


In [16]:
char_widths_df.to_csv("{0}/char_widths.csv".format(FULL_DIR), index=False)

In [17]:
char_widths_df.to_csv("{0}/char_widths.csv".format(CROPPED_DIR), index=False)

In [18]:
control_df = get_control_df(bunch_size=500, max_words_count=20)
control_df.head()

Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count
24,Да.,24,russian,Courier,9,normal,3.0
21,Мимо.,40,russian,Courier,9,normal,5.0
18,Один.,40,russian,Courier,9,normal,5.0
17,Идея.,40,russian,Courier,9,normal,5.0
9,Изба.,40,russian,Courier,9,normal,5.0


In [19]:
control_df.to_csv("{0}/control.csv".format(FULL_DIR), index=False)

In [20]:
crop_df(control_df, proportion=.05).to_csv("{0}/control.csv".format(CROPPED_DIR), index=False)