# Оценка ширины текста

## 0. Подготовка

In [1]:
import numpy as np
import pandas as pd

from lets_plot import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html()

In [2]:
N_CLUSTERS = 3

In [3]:
FONT_FACES = {
    "train": [
        "Courier",
        "Geneva",
        "Georgia",
        "Helvetica",
        "Lucida Grande",
        "Times New Roman",
        "Verdana",
    ],
    "test": [
        "Arial",
        "Brush Script MT",
        "Lucida Console",
        "Wingdings",
    ],
}
DEMO_FONT_FACES = {
    "courier": "Courier",
    "geneva": "Geneva",
    "georgia": "Georgia",
    "helvetica": "Helvetica",
    "lucida_grande": "Lucida Grande",
    "times": "Times New Roman",
    "verdana": "Verdana",
}
FONT_VERSIONS = {
    "": 1,
    "b": 2,
    "i": 3,
    "bi": 4,
}

In [4]:
def get_df(path, df_type=None, modify=True):
    result_df = pd.read_csv(path)
    if df_type is not None:
        result_df = result_df[result_df.font_face.isin(FONT_FACES[df_type])].reset_index(drop=True)
    result_df.fillna("", inplace=True)
    if modify:
        result_df.font_version = result_df.font_version.map(FONT_VERSIONS)

    return result_df

def get_cropped_df(path, df_type=None, modify=True, proportion=.1, random_state=42):
    original_df = get_df(path, df_type, modify)
    features = ["alphabet", "font_face", "font_size", "font_version"]
    n = original_df.groupby(features).count().iloc[:, 0].min()
    result_size = max(1, int(np.round(n * proportion)))
    return original_df.groupby(features).sample(n=result_size, random_state=random_state).reset_index(drop=True)

def train_test_split(df, train_size=.75, random_state=42):
    features = ["alphabet", "font_face", "font_size", "font_version"]
    n = df.groupby(features).count().iloc[:, 0].min()
    train_size = max(1, int(np.round(n * train_size)))
    train_df = df.groupby(features).sample(n=train_size, random_state=random_state)
    return train_df.reset_index(drop=True), df.drop(train_df.index, axis=0).reset_index(drop=True)

def draw_relative_error(model_df, target):
    model_df["target_size"] = model_df[target].str.len()
    model_df["absolute_error"] = model_df.prediction - model_df.width
    model_df["relative_error"] = model_df.absolute_error / model_df.target_size

    return ggplot(model_df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
                          .relative_error.agg(["min", "max", "mean"]).reset_index()) + \
        geom_pointrange(aes(x=as_discrete("font_face", order_by="mean", order=1), \
                            y="mean", ymin="min", ymax="max", color=as_discrete("font_version")), \
                        position="dodge") + \
        facet_grid(x="alphabet", y="font_size")

In [5]:
train_cw_df = get_df("../data/cropped/char_widths.csv", "train")
print(train_cw_df.shape)
train_cw_df.head()

(97944, 7)


Unnamed: 0,char_id,char,alphabet,font_face,font_size,font_version,width
0,65,A,basic_latin,Courier,9,1,9
1,66,B,basic_latin,Courier,9,1,9
2,67,C,basic_latin,Courier,9,1,9
3,68,D,basic_latin,Courier,9,1,9
4,69,E,basic_latin,Courier,9,1,9


In [6]:
train_control_df = get_df("../data/cropped/control.csv", "train")
print(train_control_df.shape)
train_control_df.head()

(6720, 7)


Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,It friend positive wall travel affect for stor...,4860,basic_latin,Courier,9,1,540.0
1,Treatment across federal state situation ident...,2808,basic_latin,Courier,9,1,312.0
2,Lay though reason event against none as realit...,2601,basic_latin,Courier,9,1,289.0
3,Party soon since senior certainly own internat...,6129,basic_latin,Courier,9,1,681.0
4,Activity president realize artist brother fill...,1548,basic_latin,Courier,9,1,172.0


In [7]:
test_control_df = get_df("../data/cropped/control.csv", "test")
print(test_control_df.shape)
test_control_df.head()

(3840, 7)


Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Range amount large collection us shake tough c...,2694,basic_latin,Arial,9,1,329.0
1,Mean order public reality man fight key import...,3811,basic_latin,Arial,9,1,481.0
2,Fight sense similar newspaper number real prof...,3864,basic_latin,Arial,9,1,482.0
3,Knowledge party deep summer sea peace tough ag...,3392,basic_latin,Arial,9,1,421.0
4,Improve close debate and ago police put left g...,2084,basic_latin,Arial,9,1,260.0


In [8]:
alphabets = list(train_cw_df.alphabet.unique())
print("Alphabets: {0}".format(alphabets))
main_font_faces = list(train_cw_df.font_face.unique())
print("Main font faces: {0}".format(main_font_faces))
additional_font_faces = list(test_control_df.font_face.unique())
print("Additional font faces: {0}".format(additional_font_faces))
font_sizes = list(train_cw_df.font_size.unique())
print("Font sizes: {0}".format(font_sizes))
font_versions = list(train_cw_df.font_version.unique())
print("Font versions: {0}".format(font_versions))

Alphabets: ['basic_latin', 'russian', 'greek', 'japanese']
Main font faces: ['Courier', 'Geneva', 'Georgia', 'Helvetica', 'Lucida Grande', 'Times New Roman', 'Verdana']
Additional font faces: ['Arial', 'Brush Script MT', 'Lucida Console', 'Wingdings']
Font sizes: [9, 11, 12, 14, 17, 20]
Font versions: [1, 2, 3, 4]


In [9]:
class DemoPanel:
    import ipywidgets as widgets

    WIDTH, HEIGHT = 300, 30
    BACKGROUND_COLOR = "#808080"
    TEXT_BACKGROUND_COLOR = "#000000"
    TEXT_COLOR = (255, 255, 255)

    def __init__(self, model, coeff=1.0):
        self.model = model
        self.coeff = coeff
        self.font_face_widget = self.widgets.Dropdown(
            options=[(v, k) for k, v in DEMO_FONT_FACES.items()],
            value="times",
            description='Font face:',
            style=dict(description_width="initial")
        )
        self.font_size_widget = self.widgets.Dropdown(
            options=font_sizes,
            value=20,
            description='Font size:',
            style=dict(description_width="initial")
        )
        self.coeff_widget = self.widgets.Text(
            value=str(self.coeff),
            description="Correction coefficient:",
            style=dict(description_width="initial")
        )
        self.input_widget = self.widgets.Text(
            placeholder="Sample text",
            description="Input:",
            style=dict(description_width="initial")
        )

    def _predict(self, font_face, font_size, text):
        if len(text) == 0:
            return 0
        df = pd.DataFrame({
            "text": [text],
            "alphabet": ["basic_latin"],
            "font_face": [font_face],
            "font_size": [font_size],
            "font_version": [1],
        })
        return int(self.model.predict(df, "text").loc[0, "prediction"])

    def _output(self, font_face, font_size, coeff, text):
        from PIL import Image, ImageFont, ImageDraw
        img = Image.new('RGB', (self.WIDTH, self.HEIGHT), color=self.BACKGROUND_COLOR)
        draw = ImageDraw.Draw(img)
        text_width = self._predict(DEMO_FONT_FACES[font_face], font_size, text) * float(coeff)
        draw.rectangle([(0, 0), (text_width, self.HEIGHT)], fill=self.TEXT_BACKGROUND_COLOR)
        font = ImageFont.truetype("../data/fonts/{0}.ttf".format(font_face), size=font_size)
        draw.text((0, 0), text, self.TEXT_COLOR, font=font)
        return img

    def draw(self):
        self.widgets.interact(self._output,
                         text=self.input_widget,
                         font_face=self.font_face_widget,
                         font_size=self.font_size_widget,
                         coeff=self.coeff_widget)

## 1. Простая модель

### 1.1. Подготовка модели

In [10]:
class SimpleMainModel:
    from sklearn.cluster import KMeans

    def __init__(self, **parameters):
        self.parameters = parameters
        self.df = None

    def fit(self, df):
        self.df = df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
            .apply(self._add_clusters_to_font).reset_index(level=4, drop=True).set_index("char", append=True)

        return self

    def predict(self, df, target):
        return pd.concat([
            self._font_predict(
                df[(df.alphabet == i[0])&(df.font_face == i[1])&\
                   (df.font_size == i[2])&(df.font_version == i[3])],
                target,
                self.df.loc[i]
            )
            for i in df.groupby(["alphabet", "font_face", "font_size", "font_version"]).count().index
        ])

    def _font_predict(self, df, target, abvalues):
        def split_string(s):
            return pd.Series([s[i:i+1] for i in range(len(s))])
        max_cluster = abvalues.cluster.max()
        splitted_df = df[target].apply(split_string).replace(abvalues.cluster).fillna(0)
        cols = splitted_df.columns
        return df.assign(
            prediction=splitted_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\
                                        .fillna(max_cluster).sum(axis=1).astype(int)
        )

    def _add_clusters_to_font(self, df):
        result_df = df[["char_id", "char", "width"]].assign(cluster=self._get_clusters(df))
        result_df.replace({"cluster": result_df.groupby("cluster").width.max().to_dict()}, inplace=True)

        return result_df

    def _get_clusters(self, df):
        return self.KMeans(**self.parameters).fit(df[["width"]]).labels_

In [11]:
%%capture --no-display

sm_model = SimpleMainModel(n_clusters=N_CLUSTERS, random_state=42)
sm_model.fit(train_cw_df)

<__main__.SimpleMainModel at 0x7fdea0ab1090>

In [12]:
trtr_control_df, tstr_control_df = train_test_split(train_control_df, train_size=2.0/3.0)

In [13]:
trtr_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Attorney he school article floor themselves ha...,3960,basic_latin,Courier,9,1,440.0
1,Treatment across federal state situation ident...,2808,basic_latin,Courier,9,1,312.0
2,Image store character image sister sometimes r...,6966,basic_latin,Courier,9,1,774.0
3,It friend positive wall travel affect for stor...,4860,basic_latin,Courier,9,1,540.0
4,Help able group rich purpose pass memory stuff...,3510,basic_latin,Courier,9,1,390.0


In [14]:
tstr_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Party soon since senior certainly own internat...,6129,basic_latin,Courier,9,1,681.0
1,Activity president realize artist brother fill...,1548,basic_latin,Courier,9,1,172.0
2,Experience television answer pretty event son ...,846,basic_latin,Courier,9,1,94.0
3,Technology none take along give morning day pa...,4330,basic_latin,Courier,9,2,433.0
4,National management glass red machine future m...,7500,basic_latin,Courier,9,2,750.0


In [15]:
sm_model_df = sm_model.predict(tstr_control_df, "text")
sm_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count,prediction
0,Party soon since senior certainly own internat...,6129,basic_latin,Courier,9,1,681.0,6129
1,Activity president realize artist brother fill...,1548,basic_latin,Courier,9,1,172.0,1548
2,Experience television answer pretty event son ...,846,basic_latin,Courier,9,1,94.0,846
3,Technology none take along give morning day pa...,4330,basic_latin,Courier,9,2,433.0,4330
4,National management glass red machine future m...,7500,basic_latin,Courier,9,2,750.0,7500


### 1.2. Демонстрация качества модели

In [16]:
draw_relative_error(sm_model_df, "text")

In [17]:
alphabet, font_face, font_size, font_version = "basic_latin", "Lucida Grande", 14, 1
for i, r in sm_model_df[(sm_model_df.alphabet == alphabet)&(sm_model_df.font_face == font_face)&\
                        (sm_model_df.font_size == font_size)&(sm_model_df.font_version == font_version)].iterrows():
    print("""Text:
  {text}

Number of symbols: {length}
Actual width in pixels: {width}
Predicted width in pixels: {predicted}
Estimation error in pixels: {error} ({percent_error}%)

================================================================================
""".format(text=r.text, length=int(r.symbols_count), width=r.width, predicted=r.prediction, \
           error=r.absolute_error, percent_error=int(np.round(r.absolute_error * 100 / r.width))))

Text:
  Star grow technology tend dog reduce hour fish him high nation production little could his under style child young prove care author administration return main join wrong be politics indeed lay smile page international conference meeting need last green control painting.

Number of symbols: 271
Actual width in pixels: 3311
Predicted width in pixels: 4160
Estimation error in pixels: 849 (26%)


Text:
  Court professor here security community notice.

Number of symbols: 47
Actual width in pixels: 590
Predicted width in pixels: 743
Estimation error in pixels: 153 (26%)


Text:
  Mean really song how start story allow social write prepare represent today notice exactly send where check source bed production maintain threat ask enjoy year few success around direction soldier strong water clear spring participant exactly hospital Congress seem role senior serve current card yes perhaps do carry care husband yard democratic there discover detail effect life attack realize prove into l

In [18]:
demo_panel = DemoPanel(sm_model, coeff=.4)
demo_panel.draw()

interactive(children=(Dropdown(description='Font face:', index=5, options=(('Courier', 'courier'), ('Geneva', …

## 2. Скорректированная простая модель

### 2.1. Подготовка модели

In [19]:
class CorrectedMainModel:
    from sklearn.linear_model import LinearRegression

    features = ["prediction", "target_size", "font_size", "font_version"]

    def __init__(self, sm_model, **parameters):
        self.sm_model = sm_model
        self.parameters = parameters
        self.corrector_models = {}

    def fit(self, df, target):
        sm_control_df = self.sm_model.predict(df, target)
        sm_control_df["target_size"] = sm_control_df.text.str.len()
        sm_control_df["error"] = (sm_control_df.prediction - sm_control_df.width) / sm_control_df.target_size
        self.corrector_models = {
            p: self._fit_corrector_model(
                sm_control_df[(sm_control_df.alphabet == p[0])&(sm_control_df.font_face == p[1])]
            )
            for p in sm_control_df.groupby(["alphabet", "font_face"]).count().index
        }

        return self

    def predict(self, df, target):
        sm_control_df = self.sm_model.predict(df, target)
        sm_control_df["target_size"] = sm_control_df[target].str.len()
        return pd.concat([
            self._corrector_model_predict(
                sm_control_df[(sm_control_df.alphabet == i[0])&(sm_control_df.font_face == i[1])],
                target,
                self.corrector_models[i]
            )
            for i in sm_control_df.groupby(["alphabet", "font_face"]).count().index
        ]).drop(columns=["target_size"])

    def _corrector_model_predict(self, df, target, cm):
        prediction = df.prediction - (df.target_size * cm.predict(df[self.features])).astype(int)
        return df.assign(prediction=prediction)

    def _fit_corrector_model(self, df):
        X = df[self.features]
        y = df.error
        return self.LinearRegression(**self.parameters).fit(X, y)

In [20]:
cm_model = CorrectedMainModel(sm_model)
cm_model.fit(trtr_control_df, "text")

<__main__.CorrectedMainModel at 0x7fdeb0e5bf50>

In [21]:
cm_model_df = cm_model.predict(tstr_control_df, "text")
cm_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count,prediction
0,Party soon since senior certainly own internat...,6129,basic_latin,Courier,9,1,681.0,6129
1,Activity president realize artist brother fill...,1548,basic_latin,Courier,9,1,172.0,1548
2,Experience television answer pretty event son ...,846,basic_latin,Courier,9,1,94.0,846
3,Technology none take along give morning day pa...,4330,basic_latin,Courier,9,2,433.0,4330
4,National management glass red machine future m...,7500,basic_latin,Courier,9,2,750.0,7500


### 2.2. Демонстрация качества модели

In [22]:
draw_relative_error(cm_model_df, "text")

In [23]:
alphabet, font_face, font_size, font_version = "basic_latin", "Lucida Grande", 14, 1
for i, r in cm_model_df[(cm_model_df.alphabet == alphabet)&(cm_model_df.font_face == font_face)&\
                        (cm_model_df.font_size == font_size)&(cm_model_df.font_version == font_version)].iterrows():
    print("""Text:
  {text}

Number of symbols: {length}
Actual width in pixels: {width}
Predicted width in pixels: {predicted}
Estimation error in pixels: {error} ({percent_error}%)

================================================================================
""".format(text=r.text, length=int(r.symbols_count), width=r.width, predicted=r.prediction, \
           error=r.absolute_error, percent_error=int(np.round(r.absolute_error * 100 / r.width))))

Text:
  Star grow technology tend dog reduce hour fish him high nation production little could his under style child young prove care author administration return main join wrong be politics indeed lay smile page international conference meeting need last green control painting.

Number of symbols: 271
Actual width in pixels: 3311
Predicted width in pixels: 3412
Estimation error in pixels: 101 (3%)


Text:
  Court professor here security community notice.

Number of symbols: 47
Actual width in pixels: 590
Predicted width in pixels: 610
Estimation error in pixels: 20 (3%)


Text:
  Mean really song how start story allow social write prepare represent today notice exactly send where check source bed production maintain threat ask enjoy year few success around direction soldier strong water clear spring participant exactly hospital Congress seem role senior serve current card yes perhaps do carry care husband yard democratic there discover detail effect life attack realize prove into lead

In [24]:
demo_panel = DemoPanel(cm_model, coeff=.5)
demo_panel.draw()

interactive(children=(Dropdown(description='Font face:', index=5, options=(('Courier', 'courier'), ('Geneva', …