# Оценка ширины текста

## 0. Подготовка

In [1]:
import numpy as np
import pandas as pd

from lets_plot import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html()

In [2]:
N_CLUSTERS = 3

In [3]:
FONT_FACES = {
    "train": [
        "Courier",
        "Geneva",
        "Georgia",
        "Helvetica",
        "Lucida Grande",
        "Times New Roman",
        "Verdana",
    ],
    "test": [
        "Arial",
        "Brush Script MT",
        "Lucida Console",
        "Wingdings",
    ],
}
FONT_VERSIONS = {
    "": 1,
    "b": 2,
    "i": 3,
    "bi": 4,
}

In [4]:
def get_df(path, df_type):
    result_df = pd.read_csv(path)
    result_df = result_df[result_df.font_face.isin(FONT_FACES[df_type])].reset_index(drop=True)
    result_df.fillna("", inplace=True)
    result_df.font_version = result_df.font_version.map(FONT_VERSIONS)

    return result_df

def get_cropped_df(path, df_type, proportion=.1, random_state=42):
    original_df = get_df(path, df_type)
    features = ["alphabet", "font_face", "font_size", "font_version"]
    n = original_df.groupby(features).count().iloc[:, 0].min()
    result_size = max(1, int(np.round(n * proportion)))
    return original_df.groupby(features).sample(n=result_size, random_state=random_state).reset_index(drop=True)

def train_test_split(df, train_size=.75, random_state=42):
    features = ["alphabet", "font_face", "font_size", "font_version"]
    n = df.groupby(features).count().iloc[:, 0].min()
    train_size = max(1, int(np.round(n * train_size)))
    train_df = df.groupby(features).sample(n=train_size, random_state=random_state)
    return train_df.reset_index(drop=True), df.drop(train_df.index, axis=0).reset_index(drop=True)

def draw_relative_error(model_df, target):
    model_df["target_size"] = model_df[target].str.len()
    model_df["absolute_error"] = model_df.prediction - model_df.width
    model_df["relative_error"] = model_df.absolute_error / model_df.target_size

    return ggplot(model_df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
                          .relative_error.agg(["min", "max", "mean"]).reset_index()) + \
        geom_pointrange(aes(x=as_discrete("font_face", order_by="mean", order=1), \
                            y="mean", ymin="min", ymax="max", color=as_discrete("font_version")), \
                        position="dodge") + \
        facet_grid(x="alphabet", y="font_size")

In [5]:
train_cw_df = get_df("../data/char_widths.csv", "train")
print(train_cw_df.shape)
train_cw_df.head()

(97944, 7)


Unnamed: 0,char_id,char,alphabet,font_face,font_size,font_version,width
0,65,A,basic_latin,Courier,9,1,9
1,66,B,basic_latin,Courier,9,1,9
2,67,C,basic_latin,Courier,9,1,9
3,68,D,basic_latin,Courier,9,1,9
4,69,E,basic_latin,Courier,9,1,9


In [6]:
train_control_df = get_cropped_df("../data/control.csv", "train", .01, random_state=42) # TODO: get_df("../data/control.csv", "train")
print(train_control_df.shape)
train_control_df.head()

(6720, 7)


Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Range amount large collection us shake tough c...,2961,basic_latin,Courier,9,1,329.0
1,Mean order public reality man fight key import...,4329,basic_latin,Courier,9,1,481.0
2,Fight sense similar newspaper number real prof...,4338,basic_latin,Courier,9,1,482.0
3,Knowledge party deep summer sea peace tough ag...,3789,basic_latin,Courier,9,1,421.0
4,Improve close debate and ago police put left g...,2340,basic_latin,Courier,9,1,260.0


In [7]:
test_control_df = get_cropped_df("../data/control.csv", "test", .01, random_state=42) # TODO: get_df("../data/control.csv", "test")
print(test_control_df.shape)
test_control_df.head()

(3840, 7)


Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Range amount large collection us shake tough c...,2694,basic_latin,Arial,9,1,329.0
1,Mean order public reality man fight key import...,3811,basic_latin,Arial,9,1,481.0
2,Fight sense similar newspaper number real prof...,3864,basic_latin,Arial,9,1,482.0
3,Knowledge party deep summer sea peace tough ag...,3392,basic_latin,Arial,9,1,421.0
4,Improve close debate and ago police put left g...,2084,basic_latin,Arial,9,1,260.0


In [8]:
alphabets = list(train_cw_df.alphabet.unique())
print("Alphabets: {0}".format(alphabets))
main_font_faces = list(train_cw_df.font_face.unique())
print("Main font faces: {0}".format(main_font_faces))
additional_font_faces = list(test_control_df.font_face.unique())
print("Additional font faces: {0}".format(additional_font_faces))
font_sizes = list(train_cw_df.font_size.unique())
print("Font sizes: {0}".format(font_sizes))
font_versions = list(train_cw_df.font_version.unique())
print("Font versions: {0}".format(font_versions))

Alphabets: ['basic_latin', 'russian', 'greek', 'japanese']
Main font faces: ['Courier', 'Geneva', 'Georgia', 'Helvetica', 'Lucida Grande', 'Times New Roman', 'Verdana']
Additional font faces: ['Arial', 'Brush Script MT', 'Lucida Console', 'Wingdings']
Font sizes: [9, 11, 12, 14, 17, 20]
Font versions: [1, 2, 3, 4]


In [9]:
class DemoPanel:
    import ipywidgets as widgets

    WIDTH, HEIGHT = 300, 30
    BACKGROUND_COLOR = "#808080"
    TEXT_BACKGROUND_COLOR = "#000000"
    TEXT_COLOR = (255, 255, 255)

    def __init__(self, model, coeff=1.0):
        self.model = model
        self.coeff = coeff
        self.font_face_widget = self.widgets.Dropdown(
            options=main_font_faces,
            value="Times New Roman",
            description='Font face:',
            style=dict(description_width="initial")
        )
        self.font_size_widget = self.widgets.Dropdown(
            options=font_sizes,
            value=20,
            description='Font size:',
            style=dict(description_width="initial")
        )
        self.coeff_widget = self.widgets.Text(
            value=str(self.coeff),
            description="Correction coefficient:",
            style=dict(description_width="initial")
        )
        self.input_widget = self.widgets.Text(
            placeholder="Sample text",
            description="Input:",
            style=dict(description_width="initial")
        )

    def _predict(self, font_face, font_size, text):
        df = pd.DataFrame({
            "text": [text],
            "alphabet": ["basic_latin"],
            "font_face": [font_face],
            "font_size": [font_size],
            "font_version": [1],
        })
        return int(self.model.predict(df, "text").loc[0, "prediction"])

    def _output(self, font_face, font_size, coeff, text):
        from PIL import Image, ImageFont, ImageDraw
        img = Image.new('RGB', (self.WIDTH, self.HEIGHT), color=self.BACKGROUND_COLOR)
        draw = ImageDraw.Draw(img)
        text_width = self._predict(font_face, font_size, text) * float(coeff)
        draw.rectangle([(0, 0), (text_width, self.HEIGHT)], fill=self.TEXT_BACKGROUND_COLOR)
        font = ImageFont.truetype(font_face, size=font_size)
        draw.text((0, 0), text, self.TEXT_COLOR, font=font)
        return img

    def draw(self):
        self.widgets.interact(self._output,
                         text=self.input_widget,
                         font_face=self.font_face_widget,
                         font_size=self.font_size_widget,
                         coeff=self.coeff_widget)

## 1. Простая модель

### 1.1. Подготовка модели

In [10]:
class SimpleMainModel:
    from sklearn.cluster import KMeans

    def __init__(self, **parameters):
        self.parameters = parameters
        self.df = None

    def fit(self, df):
        self.df = df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
            .apply(self._add_clusters_to_font).reset_index(level=4, drop=True).set_index("char", append=True)

        return self

    def predict(self, df, target):
        df = df.assign(prediction=lambda x: np.nan)
        for (i, d) in df.iterrows():
            abvalues = self.df.loc[(d.alphabet, d.font_face, d.font_size, d.font_version)]
            max_cluster = abvalues.cluster.max()
            df.loc[i, "prediction"] = np.array([
                (abvalues.loc[c].cluster if c in abvalues.index else max_cluster)
                for c in d[target]
            ]).sum()
        df.prediction = df.prediction.astype(int)

        return df

    def _add_clusters_to_font(self, df):
        result_df = df[["char_id", "char", "width"]].assign(cluster=self._get_clusters(df))
        result_df.replace({"cluster": result_df.groupby("cluster").width.max().to_dict()}, inplace=True)

        return result_df

    def _get_clusters(self, df):
        return self.KMeans(**self.parameters).fit(df[["width"]]).labels_

class SimpleMainModelNew:
    from sklearn.cluster import KMeans

    def __init__(self, **parameters):
        self.parameters = parameters
        self.df = None

    def fit(self, df):
        self.df = df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
            .apply(self._add_clusters_to_font).reset_index(level=4, drop=True).set_index("char", append=True)

        return self

    def predict(self, df, target):
        return pd.concat([
            self._font_predict(
                df[(df.alphabet == i[0])&(df.font_face == i[1])&\
                   (df.font_size == i[2])&(df.font_version == i[3])],
                target,
                self.df.loc[i]
            )
            for i in df.groupby(["alphabet", "font_face", "font_size", "font_version"]).count().index
        ])

    def _font_predict(self, df, target, abvalues):
        def split_string(s):
            return pd.Series([s[i:i+1] for i in range(len(s))])
        max_cluster = abvalues.cluster.max()
        splitted_df = df[target].apply(split_string).replace(abvalues.cluster).fillna(0)
        cols = splitted_df.columns
        return df.assign(
            prediction=splitted_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\
                                        .fillna(max_cluster).sum(axis=1).astype(int)
        )

    def _add_clusters_to_font(self, df):
        result_df = df[["char_id", "char", "width"]].assign(cluster=self._get_clusters(df))
        result_df.replace({"cluster": result_df.groupby("cluster").width.max().to_dict()}, inplace=True)

        return result_df

    def _get_clusters(self, df):
        return self.KMeans(**self.parameters).fit(df[["width"]]).labels_

In [11]:
%%capture --no-display

sm_model = SimpleMainModel(n_clusters=N_CLUSTERS, random_state=42)
sm_model.fit(train_cw_df)

<__main__.SimpleMainModel at 0x7fa02a62f910>

In [12]:
trtr_control_df, tstr_control_df = train_test_split(train_control_df, train_size=2.0/3.0)

In [13]:
trtr_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Glass floor future among require material exis...,5382,basic_latin,Courier,9,1,598.0
1,Mean order public reality man fight key import...,4329,basic_latin,Courier,9,1,481.0
2,Yard campaign themselves common other walk kno...,3924,basic_latin,Courier,9,1,436.0
3,Range amount large collection us shake tough c...,2961,basic_latin,Courier,9,1,329.0
4,Member carry effort imagine fact down forward ...,2889,basic_latin,Courier,9,1,321.0


In [14]:
tstr_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,Knowledge party deep summer sea peace tough ag...,3789,basic_latin,Courier,9,1,421.0
1,Improve close debate and ago police put left g...,2340,basic_latin,Courier,9,1,260.0
2,I movement account policy sound which yet scho...,3627,basic_latin,Courier,9,1,403.0
3,Per question return process stuff pick tough p...,640,basic_latin,Courier,9,2,64.0
4,Tv lose others some project visit back blue a ...,2480,basic_latin,Courier,9,2,248.0


In [15]:
sm_model_df = sm_model.predict(tstr_control_df, "text")
sm_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count,prediction
0,Knowledge party deep summer sea peace tough ag...,3789,basic_latin,Courier,9,1,421.0,3789
1,Improve close debate and ago police put left g...,2340,basic_latin,Courier,9,1,260.0,2340
2,I movement account policy sound which yet scho...,3627,basic_latin,Courier,9,1,403.0,3627
3,Per question return process stuff pick tough p...,640,basic_latin,Courier,9,2,64.0,640
4,Tv lose others some project visit back blue a ...,2480,basic_latin,Courier,9,2,248.0,2480


### 1.2. Демонстрация качества модели

In [16]:
draw_relative_error(sm_model_df, "text")

In [17]:
alphabet, font_face, font_size, font_version = "basic_latin", "Lucida Grande", 14, 1
for i, r in sm_model_df[(sm_model_df.alphabet == alphabet)&(sm_model_df.font_face == font_face)&\
                        (sm_model_df.font_size == font_size)&(sm_model_df.font_version == font_version)].iterrows():
    print("""Text:
  {text}

Number of symbols: {length}
Actual width in pixels: {width}
Predicted width in pixels: {predicted}
Estimation error in pixels: {error} ({percent_error}%)

================================================================================
""".format(text=r.text, length=int(r.symbols_count), width=r.width, predicted=r.prediction, \
           error=r.absolute_error, percent_error=int(np.round(r.absolute_error * 100 / r.width))))

Text:
  Fall analysis current among value middle writer myself education send.

Number of symbols: 70
Actual width in pixels: 871
Predicted width in pixels: 1104
Estimation error in pixels: 233 (27%)


Text:
  Congress land station team keep cost company attorney area east too life money marriage both make time force often next trip drug chance speech sell better soldier however so television nature community third enter never paper vote those million media notice.

Number of symbols: 259
Actual width in pixels: 3220
Predicted width in pixels: 4047
Estimation error in pixels: 827 (26%)


Text:
  Teach develop staff least figure.

Number of symbols: 33
Actual width in pixels: 390
Predicted width in pixels: 496
Estimation error in pixels: 106 (27%)




In [18]:
demo_panel = DemoPanel(sm_model, coeff=.4)
demo_panel.draw()

interactive(children=(Dropdown(description='Font face:', index=5, options=('Courier', 'Geneva', 'Georgia', 'He…

## 2. Скорректированная простая модель

### 2.1. Подготовка модели

In [19]:
class CorrectedMainModel:
    from sklearn.linear_model import LinearRegression

    features = ["prediction", "target_size", "font_size", "font_version"]

    def __init__(self, sm_model, **parameters):
        self.sm_model = sm_model
        self.parameters = parameters
        self.corrector_models = {}

    def fit(self, df, target):
        sm_control_df = self.sm_model.predict(df, target)
        sm_control_df["target_size"] = sm_control_df.text.str.len()
        sm_control_df["error"] = (sm_control_df.prediction - sm_control_df.width) / sm_control_df.target_size
        self.corrector_models = {
            p: self._fit_corrector_model(
                sm_control_df[(sm_control_df.alphabet == p[0])&(sm_control_df.font_face == p[1])]
            )
            for p in sm_control_df.groupby(["alphabet", "font_face"]).count().index
        }

        return self

    def predict(self, df, target):
        sm_control_df = self.sm_model.predict(df, target)
        sm_control_df["target_size"] = sm_control_df[target].str.len()
        return pd.concat([
            self._corrector_model_predict(
                sm_control_df[(sm_control_df.alphabet == i[0])&(sm_control_df.font_face == i[1])],
                target,
                self.corrector_models[i]
            )
            for i in sm_control_df.groupby(["alphabet", "font_face"]).count().index
        ]).drop(columns=["target_size"])

    def _corrector_model_predict(self, df, target, cm):
        prediction = df.prediction - (df.target_size * cm.predict(df[self.features])).astype(int)
        return df.assign(prediction=prediction)

    def _fit_corrector_model(self, df):
        X = df[self.features]
        y = df.error
        return self.LinearRegression(**self.parameters).fit(X, y)

In [20]:
cm_model = CorrectedMainModel(sm_model)
cm_model.fit(trtr_control_df, "text")

<__main__.CorrectedMainModel at 0x7f9ff86b3810>

In [21]:
cm_model_df = cm_model.predict(tstr_control_df, "text")
cm_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count,prediction
0,Knowledge party deep summer sea peace tough ag...,3789,basic_latin,Courier,9,1,421.0,3789
1,Improve close debate and ago police put left g...,2340,basic_latin,Courier,9,1,260.0,2340
2,I movement account policy sound which yet scho...,3627,basic_latin,Courier,9,1,403.0,3627
3,Per question return process stuff pick tough p...,640,basic_latin,Courier,9,2,64.0,640
4,Tv lose others some project visit back blue a ...,2480,basic_latin,Courier,9,2,248.0,2480


### 2.2. Демонстрация качества модели

In [22]:
draw_relative_error(cm_model_df, "text")

In [23]:
alphabet, font_face, font_size, font_version = "basic_latin", "Lucida Grande", 14, 1
for i, r in cm_model_df[(cm_model_df.alphabet == alphabet)&(cm_model_df.font_face == font_face)&\
                        (cm_model_df.font_size == font_size)&(cm_model_df.font_version == font_version)].iterrows():
    print("""Text:
  {text}

Number of symbols: {length}
Actual width in pixels: {width}
Predicted width in pixels: {predicted}
Estimation error in pixels: {error} ({percent_error}%)

================================================================================
""".format(text=r.text, length=int(r.symbols_count), width=r.width, predicted=r.prediction, \
           error=r.absolute_error, percent_error=int(np.round(r.absolute_error * 100 / r.width))))

Text:
  Fall analysis current among value middle writer myself education send.

Number of symbols: 70
Actual width in pixels: 871
Predicted width in pixels: 913
Estimation error in pixels: 42 (5%)


Text:
  Congress land station team keep cost company attorney area east too life money marriage both make time force often next trip drug chance speech sell better soldier however so television nature community third enter never paper vote those million media notice.

Number of symbols: 259
Actual width in pixels: 3220
Predicted width in pixels: 3350
Estimation error in pixels: 130 (4%)


Text:
  Teach develop staff least figure.

Number of symbols: 33
Actual width in pixels: 390
Predicted width in pixels: 406
Estimation error in pixels: 16 (4%)




In [24]:
demo_panel = DemoPanel(cm_model, coeff=.5)
demo_panel.draw()

interactive(children=(Dropdown(description='Font face:', index=5, options=('Courier', 'Geneva', 'Georgia', 'He…