# Оценка ширины текста

## 0. Подготовка

In [1]:
import numpy as np
import pandas as pd

from lets_plot import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html()

In [2]:
N_CLUSTERS = 3

In [3]:
FONT_FACES = {
    "train": [
        "Courier",
        "Geneva",
        "Georgia",
        "Helvetica",
        "Lucida Grande",
        "Times New Roman",
        "Verdana",
    ],
    "test": [
        "Arial",
        "Brush Script MT",
        "Lucida Console",
        "Wingdings",
    ],
}
FONT_VERSIONS = {
    "": 1,
    "b": 2,
    "i": 3,
    "bi": 4,
}

In [4]:
def get_df(path, df_type):
    result_df = pd.read_csv(path)
    result_df = result_df[result_df.font_face.isin(FONT_FACES[df_type])].reset_index(drop=True)
    result_df.fillna("", inplace=True)
    result_df.font_version = result_df.font_version.map(FONT_VERSIONS)

    return result_df

def train_test_split(df, train_size=.75, random_state=42):
    features = ["alphabet", "font_face", "font_size", "font_version"]
    n = df.groupby(features).count().iloc[0][0]
    train_size = int(np.round(n * train_size))
    train_df = df.groupby(features).sample(n=train_size, random_state=random_state)
    return train_df.reset_index(drop=True), df.drop(train_df.index, axis=0).reset_index(drop=True)

def draw_relative_error(model_df, target):
    model_df["text_size"] = model_df[target].str.len()
    model_df["absolute_error"] = model_df.prediction - model_df.width
    model_df["relative_error"] = model_df.absolute_error / model_df.text_size

    return ggplot(model_df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
                          .relative_error.agg(["min", "max", "mean"]).reset_index()) + \
        geom_pointrange(aes(x=as_discrete("font_face", order_by="mean", order=1), \
                            y="mean", ymin="min", ymax="max", color=as_discrete("font_version")), \
                        position="dodge") + \
        facet_grid(x="alphabet", y="font_size")

In [5]:
train_cw_df = get_df("../data/char_widths.csv", "train")
train_cw_df.head()

Unnamed: 0,char_id,char,alphabet,font_face,font_size,font_version,width
0,65,A,basic_latin,Courier,9,1,9
1,66,B,basic_latin,Courier,9,1,9
2,67,C,basic_latin,Courier,9,1,9
3,68,D,basic_latin,Courier,9,1,9
4,69,E,basic_latin,Courier,9,1,9


In [6]:
train_control_df = get_df("../data/control.csv", "train")
train_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,溝。,25,japanese,Courier,9,1,2.0
1,高い。,37,japanese,Courier,9,1,3.0
2,今日。,40,japanese,Courier,9,1,3.0
3,リニア。,41,japanese,Courier,9,1,4.0
4,リニア。,41,japanese,Courier,9,1,4.0


In [7]:
test_control_df = get_df("../data/control.csv", "test")
test_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,symbols_count
0,溝。,28,japanese,Arial,9,1,2.0
1,高い。,42,japanese,Arial,9,1,3.0
2,今日。,45,japanese,Arial,9,1,3.0
3,リニア。,47,japanese,Arial,9,1,4.0
4,リニア。,47,japanese,Arial,9,1,4.0


In [8]:
alphabets = list(train_cw_df.alphabet.unique())
print("Alphabets: {0}".format(alphabets))
main_font_faces = list(train_cw_df.font_face.unique())
print("Main font faces: {0}".format(main_font_faces))
additional_font_faces = list(test_control_df.font_face.unique())
print("Additional font faces: {0}".format(additional_font_faces))
font_sizes = list(train_cw_df.font_size.unique())
print("Font sizes: {0}".format(font_sizes))
font_versions = list(train_cw_df.font_version.unique())
print("Font versions: {0}".format(font_versions))

Alphabets: ['basic_latin', 'russian', 'greek', 'japanese']
Main font faces: ['Courier', 'Geneva', 'Georgia', 'Helvetica', 'Lucida Grande', 'Times New Roman', 'Verdana']
Additional font faces: ['Arial', 'Brush Script MT', 'Lucida Console', 'Wingdings']
Font sizes: [9, 11, 12, 14, 17, 20]
Font versions: [1, 2, 3, 4]


## 1. Оценка ширины текста для известных шрифтов

### 1.1. Простая модель

In [9]:
def add_clusters(df, **parameters):
    from sklearn.cluster import KMeans

    def get_clusters(group_df):
        return KMeans(**parameters).fit(group_df[["width"]]).labels_

    def add_clusters_to_font(group_df):
        result_df = group_df[["char_id", "char", "width"]].assign(cluster=get_clusters(group_df))
        result_df.replace({"cluster": result_df.groupby("cluster").width.max().to_dict()}, inplace=True)
        return result_df

    return df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
        .apply(add_clusters_to_font).reset_index()

In [10]:
class SimpleMainModel:
    from sklearn.cluster import KMeans

    def __init__(self, **parameters):
        self.parameters = parameters
        self.df = None

    def fit(self, df):
        self.df = df.groupby(["alphabet", "font_face", "font_size", "font_version"])\
            .apply(self._add_clusters_to_font).reset_index(level=4, drop=True).set_index("char", append=True)

        return self

    def predict(self, df, target):
        df = df.assign(prediction=lambda x: np.nan)
        for (i, d) in df.iterrows():
            abvalues = self.df.loc[(d.alphabet, d.font_face, d.font_size, d.font_version)]
            max_cluster = abvalues.cluster.max()
            df.loc[i, "prediction"] = np.array([
                (abvalues.loc[c].cluster if c in abvalues.index else max_cluster)
                for c in d[target]
            ]).sum()
        df.prediction = df.prediction.astype(int)

        return df

    def _add_clusters_to_font(self, df):
        result_df = df[["char_id", "char", "width"]].assign(cluster=self._get_clusters(df))
        result_df.replace({"cluster": result_df.groupby("cluster").width.max().to_dict()}, inplace=True)

        return result_df

    def _get_clusters(self, df):
        return self.KMeans(**self.parameters).fit(df[["width"]]).labels_

In [11]:
%%capture --no-display

sm_model = SimpleMainModel(n_clusters=N_CLUSTERS, random_state=42)
sm_model.fit(train_cw_df)

IndexError: Too many levels: Index has only 1 level, not 5

In [12]:
trtr_control_df, tstr_control_df = train_test_split(train_control_df, train_size=2.0/3.0)

In [13]:
trtr_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version
0,Skill city myself idea.,207,basic_latin,Courier,9,1
1,Dog reduce hour.,144,basic_latin,Courier,9,1
2,Window career case south team throw.,324,basic_latin,Courier,9,1
3,Pretty show college glass start sort perhaps key.,441,basic_latin,Courier,9,1
4,Region goal nothing.,180,basic_latin,Courier,9,1


In [14]:
tstr_control_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version
0,教授午前ダッシュストレージスキーム。,216,japanese,Courier,9,1
1,衝突叔父楽しんでジャム持つ電池コミュニケーションアクセルペダル。,386,japanese,Courier,9,1
2,サンプル月意図人形主婦持っていました中世画面。,303,japanese,Courier,9,1
3,試してみる呼ぶクロスコピーは楽しんで評議会。,272,japanese,Courier,9,1
4,サラダヒット拡張バスケット。,159,japanese,Courier,9,1


In [15]:
sm_model_df = sm_model.predict(tstr_control_df, "text")
sm_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,prediction
0,教授午前ダッシュストレージスキーム。,216,japanese,Courier,9,1,233
1,衝突叔父楽しんでジャム持つ電池コミュニケーションアクセルペダル。,386,japanese,Courier,9,1,411
2,サンプル月意図人形主婦持っていました中世画面。,303,japanese,Courier,9,1,317
3,試してみる呼ぶクロスコピーは楽しんで評議会。,272,japanese,Courier,9,1,289
4,サラダヒット拡張バスケット。,159,japanese,Courier,9,1,171


In [16]:
draw_relative_error(sm_model_df, "text")

### 1.2 Скорректированная модель

In [17]:
class CorrectedMainModel:
    from sklearn.linear_model import LinearRegression

    features = ["prediction", "text_size", "font_size", "font_version"]

    def __init__(self, sm_model, **parameters):
        self.sm_model = sm_model
        self.parameters = parameters
        self.corrector_models = {}

    def fit(self, df, target):
        sm_control_df = self.sm_model.predict(df, target)
        sm_control_df["text_size"] = sm_control_df.text.str.len()
        sm_control_df["error"] = (sm_control_df.prediction - sm_control_df.width) / sm_control_df.text_size
        self.corrector_models = {
            p: self._fit_corrector_model(
                sm_control_df[(sm_control_df.alphabet == p[0])&(sm_control_df.font_face == p[1])]
            )
            for p in sm_control_df.groupby(["alphabet", "font_face"]).count().index
        }

        return self

    def predict(self, df, target):
        sm_control_df = self.sm_model.predict(df, target)
        sm_control_df["text_size"] = sm_control_df.text.str.len()
        for (i, d) in sm_control_df.iterrows():
            corrector_model = self.corrector_models[(d.alphabet, d.font_face)]
            prediction = corrector_model.predict(sm_control_df[self.features].loc[i:i+1])[0]
            sm_control_df.loc[i, "prediction"] = sm_control_df.loc[i, "prediction"] - int(
                sm_control_df.loc[i, "text_size"] * prediction
            )
        sm_control_df.drop(columns=["text_size"], inplace=True)

        return sm_control_df

    def _fit_corrector_model(self, df):
        X = df[self.features]
        y = df.error
        return self.LinearRegression(**self.parameters).fit(X, y)

In [18]:
cm_model = CorrectedMainModel(sm_model)
cm_model.fit(trtr_control_df, "text")

<__main__.CorrectedMainModel at 0x21141388>

In [19]:
cm_model_df = cm_model.predict(tstr_control_df, "text")
cm_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,prediction
0,教授午前ダッシュストレージスキーム。,216,japanese,Courier,9,1,220
1,衝突叔父楽しんでジャム持つ電池コミュニケーションアクセルペダル。,386,japanese,Courier,9,1,376
2,サンプル月意図人形主婦持っていました中世画面。,303,japanese,Courier,9,1,301
3,試してみる呼ぶクロスコピーは楽しんで評議会。,272,japanese,Courier,9,1,272
4,サラダヒット拡張バスケット。,159,japanese,Courier,9,1,162


In [20]:
draw_relative_error(cm_model_df, "text")

## 2. Оценка ширины текста для неизвестных шрифтов, но для известных языков

In [21]:
class SimpleOpenModel:
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import LinearRegression

    features = ["font_size", "font_version"]

    def __init__(self, degree=3, **parameters):
        self.degree = degree
        self.parameters = parameters
        self.char_models = {}

    def fit(self, df):
        self.char_models = {
            p: self._fit_char_model(
                df[(df.alphabet == p[0])&(df.char == p[1])]
            )
            for p in df.groupby(["alphabet", "char"]).count().index
        }

        return self

    def _fit_char_model(self, df):
        X = self.PolynomialFeatures(degree=self.degree).fit_transform(df[self.features])
        y = df.width
        return self.LinearRegression(**self.parameters).fit(X, y)

    def predict_char(self, char, alphabet, features):
        X = self.PolynomialFeatures(degree=self.degree).fit_transform([features])
        return self.char_models[(alphabet, char)].predict(X)[0]

    def predict_string(self, string, alphabet, features):
        return int(np.round(np.array([self.predict_char(c, alphabet, features) for c in string]).sum()))

    def predict(self, df, target, alphabet):
        df = df.assign(prediction=lambda x: np.nan)
        for (i, d) in df.iterrows():
            df.loc[i, "prediction"] = self.predict_string(d[target], d[alphabet], list(d[self.features]))

        return df

In [22]:
so_model = SimpleOpenModel()
so_model.fit(train_cw_df)

<__main__.SimpleOpenModel at 0x212e4a88>

In [23]:
so_model_df = so_model.predict(test_control_df, "text", "alphabet")
so_model_df.head()

Unnamed: 0,text,width,alphabet,font_face,font_size,font_version,prediction
0,偏差意図トス。,104,japanese,Arial,9,1,101.0
1,野球文言リニア雪出演者バーゲン狐。,255,japanese,Arial,9,1,247.0
2,画面溝学生隠す連続ノート。,198,japanese,Arial,9,1,193.0
3,教授午前ダッシュストレージスキーム。,247,japanese,Arial,9,1,239.0
4,衝突叔父楽しんでジャム持つ電池コミュニケーションアクセルペダル。,441,japanese,Arial,9,1,427.0


In [24]:
draw_relative_error(so_model_df, "text")