# Модель предсказаний для базового шрифта

In [1]:
import numpy as np
import pandas as pd

from lets_plot import *
LetsPlot.setup_html()

import os; import sys; sys.path.append(os.path.join(sys.path[0], ".."))
from util import util

In [2]:
N_CLUSTERS = 3
BASIC_FONT = util.Font("Lucida Grande", 14, "normal")

In [3]:
def get_char_widths_series(df, font):
    return util.filter_by_font(df, font).groupby("char").median().width.astype(int)

In [4]:
def show_pair_of_plots(p1, p2, w=480, h=360):
    bunch = GGBunch()
    bunch.add_plot(p1, 0, 0, w, h)
    bunch.add_plot(p2, w, 0, w, h)
    bunch.show()

In [5]:
char_widths_df = util.get_df("../data/full/char_widths.csv", "all")
char_widths_df.head()

Unnamed: 0,char_id,char,alphabet,font_family,font_size,font_face,width
0,65,A,basic_latin,Courier,9,normal,8
1,66,B,basic_latin,Courier,9,normal,8
2,67,C,basic_latin,Courier,9,normal,8
3,68,D,basic_latin,Courier,9,normal,8
4,69,E,basic_latin,Courier,9,normal,8


In [6]:
control_df = util.get_df("../data/full/control.csv", "all")
control_df.head()

Unnamed: 0,text,width,alphabet,font_family,font_size,font_face,symbols_count
0,Да.,24,russian,Courier,9,normal,3.0
1,Мимо.,40,russian,Courier,9,normal,5.0
2,Один.,40,russian,Courier,9,normal,5.0
3,Идея.,40,russian,Courier,9,normal,5.0
4,Изба.,40,russian,Courier,9,normal,5.0


In [7]:
basic_char_widths_s = get_char_widths_series(char_widths_df, BASIC_FONT)
basic_char_widths_s.head()

char
      5
!     6
"     7
#    11
$    11
Name: width, dtype: int32

In [8]:
def occurrences_number(series, symbol):
    try:
        s = series.str.count(symbol).sum()
        return s - series.size if symbol in ['$', '^'] else s
    except:
        return 0

texts_s = control_df.text.drop_duplicates().str[0:-1]
basic_char_weights_s = pd.Series({c: occurrences_number(texts_s, c)
                                  for c in basic_char_widths_s.index}, name="weight") + 1
basic_char_weights_s.head()

     9045
!       1
"       1
#       1
$       1
Name: weight, dtype: int64

In [9]:
basic_char_data_df = pd.concat([basic_char_widths_s, basic_char_weights_s], axis="columns")
basic_char_data_df.head()

Unnamed: 0,width,weight
,5,9045
!,6,1
"""",7,1
#,11,1
$,11,1


In [10]:
class ClusteringModel:
    from sklearn.cluster import KMeans

    INDEX_NAME = "char"
    WIDTH_NAME = "width"
    WEIGHT_NAME = "weight"
    CLUSTER_NAME = "cluster"
    CLUSTER_WIDTH_NAME = "cluster_width"

    predictor = None
    extra_symbol_width = None

    def __init__(self,
                 calc_cluster_width=None,
                 allow_extra_symbols=True,
                 index_name=INDEX_NAME,
                 width_name=WIDTH_NAME,
                 weight_name=WEIGHT_NAME,
                 cluster_name=CLUSTER_NAME,
                 cluster_width_name=CLUSTER_WIDTH_NAME,
                 **kmean_parameters):
        self.calc_cluster_width = calc_cluster_width
        self.allow_extra_symbols = allow_extra_symbols
        self.kmean_parameters = kmean_parameters
        self.index_name = index_name
        self.width_name = width_name
        self.weight_name = weight_name
        self.cluster_name = cluster_name
        self.cluster_width_name = cluster_width_name

    def fit(self, char_data):
        predictor_df = char_data[[self.width_name, self.weight_name]]
        predictor_df.index.name = self.index_name
        # Set clusters
        predictor_df[self.cluster_name] = self.KMeans(**self.kmean_parameters).fit(predictor_df[[self.width_name]]).labels_
        # Set cluster widths
        cluster_widths = predictor_df.groupby(self.cluster_name).apply(self.calc_cluster_width or self._calc_cluster_width)
        predictor_df[self.cluster_width_name] = predictor_df[self.cluster_name].replace(cluster_widths)
        # Sort clusters
        predictor_df.sort_values(by=self.cluster_width_name, inplace=True)
        predictor_df.cluster.replace(
            {cluster_id: i for i, cluster_id in enumerate(predictor_df[self.cluster_name].unique())},
            inplace=True
        )
        self.predictor = predictor_df
        self.extra_symbol_width = (self.calc_cluster_width or self._calc_cluster_width)(self.predictor)

        return self

    def predict(self, text, name=None):
        if isinstance(text, str):
            return round(np.sum([self._predict_char_width(c) for c in text]))
        elif isinstance(text, pd.core.series.Series):
            return self._predict_for_series(text, name if (isinstance(name, str)) else None)
        elif isinstance(text, pd.core.frame.DataFrame):
            return pd.concat([
                self._predict_for_series(text[column], name[column] if (isinstance(name, dict)) else None)
                for column in text.columns
            ], axis="columns")
        else:
            raise Exception("Bad type of input: {0}".format(type(text)))

    def _calc_cluster_width(self, r):
        return (r.width * r.weight).sum() / r.weight.sum()

    def _predict_char_width(self, c):
        try:
            return self.predictor.loc[c].cluster_width
        except KeyError as e:
            if self.allow_extra_symbols:
                return self.extra_symbol_width
            else:
                raise e

    def _predict_for_series(self, text_s, name=None):
        def split_string(s):
            return pd.Series([s[i:i+1] for i in range(len(s))])
        df = text_s.apply(split_string)
        splitted_df = df.replace(self.predictor.cluster_width).fillna(0)
        cols = splitted_df.columns
        result = splitted_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\
                                  .fillna(self.extra_symbol_width).sum(axis=1).round().astype(int)
        result.name = name or "predict_{0}".format(text_s.name)
        return result

In [11]:
clustering_model = ClusteringModel(
    calc_cluster_width=lambda r: r.width.mean(),
    n_clusters=N_CLUSTERS,
    random_state=42
)
clustering_model.fit(basic_char_data_df)
print("Простая модель")
print("Кластеры: {0}".format(clustering_model.predictor.cluster.to_dict()))
print("Веса кластеров: {0}".format(
    clustering_model.predictor[["cluster", "cluster_width"]]\
                    .drop_duplicates().set_index("cluster").cluster_width.to_dict())
)

Простая модель
Кластеры: {' ': 0, ';': 0, 'г': 0, 'I': 0, '}': 0, '|': 0, '{': 0, 't': 0, ':': 0, 'r': 0, '\\': 0, ']': 0, '^': 0, '`': 0, 'l': 0, 'j': 0, 'i': 0, '[': 0, 'к': 0, 'f': 0, '/': 0, '.': 0, '*': 0, ')': 0, ',': 0, '(': 0, "'": 0, '-': 0, '"': 0, 'т': 0, '!': 0, 'ц': 1, 'z': 1, 'y': 1, 'x': 1, 'v': 1, 'u': 1, 'ъ': 1, 's': 1, 'k': 1, 'q': 1, 'э': 1, 'p': 1, 'ь': 1, 'ч': 1, 'n': 1, 'р': 1, '~': 1, 'Б': 1, 'п': 1, 'о': 1, 'н': 1, 'л': 1, 'у': 1, 'х': 1, 'й': 1, 'и': 1, 'з': 1, 'с': 1, 'е': 1, 'в': 1, 'h': 1, 'а': 1, 'У': 1, 'Т': 1, 'Л': 1, 'К': 1, 'З': 1, 'Г': 1, 'д': 1, 'б': 1, 'o': 1, 'e': 1, 'F': 1, 'g': 1, '?': 1, '>': 1, '=': 1, '<': 1, '9': 1, '8': 1, '7': 1, '6': 1, '5': 1, '4': 1, '3': 1, '2': 1, '1': 1, '0': 1, '+': 1, '$': 1, '#': 1, 'J': 1, 'L': 1, 'я': 1, 'Y': 1, 'Z': 1, 'T': 1, '_': 1, 'a': 1, 'b': 1, 'c': 1, 'd': 1, 'R': 2, 'ж': 2, 'V': 2, 'A': 2, 'S': 2, 'w': 2, 'B': 2, 'U': 2, '@': 2, 'м': 2, 'А': 2, 'X': 2, 'C': 2, 'ю': 2, 'm': 2, 'ф': 2, '&': 2, 'ш': 2, 'щ': 

In [12]:
weighted_clustering_model = ClusteringModel(
    n_clusters=N_CLUSTERS,
    random_state=42
)
weighted_clustering_model.fit(basic_char_data_df)
print("Модель с весами")
print("Кластеры: {0}".format(weighted_clustering_model.predictor.cluster.to_dict()))
print("Веса кластеров: {0}".format(
    weighted_clustering_model.predictor[["cluster", "cluster_width"]]\
                             .drop_duplicates().set_index("cluster").cluster_width.to_dict())
)

Модель с весами
Кластеры: {' ': 0, ';': 0, 'г': 0, 'I': 0, '}': 0, '|': 0, '{': 0, 't': 0, ':': 0, 'r': 0, '\\': 0, ']': 0, '^': 0, '`': 0, 'l': 0, 'j': 0, 'i': 0, '[': 0, 'к': 0, 'f': 0, '/': 0, '.': 0, '*': 0, ')': 0, ',': 0, '(': 0, "'": 0, '-': 0, '"': 0, 'т': 0, '!': 0, 'ц': 1, 'z': 1, 'y': 1, 'x': 1, 'v': 1, 'u': 1, 'ъ': 1, 's': 1, 'k': 1, 'q': 1, 'э': 1, 'p': 1, 'ь': 1, 'ч': 1, 'n': 1, 'р': 1, '~': 1, 'Б': 1, 'п': 1, 'о': 1, 'н': 1, 'л': 1, 'у': 1, 'х': 1, 'й': 1, 'и': 1, 'з': 1, 'с': 1, 'е': 1, 'в': 1, 'h': 1, 'а': 1, 'У': 1, 'Т': 1, 'Л': 1, 'К': 1, 'З': 1, 'Г': 1, 'д': 1, 'б': 1, 'o': 1, 'e': 1, 'F': 1, 'g': 1, '?': 1, '>': 1, '=': 1, '<': 1, '9': 1, '8': 1, '7': 1, '6': 1, '5': 1, '4': 1, '3': 1, '2': 1, '1': 1, '0': 1, '+': 1, '$': 1, '#': 1, 'J': 1, 'L': 1, 'я': 1, 'Y': 1, 'Z': 1, 'T': 1, '_': 1, 'a': 1, 'b': 1, 'c': 1, 'd': 1, 'R': 2, 'ж': 2, 'V': 2, 'A': 2, 'S': 2, 'w': 2, 'B': 2, 'U': 2, '@': 2, 'м': 2, 'А': 2, 'X': 2, 'C': 2, 'ю': 2, 'm': 2, 'ф': 2, '&': 2, 'ш': 2, 'щ':

In [13]:
test_df = util.filter_by_font(control_df, BASIC_FONT).copy()
test_df["width_prediction"] = clustering_model.predict(test_df.text)
test_df["width_prediction_error"] = test_df.width_prediction - test_df.width
test_df["width_prediction_mean_error"] = test_df.width_prediction_error / test_df.symbols_count
test_df["width_weighted_prediction"] = weighted_clustering_model.predict(test_df.text)
test_df["width_weighted_prediction_error"] = test_df.width_weighted_prediction - test_df.width
test_df["width_weighted_prediction_mean_error"] = test_df.width_weighted_prediction_error / test_df.symbols_count
test_df.head()

Unnamed: 0,text,width,alphabet,symbols_count,width_prediction,width_prediction_error,width_prediction_mean_error,width_weighted_prediction,width_weighted_prediction_error,width_weighted_prediction_mean_error
0,Да.,28,russian,3.0,31,3,1.0,30,2,0.666667
1,Мимо.,54,russian,5.0,56,2,0.4,54,0,0.0
2,Один.,51,russian,5.0,52,1,0.2,51,0,0.0
3,Идея.,51,russian,5.0,52,1,0.2,51,0,0.0
4,Изба.,49,russian,5.0,52,3,0.6,51,2,0.4


In [14]:
p1 = ggplot(test_df) + \
    geom_point(aes("width", "width_prediction")) + \
    geom_abline(slope=1, color="magenta", size=1) + \
    scale_x_log10() + scale_y_log10() + \
    ggtitle("Предсказания простой модели")
p2 = ggplot(test_df) + \
    geom_point(aes("width", "width_weighted_prediction")) + \
    geom_abline(slope=1, color="magenta", size=1) + \
    scale_x_log10() + scale_y_log10() + \
    ggtitle("Предсказания модели с весами")

show_pair_of_plots(p1, p2)

In [15]:
p1 = ggplot(test_df) + \
    geom_point(aes("symbols_count", "width_prediction_error", color="alphabet")) + \
    ylim(-80, 80) + \
    ggtitle("Ошибки простой модели")
p2 = ggplot(test_df) + \
    ggplot(test_df) + \
    geom_point(aes("symbols_count", "width_weighted_prediction_error", color="alphabet")) + \
    ylim(-80, 80) + \
    ggtitle("Ошибки модели с весами")

show_pair_of_plots(p1, p2)

In [16]:
p1 = ggplot(test_df) + \
    geom_linerange(aes("symbols_count", "width_prediction_mean_error", color="alphabet"), stat="boxplot") + \
    ylim(-1.5, 1.5) + ylab("width_prediction_mean_error") + \
    ggtitle("Вариация относительной ошибки простой модели")
p2 = ggplot(test_df) + \
    geom_linerange(aes("symbols_count", "width_weighted_prediction_mean_error", color="alphabet"), stat="boxplot") + \
    ylim(-1.5, 1.5) + ylab("width_weighted_prediction_mean_error") + \
    ggtitle("Вариация относительной ошибки модели с весами")

show_pair_of_plots(p1, p2)

In [17]:
p1 = ggplot(test_df) + \
    geom_smooth(aes("symbols_count", "width_prediction_mean_error", color="alphabet"), method='loess') + \
    ylim(-.2, 1) + \
    ggtitle("Тренд относительной ошибки простой модели")
p2 = ggplot(test_df) + \
    geom_smooth(aes("symbols_count", "width_weighted_prediction_mean_error", color="alphabet"), method='loess') + \
    ylim(-.2, 1) + \
    ggtitle("Тренд относительной ошибки модели с весами")

show_pair_of_plots(p1, p2)