# Тренировка модели и вспомогательных коэффициентов

## Подготовка

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error as error_metric

from lets_plot import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html()

import os; import sys; sys.path.append(os.path.join(sys.path[0], ".."))
from utils import misc as um
from utils import font as ufont
from utils import clustering_model as ucm
from utils import transform_data as utd

In [2]:
N_CLUSTERS = 4
N_CLUSTERS_MIN, N_CLUSTERS_MAX = 1, 10

In [3]:
def get_coefficients_s(df, target_col, *,
                       additive=True,
                       predicted_col="predicted_width", coeff_col="coeff", width_col="width", size_col="symbols_count"):
    import numpy as np

    if additive:
        df[coeff_col] = (df[width_col] - df[predicted_col]) / df[size_col]
        return df.groupby(target_col)[coeff_col].mean()
    else:
        df = df.assign(
            numerator=df[width_col] * df[predicted_col] / df[size_col].pow(2),
            denominator=df[predicted_col].pow(2) / df[size_col].pow(2)
        )
        grouped_df = df.groupby(target_col).agg({"numerator": ["sum"], "denominator": ["sum"]})
        return (grouped_df.numerator / grouped_df.denominator)["sum"].rename(coeff_col)

In [4]:
char_widths_df = um.read_data("../data/char_widths.csv", monospaced=False)

char_widths_df = utd.filter_by_font(char_widths_df, filters=["size", "face"])

print(char_widths_df.shape)
char_widths_df

(8240, 6)


Unnamed: 0,char_id,char,alphabet,subset,font_family,width
0,32,,latin,symbols,Geneva,5
1,33,!,latin,symbols,Geneva,6
2,34,"""",latin,symbols,Geneva,7
3,35,#,latin,symbols,Geneva,11
4,36,$,latin,symbols,Geneva,11
...,...,...,...,...,...,...
8235,1323,ԫ,cyrillic,supplement,Rockwell,14
8236,1324,Ԭ,cyrillic,supplement,Rockwell,14
8237,1325,ԭ,cyrillic,supplement,Rockwell,14
8238,1326,Ԯ,cyrillic,supplement,Rockwell,14


In [5]:
control_df = um.read_data("../data/control.csv", monospaced=False)

train_control_df, test_control_df = utd.train_test_split_by_column(control_df, "text")

print(control_df.shape)
control_df

(242160, 8)


Unnamed: 0,text,width,alphabet,locale,font_family,font_size,font_face,symbols_count
0,x,5,latin,en_US,Geneva,9,normal,1
1,y,5,latin,en_US,Geneva,9,normal,1
2,z,5,latin,en_US,Geneva,9,normal,1
3,0,7,latin,en_US,Geneva,9,normal,1
4,1,7,latin,en_US,Geneva,9,normal,1
...,...,...,...,...,...,...,...,...
242155,Дыхание означать горький выбирать,831,cyrillic,ru_RU,Rockwell,20,bold+italic,33
242156,Правление рот человечек мелькнуть,831,cyrillic,ru_RU,Rockwell,20,bold+italic,33
242157,Металл назначить ложиться подземный,885,cyrillic,ru_RU,Rockwell,20,bold+italic,35
242158,Металл поставить провинция разводить,912,cyrillic,ru_RU,Rockwell,20,bold+italic,36


In [6]:
texts_df = um.read_data("../data/texts.csv", monospaced=False)[["text", "alphabet"]].drop_duplicates()
print(texts_df.shape)
texts_df

(1033, 2)


Unnamed: 0,text,alphabet
0,x,latin
1,y,latin
2,z,latin
3,0,latin
4,1,latin
...,...,...
258727,Столетие наступать порода пропасть ярко совеща...,cyrillic
258728,Интернет вперед зима мелькнуть стакан крыса ср...,cyrillic
258729,Сбросить витрина холодно настать тюрьма металл...,cyrillic
258730,Рот человечек мелькнуть единый набор вариант в...,cyrillic


In [7]:
char_df = pd.concat([
    ucm.prepare_char_data(
        char_widths_df[char_widths_df.alphabet == alphabet],
        texts_df[texts_df.alphabet == alphabet].text
    ).assign(alphabet=lambda r: alphabet)
    for alphabet in char_widths_df.alphabet.unique()
])
print(char_df.shape)
char_df.head()

(824, 4)


Unnamed: 0_level_0,width,weight,order,alphabet
char,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,5.1,813,4,latin
!,5.9,1,21,latin
"""",7.7,1,67,latin
#,11.7,1,228,latin
$,10.6,1,174,latin


## Вычисление оптимального числа кластеров

In [8]:
clusters_train_df = utd.filter_by_font(train_control_df, filters=["size", "face"])
clusters_df = pd.DataFrame(columns=["n_clusters", "score"])
for n_clusters in range(N_CLUSTERS_MIN, N_CLUSTERS_MAX + 1):
    model = ucm.ClusteringModel(clustering_col="width", n_clusters=n_clusters, random_state=42).fit(char_df)
    clusters_df = pd.concat([clusters_df, pd.DataFrame({
        "n_clusters": [n_clusters],
        "score": [error_metric(clusters_train_df.width, model.predict(clusters_train_df.text))],
    })], ignore_index=True)
print(clusters_df.shape)
clusters_df

(10, 2)


Unnamed: 0,n_clusters,score
0,1,984.64603
1,2,895.471332
2,3,265.220188
3,4,186.213997
4,5,185.337012
5,6,185.732167
6,7,187.630148
7,8,181.318439
8,9,180.030417
9,10,180.464334


In [9]:
ggplot(clusters_df) + \
    geom_bar(aes(x="n_clusters", y="score"), stat='identity') + \
    scale_x_continuous(breaks=list(range(N_CLUSTERS_MIN, N_CLUSTERS_MAX + 1)))

## Тренировка модели

In [10]:
model = ucm.ClusteringModel(clustering_col="width", n_clusters=N_CLUSTERS, random_state=42).fit(char_df)

In [11]:
clustering_s = model.predictor.cluster
clustering_d = clustering_s.to_frame().reset_index().groupby("cluster").char.agg(lambda r: list(r)).to_dict()
print(um.to_kotlin_map({
    k: um.to_kotlin_list(v, new_lines=False, quotation_mark='\'') \
    for k, v in clustering_d.items()
}, replace_key_str=False, replace_value_str=False))

mapOf(
0 to listOf(
' ', 'ґ', '҉', '҈', 'º', '¹', '¸', '·', '´', '²', '°', 'ª', '¨', '¦', 'ј', 'ї', '¡', 'і', '}', '|', '{', 'ѓ', 't', 'r', 'l', 'j', 'i', 'f', 'Ì', '`', 'Í', 'Ï', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'ĵ', 'ĺ', 'ļ', 'ľ', 'ŀ', 'ł', 'ŕ', 'ŗ', 'ř', 'ţ', 'ť', 'ï', 'î', 'í', 'ì', 'ŧ', 'ſ', 'Î', ']', '³', '/', 'ț', ':', 'I', '.', '-', ',', 'Ї', 'ȷ', 'І', '*', 'J', 'г', '(', ''', '!', ';', '[', '"', ')', '\'
),
1 to listOf(
'ū', 'ũ', 'Ť', 'Ţ', 'Ј', 'Ѕ', 'Ŧ', 'Ӏ', 'Ў', 'Ќ', 'л', 'ů', 'ҳ', 'ž', 'Ž', 'ż', 'Ż', 'ź', 'ŭ', 'Ź', 'ŷ', 'Ŷ', 'ҹ', 'ų', 'һ', 'ű', 'Ÿ', 'š', 'ś', 'ş', 'ɉ', 'ɂ', 'ȿ', 'ɍ', 'ĳ', 'Ĵ', 'ķ', 'ĸ', 'Ĺ', 'Ļ', 'Ľ', 'Ŀ', 'Ł', 'ȶ', 'Ɉ', 'ӏ', 'ņ', 'Ş', 'ŝ', 'Ŝ', 'ұ', 'Ś', 'ȳ', 'Š', 'ȴ', 'ő', 'ŏ', 'ō', 'ŋ', 'ŉ', 'ň', 'Ѓ', 'ү', 'ƒ', 'ҩ', 'е', 'я', 'ё', 'ђ', 'ǿ', 'є', 'ѕ', 'д', 'в', 'б', 'а', 'Ȉ', 'ȉ', 'ћ', 'ќ', 'э', 'з', 'ь', 'ъ', 'н', 'о', 'п', 'р', 'с', 'ǰ', 'к', 'Ȋ', 'т', 'й', 'и', 'х', 'ǻ', 'ц', 'ч', 'ǩ', 'у', 'ѝ', 'ȋ', 'ў', 'ƨ', 'Ґ', 'ȝ', 'Ғ', 'ғ', 'ҙ', 

In [12]:
cluster_widths_s = model.predictor[["cluster", "cluster_width"]].drop_duplicates()\
                        .set_index("cluster").cluster_width.sort_values()
print(um.to_kotlin_map(cluster_widths_s.to_dict(), replace_key_str=False))

mapOf(
0 to 5.267064129494624,
1 to 10.306123828026017,
2 to 13.44090338770389,
3 to 16.320833333333333
)


In [13]:
model.extra_symbol_width

7.663869264009877

## Вычисление вспомогательных коэффициентов шрифтов

Формула:

```
(ширина кластера символа + поправка на шрифт + поправка на стиль) * поправка на размер
```

In [14]:
f1_family_train_df = utd.filter_by_font(train_control_df, filters=["size", "face"])
f1_family_train_df["predicted_width"] = model.predict(f1_family_train_df.text)
f1_family = get_coefficients_s(f1_family_train_df, "font_family")
print(um.to_kotlin_map(f1_family.to_dict()))

mapOf(
"Arial" to 0.2669650431210687,
"Calibri" to -0.5777025194123451,
"Garamond" to -1.0549392628444865,
"Geneva" to 0.2669650431210687,
"Georgia" to 0.245478718732333,
"Helvetica" to 0.2669650431210687,
"Lucida Grande" to 0.2669650431210687,
"Rockwell" to 0.5962965544448325,
"Times New Roman" to -1.0215670253115303,
"Verdana" to 1.4833983517230755
)


In [15]:
f1_face_train_df = utd.filter_by_font(train_control_df, filters=["size"])
f1_face_train_df["predicted_width"] = (
    model.predict(f1_face_train_df.text) + \
    f1_face_train_df.symbols_count * f1_face_train_df.font_family.replace(f1_family)
).round().astype(int)
f1_face = get_coefficients_s(f1_face_train_df, "font_face")
print(um.to_kotlin_map(f1_face.to_dict()))

mapOf(
"bold" to 0.6938603241272686,
"bold+italic" to 0.9202485346162085,
"italic" to 0.18135529718278232,
"normal" to 0.003036435109108489
)


In [16]:
f1_size_train_df = train_control_df.copy()
f1_size_train_df["predicted_width"] = (
    model.predict(f1_size_train_df.text) + \
    f1_size_train_df.symbols_count * (
        f1_size_train_df.font_family.replace(f1_family) + \
        f1_size_train_df.font_face.replace(f1_face)
    )
).round().astype(int)
f1_sizes = get_coefficients_s(f1_size_train_df, "font_size", additive=False)
f1_size = (f1_sizes * ufont.BASIC_FONT_SIZE /  pd.Series(ufont.FONT_SIZES, index=ufont.FONT_SIZES)).mean()
f1_size

0.9830243718466446

## Оценка предсказаний

In [17]:
f1_test_df = test_control_df.copy()
f1_test_df["predicted_width"] = (
    (
        model.predict(f1_test_df.text) + \
        f1_test_df.symbols_count * (f1_test_df.font_family.replace(f1_family) + f1_test_df.font_face.replace(f1_face))
    ) * f1_size * f1_test_df.font_size / ufont.BASIC_FONT_SIZE
).round().astype(int)
f1_test_df["error"] = f1_test_df.width - f1_test_df.predicted_width
f1_test_df["mean_error"] = f1_test_df.error / f1_test_df.symbols_count
print(f1_test_df.shape)
f1_test_df

(63840, 11)


Unnamed: 0,text,width,alphabet,locale,font_family,font_size,font_face,symbols_count,predicted_width,error,mean_error
0,y,5,latin,en_US,Geneva,9,normal,1,6,-1,-1.000000
1,1,7,latin,en_US,Geneva,9,normal,1,6,1,1.000000
2,-1,11,latin,en_US,Geneva,9,normal,2,10,1,0.500000
3,-2,11,latin,en_US,Geneva,9,normal,2,10,1,0.500000
4,-8,11,latin,en_US,Geneva,9,normal,2,10,1,0.500000
...,...,...,...,...,...,...,...,...,...,...,...
63835,Витрина холодно настать тюрьма,750,cyrillic,ru_RU,Rockwell,20,bold+italic,30,489,261,8.700000
63836,Самостоятельно спорт результат,770,cyrillic,ru_RU,Rockwell,20,bold+italic,30,492,278,9.266667
63837,Потрясти упор угроза прошептать,777,cyrillic,ru_RU,Rockwell,20,bold+italic,31,496,281,9.064516
63838,Зиновьев Варфоломей Афанасьевич,797,cyrillic,ru_RU,Rockwell,20,bold+italic,31,522,275,8.870968


In [18]:
ggplot(f1_test_df) + \
    geom_point(aes("width", "predicted_width", color="alphabet"), alpha=.3) + \
    geom_abline(slope=1, color="magenta") + \
    facet_wrap(facets="font_family", ncol=3)

In [19]:
ggplot(f1_test_df) + \
    geom_density(aes("mean_error")) + \
    facet_wrap(facets="font_family", ncol=3)

In [20]:
ggplot(f1_test_df) + \
    geom_qq(aes(sample="mean_error")) + \
    geom_qq_line(aes(sample="mean_error"), color="magenta") + \
    facet_wrap(facets="font_family", ncol=3)