# Тренировка моделей

## Подготовка

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error as error_metric

from lets_plot import *
LetsPlot.setup_html()

import os; import sys; sys.path.append(os.path.join(sys.path[0], ".."))
from utils import misc as um
from utils import models as ums
from utils import transform_data as utd

In [2]:
DATA_DIR = "../data"
N_CLUSTERS = 5
N_CLUSTERS_MIN, N_CLUSTERS_MAX = 1, 10

In [3]:
char_widths_df = um.read_data("{0}/char_widths.csv".format(DATA_DIR), monospaced=False)

char_widths_df = utd.filter_by_font(char_widths_df, filters=["size", "face"])

print(char_widths_df.shape)
char_widths_df

(8240, 6)


Unnamed: 0,char_id,char,alphabet,subset,font_family,width
0,32,,latin,symbols,Arial,5
1,32,,latin,symbols,Calibri,6
2,32,,latin,symbols,Garamond,6
3,32,,latin,symbols,Geneva,6
4,32,,latin,symbols,Georgia,6
...,...,...,...,...,...,...
8235,1327,ԯ,cyrillic,supplement,Helvetica,11
8236,1327,ԯ,cyrillic,supplement,Lucida Grande,11
8237,1327,ԯ,cyrillic,supplement,Rockwell,14
8238,1327,ԯ,cyrillic,supplement,Times New Roman,11


In [4]:
control_df = um.read_data("{0}/control.csv".format(DATA_DIR), monospaced=False)

train_control_df, test_control_df = utd.train_test_split_by_column(control_df, "text")

print(control_df.shape)
control_df

(242160, 8)


Unnamed: 0,text,alphabet,locale,font_family,font_size,font_face,symbols_count,width
0,-0.09999999999999998,cyrillic,ru_RU,Arial,9,bold,20,133
1,-0.09999999999999998,cyrillic,ru_RU,Arial,9,bold+italic,20,133
2,-0.09999999999999998,cyrillic,ru_RU,Arial,9,italic,20,134
3,-0.09999999999999998,cyrillic,ru_RU,Arial,9,normal,20,133
4,-0.09999999999999998,cyrillic,ru_RU,Arial,11,bold,20,153
...,...,...,...,...,...,...,...,...
242155,Ягода,cyrillic,ru_RU,Verdana,16,normal,5,67
242156,Ягода,cyrillic,ru_RU,Verdana,20,bold,5,93
242157,Ягода,cyrillic,ru_RU,Verdana,20,bold+italic,5,93
242158,Ягода,cyrillic,ru_RU,Verdana,20,italic,5,83


In [5]:
texts_df = um.read_data("{0}/texts.csv".format(DATA_DIR), monospaced=False)[["text", "alphabet"]].drop_duplicates()
print(texts_df.shape)
texts_df

(1033, 2)


Unnamed: 0,text,alphabet
0,-0.04999999999999993,cyrillic
240,-0.04999999999999993,latin
1200,-0.09999999999999998,cyrillic
1440,-0.09999999999999998,latin
2400,-0.1499999999999999,cyrillic
...,...,...
317760,Ученый обида коричневый другой устройство изме...,cyrillic
318000,Чем остановить освободить мотоцикл помолчать а...,cyrillic
318240,Число ярко возможно прошептать,cyrillic
318480,Чувство блин карандаш легко самостоятельно куз...,cyrillic


In [6]:
char_df = pd.concat([
    ums.prepare_char_data(
        char_widths_df[char_widths_df.alphabet == alphabet],
        texts_df[texts_df.alphabet == alphabet].text
    ).assign(alphabet=lambda r: alphabet)
    for alphabet in char_widths_df.alphabet.unique()
])
print(char_df.shape)
char_df.head()

(824, 4)


Unnamed: 0_level_0,width,weight,order,alphabet
char,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,5.8,813,4,latin
!,6.9,1,32,latin
"""",8.5,1,62,latin
#,14.4,1,404,latin
$,11.5,1,176,latin


## Вычисление оптимального числа кластеров

In [7]:
clusters_train_df = utd.filter_by_font(train_control_df, filters=["size", "face"])
clusters_df = pd.DataFrame(columns=["n_clusters", "score"])
for n_clusters in range(N_CLUSTERS_MIN, N_CLUSTERS_MAX + 1):
    model = ums.ClusteringModel(
        calc_cluster_size=lambda r: r.width.mean(),
        clustering_col="width",
        n_clusters=n_clusters, random_state=42
    ).fit(char_df)
    clusters_df = pd.concat([clusters_df, pd.DataFrame({
        "n_clusters": [n_clusters],
        "score": [error_metric(clusters_train_df.width, model.predict(clusters_train_df.text))],
    })], ignore_index=True)
print(clusters_df.shape)
clusters_df



(10, 2)


Unnamed: 0,n_clusters,score
0,1,2514.423735
1,2,431.701297
2,3,411.835409
3,4,250.158366
4,5,202.029961
5,6,209.39131
6,7,206.768482
7,8,191.731647
8,9,184.27847
9,10,179.416472


In [8]:
ggplot(clusters_df) + \
    geom_bar(aes(x="n_clusters", y="score"), stat='identity') + \
    scale_x_continuous(breaks=list(range(N_CLUSTERS_MIN, N_CLUSTERS_MAX + 1)))

## Тренировка и сохранение модели

In [9]:
ums.FullModel(
    clustering_model=ums.ClusteringModel(
        calc_cluster_size=lambda r: r.width.mean(),
        clustering_col="width",
        n_clusters=N_CLUSTERS, random_state=42
    ).fit(char_df)
).fit(train_control_df).save("{0}/model.csv".format(DATA_DIR), "{0}/model.json".format(DATA_DIR))
model = ums.FullModel.load("{0}/model.csv".format(DATA_DIR), "{0}/model.json".format(DATA_DIR))
model



Clusters:

mapOf(
0 to listOf(
' ', 'Î', 'Ï', 't', 'ŕ', 'r', 'l', 'j', 'i', 'f', 'ł', 'ŀ', '`', 'ľ', ']', 'º', '[', 'ļ', 'ĺ', 'ĵ', 'Í', 'ı', 'Ì', 'ř', '¹', '¸', '·', '´', '³', '²', 'ŧ', '°', 'ť', 'ţ', 'ª', '¨', '¦', 'ſ', '¡', 'ј', 'ї', '|', 'і', 'ŗ', 'ì', '\', 'î', 'І', 'ĩ', 'Ĩ', 'í', ':', 'ȷ', 'ț', 'Ї', '/', '-', ',', ')', '(', ''', '"', '!', '.', 'Ī', ';', 'I', 'J', 'İ', 'ĭ', 'Į', 'ī', 'ï', 'Ĭ', 'į'
),
1 to listOf(
'ş', 'ĸ', 'ŝ', 'ĕ', 'š', 'ķ', 'Ţ', 'ė', 'Ł', 'ҙ', 'ś', 'Ĵ', 'ҭ', 'ē', 'đ', 'ũ', 'ѯ', 'ү', 'ĳ', 'Ť', 'Ŧ', 'Ļ', 'ę', 'Ŀ', 'ń', 'ņ', 'ň', 'ғ', 'ŋ', 'ħ', 'ґ', 'ō', 'Ґ', 'ŏ', 'ő', 'Ȉ', 'Ľ', 'ĥ', 'ģ', 'ҝ', 'ġ', 'ğ', 'ĝ', 'ě', 'Ĺ', 'ū', 'ż', 'ů', 'ǃ', 'ǁ', 'ǀ', 'ƾ', 'Г', 'З', 'Т', 'ƭ', 'ƫ', 'ƪ', 'а', 'б', 'в', 'г', 'д', 'Ј', 'ǉ', 'Ѓ', 'Ǐ', 'Ȋ', 'ȋ', 'ȑ', 'ȓ', 'ǿ', 'ș', 'ǻ', 'е', 'ȝ', 'ȴ', 'ȶ', 'ɂ', 'Ɉ', 'ɉ', 'ɍ', 'ǐ', 'ǰ', 'з', 'и', 'й', 'ђ', 'ѓ', 'є', 'ѕ', 'ž', 'ћ', 'ұ', 'ё', 'ќ', 'Ÿ', 'ŷ', 'Ŷ', 'ў', 'џ', 'ų', 'ű', 'ź', 'ŭ', 'я', 'ь', 'к', 'л', 'ƚ', 'н', 'о', 'Ɨ', 'Ɩ', 'э', 'п',

## Оценка предсказаний

In [10]:
test_df = test_control_df.copy()
test_df["non_exaggerated_prediction"] = model.predict(test_df, use_exagg=False).round().astype(int)
test_df["non_exaggerated_prediction_error"] = test_df.non_exaggerated_prediction - test_df.width
test_df["non_exaggerated_prediction_mean_error"] = test_df.non_exaggerated_prediction_error / test_df.symbols_count
test_df["exaggerated_prediction"] = model.predict(test_df, use_exagg=True).round().astype(int)
test_df["exaggerated_prediction_error"] = test_df.exaggerated_prediction - test_df.width
test_df["exaggerated_prediction_mean_error"] = test_df.exaggerated_prediction_error / test_df.symbols_count
print(test_df.shape)
test_df

(57120, 14)


Unnamed: 0,text,alphabet,locale,font_family,font_size,font_face,symbols_count,width,non_exaggerated_prediction,non_exaggerated_prediction_error,non_exaggerated_prediction_mean_error,exaggerated_prediction,exaggerated_prediction_error,exaggerated_prediction_mean_error
0,-0.19999999999999996,cyrillic,ru_RU,Arial,9,bold,20,133,140,7,0.350000,154,21,1.050000
1,-0.19999999999999996,cyrillic,ru_RU,Arial,9,bold+italic,20,133,142,9,0.450000,156,23,1.150000
2,-0.19999999999999996,cyrillic,ru_RU,Arial,9,italic,20,134,129,-5,-0.250000,142,8,0.400000
3,-0.19999999999999996,cyrillic,ru_RU,Arial,9,normal,20,133,127,-6,-0.300000,140,7,0.350000
4,-0.19999999999999996,cyrillic,ru_RU,Arial,11,bold,20,153,171,18,0.900000,189,36,1.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57115,Чувство дыхание,cyrillic,ru_RU,Verdana,16,normal,15,197,201,4,0.266667,222,25,1.666667
57116,Чувство дыхание,cyrillic,ru_RU,Verdana,20,bold,15,269,273,4,0.266667,301,32,2.133333
57117,Чувство дыхание,cyrillic,ru_RU,Verdana,20,bold+italic,15,270,276,6,0.400000,303,33,2.200000
57118,Чувство дыхание,cyrillic,ru_RU,Verdana,20,italic,15,242,254,12,0.800000,280,38,2.533333


In [11]:
ggplot(test_df) + \
    geom_point(aes("width", "non_exaggerated_prediction", color="alphabet"), alpha=.3) + \
    geom_abline(slope=1, color="magenta") + \
    facet_wrap(facets="font_family", ncol=3) + \
    ggtitle("Соотношение ширины и предсказания для скорректированной по шрифту модели")

In [12]:
ggplot(test_df) + \
    geom_point(aes("width", "exaggerated_prediction", color="alphabet"), alpha=.3) + \
    geom_abline(slope=1, color="magenta") + \
    facet_wrap(facets="font_family", ncol=3) + \
    ggtitle("Соотношение ширины и предсказания для преувеличивающей модели")

In [13]:
ggplot(test_df) + \
    geom_density(aes("non_exaggerated_prediction_mean_error")) + \
    facet_wrap(facets="font_family", ncol=3) + \
    ggtitle("Распределение средней ошибки для скорректированной по шрифту модели")

In [14]:
ggplot(test_df) + \
    geom_density(aes("exaggerated_prediction_mean_error")) + \
    facet_wrap(facets="font_family", ncol=3) + \
    ggtitle("Распределение средней ошибки для преувеличивающей модели")