In [17]:
from numpy import zeros, array
from functools import lru_cache
from math import log
from tqdm import tqdm

from random import randint as rint
from random import seed
seed(42)

import pandas as pd
import pymorphy2
import time
import re


morph = pymorphy2.MorphAnalyzer()

In [2]:
lemma_dict = pd.read_csv('C:\\Python programs\\ru_dict_lemma.csv')
print(lemma_dict)

         lemma
0            а
1           аа
2          а-а
3          ааа
4        а-а-а
...        ...
51728  ящерица
51729   ящерка
51730     ящик
51731   ящичек
51732     ящур

[51733 rows x 1 columns]


In [3]:
df_train = pd.read_csv('C:\\Python programs\\dataset_ru_word_lemma2.csv')
print(df_train)

               form          lemma  id_sen
0                 «              «     0.0
1              Если           если     0.0
2          передача       передача     0.0
3          цифровых       цифровой     0.0
4        технологий     технология     0.0
...             ...            ...     ...
19350  провозглашал  провозглашать   999.0
19351          себя           себя   999.0
19352        другом           друг   999.0
19353          мира            мир   999.0
19354             .              .   999.0

[19355 rows x 3 columns]


In [4]:
df_stopwords = pd.read_csv('C:\\Python programs\\stopwords_ru.csv')
print(df_stopwords)

            stopword
0                  а
1          абсолютно
2     авторизоваться
3           активный
4               алло
...              ...
1367               „
1368               “
1369               …
1370               /
1371             ...

[1372 rows x 1 columns]


In [5]:
df_test_docs = pd.read_csv('C:\\Python programs\\test_docs.csv')
print(df_test_docs)

                                                   text     class
0     Ваши дети совсем не хотят вылезать из реки. С ...     семья
1     Одна из многих, очень многих вещей, о которых ...     семья
2     Если вы дожили до этого этапа в воспитании дет...     семья
3     Только представьте: ванная наполняется паром, ...     семья
4     Когда дети были дома, вам приходилось быть обр...     семья
...                                                 ...       ...
3395  Православие.Ру В сентябре 2010 года Святейший ...    страна
3396  Интерфакс Религия (interfax-religion.ru) В Мин...    страна
3397  Окно возможностей В Эвенкинском муниципальном ...     наука
3398  ИТАР-ТАСС. Новости из властных структур. Совет...  политика
3399  Вечерний Магадан (Магадан) Охотское территориа...    страна

[3400 rows x 2 columns]


$$\large\textbf{Расстояние Левенштейна}$$

$$
D(i, j) = 
\begin{cases} 
\max(i, j), \quad \quad \quad \quad \quad \quad  \text{if} \; \min(i, j) = 0  \\
\min( \quad \quad \quad  \quad  \quad \quad \quad \quad \text{otherwise}\\
\quad D(i, j - 1) + 1, \\
\quad D(i - 1, j) + 1, \\
\quad D(i - 1, j - 1) + (W_1[i] \: \neq \: W_2 [j]) \\
)
\end{cases}
$$


In [6]:
@lru_cache
def lev_dist(w1, w2):
    n, m = len(w1), len(w2)
    mat = zeros([n + 1, m + 1], int)
    mat[0, :] = array([i for i in range(m + 1)])
    mat[:, 0] = array([i for i in range(n + 1)])

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            mat[i][j] = min(mat[i - 1][j] + 1, mat[i][j - 1] + 1,
                            mat[i - 1][j - 1] + (w1[i - 1] != w2[j - 1]))
    return mat[n][m]

In [7]:
def lemma_lev(w):
    lemma = ""
    len_w = len(w)
    len_lemma = len_w
    for word in lemma_dict["lemma"]:
        if abs(len_w - len(word)) >= len_lemma:
            continue

        dist = lev_dist(word, w)
        if dist < len_lemma:
            if dist == 0:
                return word
            lemma = word
            len_lemma = dist

    return lemma if lemma != "" else w

In [6]:
print(lemma_lev("гнездился"))

гнездиться


$$ \large\textbf{Поиск по словарю с удалением суффикса}$$

In [8]:
@lru_cache
def lemma_dbsra(w):
    lemma = ""
    for i in range(len(w)):
        for j in range(len(w), i, -1):
            if trie.hasKey(w[i:j]) and j - i > len(lemma):
                lemma = w[i:j]
    return lemma if lemma != "" else w

In [16]:
print(lemma_dbsra("+"))

+


$$ \large\textbf{Префиксное дерево}$$

In [9]:
class Node(object):
    def __init__(self, value, child=None, key=None, end=False):
        if child is not None and key is not None:
            self.children = {key: child}
        else:
            self.children = {}
        self.value = value
        self._isEnd = end

    def __str__(self):
        return f"keys = {list(self.children.keys())}, value = {self.value}, \
end = {self._isEnd}"

    def addChild(self, value, key):
        self.children[key] = Node(value)

    def hasChild(self, key):
        if key in self.children.keys():
            return True
        return False

    def getChild(self, key):
        return self.children[key]

    def setValue(self, value):
        self.value = value

    def getValue(self):
        return self.value

    def isEnd(self):
        return self._isEnd

    def getKeys(self):
        return list(self.children.keys())

    def setEnd(self):
        self._isEnd = True


class PrefixTrie(object):
    def __init__(self, db=None):
        self.root = Node('')

        if db is not None:
            for word in db:
                self.insert(word, word)

    def insertDict(self, db_form, db_lemma):
        for i in range(len(db_form)):
            self.insert(db_form[i], db_lemma[i])

    def insert(self, key, value):
        node = self.root
        lenght_key = len(key)
        way = ""

        for i in range(lenght_key):
            char = key[i]
            way += char
            if not node.hasChild(char):
                node.addChild(way, char)
            node = node.getChild(char)
        if (not node.isEnd()):
            node.setValue(value)
            node.setEnd()

    def hasKey(self, key):
        node = self.root

        for i in range(len(key)):
            char = key[i]
            if node.hasChild(char):
                node = node.getChild(char)
            else:
                return False

        if node.isEnd():
            return True
        return False

    def largestPrefix(self, key):
        node = self.root
        lemma = ""
        lenght_key = len(key)

        for i in range(lenght_key):
            char = key[i]
            if not node.hasChild(char):
                break
            node = node.getChild(char)
            if node.isEnd():
                lemma = node.getValue()

        dif = lenght_key - len(lemma)
        if dif != 0:
            depth = 1
            stack = [node.getChild(i) for i in node.getKeys()]
            next_stack = []

            while depth <= max(dif, int(lenght_key / 2)):
                for elem in stack:
                    if elem.isEnd():
                        return elem.getValue()
                    next_stack += [elem.getChild(i) for i in elem.getKeys()]

                stack = next_stack.copy()
                next_stack = []
                depth += 1

        return lemma if lemma != "" else key

    @lru_cache
    def getLemma(self, word):
        prefix = ""
        lemma = word
        len_word = len(word)
        min_dist = len_word

        for i in range(len_word):
            prefix += word[i]
            new_lemma = self.largestPrefix(prefix)
            dist = lev_dist(new_lemma, word)

            if dist < min_dist and dist < len_word // 2:
                if dist == 0:
                    return new_lemma
                lemma = new_lemma
                min_dist = dist

        return lemma

In [10]:
trie = PrefixTrie(lemma_dict["lemma"].apply(str.lower))

In [11]:
print(trie.getLemma("сашин"))

сашин


In [19]:
cnt_PT = 0
cnt_PM2 = 0
cnt_DBSRA = 0
len_df = len(df_train.index)

start_time = time.time()
for word_id in range(len_df):
    word = df_train['form'][word_id].lower()
    lemma = df_train['lemma'][word_id].lower()

    # if not lemma_dict["lemma"].isin([lemma]).any():
    #    len_df -= 1
    # else:
    res = trie.getLemma(word)
    if res == lemma:
        cnt_PT += 1

time_PT = round(time.time() - start_time, 3)

start_time = time.time()
for word_id in range(len_df):
    word = df_train['form'][word_id].lower()
    lemma = df_train['lemma'][word_id].lower()
    res = morph.parse(word)[0].normal_form

    if res == lemma:
        cnt_PM2 += 1
time_PM2 = round(time.time() - start_time, 3)

start_time = time.time()
for word_id in range(len_df):
    word = df_train['form'][word_id].lower()
    lemma = df_train['lemma'][word_id].lower()
    res = lemma_dbsra(word)

    if res == lemma:
        cnt_DBSRA += 1
time_DBSRA = round(time.time() - start_time, 3)

# PrefixTrie : 0.804, time : 6.253
# PrefixTrie (если вычесть леммы не в словаре) : 0.833
# Pymorphy2 : 0.943, time : 6.152
# Pymorphy2 (если вычесть леммы не в словаре) : 0.99
# DBSRA : 0.608, time : 1.6
print(f"PrefixTrie : {round(cnt_PT / len_df, 3)}, time : {time_PT}")
print(f"Pymorphy2 : {round(cnt_PM2 / len_df, 3)}, time : {time_PM2}")
print(f"DBSRA : {round(cnt_DBSRA / len_df, 3)}, time : {time_DBSRA}")

PrefixTrie : 0.804, time : 6.318
Pymorphy2 : 0.943, time : 6.096
DBSRA : 0.608, time : 1.653


$$\large \textbf{Расстояние между документами}$$

$$\textbf{Векторизация документов}$$

В stopwordsiso есть 559 русских стоп-слов, найден больший словарь стоп-слов на 1352 слова

In [12]:
trie_stopwords = PrefixTrie(df_stopwords['stopword'])


def TFIDFn(list_of_docs):
    """
    list_of_doc = [doc1, doc2, ...]
    doc1 = [word1, word2, ...]
    """

    count_of_docs = len(list_of_docs)
    num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'

    lemms_dict = {}
    id_lemma_dict = {}
    id_lemma = 0
    list_docs_dict = []

    with tqdm(total=count_of_docs*2, position=0, leave=True) as pbar:

        for id_doc in range(count_of_docs):
            doc_dict = {}

            pbar.set_description(f"Cycle: 1/2, Doc: {id_doc+1}/{count_of_docs}")
            pbar.update()

            for id_word in range(len(list_of_docs[id_doc])):
                word = list_of_docs[id_doc][id_word]
                lemma = morph.parse(word)[0].normal_form

                if trie_stopwords.hasKey(lemma) or trie_stopwords.hasKey(word) or \
                        re.fullmatch(num_reg_exp, word) is not None:
                    continue

                if lemma not in lemms_dict.keys():
                    lemms_dict[lemma] = [id_lemma, [id_doc]]
                    id_lemma_dict[id_lemma] = lemma
                    id_lemma += 1
                elif lemms_dict[lemma][1][-1] < id_doc:
                    lemms_dict[lemma][1] += [id_doc]

                id_this_lemma = lemms_dict[lemma][0]

                if id_this_lemma not in doc_dict.keys():
                    doc_dict[id_this_lemma] = 1
                else:
                    doc_dict[id_this_lemma] += 1

            list_docs_dict += [doc_dict]

        for id_doc in range(count_of_docs):

            pbar.set_description(f"Cycle: 2/2, Doc: {id_doc+1}/{count_of_docs}")
            pbar.update()

            count_words = len(list_docs_dict[id_doc].keys())
            norm_this_docs = 0

            for key in list_docs_dict[id_doc].keys():
                lemma = id_lemma_dict[key]
                res = list_docs_dict[id_doc][key] \
                    / count_words * log(count_of_docs / len(lemms_dict[lemma][1]))

                list_docs_dict[id_doc][key] = res
                norm_this_docs += res ** 2

            if norm_this_docs != 0:
                norm_this_docs = norm_this_docs ** 0.5
                for key in list_docs_dict[id_doc].keys():
                    list_docs_dict[id_doc][key] /= norm_this_docs

    return list_docs_dict, id_lemma  # лист документов и размерность пространства

In [133]:
print(TFIDFn([['приниматься', 'ись', 'ись', 'глядел'], ['ты', 'сидел', 'и', 'глядел']]))

Cycle: 2/2, Doc: 2/2: 100%|█████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 372.74it/s]

([{0: 0.4472135954999579, 1: 0.8944271909999159, 2: 0.0}, {2: 0.0}], 3)





In [22]:
list_docs = [list(df_train.loc[df_train['id_sen'] == id_sen, 'form']) for id_sen in range(1000)]

In [134]:
vect_list, dim = TFIDFn(list_docs)

Cycle: 2/2, Doc: 1000/1000: 100%|█████████████████████████████████████████████████| 2000/2000 [00:13<00:00, 145.46it/s]


In [136]:
print(dim)

4467


In [13]:
def prod_vect(vect1, vect2):
    res = 0
    for key in vect1.keys() & vect2.keys():
        res += vect1[key] * vect2[key]
    return 1 - res


def euclide_dist(vect1, vect2):
    res = 0
    inters = vect1.keys() & vect2.keys()
    for key in vect1.keys():
        if key not in inters:
            res += vect1[key] ** 2
    for key in vect2.keys():
        if key not in inters:
            res += vect2[key] ** 2
    for key in inters:
        res += (vect1[key] - vect2[key]) ** 2

    return res ** 0.5


def dist_matrix(vect_list, metric=prod_vect):
    len_vect_list = len(vect_list)
    dist_docs = {}

    with tqdm(total=len_vect_list*(len_vect_list-1)//2, position=0, leave=True) as pbar:
        for i in range(len_vect_list):
            vect1 = vect_list[i]
            for j in range(i + 1, len_vect_list):
                vect2 = vect_list[j]
                dist_docs[(i, j)] = metric(vect1, vect2)

                pbar.update()

    return dist_docs


def save_dist_matrix(dist_matrix, path="C:\\Users\\user\\Desktop\\", name="dist_matrix"):
    file = open(path + name + ".txt", "w")
    for key in dist_matrix.keys():
        file.write(f"{key[0]}:{key[1]}:{dist_matrix[key]}\n")
    file.close()


def open_dist_matrix(path="C:\\Users\\user\\Desktop\\dist_matrix.txt"):
    file = open(path, "r")
    dist_matrix = {}

    for line in file:
        id_res = line.split(":")
        dist_matrix[(int(id_res[0]), int(id_res[1]))] = float(id_res[2])

    file.close()
    return dist_matrix

In [None]:
d = dist_matrix(vect_list)
cnt = 5
for key in d.keys():
    if d[key] > 0.5:
        print(list_docs[key[0]])
        print(list_docs[key[1]])
        print(d[key])
        print()

In [103]:
save_dist_matrix(d)

In [113]:
r = open_dist_matrix()
print(len(r))

5778300


Слишком большая размерность векторов и большая разряженность. Для оптимизации по памяти при больших объемах можно хранить только ненулевые значения в разряженных таблицах.

Используем алгоритм TF-IDF:
$$\text{TF-IDF}(w, d) = \text{TF}(w, d) \cdot \text{IDF}(w),$$
где $\text{TF}(w, d)$ - частота слова w в документе d, $\text{IDF}(w)$ - логарифм отношения общего количества документов к количеству документов со словом w.

$$\large \textbf{Класстеризация документов}$$ 

Токенизация - Леммализация - Векторизация - TF-IDF - 
(Метрики: Метод Косинусов, Евклидово расстояние) - (Класстеризация: K-средних, DBSCAN, нейроночки)

In [14]:
def tokenize(doc):
    res = re.split(r'\W', doc.lower())

    while True:
        try:
            res.remove('')
        except ValueError:
            return res

In [69]:
print(tokenize(df_test_docs['text'][0]))

['ваши', 'дети', 'совсем', 'не', 'хотят', 'вылезать', 'из', 'реки', 'с', 'каждым', 'днем', 'допустимое', 'время', 'просмотра', 'телевизора', 'становится', 'все', 'больше', 'и', 'больше', 'вы', 'не', 'уверены', 'что', 'способны', 'вынести', 'хотя', 'бы', 'еще', 'один', 'крик', 'его', 'половина', 'больше', 'моей']


In [15]:
list_docs_test = [tokenize(df_test_docs['text'][i]) for i in range(len(df_test_docs.index))]

In [16]:
vect_docs_test, dim = TFIDFn(list_docs_test)

Cycle: 2/2, Doc: 3400/3400: 100%|██████████████████████████████████████████████████| 6800/6800 [05:23<00:00, 21.01it/s]


In [18]:
print(dim, vect_docs_test[:5])

36217 [{0: 0.1586698379117062, 1: 0.4482251888500325, 2: 0.30363640681942283, 3: 0.34565175219632255, 4: 0.28156737056940456, 5: 0.3340319509527152, 6: 0.2552100268440207, 7: 0.23361914356988459, 8: 0.29736257886067563, 9: 0.4040492142022535}, {10: 0.27264399391688626, 11: 0.2108448556020082, 12: 0.12719230255212993, 13: 0.14528390387922988, 14: 0.22660783880457305, 15: 0.1382759911284418, 16: 0.19098584537932095, 17: 0.2083358366894893, 18: 0.2535548365747409, 19: 0.14250584375420736, 20: 0.21951985125709236, 21: 0.1420615383461895, 22: 0.3416341244050081, 23: 0.24646684902726027, 24: 0.2622298322298251, 25: 0.13482657028317926, 26: 0.17369810596403937, 27: 0.09260870094538967, 28: 0.14932663918429048, 29: 0.31612382777016096, 30: 0.13519506877725196, 31: 0.21352704317523685, 32: 0.22660783880457305}, {33: 0.22109273147285796, 34: 0.14490090686727164, 35: 0.2010914295890518, 0: 0.19496169212073097, 36: 0.23235688054832207, 37: 0.16077683331519121, 38: 0.15940283786861995, 39: 0.232356

In [140]:
t = dist_matrix(vect_docs_test)
save_dist_matrix(t)

100%|████████████████████████████████████████████████████████████████████| 5778300/5778300 [00:31<00:00, 185836.29it/s]


In [None]:
cnt = 5
for key in t.keys():
    if t[key] < 0.1:
        print(df_test_docs['text'][key[0]])
        print()
        print(df_test_docs['text'][key[1]])
        print(t[key])
        print(df_test_docs['class'][key[0]], df_test_docs['class'][key[1]])
        print("\n---------------------------------------------------------------\n")

        cnt -= 1
        if cnt == 0:
            break

$$\large \textbf{Кластеризация}$$

In [99]:
train_id, test_id = [], []
for i in range(len(vect_docs_test)):
    if rint(1, 5) == 1:
        test_id += [i]
    else:
        train_id += [i]

In [129]:
def sum_vect(vect1, vect2, metric):
    res = {}
    inters = vect1.keys() & vect2.keys()

    for key in vect1.keys():
        res[key] = vect1[key]
    for key in vect2.keys():
        if key in inters:
            res[key] += vect2[key]
        else:
            res[key] = vect2[key]

    if metric != euclide_dist:
        norm = sum(map(lambda x: x ** 2, res.values())) ** 0.5
        for key in res.keys():
            res[key] /= norm

    return res


def norm(vect):
    res = 0
    for key in vect.keys():
        res += vect[key] ** 2
    return res ** 0.5


def dev_vect_on_num(vect, num):
    res = {}
    for key in vect.keys():
        res[key] = vect[key] / num
    return res


def mKavg(vect_docs, train_id, test_id, name_classes, metric=prod_vect):
    # нужно согласовать норму с косинусной метрикой
    # изменить сложение для косинусной метрики

    cnt_classters = len(name_classes)
    classter_centers = [{} for i in range(cnt_classters)]
    classter_id_vect = {}
    name_classes = dict([(name_classes[i], i) for i in range(cnt_classters)])
    cnt_train_doc_in_class = dict([(i, 0) for i in range(cnt_classters)])

    for id_doc in train_id:
        classter_id = name_classes[df_test_docs['class'][id_doc]]
        classter_centers[classter_id] = \
            sum_vect(classter_centers[classter_id], vect_docs[id_doc], metric)
        cnt_train_doc_in_class[classter_id] += 1

    if metric == euclide_dist:
        for classter_id in range(cnt_classters):
            classter_centers[classter_id] = \
                dev_vect_on_num(classter_centers[classter_id], cnt_train_doc_in_class[classter_id])

    cnt_doc = len(vect_docs)
    changed = True
    cnt_cycle = 0

    while changed:
        changed = False
        cnt_cycle += 1
        print(cnt_cycle)
        if cnt_cycle == 21:
            break

        for i in range(cnt_classters):
            classter_id_vect[i] = []

        for id_doc in range(cnt_doc):
            doc = vect_docs[id_doc]
            min_dist = metric(doc, classter_centers[0]) ** 2
            classter = 0

            for id_classter in range(1, cnt_classters):
                dist = metric(doc, classter_centers[id_classter]) ** 2
                if dist < min_dist:
                    min_dist = dist
                    classter = id_classter

            classter_id_vect[classter] += [id_doc]

        for i in range(cnt_classters):
            new_class_center = {}
            for id_doc in classter_id_vect[i]:
                new_class_center = sum_vect(new_class_center, vect_docs[id_doc], metric)

            if metric == euclide_dist:
                new_class_center = dev_vect_on_num(new_class_center, len(classter_id_vect[i]))

            if not changed and classter_centers[i] != new_class_center:
                changed = True

            classter_centers[i] = new_class_center

    return classter_id_vect

In [117]:
def method_min_dist(vect_docs, train_id, test_id, name_classes, metric=prod_vect):
    cnt_classters = len(name_classes)
    classter_centers = [{} for i in range(cnt_classters)]
    classter_id_vect = {}
    name_classes = dict([(name_classes[i], i) for i in range(cnt_classters)])
    cnt_train_doc_in_class = dict([(i, 0) for i in range(cnt_classters)])

    for id_doc in train_id:
        classter_id = name_classes[df_test_docs['class'][id_doc]]
        classter_centers[classter_id] = \
            sum_vect(classter_centers[classter_id], vect_docs[id_doc], metric)
        cnt_train_doc_in_class[classter_id] += 1

    if metric == euclide_dist:
        for classter_id in range(cnt_classters):
            classter_centers[classter_id] = \
                dev_vect_on_num(classter_centers[classter_id], cnt_train_doc_in_class[classter_id])

    for i in range(cnt_classters):
        classter_id_vect[i] = []

    for id_doc in test_id:
        doc = vect_docs[id_doc]
        min_dist = metric(doc, classter_centers[0])
        classter = 0

        for id_classter in range(1, cnt_classters):
            dist = metric(doc, classter_centers[id_classter])
            if dist < min_dist:
                min_dist = dist
                classter = id_classter

        classter_id_vect[classter] += [id_doc]

    return classter_id_vect

In [34]:
name_classes = list(df_test_docs['class'].unique())
print(name_classes)

['семья', 'реклама', 'недвижимость', 'здоровье', 'политика', 'культура', 'спорт', 'техника', 'экономика', 'происшествия', 'автомобили', 'страна', 'наука']


In [127]:
classters_vect = mKavg(vect_docs_test, train_id, test_id, name_classes)

1
0.9999999999999993 0
1.0000000000000007 1
1.0000000000000013 2
1.0000000000000002 3
0.9999999999999994 4
1.0 5
1.0000000000000002 6
0.9999999999999999 7
1.0000000000000004 8
1.0000000000000002 9
1.0000000000000004 10
0.9999999999999983 11
1.0000000000000007 12
2
0.9999999999999999 0
0.9999999999999994 1
0.999999999999999 2
1.0 3
0.999999999999998 4
1.0000000000000024 5
1.0 6
0.9999999999999997 7
0.9999999999999998 8


KeyboardInterrupt: 

In [118]:
classters_vect = method_min_dist(vect_docs_test, train_id, test_id, name_classes, euclide_dist)

In [None]:
id1, id2 = classters_vect[1][17], classters_vect[1][20]
print(df_test_docs['text'][id1])
print()
print(df_test_docs['text'][id2])
print(df_test_docs['class'][id1], df_test_docs['class'][id2])

In [123]:
acc = 0
den = 0
for i in range(13):
    local_acc = 0
    cnt_vect = 0
    dct = {}
    for id_doc in classters_vect[i]:
        if id_doc not in test_id:
            continue
        den += 1
        cnt_vect += 1
        target = df_test_docs['class'][id_doc]
        if target == name_classes[i]:
            acc += 1
            local_acc += 1

        if target not in dct.keys():
            dct[target] = 1
        else:
            dct[target] += 1
    print(dct)
    print(cnt_vect, name_classes[i])
    print(local_acc / cnt_vect)
    print()
print(acc / den)
# mKavg(cos): для train = 100% 0.19117647058823528
# mKavg(cos): для train = 80% 0.18668596237337193
# mKavg(euc): для train = 80% 0.6295224312590448
# min_dist(cos): для train = 80% 0.3140376266280753
# min_dist(euc): для train = 80% 0.8422575976845152

{'семья': 10, 'реклама': 1, 'страна': 2, 'экономика': 1, 'спорт': 1}
15 семья
0.6666666666666666

{'реклама': 18, 'недвижимость': 2, 'происшествия': 2, 'страна': 3, 'техника': 1, 'экономика': 3}
29 реклама
0.6206896551724138

{'недвижимость': 12, 'происшествия': 3, 'страна': 4, 'культура': 1}
20 недвижимость
0.6

{'семья': 4, 'реклама': 4, 'недвижимость': 2, 'здоровье': 20, 'культура': 42, 'политика': 46, 'спорт': 17, 'техника': 32, 'экономика': 12, 'происшествия': 16, 'страна': 13, 'автомобили': 5}
213 здоровье
0.09389671361502347

{'политика': 70}
70 политика
1.0

{'культура': 31}
31 культура
1.0

{'спорт': 60, 'происшествия': 1}
61 спорт
0.9836065573770492

{'автомобили': 1, 'техника': 28, 'политика': 1, 'происшествия': 5}
35 техника
0.8

{'экономика': 29}
29 экономика
1.0

{'происшествия': 74, 'автомобили': 8}
82 происшествия
0.9024390243902439

{'автомобили': 35}
35 автомобили
1.0

{'политика': 9, 'автомобили': 2, 'происшествия': 6, 'экономика': 2, 'страна': 8, 'культура': 3}
30 с