# Классификация текстовых запросов на основе tf-idf перобразования в связке с градиентным бустингом.

## Импорт необходимых модулей

In [1]:
import os
import re
import numpy as np

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Описание необходимых классов и функций

In [2]:
class Processor():
    def __init__(self, alphabet = "абвгдежзийклмнопрстуфхцчшщъыьэюя "):
        self.alphabet = alphabet
        self.morph = MorphAnalyzer()
    
    def __call__(self, text: str):
        text = self.preprocess(text)
        return text
    
    def __len__(self,):
        return len(self.texts)
        
    def preprocess(self, text: str):
        """
            Метод подготавливает текст 
            перед использованием в процессе обучения.
            
            Args:
                text (str): текст запроса
            
            Returns:
                new_text (str): подготовленный текст
        """
        text = self.lowercase(text)
        text = self.remove_html(text)
        text = text.replace('ё', 'е')
        text = self.filter_symbols(text) 
        text = self.remove_stopwords(text)
        text = self.lemmatize(text)
        return text
    
    def remove_html(self, text: str):
        """
            Метод ищет все подстроки типа "</p>" (html разметка)
            и удаляет из текста.
            
            Args:
                text (str): текст запроса
            
            Returns:
                text (str): искомый текст
        """
        html_code_pattern = "<\S{1,}>"
        substrings = re.findall(html_code_pattern, text)
        for substring in substrings:
            text = text.replace(substring, '')
        return text
    
    def filter_symbols(self, text: str):
        """
            Метод убирает из текста все символы, 
            которые не входят в словарь (self.alphabet).
            
            Args:
                text (str): текст запроса
            
            Returns:
                new_text (str): отфильтрованный текст
        """
        new_text = ""
        for char in text:
            if char in self.alphabet:
                new_text += char
        return new_text
    
    def lemmatize(self, text):
        """
            Метод приводит все слова 
            в тексте к начальной форме.
            
            Args:
                text (str): текст запроса
            
            Returns:
                new_text (str): преобразованный текст
        """
        words = word_tokenize(text)
        lemmas = [self.morph.parse(word)[0].normal_form for word in words]
        new_text = ' '.join(lemmas)
        return new_text
    
    def remove_stopwords(self, text):
        """
            Метод убирает из текста все стоп слова.
            
            Args:
                text (str): текст запроса
            
            Returns:
                new_text (str): отфильтрованный текст
        """
        words = word_tokenize(text)
        filtered_words = [w for w in words if not w in stopwords.words('russian')]
        new_text = ' '.join(filtered_words)
        return new_text

    def lowercase(self, text: str):
        return text.lower()        

## Подготовка данных

In [3]:
data_path = 'data.xlsx'
#Читаем табличку и удаляем записи с пропущенными значениями
df = pd.read_excel(data_path).dropna() 

In [4]:
class_counts = df['name'].value_counts() 
#Считаем встречаемость классов в датасете
name2count = {name:count for name,count in zip(class_counts.index, class_counts.values)}

In [5]:
#Добавляем в табличку столбец вхождений класса
df['class_count'] = df['name'].apply(lambda x: name2count[x]) 

In [6]:
#Оставляем записи только тех классов, встречаемость которых выше единицы
data = df[df['class_count'] > 1][['description', 'name']]
name2class_id = {name:idx for idx, name in enumerate(data['name'].unique())}
#Добавляем в табличку столбец с индексами классов
data['class_id'] = data['name'].apply(lambda x: name2class_id[x])
data.drop(['name'], axis=1, inplace=True)

In [7]:
data_processor = Processor()
# Немного очищаем тексты
data['description'] = data['description'].apply(lambda x: data_processor(x))

In [8]:
data

Unnamed: 0,description,class_id
0,первый подъезд убирать слово последний убирать...,0
1,дорожный покрытие внутридворовый дорога адрес ...,1
2,добрый день п батрацкий дача очень редко чисти...,2
3,добрый день дом адрес садовый протекать труба ...,3
4,хотеться обратить внимание транспортный отдел ...,4
...,...,...
9976,необходимо представить информация овыполнить в...,1
9977,необходимо восстановить выровнять плитка плита...,28
9978,необходимо закрасить надпись столб фасад корпу...,28
9979,необходимо очередной зачистить мусор столб ряд...,28


In [9]:
X = data['description'].values
y = data['class_id'].values
#Делим датасет на тренировочную и тестовую выборки, учитывая распределение классов
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
corpus = X_train.tolist()
#Создаем объект TfidfVectorizer и обучаем на тренировочном корпусе текстов
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

TfidfVectorizer()

In [11]:
#Преобразуем тексты в tf-idf векторы
X_train_tfidf = vectorizer.transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [12]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(6601, 12508)
(2830, 12508)


## Обучение модели

In [13]:
# Так как выборка совершенно несбалансирована, стоит прибегнуть к 
# искуственному скейлингу значения функции ошибки.
# Для этого считаем веса каждого класса, в зависимости от встречаемости в выборке.

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [14]:
# Создаем модель и обучаем её
model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0',
                           class_weights=class_weights)
model.fit(X_train_tfidf,
          y_train,
          verbose=True)

Learning rate set to 0.095386
0:	learn: 4.7918687	total: 169ms	remaining: 2m 48s
1:	learn: 4.7263734	total: 316ms	remaining: 2m 37s
2:	learn: 4.6772494	total: 462ms	remaining: 2m 33s
3:	learn: 4.6236105	total: 607ms	remaining: 2m 31s
4:	learn: 4.5716700	total: 753ms	remaining: 2m 29s
5:	learn: 4.5196943	total: 898ms	remaining: 2m 28s
6:	learn: 4.4872090	total: 1.04s	remaining: 2m 28s
7:	learn: 4.4047670	total: 1.19s	remaining: 2m 27s
8:	learn: 4.3639751	total: 1.34s	remaining: 2m 27s
9:	learn: 4.2884956	total: 1.5s	remaining: 2m 28s
10:	learn: 4.2574468	total: 1.65s	remaining: 2m 27s
11:	learn: 4.1942813	total: 1.79s	remaining: 2m 27s
12:	learn: 4.1636104	total: 1.94s	remaining: 2m 27s
13:	learn: 4.1359468	total: 2.08s	remaining: 2m 26s
14:	learn: 4.1131088	total: 2.23s	remaining: 2m 26s
15:	learn: 4.0939728	total: 2.38s	remaining: 2m 26s
16:	learn: 4.0724659	total: 2.52s	remaining: 2m 25s
17:	learn: 4.0435833	total: 2.67s	remaining: 2m 25s
18:	learn: 4.0123309	total: 2.82s	remaining: 

158:	learn: 2.0277164	total: 24.4s	remaining: 2m 8s
159:	learn: 2.0232327	total: 24.5s	remaining: 2m 8s
160:	learn: 2.0198953	total: 24.6s	remaining: 2m 8s
161:	learn: 2.0171470	total: 24.8s	remaining: 2m 8s
162:	learn: 2.0132806	total: 24.9s	remaining: 2m 8s
163:	learn: 2.0042825	total: 25.1s	remaining: 2m 7s
164:	learn: 2.0005308	total: 25.2s	remaining: 2m 7s
165:	learn: 1.9967684	total: 25.4s	remaining: 2m 7s
166:	learn: 1.9927452	total: 25.5s	remaining: 2m 7s
167:	learn: 1.9887959	total: 25.7s	remaining: 2m 7s
168:	learn: 1.9853248	total: 25.8s	remaining: 2m 6s
169:	learn: 1.9802566	total: 26s	remaining: 2m 6s
170:	learn: 1.9765097	total: 26.1s	remaining: 2m 6s
171:	learn: 1.9736127	total: 26.3s	remaining: 2m 6s
172:	learn: 1.9702131	total: 26.4s	remaining: 2m 6s
173:	learn: 1.9598257	total: 26.6s	remaining: 2m 6s
174:	learn: 1.9554620	total: 26.8s	remaining: 2m 6s
175:	learn: 1.9483392	total: 26.9s	remaining: 2m 6s
176:	learn: 1.9451329	total: 27.1s	remaining: 2m 5s
177:	learn: 1.

315:	learn: 1.3487420	total: 48.6s	remaining: 1m 45s
316:	learn: 1.3470256	total: 48.7s	remaining: 1m 44s
317:	learn: 1.3400756	total: 48.9s	remaining: 1m 44s
318:	learn: 1.3235848	total: 49.1s	remaining: 1m 44s
319:	learn: 1.3217740	total: 49.3s	remaining: 1m 44s
320:	learn: 1.3209007	total: 49.5s	remaining: 1m 44s
321:	learn: 1.3201548	total: 49.6s	remaining: 1m 44s
322:	learn: 1.3190165	total: 49.7s	remaining: 1m 44s
323:	learn: 1.3175967	total: 49.9s	remaining: 1m 44s
324:	learn: 1.3085455	total: 50.1s	remaining: 1m 43s
325:	learn: 1.3032619	total: 50.2s	remaining: 1m 43s
326:	learn: 1.3010903	total: 50.4s	remaining: 1m 43s
327:	learn: 1.2998642	total: 50.5s	remaining: 1m 43s
328:	learn: 1.2987545	total: 50.7s	remaining: 1m 43s
329:	learn: 1.2968771	total: 50.8s	remaining: 1m 43s
330:	learn: 1.2960825	total: 51s	remaining: 1m 42s
331:	learn: 1.2943757	total: 51.1s	remaining: 1m 42s
332:	learn: 1.2923705	total: 51.3s	remaining: 1m 42s
333:	learn: 1.2905912	total: 51.4s	remaining: 1m

470:	learn: 1.0219769	total: 1m 12s	remaining: 1m 21s
471:	learn: 1.0213408	total: 1m 12s	remaining: 1m 21s
472:	learn: 1.0201405	total: 1m 12s	remaining: 1m 20s
473:	learn: 1.0184815	total: 1m 12s	remaining: 1m 20s
474:	learn: 1.0177488	total: 1m 12s	remaining: 1m 20s
475:	learn: 1.0170496	total: 1m 13s	remaining: 1m 20s
476:	learn: 1.0162899	total: 1m 13s	remaining: 1m 20s
477:	learn: 1.0156217	total: 1m 13s	remaining: 1m 20s
478:	learn: 1.0152953	total: 1m 13s	remaining: 1m 20s
479:	learn: 1.0103177	total: 1m 13s	remaining: 1m 19s
480:	learn: 1.0078815	total: 1m 13s	remaining: 1m 19s
481:	learn: 1.0071315	total: 1m 14s	remaining: 1m 19s
482:	learn: 1.0064255	total: 1m 14s	remaining: 1m 19s
483:	learn: 1.0056879	total: 1m 14s	remaining: 1m 19s
484:	learn: 1.0044914	total: 1m 14s	remaining: 1m 19s
485:	learn: 1.0033453	total: 1m 14s	remaining: 1m 18s
486:	learn: 1.0022041	total: 1m 14s	remaining: 1m 18s
487:	learn: 1.0015869	total: 1m 14s	remaining: 1m 18s
488:	learn: 1.0011313	total:

624:	learn: 0.8473820	total: 1m 35s	remaining: 57.3s
625:	learn: 0.8469055	total: 1m 35s	remaining: 57.1s
626:	learn: 0.8463686	total: 1m 35s	remaining: 57s
627:	learn: 0.8461210	total: 1m 35s	remaining: 56.8s
628:	learn: 0.8457548	total: 1m 36s	remaining: 56.7s
629:	learn: 0.8447022	total: 1m 36s	remaining: 56.5s
630:	learn: 0.8440775	total: 1m 36s	remaining: 56.4s
631:	learn: 0.8437914	total: 1m 36s	remaining: 56.2s
632:	learn: 0.8410235	total: 1m 36s	remaining: 56.1s
633:	learn: 0.8406810	total: 1m 36s	remaining: 55.9s
634:	learn: 0.8402331	total: 1m 37s	remaining: 55.8s
635:	learn: 0.8398469	total: 1m 37s	remaining: 55.6s
636:	learn: 0.8394817	total: 1m 37s	remaining: 55.4s
637:	learn: 0.8390071	total: 1m 37s	remaining: 55.3s
638:	learn: 0.8383349	total: 1m 37s	remaining: 55.1s
639:	learn: 0.8377846	total: 1m 37s	remaining: 55s
640:	learn: 0.8359180	total: 1m 37s	remaining: 54.8s
641:	learn: 0.8354524	total: 1m 38s	remaining: 54.7s
642:	learn: 0.8351564	total: 1m 38s	remaining: 54.

780:	learn: 0.7364937	total: 1m 59s	remaining: 33.4s
781:	learn: 0.7362885	total: 1m 59s	remaining: 33.2s
782:	learn: 0.7359244	total: 1m 59s	remaining: 33.1s
783:	learn: 0.7355430	total: 1m 59s	remaining: 32.9s
784:	learn: 0.7350774	total: 1m 59s	remaining: 32.8s
785:	learn: 0.7348546	total: 1m 59s	remaining: 32.6s
786:	learn: 0.7345204	total: 1m 59s	remaining: 32.5s
787:	learn: 0.7340480	total: 2m	remaining: 32.3s
788:	learn: 0.7334857	total: 2m	remaining: 32.2s
789:	learn: 0.7324342	total: 2m	remaining: 32s
790:	learn: 0.7320585	total: 2m	remaining: 31.8s
791:	learn: 0.7318612	total: 2m	remaining: 31.7s
792:	learn: 0.7315586	total: 2m	remaining: 31.5s
793:	learn: 0.7312505	total: 2m	remaining: 31.4s
794:	learn: 0.7307688	total: 2m 1s	remaining: 31.2s
795:	learn: 0.7302826	total: 2m 1s	remaining: 31.1s
796:	learn: 0.7299798	total: 2m 1s	remaining: 30.9s
797:	learn: 0.7297137	total: 2m 1s	remaining: 30.8s
798:	learn: 0.7294977	total: 2m 1s	remaining: 30.6s
799:	learn: 0.7289813	total:

938:	learn: 0.6593798	total: 2m 22s	remaining: 9.27s
939:	learn: 0.6592128	total: 2m 22s	remaining: 9.12s
940:	learn: 0.6589218	total: 2m 22s	remaining: 8.96s
941:	learn: 0.6587431	total: 2m 23s	remaining: 8.81s
942:	learn: 0.6585320	total: 2m 23s	remaining: 8.66s
943:	learn: 0.6582045	total: 2m 23s	remaining: 8.51s
944:	learn: 0.6581127	total: 2m 23s	remaining: 8.35s
945:	learn: 0.6576970	total: 2m 23s	remaining: 8.2s
946:	learn: 0.6571173	total: 2m 23s	remaining: 8.05s
947:	learn: 0.6568634	total: 2m 23s	remaining: 7.9s
948:	learn: 0.6565204	total: 2m 24s	remaining: 7.75s
949:	learn: 0.6562016	total: 2m 24s	remaining: 7.59s
950:	learn: 0.6558317	total: 2m 24s	remaining: 7.44s
951:	learn: 0.6553240	total: 2m 24s	remaining: 7.29s
952:	learn: 0.6548833	total: 2m 24s	remaining: 7.14s
953:	learn: 0.6546638	total: 2m 24s	remaining: 6.99s
954:	learn: 0.6544453	total: 2m 25s	remaining: 6.83s
955:	learn: 0.6541361	total: 2m 25s	remaining: 6.68s
956:	learn: 0.6540156	total: 2m 25s	remaining: 6

<catboost.core.CatBoostClassifier at 0x7efe040ae8b0>

In [15]:
%%time
# Считаем предсказания, используя обученную модель
preds = model.predict(X_test_tfidf)

CPU times: user 19.4 s, sys: 237 ms, total: 19.6 s
Wall time: 18.1 s


In [16]:
# Считаем метрики
accuracy = accuracy_score(y_test, preds)
balanced_accuracy = balanced_accuracy_score(y_test, preds)
f1 = f1_score(y_test, preds, average='weighted')



In [17]:
print("Accuracy:", accuracy)
print("Balanced accuracy:", balanced_accuracy)
print("F1-score:", f1)

Accuracy: 0.6293286219081272
Balanced accuracy: 0.3042064305276657
F1-score: 0.6654665639260515
