# <center> **`Hard задача`. Классифицируем тексты** (задание степа 6.2.13)

У вас есть датасет текстов постов из социальной сети. Вам нужно классифицировать их по 13 темам, к которым они относятся.

**Достаем эмбеддинги при помощи языковых моделей**

Очень часто в задачах, связанными с текстами, выстреливает подход с извлечением из текста эмбеддингов и последующего их использования в бустингах или линейных моделях для решения своей задачи.

В рамках данного степа уже достали для каждого текста из датасета эмбеддинги при помощи больших языковых моделей из `HuggingFace`, а именно использовали `'sberbank-ai/ruBert-base'`, `'cointegrated/rubert-tiny2'`, `'DeepPavlov/rubert-base-cased-conversational'` и `'sentence-transformers/LaBSE'`.

Так что теперь вы можете использовать их, чтобы составить свой невероятный ансамбль и порвать лидерборд.

<center> <img src='../images/text_classes.png' width="900" /> </center>

**Задача:** Получите максимальное качество классификации. Метрика - balanced_accuracy_score, классы в тестовой выборке сбалансированны так же, как и в обучающей. За каждый процент точности, начиная от 70%, вы будете получать +1 балл (максимум 10).

**Данные:**
- Train dataset - [ссылка](https://drive.google.com/file/d/1KRtMVLLp_gV9V56YwU8LG2i19tvpw_6K/view?usp=share_link).
- Test dataset - [ссылка](https://drive.google.com/file/d/18ulfeupdqO3Siwu7WqWWP_OY14sdlTiy/view?usp=sharing).

In [19]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from transformers import AutoModel, AutoTokenizer
import torch

In [20]:
class TextEmbeddings:
    def __init__(self, add_cls_embeddings=True, add_mean_embeddings=False):
        self.add_mean_embeddings = add_mean_embeddings
        self.add_cls_embeddings = add_cls_embeddings
        if add_cls_embeddings is False and add_mean_embeddings is False:
            raise 'Error: you should select at least one type of embeddings to be computed'

    def mean_pooling(self, hidden_state, attention_mask):
        """
        Возвращает усредненный с учетом attention_mask hidden_state.
        """
        token_embeddings = hidden_state.detach().cpu() 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        return sum_embeddings / attention_mask.sum()

    def extract_embeddings(self, texts, model_name, max_len):
        """
        Возвращает значения, посчитанные данной моделью - эмбеддинги для всех текстов из texts.
        """
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name).cuda()
        text_features = []
        for sentence in tqdm(texts):
            encoded_input = tokenizer([sentence],
                                      padding='max_length',
                                      truncation=True,
                                      max_length=max_len,
                                      return_tensors='pt')
            with torch.no_grad():
                hidden_state, cls_head = model(input_ids=encoded_input['input_ids'].cuda(), return_dict=False)
                sentence_embeddings = self.mean_pooling(hidden_state, encoded_input['attention_mask'])
            
            now_emb = []
            if self.add_cls_embeddings:
                now_emb.append(cls_head.detach().cpu().numpy().flatten())
            
            if self.add_mean_embeddings:
                now_emb.append(sentence_embeddings.detach().cpu().numpy().flatten())
            
            text_features.append(np.concatenate(now_emb, axis=0))
        return text_features

    def add_many_embeddings(self, df, text_col, models):
        """"
        Добавляет в качестве признаков эмбеддинги для колонки text_col.
        В качестве моделей и максимальных длин используются models.
        """
        for model_name, max_len in models:
            print(model_name)
            text_features = self.extract_embeddings(df[text_col], model_name, max_len)
            text_features_df = pd.DataFrame(text_features, columns = [f'{model_name}_{text_col}_feature_{i}' for i in range(len(text_features[0]))])
            df = df.join(text_features_df)
            df.to_csv('transformers_text_features.csv', index=False)
            os.system('cp /content/transformers_text_features.csv /content/drive/MyDrive/datasets/transformers_text_features.csv')
        return df

In [118]:
train = pd.read_csv('../data/text_classification_train.csv')
test = pd.read_csv('../data/text_classification_test.csv')

ruBert-base_text_feature - 768
rubert-tiny2_text_feature - 312
rubert-base-cased-conversational_text_feature - 768
labse_text_feature - 768

In [92]:
import json
import os

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

import optuna
import torch
import seaborn as sns
import shap

In [93]:
# Полный список поддерживаемых моделей можно найти на https://huggingface.co/models
models = [
          #('sberbank-ai/ruBert-base', 512),
          #('cointegrated/rubert-tiny2', 2048),
          #('DeepPavlov/rubert-base-cased-conversational', 512)
          ('sentence-transformers/LaBSE', 512),
          
          #  # ('cointegrated/rubert-tiny2', 2048),

          #
          #('cointegrated/LaBSE-en-ru', 512),
        #   ('sberbank-ai/ruRoberta-large', 512),
        #   ('sberbank-ai/sbert_large_nlu_ru', 512),
        #   ('sberbank-ai/sbert_large_mt_nlu_ru', 512),
        #   ('sberbank-ai/ruBert-large', 512),
          

          
        #   ('microsoft/mdeberta-v3-base', 512),
        #   ('vicgalle/xlm-roberta-large-xnli-anli', 512),
        #   ('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512),
        #   ('facebook/bart-large-mnli', 1024)
]

text_embeddings = TextEmbeddings(True, True)
tmp = text_embeddings.add_many_embeddings(train.iloc[[0]], 'text', models)

sentence-transformers/LaBSE


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tmp

Unnamed: 0,category,text,DeepPavlov/rubert-base-cased-conversational_text_feature_0,DeepPavlov/rubert-base-cased-conversational_text_feature_1,DeepPavlov/rubert-base-cased-conversational_text_feature_2,DeepPavlov/rubert-base-cased-conversational_text_feature_3,DeepPavlov/rubert-base-cased-conversational_text_feature_4,DeepPavlov/rubert-base-cased-conversational_text_feature_5,DeepPavlov/rubert-base-cased-conversational_text_feature_6,DeepPavlov/rubert-base-cased-conversational_text_feature_7,...,DeepPavlov/rubert-base-cased-conversational_text_feature_1526,DeepPavlov/rubert-base-cased-conversational_text_feature_1527,DeepPavlov/rubert-base-cased-conversational_text_feature_1528,DeepPavlov/rubert-base-cased-conversational_text_feature_1529,DeepPavlov/rubert-base-cased-conversational_text_feature_1530,DeepPavlov/rubert-base-cased-conversational_text_feature_1531,DeepPavlov/rubert-base-cased-conversational_text_feature_1532,DeepPavlov/rubert-base-cased-conversational_text_feature_1533,DeepPavlov/rubert-base-cased-conversational_text_feature_1534,DeepPavlov/rubert-base-cased-conversational_text_feature_1535
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.234252,0.129809,0.03384,0.688277,0.270071,0.097495,-0.149023,-0.032282,...,-0.783512,0.589173,-0.022017,-0.231996,0.875345,-0.112136,-0.048333,0.161958,0.981491,-0.298607


In [54]:
train_full.iloc[:1, 2:]

Unnamed: 0,ruBert-base_text_feature_0,ruBert-base_text_feature_1,ruBert-base_text_feature_2,ruBert-base_text_feature_3,ruBert-base_text_feature_4,ruBert-base_text_feature_5,ruBert-base_text_feature_6,ruBert-base_text_feature_7,ruBert-base_text_feature_8,ruBert-base_text_feature_9,...,labse_text_feature_758,labse_text_feature_759,labse_text_feature_760,labse_text_feature_761,labse_text_feature_762,labse_text_feature_763,labse_text_feature_764,labse_text_feature_765,labse_text_feature_766,labse_text_feature_767
0,0.272156,0.155383,0.060285,0.363159,-0.140391,0.507753,-0.226326,0.431878,0.063127,-0.159407,...,-0.045795,-0.027475,0.030528,-0.052218,0.042459,-0.012714,-0.05537,-0.012433,-0.016283,-0.006994


In [60]:
train_full.iloc[:1, 770:]

Unnamed: 0,rubert-tiny2_text_feature_0,rubert-tiny2_text_feature_1,rubert-tiny2_text_feature_2,rubert-tiny2_text_feature_3,rubert-tiny2_text_feature_4,rubert-tiny2_text_feature_5,rubert-tiny2_text_feature_6,rubert-tiny2_text_feature_7,rubert-tiny2_text_feature_8,rubert-tiny2_text_feature_9,...,labse_text_feature_758,labse_text_feature_759,labse_text_feature_760,labse_text_feature_761,labse_text_feature_762,labse_text_feature_763,labse_text_feature_764,labse_text_feature_765,labse_text_feature_766,labse_text_feature_767
0,-0.27044,0.125625,0.274958,-1.458633,-0.325684,0.209481,-0.538193,-0.294977,-0.113118,-0.137835,...,-0.045795,-0.027475,0.030528,-0.052218,0.042459,-0.012714,-0.05537,-0.012433,-0.016283,-0.006994


In [63]:
train_full.iloc[:1, 1082:]

Unnamed: 0,rubert-base-cased-conversational_text_feature_0,rubert-base-cased-conversational_text_feature_1,rubert-base-cased-conversational_text_feature_2,rubert-base-cased-conversational_text_feature_3,rubert-base-cased-conversational_text_feature_4,rubert-base-cased-conversational_text_feature_5,rubert-base-cased-conversational_text_feature_6,rubert-base-cased-conversational_text_feature_7,rubert-base-cased-conversational_text_feature_8,rubert-base-cased-conversational_text_feature_9,...,labse_text_feature_758,labse_text_feature_759,labse_text_feature_760,labse_text_feature_761,labse_text_feature_762,labse_text_feature_763,labse_text_feature_764,labse_text_feature_765,labse_text_feature_766,labse_text_feature_767
0,-0.618955,-0.219039,-0.295544,-0.043835,0.187447,0.339212,0.297977,0.204835,0.019111,0.424508,...,-0.045795,-0.027475,0.030528,-0.052218,0.042459,-0.012714,-0.05537,-0.012433,-0.016283,-0.006994


In [71]:
train_full.iloc[:1, 1850:]

Unnamed: 0,labse_text_feature_0,labse_text_feature_1,labse_text_feature_2,labse_text_feature_3,labse_text_feature_4,labse_text_feature_5,labse_text_feature_6,labse_text_feature_7,labse_text_feature_8,labse_text_feature_9,...,labse_text_feature_758,labse_text_feature_759,labse_text_feature_760,labse_text_feature_761,labse_text_feature_762,labse_text_feature_763,labse_text_feature_764,labse_text_feature_765,labse_text_feature_766,labse_text_feature_767
0,-0.033953,-0.057514,-0.032344,0.007962,-0.064609,-0.05585,-0.016916,-0.001691,-0.010337,-0.018989,...,-0.045795,-0.027475,0.030528,-0.052218,0.042459,-0.012714,-0.05537,-0.012433,-0.016283,-0.006994


In [81]:
# Полный список поддерживаемых моделей можно найти на https://huggingface.co/models

models = [
          ('sberbank-ai/ruBert-base', 512),
          ('cointegrated/rubert-tiny2', 2048),
          ('DeepPavlov/rubert-base-cased-conversational', 512),
          ('sentence-transformers/LaBSE', 512),
          ('cointegrated/LaBSE-en-ru', 512),
           ('sberbank-ai/ruRoberta-large', 512),
           ('sberbank-ai/sbert_large_nlu_ru', 512),
           ('sberbank-ai/sbert_large_mt_nlu_ru', 512),
           ('sberbank-ai/ruBert-large', 512),
          

          
           #('microsoft/mdeberta-v3-base', 512),
           ('vicgalle/xlm-roberta-large-xnli-anli', 512),
           #('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512),
           #('facebook/bart-large-mnli', 1024)
]
for model in models:
    text_embeddings = TextEmbeddings(True, False)
    tmp = text_embeddings.add_many_embeddings(train.iloc[[0]], 'text', [model])
    display(tmp)

sberbank-ai/ruBert-base


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/ruBert-base_text_feature_0,sberbank-ai/ruBert-base_text_feature_1,sberbank-ai/ruBert-base_text_feature_2,sberbank-ai/ruBert-base_text_feature_3,sberbank-ai/ruBert-base_text_feature_4,sberbank-ai/ruBert-base_text_feature_5,sberbank-ai/ruBert-base_text_feature_6,sberbank-ai/ruBert-base_text_feature_7,...,sberbank-ai/ruBert-base_text_feature_758,sberbank-ai/ruBert-base_text_feature_759,sberbank-ai/ruBert-base_text_feature_760,sberbank-ai/ruBert-base_text_feature_761,sberbank-ai/ruBert-base_text_feature_762,sberbank-ai/ruBert-base_text_feature_763,sberbank-ai/ruBert-base_text_feature_764,sberbank-ai/ruBert-base_text_feature_765,sberbank-ai/ruBert-base_text_feature_766,sberbank-ai/ruBert-base_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.1246,-0.349294,-0.085619,-0.112011,-0.022692,0.124647,0.615842,0.553348,...,0.098652,-0.00936,0.92347,-0.016722,0.058011,-0.920569,-0.18203,0.067272,-0.091236,0.269201


cointegrated/rubert-tiny2


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,cointegrated/rubert-tiny2_text_feature_0,cointegrated/rubert-tiny2_text_feature_1,cointegrated/rubert-tiny2_text_feature_2,cointegrated/rubert-tiny2_text_feature_3,cointegrated/rubert-tiny2_text_feature_4,cointegrated/rubert-tiny2_text_feature_5,cointegrated/rubert-tiny2_text_feature_6,cointegrated/rubert-tiny2_text_feature_7,...,cointegrated/rubert-tiny2_text_feature_302,cointegrated/rubert-tiny2_text_feature_303,cointegrated/rubert-tiny2_text_feature_304,cointegrated/rubert-tiny2_text_feature_305,cointegrated/rubert-tiny2_text_feature_306,cointegrated/rubert-tiny2_text_feature_307,cointegrated/rubert-tiny2_text_feature_308,cointegrated/rubert-tiny2_text_feature_309,cointegrated/rubert-tiny2_text_feature_310,cointegrated/rubert-tiny2_text_feature_311
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.073752,-0.031242,-0.076214,0.193213,0.118162,-0.099399,0.19756,-0.134615,...,-0.204796,-0.055983,0.008363,-0.125359,0.05548,0.064453,-0.170632,0.098098,0.028671,-0.057577


DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,DeepPavlov/rubert-base-cased-conversational_text_feature_0,DeepPavlov/rubert-base-cased-conversational_text_feature_1,DeepPavlov/rubert-base-cased-conversational_text_feature_2,DeepPavlov/rubert-base-cased-conversational_text_feature_3,DeepPavlov/rubert-base-cased-conversational_text_feature_4,DeepPavlov/rubert-base-cased-conversational_text_feature_5,DeepPavlov/rubert-base-cased-conversational_text_feature_6,DeepPavlov/rubert-base-cased-conversational_text_feature_7,...,DeepPavlov/rubert-base-cased-conversational_text_feature_758,DeepPavlov/rubert-base-cased-conversational_text_feature_759,DeepPavlov/rubert-base-cased-conversational_text_feature_760,DeepPavlov/rubert-base-cased-conversational_text_feature_761,DeepPavlov/rubert-base-cased-conversational_text_feature_762,DeepPavlov/rubert-base-cased-conversational_text_feature_763,DeepPavlov/rubert-base-cased-conversational_text_feature_764,DeepPavlov/rubert-base-cased-conversational_text_feature_765,DeepPavlov/rubert-base-cased-conversational_text_feature_766,DeepPavlov/rubert-base-cased-conversational_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.234252,0.129809,0.03384,0.688277,0.270071,0.097495,-0.149023,-0.032282,...,0.027908,-0.364947,-0.353177,0.429614,-0.067803,0.022929,-0.16436,0.154825,-0.150798,0.081176


sentence-transformers/LaBSE


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sentence-transformers/LaBSE_text_feature_0,sentence-transformers/LaBSE_text_feature_1,sentence-transformers/LaBSE_text_feature_2,sentence-transformers/LaBSE_text_feature_3,sentence-transformers/LaBSE_text_feature_4,sentence-transformers/LaBSE_text_feature_5,sentence-transformers/LaBSE_text_feature_6,sentence-transformers/LaBSE_text_feature_7,...,sentence-transformers/LaBSE_text_feature_758,sentence-transformers/LaBSE_text_feature_759,sentence-transformers/LaBSE_text_feature_760,sentence-transformers/LaBSE_text_feature_761,sentence-transformers/LaBSE_text_feature_762,sentence-transformers/LaBSE_text_feature_763,sentence-transformers/LaBSE_text_feature_764,sentence-transformers/LaBSE_text_feature_765,sentence-transformers/LaBSE_text_feature_766,sentence-transformers/LaBSE_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.25085,-0.427436,-0.222047,0.010641,-0.693528,-0.527626,-0.357342,-0.049272,...,-0.711123,0.076657,0.450998,-0.527786,0.639146,0.006443,-0.644647,0.324797,0.006354,-0.044956


cointegrated/LaBSE-en-ru


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,cointegrated/LaBSE-en-ru_text_feature_0,cointegrated/LaBSE-en-ru_text_feature_1,cointegrated/LaBSE-en-ru_text_feature_2,cointegrated/LaBSE-en-ru_text_feature_3,cointegrated/LaBSE-en-ru_text_feature_4,cointegrated/LaBSE-en-ru_text_feature_5,cointegrated/LaBSE-en-ru_text_feature_6,cointegrated/LaBSE-en-ru_text_feature_7,...,cointegrated/LaBSE-en-ru_text_feature_758,cointegrated/LaBSE-en-ru_text_feature_759,cointegrated/LaBSE-en-ru_text_feature_760,cointegrated/LaBSE-en-ru_text_feature_761,cointegrated/LaBSE-en-ru_text_feature_762,cointegrated/LaBSE-en-ru_text_feature_763,cointegrated/LaBSE-en-ru_text_feature_764,cointegrated/LaBSE-en-ru_text_feature_765,cointegrated/LaBSE-en-ru_text_feature_766,cointegrated/LaBSE-en-ru_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.076881,-0.412761,-0.056001,0.046433,-0.449511,-0.253623,-0.301378,-0.103132,...,-0.299683,0.028655,0.4017,-0.196149,0.495118,0.019013,-0.362841,0.161874,-0.045738,-0.059286


sberbank-ai/ruRoberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/ruRoberta-large_text_feature_0,sberbank-ai/ruRoberta-large_text_feature_1,sberbank-ai/ruRoberta-large_text_feature_2,sberbank-ai/ruRoberta-large_text_feature_3,sberbank-ai/ruRoberta-large_text_feature_4,sberbank-ai/ruRoberta-large_text_feature_5,sberbank-ai/ruRoberta-large_text_feature_6,sberbank-ai/ruRoberta-large_text_feature_7,...,sberbank-ai/ruRoberta-large_text_feature_1014,sberbank-ai/ruRoberta-large_text_feature_1015,sberbank-ai/ruRoberta-large_text_feature_1016,sberbank-ai/ruRoberta-large_text_feature_1017,sberbank-ai/ruRoberta-large_text_feature_1018,sberbank-ai/ruRoberta-large_text_feature_1019,sberbank-ai/ruRoberta-large_text_feature_1020,sberbank-ai/ruRoberta-large_text_feature_1021,sberbank-ai/ruRoberta-large_text_feature_1022,sberbank-ai/ruRoberta-large_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.297709,0.235417,-0.506058,0.004069,0.579705,0.126498,-0.385859,-0.193576,...,0.177381,-0.361036,0.180794,0.098787,0.166444,0.211842,-0.378211,-0.228081,0.112738,0.847715


sberbank-ai/sbert_large_nlu_ru


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/sbert_large_nlu_ru_text_feature_0,sberbank-ai/sbert_large_nlu_ru_text_feature_1,sberbank-ai/sbert_large_nlu_ru_text_feature_2,sberbank-ai/sbert_large_nlu_ru_text_feature_3,sberbank-ai/sbert_large_nlu_ru_text_feature_4,sberbank-ai/sbert_large_nlu_ru_text_feature_5,sberbank-ai/sbert_large_nlu_ru_text_feature_6,sberbank-ai/sbert_large_nlu_ru_text_feature_7,...,sberbank-ai/sbert_large_nlu_ru_text_feature_1014,sberbank-ai/sbert_large_nlu_ru_text_feature_1015,sberbank-ai/sbert_large_nlu_ru_text_feature_1016,sberbank-ai/sbert_large_nlu_ru_text_feature_1017,sberbank-ai/sbert_large_nlu_ru_text_feature_1018,sberbank-ai/sbert_large_nlu_ru_text_feature_1019,sberbank-ai/sbert_large_nlu_ru_text_feature_1020,sberbank-ai/sbert_large_nlu_ru_text_feature_1021,sberbank-ai/sbert_large_nlu_ru_text_feature_1022,sberbank-ai/sbert_large_nlu_ru_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.347409,-0.048205,-0.669208,-0.158887,-0.083748,0.589775,0.246358,0.156766,...,0.145878,-0.083728,0.158489,-0.231062,-0.583284,-0.200041,-0.385793,0.126939,-0.550977,-0.233419


sberbank-ai/sbert_large_mt_nlu_ru


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_0,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_2,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_3,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_4,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_5,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_6,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_7,...,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1014,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1015,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1016,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1017,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1018,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1019,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1020,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1021,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1022,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.461964,-0.206542,-0.68977,0.025292,-0.086468,0.754107,-0.039861,0.319692,...,0.04264,-0.14949,0.23969,-0.22654,-0.338808,-0.105256,-0.368561,0.092168,0.29101,-0.214342


sberbank-ai/ruBert-large


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/ruBert-large_text_feature_0,sberbank-ai/ruBert-large_text_feature_1,sberbank-ai/ruBert-large_text_feature_2,sberbank-ai/ruBert-large_text_feature_3,sberbank-ai/ruBert-large_text_feature_4,sberbank-ai/ruBert-large_text_feature_5,sberbank-ai/ruBert-large_text_feature_6,sberbank-ai/ruBert-large_text_feature_7,...,sberbank-ai/ruBert-large_text_feature_1014,sberbank-ai/ruBert-large_text_feature_1015,sberbank-ai/ruBert-large_text_feature_1016,sberbank-ai/ruBert-large_text_feature_1017,sberbank-ai/ruBert-large_text_feature_1018,sberbank-ai/ruBert-large_text_feature_1019,sberbank-ai/ruBert-large_text_feature_1020,sberbank-ai/ruBert-large_text_feature_1021,sberbank-ai/ruBert-large_text_feature_1022,sberbank-ai/ruBert-large_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.685256,0.138065,-0.816772,-0.024534,0.168912,0.046213,0.124235,-0.072823,...,0.012156,0.042902,-0.017649,-0.268881,-0.795592,-0.189305,-0.248218,-0.117341,0.310468,-0.009446


vicgalle/xlm-roberta-large-xnli-anli


Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,vicgalle/xlm-roberta-large-xnli-anli_text_feature_0,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1,vicgalle/xlm-roberta-large-xnli-anli_text_feature_2,vicgalle/xlm-roberta-large-xnli-anli_text_feature_3,vicgalle/xlm-roberta-large-xnli-anli_text_feature_4,vicgalle/xlm-roberta-large-xnli-anli_text_feature_5,vicgalle/xlm-roberta-large-xnli-anli_text_feature_6,vicgalle/xlm-roberta-large-xnli-anli_text_feature_7,...,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1014,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1015,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1016,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1017,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1018,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1019,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1020,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1021,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1022,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.159746,0.261897,-0.229278,-0.155079,-0.71902,-0.50586,-0.525603,0.298841,...,0.536427,-0.121498,0.482937,-0.906698,-0.673889,0.835474,-0.929037,0.425161,0.244952,0.666885


In [119]:
# Полный список поддерживаемых моделей можно найти на https://huggingface.co/models

models = [
          ('sberbank-ai/ruBert-base', 512),
          ('cointegrated/rubert-tiny2', 2048),
          ('DeepPavlov/rubert-base-cased-conversational', 512),
          ('sentence-transformers/LaBSE', 512),
          ('cointegrated/LaBSE-en-ru', 512),
           ('sberbank-ai/ruRoberta-large', 512),
           ('sberbank-ai/sbert_large_nlu_ru', 512),
           ('sberbank-ai/sbert_large_mt_nlu_ru', 512),
           ('sberbank-ai/ruBert-large', 512),
          

          
           #('microsoft/mdeberta-v3-base', 512),
           ('vicgalle/xlm-roberta-large-xnli-anli', 512),
           #('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512),
           #('facebook/bart-large-mnli', 1024)
]
for model in models:
    text_embeddings = TextEmbeddings(False, True)
    tmp = text_embeddings.add_many_embeddings(train.iloc[[0]], 'text', [model])
    display(tmp)

sberbank-ai/ruBert-base


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/ruBert-base_text_feature_0,sberbank-ai/ruBert-base_text_feature_1,sberbank-ai/ruBert-base_text_feature_2,sberbank-ai/ruBert-base_text_feature_3,sberbank-ai/ruBert-base_text_feature_4,sberbank-ai/ruBert-base_text_feature_5,sberbank-ai/ruBert-base_text_feature_6,sberbank-ai/ruBert-base_text_feature_7,...,sberbank-ai/ruBert-base_text_feature_758,sberbank-ai/ruBert-base_text_feature_759,sberbank-ai/ruBert-base_text_feature_760,sberbank-ai/ruBert-base_text_feature_761,sberbank-ai/ruBert-base_text_feature_762,sberbank-ai/ruBert-base_text_feature_763,sberbank-ai/ruBert-base_text_feature_764,sberbank-ai/ruBert-base_text_feature_765,sberbank-ai/ruBert-base_text_feature_766,sberbank-ai/ruBert-base_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.272157,0.155383,0.060286,0.363159,-0.140392,0.507753,-0.226326,0.431879,...,-0.092746,-0.320734,-0.527221,0.346372,-0.371591,0.084271,0.122088,0.002221,-0.000357,-0.124441


cointegrated/rubert-tiny2


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,cointegrated/rubert-tiny2_text_feature_0,cointegrated/rubert-tiny2_text_feature_1,cointegrated/rubert-tiny2_text_feature_2,cointegrated/rubert-tiny2_text_feature_3,cointegrated/rubert-tiny2_text_feature_4,cointegrated/rubert-tiny2_text_feature_5,cointegrated/rubert-tiny2_text_feature_6,cointegrated/rubert-tiny2_text_feature_7,...,cointegrated/rubert-tiny2_text_feature_302,cointegrated/rubert-tiny2_text_feature_303,cointegrated/rubert-tiny2_text_feature_304,cointegrated/rubert-tiny2_text_feature_305,cointegrated/rubert-tiny2_text_feature_306,cointegrated/rubert-tiny2_text_feature_307,cointegrated/rubert-tiny2_text_feature_308,cointegrated/rubert-tiny2_text_feature_309,cointegrated/rubert-tiny2_text_feature_310,cointegrated/rubert-tiny2_text_feature_311
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.27044,0.125625,0.274958,-1.458632,-0.325684,0.209481,-0.538193,-0.294977,...,-0.217944,0.007775,0.097263,-0.46204,0.001138,-0.718393,0.212273,-0.482992,0.359072,-1.07429


DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,DeepPavlov/rubert-base-cased-conversational_text_feature_0,DeepPavlov/rubert-base-cased-conversational_text_feature_1,DeepPavlov/rubert-base-cased-conversational_text_feature_2,DeepPavlov/rubert-base-cased-conversational_text_feature_3,DeepPavlov/rubert-base-cased-conversational_text_feature_4,DeepPavlov/rubert-base-cased-conversational_text_feature_5,DeepPavlov/rubert-base-cased-conversational_text_feature_6,DeepPavlov/rubert-base-cased-conversational_text_feature_7,...,DeepPavlov/rubert-base-cased-conversational_text_feature_758,DeepPavlov/rubert-base-cased-conversational_text_feature_759,DeepPavlov/rubert-base-cased-conversational_text_feature_760,DeepPavlov/rubert-base-cased-conversational_text_feature_761,DeepPavlov/rubert-base-cased-conversational_text_feature_762,DeepPavlov/rubert-base-cased-conversational_text_feature_763,DeepPavlov/rubert-base-cased-conversational_text_feature_764,DeepPavlov/rubert-base-cased-conversational_text_feature_765,DeepPavlov/rubert-base-cased-conversational_text_feature_766,DeepPavlov/rubert-base-cased-conversational_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.618956,-0.219038,-0.295542,-0.043835,0.187446,0.339212,0.297976,0.204837,...,-0.783512,0.589173,-0.022017,-0.231996,0.875345,-0.112136,-0.048333,0.161958,0.981491,-0.298607


sentence-transformers/LaBSE


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sentence-transformers/LaBSE_text_feature_0,sentence-transformers/LaBSE_text_feature_1,sentence-transformers/LaBSE_text_feature_2,sentence-transformers/LaBSE_text_feature_3,sentence-transformers/LaBSE_text_feature_4,sentence-transformers/LaBSE_text_feature_5,sentence-transformers/LaBSE_text_feature_6,sentence-transformers/LaBSE_text_feature_7,...,sentence-transformers/LaBSE_text_feature_758,sentence-transformers/LaBSE_text_feature_759,sentence-transformers/LaBSE_text_feature_760,sentence-transformers/LaBSE_text_feature_761,sentence-transformers/LaBSE_text_feature_762,sentence-transformers/LaBSE_text_feature_763,sentence-transformers/LaBSE_text_feature_764,sentence-transformers/LaBSE_text_feature_765,sentence-transformers/LaBSE_text_feature_766,sentence-transformers/LaBSE_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.857106,-0.215657,1.067143,0.311904,-0.330101,0.720545,-0.263803,0.03688,...,0.297108,-0.040586,0.602947,-0.2219,1.02662,0.107726,-0.359772,0.015022,-0.514056,-0.497934


cointegrated/LaBSE-en-ru


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,cointegrated/LaBSE-en-ru_text_feature_0,cointegrated/LaBSE-en-ru_text_feature_1,cointegrated/LaBSE-en-ru_text_feature_2,cointegrated/LaBSE-en-ru_text_feature_3,cointegrated/LaBSE-en-ru_text_feature_4,cointegrated/LaBSE-en-ru_text_feature_5,cointegrated/LaBSE-en-ru_text_feature_6,cointegrated/LaBSE-en-ru_text_feature_7,...,cointegrated/LaBSE-en-ru_text_feature_758,cointegrated/LaBSE-en-ru_text_feature_759,cointegrated/LaBSE-en-ru_text_feature_760,cointegrated/LaBSE-en-ru_text_feature_761,cointegrated/LaBSE-en-ru_text_feature_762,cointegrated/LaBSE-en-ru_text_feature_763,cointegrated/LaBSE-en-ru_text_feature_764,cointegrated/LaBSE-en-ru_text_feature_765,cointegrated/LaBSE-en-ru_text_feature_766,cointegrated/LaBSE-en-ru_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.399257,-0.014758,0.616264,0.149836,-0.242052,0.639204,-0.380299,-0.161804,...,0.268026,0.064704,0.130655,0.02184,0.568038,0.156017,-0.069051,0.134307,-0.321157,-0.437823


sberbank-ai/ruRoberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/ruRoberta-large_text_feature_0,sberbank-ai/ruRoberta-large_text_feature_1,sberbank-ai/ruRoberta-large_text_feature_2,sberbank-ai/ruRoberta-large_text_feature_3,sberbank-ai/ruRoberta-large_text_feature_4,sberbank-ai/ruRoberta-large_text_feature_5,sberbank-ai/ruRoberta-large_text_feature_6,sberbank-ai/ruRoberta-large_text_feature_7,...,sberbank-ai/ruRoberta-large_text_feature_1014,sberbank-ai/ruRoberta-large_text_feature_1015,sberbank-ai/ruRoberta-large_text_feature_1016,sberbank-ai/ruRoberta-large_text_feature_1017,sberbank-ai/ruRoberta-large_text_feature_1018,sberbank-ai/ruRoberta-large_text_feature_1019,sberbank-ai/ruRoberta-large_text_feature_1020,sberbank-ai/ruRoberta-large_text_feature_1021,sberbank-ai/ruRoberta-large_text_feature_1022,sberbank-ai/ruRoberta-large_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.386927,-0.017254,0.128882,-0.046298,0.755182,0.117258,-0.260361,0.367241,...,-0.334861,0.101159,0.211693,0.298024,-0.111337,0.068003,0.376486,0.13787,0.187138,0.039322


sberbank-ai/sbert_large_nlu_ru


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/sbert_large_nlu_ru_text_feature_0,sberbank-ai/sbert_large_nlu_ru_text_feature_1,sberbank-ai/sbert_large_nlu_ru_text_feature_2,sberbank-ai/sbert_large_nlu_ru_text_feature_3,sberbank-ai/sbert_large_nlu_ru_text_feature_4,sberbank-ai/sbert_large_nlu_ru_text_feature_5,sberbank-ai/sbert_large_nlu_ru_text_feature_6,sberbank-ai/sbert_large_nlu_ru_text_feature_7,...,sberbank-ai/sbert_large_nlu_ru_text_feature_1014,sberbank-ai/sbert_large_nlu_ru_text_feature_1015,sberbank-ai/sbert_large_nlu_ru_text_feature_1016,sberbank-ai/sbert_large_nlu_ru_text_feature_1017,sberbank-ai/sbert_large_nlu_ru_text_feature_1018,sberbank-ai/sbert_large_nlu_ru_text_feature_1019,sberbank-ai/sbert_large_nlu_ru_text_feature_1020,sberbank-ai/sbert_large_nlu_ru_text_feature_1021,sberbank-ai/sbert_large_nlu_ru_text_feature_1022,sberbank-ai/sbert_large_nlu_ru_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.415026,0.092475,0.492794,0.48916,-0.173276,-0.992799,0.658386,0.733077,...,-0.554615,-0.074569,-0.685439,0.697551,-0.6611,0.057302,-0.192679,-0.104465,0.302485,0.08279


sberbank-ai/sbert_large_mt_nlu_ru


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_0,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_2,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_3,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_4,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_5,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_6,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_7,...,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1014,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1015,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1016,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1017,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1018,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1019,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1020,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1021,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1022,sberbank-ai/sbert_large_mt_nlu_ru_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,1.021607,-0.240039,-0.333009,0.627153,-0.15489,-0.767992,0.281795,0.527032,...,-0.170824,-0.72699,0.120144,1.997994,-1.256079,0.18146,-0.2253,-0.606861,0.566627,-0.121111


sberbank-ai/ruBert-large


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,sberbank-ai/ruBert-large_text_feature_0,sberbank-ai/ruBert-large_text_feature_1,sberbank-ai/ruBert-large_text_feature_2,sberbank-ai/ruBert-large_text_feature_3,sberbank-ai/ruBert-large_text_feature_4,sberbank-ai/ruBert-large_text_feature_5,sberbank-ai/ruBert-large_text_feature_6,sberbank-ai/ruBert-large_text_feature_7,...,sberbank-ai/ruBert-large_text_feature_1014,sberbank-ai/ruBert-large_text_feature_1015,sberbank-ai/ruBert-large_text_feature_1016,sberbank-ai/ruBert-large_text_feature_1017,sberbank-ai/ruBert-large_text_feature_1018,sberbank-ai/ruBert-large_text_feature_1019,sberbank-ai/ruBert-large_text_feature_1020,sberbank-ai/ruBert-large_text_feature_1021,sberbank-ai/ruBert-large_text_feature_1022,sberbank-ai/ruBert-large_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.20393,0.305737,-0.096678,0.565531,-0.101286,-0.408934,0.197335,0.50366,...,0.073689,-0.142128,-0.191717,0.452236,-0.540893,0.20142,-0.010704,-0.111506,-0.302127,0.123216


vicgalle/xlm-roberta-large-xnli-anli


Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,text,vicgalle/xlm-roberta-large-xnli-anli_text_feature_0,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1,vicgalle/xlm-roberta-large-xnli-anli_text_feature_2,vicgalle/xlm-roberta-large-xnli-anli_text_feature_3,vicgalle/xlm-roberta-large-xnli-anli_text_feature_4,vicgalle/xlm-roberta-large-xnli-anli_text_feature_5,vicgalle/xlm-roberta-large-xnli-anli_text_feature_6,vicgalle/xlm-roberta-large-xnli-anli_text_feature_7,...,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1014,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1015,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1016,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1017,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1018,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1019,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1020,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1021,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1022,vicgalle/xlm-roberta-large-xnli-anli_text_feature_1023
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.274763,0.264808,0.000878,-0.204066,-0.199178,-0.035506,-0.380218,-0.127966,...,-0.4328,-0.784799,0.157693,-0.356171,-0.409211,-0.004737,-0.404739,0.394192,0.213243,0.166621


In [83]:

RANDOM_STATE = 42
cb_init_params_cust = {
        'loss_function': 'MultiClass',
        
        # Ограничим глубину деревьев для ускорения
        'depth': 4,
        'iterations': 3500,

        # Регуляризация и ускорение
        'max_bin': 187,
        'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
        'thread_count': -1,
        'bootstrap_type': 'Bernoulli', 
            
        # Важное!
        'random_seed': RANDOM_STATE,
        'auto_class_weights': 'SqrtBalanced',
        'early_stopping_rounds': 30
    }

In [86]:
def train_model(algorithm,
                X,
                y,
                early_stopping_rounds,
                init_params=None,
                cat_features=None,
                text_features=None,
                random_seed=2024
    ):
    scores = []
    models = []

    kf = KFold(n_splits=3, shuffle=True, random_state=random_seed)

    print(f"========= TRAINING {algorithm.__name__} =========")

    for num_fold, (train_index, val_index) in enumerate(kf.split(X)):
        X_train, X_eval = X.iloc[train_index], X.iloc[val_index]
        y_train, y_eval = y.iloc[train_index], y.iloc[val_index]

        if init_params is not None:
            model = algorithm(**init_params)
        else:
            model = algorithm()

        if 'CatBoost' in algorithm.__name__:
            # Специальный класс для ускорения обучения 
            train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features, text_features=text_features)
            eval_dataset  = Pool(data=X_eval, label=y_eval, cat_features=cat_features, text_features=text_features)

            model.fit(train_dataset,
                      eval_set=eval_dataset,
                      verbose=0,
                      early_stopping_rounds=early_stopping_rounds)

        elif 'LGBM' in algorithm.__name__:
            # Специальный класс для ускорения обучения 
            train_dataset = Dataset(X_train, y_train, categorical_feature=cat_features, free_raw_data=False,)
            eval_dataset  = Dataset(X_eval, y_eval, categorical_feature=cat_features, free_raw_data=False,)

            model = lgb.train(params=init_params,
                              train_set=train_dataset,
                              valid_sets=(eval_dataset),
                              #callbacks=[lgb.log_evaluation(10)],
                              #           lgb.early_stopping(stopping_rounds=5)],
                              categorical_feature=cat_features,
                              #verbose_eval=False                   # в новой версии LightGBM по логи по умолчанию отключены
                              )

        elif 'XGB' in algorithm.__name__:
            # Специальный класс для ускорения обучения
            train_dataset = xgb.DMatrix(X_train, label=y_train, nthread=-1, enable_categorical=True,)
            eval_dataset  = xgb.DMatrix(X_eval,  label=y_eval,  nthread=-1, enable_categorical=True,)

            model = xgb.train(params=init_params,
                              dtrain=train_dataset,
                              evals=[(train_dataset, 'dtrain'), (eval_dataset, 'dtest')],
                              verbose_eval=False,
                              early_stopping_rounds=early_stopping_rounds)

            X_eval = eval_dataset

        # Предсказание на X_eval и расчет RMSE
        y_pred = model.predict(X_eval)
        score = balanced_accuracy_score(y_eval, y_pred)

        models.append(model)
        scores.append(score)

        print(f'FOLD {num_fold}: SCORE {score}')

    mean_kfold_score = np.mean(scores, dtype="float16") - np.std(scores, dtype="float16")
    print("\nMEAN BALANCED ACCURACY SCORE", mean_kfold_score)

    # Модель с наименьшим значением скора
    best_model = models[scores.index(min(scores))]

    return mean_kfold_score, best_model

In [87]:
from catboost import CatBoostClassifier, Pool

In [89]:
models = [
          ('sberbank-ai/ruBert-base', 512),
          ('cointegrated/rubert-tiny2', 2048),
          ('DeepPavlov/rubert-base-cased-conversational', 512),
          ('sentence-transformers/LaBSE', 512),
          ('cointegrated/LaBSE-en-ru', 512),
           ('sberbank-ai/ruRoberta-large', 512),
           ('sberbank-ai/sbert_large_nlu_ru', 512),
           ('sberbank-ai/sbert_large_mt_nlu_ru', 512),
           ('sberbank-ai/ruBert-large', 512),
          

          
           #('microsoft/mdeberta-v3-base', 512),
           ('vicgalle/xlm-roberta-large-xnli-anli', 512),
           #('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512),
           #('facebook/bart-large-mnli', 1024)
]
for model in models:
    print(model[0])

sberbank-ai/ruBert-base
cointegrated/rubert-tiny2
DeepPavlov/rubert-base-cased-conversational
sentence-transformers/LaBSE
cointegrated/LaBSE-en-ru
sberbank-ai/ruRoberta-large
sberbank-ai/sbert_large_nlu_ru
sberbank-ai/sbert_large_mt_nlu_ru
sberbank-ai/ruBert-large
vicgalle/xlm-roberta-large-xnli-anli


In [114]:
# Полный список поддерживаемых моделей можно найти на https://huggingface.co/models

models = [
          ('sberbank-ai/ruBert-base', 512),
          ('cointegrated/rubert-tiny2', 2048),
          ('DeepPavlov/rubert-base-cased-conversational', 512),
          ('sentence-transformers/LaBSE', 512),
          ('cointegrated/LaBSE-en-ru', 512),
           ('sberbank-ai/ruRoberta-large', 512),
           ('sberbank-ai/sbert_large_nlu_ru', 512),
           ('sberbank-ai/sbert_large_mt_nlu_ru', 512),
           ('sberbank-ai/ruBert-large', 512),
          

          
           #('microsoft/mdeberta-v3-base', 512),
           ('vicgalle/xlm-roberta-large-xnli-anli', 512),
           #('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512),
           #('facebook/bart-large-mnli', 1024)
]
for model in models:
    train = pd.read_csv('../data/text_classification_train.csv')
    test = pd.read_csv('../data/text_classification_test.csv')
    
    
    
    text_embeddings = TextEmbeddings(False, True)
    train = text_embeddings.add_many_embeddings(train, 'text', [model])
    test = text_embeddings.add_many_embeddings(test, 'text', [model])


    print(model)

    cb_score, cb_model = train_model(
        algorithm=CatBoostClassifier,
        X=train.drop(columns=['category', 'text']), y=train['category'],
        init_params=cb_init_params_cust,
        early_stopping_rounds=30,
        #text_features=['text'],
        random_seed=RANDOM_STATE
        )
    
    model_name = '_'.join(model[0].split('/'))
    pd.DataFrame(cb_model.predict(test.drop(columns='text')), columns=['category']).to_csv(f'../subs/cb_model_False_True_{model_name}_{model[1]}.csv', index=False)

sberbank-ai/ruBert-base


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/ruBert-base


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/ruBert-base', 512)




FOLD 0: SCORE 0.6835966450669996




FOLD 1: SCORE 0.6744327824988847




FOLD 2: SCORE 0.6721239699033392

MEAN BALANCED ACCURACY SCORE 0.6724
cointegrated/rubert-tiny2


  0%|          | 0/7500 [00:00<?, ?it/s]

cointegrated/rubert-tiny2


  0%|          | 0/2500 [00:00<?, ?it/s]

('cointegrated/rubert-tiny2', 2048)




FOLD 0: SCORE 0.5513404975979704




FOLD 1: SCORE 0.5539672910697848




FOLD 2: SCORE 0.5579395193076688

MEAN BALANCED ACCURACY SCORE 0.552
DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/7500 [00:00<?, ?it/s]

DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2500 [00:00<?, ?it/s]

('DeepPavlov/rubert-base-cased-conversational', 512)




FOLD 0: SCORE 0.5792180734484684




FOLD 1: SCORE 0.5648614108484784




FOLD 2: SCORE 0.5674280475634462

MEAN BALANCED ACCURACY SCORE 0.564
sentence-transformers/LaBSE


  0%|          | 0/7500 [00:00<?, ?it/s]

sentence-transformers/LaBSE


  0%|          | 0/2500 [00:00<?, ?it/s]

('sentence-transformers/LaBSE', 512)




FOLD 0: SCORE 0.6651459841796643




FOLD 1: SCORE 0.6506902245482551




FOLD 2: SCORE 0.6668804418784802

MEAN BALANCED ACCURACY SCORE 0.6533
cointegrated/LaBSE-en-ru


  0%|          | 0/7500 [00:00<?, ?it/s]

cointegrated/LaBSE-en-ru


  0%|          | 0/2500 [00:00<?, ?it/s]

('cointegrated/LaBSE-en-ru', 512)




FOLD 0: SCORE 0.6871740314029218




FOLD 1: SCORE 0.6748784011173234




FOLD 2: SCORE 0.687415197552776

MEAN BALANCED ACCURACY SCORE 0.6772
sberbank-ai/ruRoberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/ruRoberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/ruRoberta-large', 512)




FOLD 0: SCORE 0.6092802541689742




FOLD 1: SCORE 0.6038754376020863




FOLD 2: SCORE 0.6014725737036308

MEAN BALANCED ACCURACY SCORE 0.6016
sberbank-ai/sbert_large_nlu_ru


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/sbert_large_nlu_ru


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/sbert_large_nlu_ru', 512)




FOLD 0: SCORE 0.6349966701110472




FOLD 1: SCORE 0.6186789941122197




FOLD 2: SCORE 0.6195090361991299

MEAN BALANCED ACCURACY SCORE 0.617
sberbank-ai/sbert_large_mt_nlu_ru


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/sbert_large_mt_nlu_ru


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/sbert_large_mt_nlu_ru', 512)




FOLD 0: SCORE 0.6297258605007889




FOLD 1: SCORE 0.6153935812920673




FOLD 2: SCORE 0.6003947386827996

MEAN BALANCED ACCURACY SCORE 0.603
sberbank-ai/ruBert-large


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/ruBert-large


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/ruBert-large', 512)




FOLD 0: SCORE 0.6446203032049126




FOLD 1: SCORE 0.6363942700420918




FOLD 2: SCORE 0.6330525288661664

MEAN BALANCED ACCURACY SCORE 0.6333
vicgalle/xlm-roberta-large-xnli-anli


Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7500 [00:00<?, ?it/s]

vicgalle/xlm-roberta-large-xnli-anli


Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2500 [00:00<?, ?it/s]

('vicgalle/xlm-roberta-large-xnli-anli', 512)




FOLD 0: SCORE 0.3586445986694841




FOLD 1: SCORE 0.3457664991658055




FOLD 2: SCORE 0.3443730580518405

MEAN BALANCED ACCURACY SCORE 0.3433


In [115]:
# Полный список поддерживаемых моделей можно найти на https://huggingface.co/models

models = [
          ('sberbank-ai/ruBert-base', 512),
          ('cointegrated/rubert-tiny2', 2048),
          ('DeepPavlov/rubert-base-cased-conversational', 512),
          ('sentence-transformers/LaBSE', 512),
          ('cointegrated/LaBSE-en-ru', 512),
           ('sberbank-ai/ruRoberta-large', 512),
           ('sberbank-ai/sbert_large_nlu_ru', 512),
           ('sberbank-ai/sbert_large_mt_nlu_ru', 512),
           ('sberbank-ai/ruBert-large', 512),
          

          
           #('microsoft/mdeberta-v3-base', 512),
           ('vicgalle/xlm-roberta-large-xnli-anli', 512),
           #('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512),
           #('facebook/bart-large-mnli', 1024)
]
for model in models:
    train = pd.read_csv('../data/text_classification_train.csv')
    test = pd.read_csv('../data/text_classification_test.csv')
    
    
    
    text_embeddings = TextEmbeddings(True, False)
    train = text_embeddings.add_many_embeddings(train, 'text', [model])
    test = text_embeddings.add_many_embeddings(test, 'text', [model])


    print(model)

    cb_score, cb_model = train_model(
        algorithm=CatBoostClassifier,
        X=train.drop(columns=['category', 'text']), y=train['category'],
        init_params=cb_init_params_cust,
        early_stopping_rounds=30,
        #text_features=['text'],
        random_seed=RANDOM_STATE
        )
    
    model_name = '_'.join(model[0].split('/'))
    pd.DataFrame(cb_model.predict(test.drop(columns='text')), columns=['category']).to_csv(f'../subs/cb_model_False_True_{model_name}_{model[1]}.csv', index=False)


sberbank-ai/ruBert-base


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/ruBert-base


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/ruBert-base', 512)




FOLD 0: SCORE 0.36831821157856015




FOLD 1: SCORE 0.3462740827750925




FOLD 2: SCORE 0.34659578221442355

MEAN BALANCED ACCURACY SCORE 0.3435
cointegrated/rubert-tiny2


  0%|          | 0/7500 [00:00<?, ?it/s]

cointegrated/rubert-tiny2


  0%|          | 0/2500 [00:00<?, ?it/s]

('cointegrated/rubert-tiny2', 2048)




FOLD 0: SCORE 0.4832962800322447




FOLD 1: SCORE 0.48333113580294434




FOLD 2: SCORE 0.46669721171835243

MEAN BALANCED ACCURACY SCORE 0.47
DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/7500 [00:00<?, ?it/s]

DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2500 [00:00<?, ?it/s]

('DeepPavlov/rubert-base-cased-conversational', 512)




FOLD 0: SCORE 0.4205267712225738




FOLD 1: SCORE 0.4072665910313014




FOLD 2: SCORE 0.40556859896068514

MEAN BALANCED ACCURACY SCORE 0.4045
sentence-transformers/LaBSE


  0%|          | 0/7500 [00:00<?, ?it/s]

sentence-transformers/LaBSE


  0%|          | 0/2500 [00:00<?, ?it/s]

('sentence-transformers/LaBSE', 512)




FOLD 0: SCORE 0.6512730598165231




FOLD 1: SCORE 0.6379298629308399




FOLD 2: SCORE 0.662574744507883

MEAN BALANCED ACCURACY SCORE 0.64
cointegrated/LaBSE-en-ru


  0%|          | 0/7500 [00:00<?, ?it/s]

cointegrated/LaBSE-en-ru


  0%|          | 0/2500 [00:00<?, ?it/s]

('cointegrated/LaBSE-en-ru', 512)




FOLD 0: SCORE 0.6857827873903606




FOLD 1: SCORE 0.6723472040777009




FOLD 2: SCORE 0.6633622575801229

MEAN BALANCED ACCURACY SCORE 0.6646
sberbank-ai/ruRoberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/ruRoberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/ruRoberta-large', 512)




FOLD 0: SCORE 0.43770608065393823




FOLD 1: SCORE 0.4485206341783569




FOLD 2: SCORE 0.4258154579283962

MEAN BALANCED ACCURACY SCORE 0.4282
sberbank-ai/sbert_large_nlu_ru


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/sbert_large_nlu_ru


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/sbert_large_nlu_ru', 512)




FOLD 0: SCORE 0.5851200741860374




FOLD 1: SCORE 0.5818091511345933




FOLD 2: SCORE 0.5673972939676779

MEAN BALANCED ACCURACY SCORE 0.5703
sberbank-ai/sbert_large_mt_nlu_ru


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/sbert_large_mt_nlu_ru


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/sbert_large_mt_nlu_ru', 512)




FOLD 0: SCORE 0.5861520418898778




FOLD 1: SCORE 0.5766903620575154




FOLD 2: SCORE 0.5773471350884901

MEAN BALANCED ACCURACY SCORE 0.5757
sberbank-ai/ruBert-large


  0%|          | 0/7500 [00:00<?, ?it/s]

sberbank-ai/ruBert-large


  0%|          | 0/2500 [00:00<?, ?it/s]

('sberbank-ai/ruBert-large', 512)




FOLD 0: SCORE 0.5340571961565064




FOLD 1: SCORE 0.5258323982559555




FOLD 2: SCORE 0.5309348965250986

MEAN BALANCED ACCURACY SCORE 0.527
vicgalle/xlm-roberta-large-xnli-anli


Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7500 [00:00<?, ?it/s]

vicgalle/xlm-roberta-large-xnli-anli


Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2500 [00:00<?, ?it/s]

('vicgalle/xlm-roberta-large-xnli-anli', 512)




FOLD 0: SCORE 0.19122396263141012




FOLD 1: SCORE 0.18563895776980271




FOLD 2: SCORE 0.16905418363313393

MEAN BALANCED ACCURACY SCORE 0.1726


In [None]:
'sberbank-ai/ruBert-base', 'cointegrated/rubert-tiny2', 'DeepPavlov/rubert-base-cased-conversational' и 'sentence-transformers/LaBSE'.

In [3]:
train_full = pd.read_csv('https://www.dropbox.com/scl/fi/9hb4r3uce0mqz8fkpja17/text_classification_train.csv?rlkey=w42y98wa401gelzou08pp582k&dl=1')
test_full = pd.read_csv('https://www.dropbox.com/scl/fi/7z7rsy14amjeugf166i1t/text_classification_test.csv?rlkey=z53jgwhijd6bpvk7n8n2munwb&dl=1')

In [5]:
train[['category', 'text']].to_csv('../data/text_classification_train.csv', index=False)
test[['text']].to_csv('../data/text_classification_test.csv', index=False)

# Часть 2

In [None]:
!pip install optuna optuna-integration catboost -q

import json
import os

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

import optuna
import torch
import seaborn as sns
import shap

In [None]:
train = pd.read_csv('https://www.dropbox.com/scl/fi/9hb4r3uce0mqz8fkpja17/text_classification_train.csv?rlkey=w42y98wa401gelzou08pp582k&dl=1')
test = pd.read_csv('https://www.dropbox.com/scl/fi/7z7rsy14amjeugf166i1t/text_classification_test.csv?rlkey=z53jgwhijd6bpvk7n8n2munwb&dl=1')

print(train.shape, test.shape)
train.head(3)

In [None]:
RANDOM_STATE = 42

## Вспомогательные функции

In [None]:
def train_model(algorithm,
                X,
                y,
                early_stopping_rounds,
                init_params=None,
                cat_features=None,
                text_features=None,
                random_seed=2024
    ):
    scores = []
    models = []

    kf = KFold(n_splits=3, shuffle=True, random_state=random_seed)

    print(f"========= TRAINING {algorithm.__name__} =========")

    for num_fold, (train_index, val_index) in enumerate(kf.split(X)):
        X_train, X_eval = X.iloc[train_index], X.iloc[val_index]
        y_train, y_eval = y.iloc[train_index], y.iloc[val_index]

        if init_params is not None:
            model = algorithm(**init_params)
        else:
            model = algorithm()

        if 'CatBoost' in algorithm.__name__:
            # Специальный класс для ускорения обучения 
            train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features, text_features=text_features)
            eval_dataset  = Pool(data=X_eval, label=y_eval, cat_features=cat_features, text_features=text_features)

            model.fit(train_dataset,
                      eval_set=eval_dataset,
                      verbose=0,
                      early_stopping_rounds=early_stopping_rounds)

        elif 'LGBM' in algorithm.__name__:
            # Специальный класс для ускорения обучения 
            train_dataset = Dataset(X_train, y_train, categorical_feature=cat_features, free_raw_data=False,)
            eval_dataset  = Dataset(X_eval, y_eval, categorical_feature=cat_features, free_raw_data=False,)

            model = lgb.train(params=init_params,
                              train_set=train_dataset,
                              valid_sets=(eval_dataset),
                              #callbacks=[lgb.log_evaluation(10)],
                              #           lgb.early_stopping(stopping_rounds=5)],
                              categorical_feature=cat_features,
                              #verbose_eval=False                   # в новой версии LightGBM по логи по умолчанию отключены
                              )

        elif 'XGB' in algorithm.__name__:
            # Специальный класс для ускорения обучения
            train_dataset = xgb.DMatrix(X_train, label=y_train, nthread=-1, enable_categorical=True,)
            eval_dataset  = xgb.DMatrix(X_eval,  label=y_eval,  nthread=-1, enable_categorical=True,)

            model = xgb.train(params=init_params,
                              dtrain=train_dataset,
                              evals=[(train_dataset, 'dtrain'), (eval_dataset, 'dtest')],
                              verbose_eval=False,
                              early_stopping_rounds=early_stopping_rounds)

            X_eval = eval_dataset

        # Предсказание на X_eval и расчет RMSE
        y_pred = model.predict(X_eval)
        score = balanced_accuracy_score(y_eval, y_pred)

        models.append(model)
        scores.append(score)

        print(f'FOLD {num_fold}: SCORE {score}')

    mean_kfold_score = np.mean(scores, dtype="float16") - np.std(scores, dtype="float16")
    print("\nMEAN BALANCED ACCURACY SCORE", mean_kfold_score)

    # Модель с наименьшим значением скора
    best_model = models[scores.index(min(scores))]

    return mean_kfold_score, best_model

## CatBoost

### Обучение CatBoost модели с текстовыми признаками и кастомными параметрами

In [None]:
cb_init_params_cust = {
        'loss_function': 'MultiClass',
        
        # Ограничим глубину деревьев для ускорения
        'depth': 4,
        'iterations': 3500,

        # Регуляризация и ускорение
        'max_bin': 187,
        'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
        'thread_count': -1,
        'bootstrap_type': 'Bernoulli', 
            
        # Важное!
        'random_seed': RANDOM_STATE,
        'auto_class_weights': 'SqrtBalanced',
        'early_stopping_rounds': 30
    }

In [None]:
cb_score, cb_model = train_model(
    algorithm=CatBoostClassifier,
    X=train.drop(columns=['category']), y=train['category'],
    init_params=cb_init_params_cust,
    early_stopping_rounds=30,
    text_features=['text'],
    random_seed=RANDOM_STATE
)

pd.DataFrame(cb_model.predict(test), columns=['category']).to_csv('../subs/cb_model_preds.csv', index=False)

accuracy на лидерборде 0.7568

### Feature Selection (Shap)

In [None]:
if 'shap_result.json' in os.listdir('../src'):
    # загрузка параметров из файла
    with open('../src/shap_result.json', 'r') as read_file:
        shap_result = json.load(read_file)
    row = shap_result['shap_result']

else:
    X_train, X_eval, y_train, y_eval = train_test_split(train.drop(columns=['category']), train['category'], test_size=0.2, random_state=42)

    model = CatBoostClassifier(**cb_init_params_cust)

    train_dataset = Pool(data=X_train, label=y_train, text_features=['text'])
    eval_dataset  = Pool(data=X_eval, label=y_eval, text_features=['text'])

    model.fit(train_dataset, 
            eval_set=eval_dataset,
            verbose=0, plot=False,
            early_stopping_rounds=30)

    explainer = shap.TreeExplainer(model)

    cat_features = None

    #train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features, text_features=text_cols)
    shap_values = explainer.shap_values(train_dataset)

    row = [shap_values[:, feature_ind, :] for feature_ind in range(shap_values.shape[1])]
    row = [np.abs(i).mean(0).mean() for i in row]

    shap_result={
        'shap_result': row
        }

    # сохранение результатов в файл
    with open('../src/shap_result.json', 'w') as f:
        json.dump(shap_result, f)

top_shap_idx = sorted(range(len(row)), key=lambda k: row[k], reverse=True)

df_plot = []
cum_shap_value = 0
for shap_value in sorted(row, reverse=True):
    cum_shap_value += shap_value
    df_plot.append(cum_shap_value)
df_plot = pd.DataFrame(df_plot, columns=['shap_value'])

sns.relplot(
    data=df_plot,
    kind="line",
    height=4, 
    aspect=2,
);

In [None]:
# Подбор количества фичей
for num_col in range(200, 900, 50):
    X_train, X_eval, y_train, y_eval = train_test_split(train[train.drop(columns=['category']).columns[top_shap_idx[:num_col]]], train['category'], test_size=0.2, random_state=42)
    train_dataset = Pool(data=X_train, label=y_train, text_features=['text'])
    eval_dataset  = Pool(data=X_eval, label=y_eval, text_features=['text'])

    cb_model = CatBoostClassifier(**cb_init_params_cust)
    cb_model.fit(train_dataset, 
            eval_set=eval_dataset,
            verbose=0, plot=False, 
            early_stopping_rounds=30)

    y_pred = cb_model.predict(X_eval)
    score = balanced_accuracy_score(y_eval, y_pred)
    print(num_col, score)

In [None]:
cb_score, cb_model = train_model(
    algorithm=CatBoostClassifier,
    X=train[train.drop(columns=['category']).columns[top_shap_idx[:500]]], y=train['category'],
    init_params=cb_init_params_cust,
    early_stopping_rounds=30,
    text_features=['text'],
    random_seed=RANDOM_STATE
)

pd.DataFrame(cb_model.predict(test[test.columns[top_shap_idx[:500]]]),
             columns=['category']).to_csv('../subs/cb_model_preds.csv', index=False)

accuracy на лидерборде 0.7604

## TFIDF + классификатор

### TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
X_TFIDF = vectorizer.fit_transform(train['text'])
X_TFIDF_train, X_TFIDF_eval, y_TFIDF_train, y_TFIDF_eval = train_test_split(X_TFIDF, train['category'], test_size=0.2, random_state=42)
X_TFIDF_test  = vectorizer.transform(test['text'])

print(X_TFIDF_train.shape)
print(X_TFIDF_eval.shape)
print(X_TFIDF_test.shape)

### LogisticRegression

In [None]:
# Обечение с валидацией
logreg = LogisticRegression(random_state=RANDOM_STATE)
logreg.fit(X_TFIDF_train, y_TFIDF_train)
print(balanced_accuracy_score(y_TFIDF_eval, logreg.predict(X_TFIDF_eval)))

# Обучение на всех данных
logreg.fit(X_TFIDF, train['category'])
pd.DataFrame(logreg.predict(X_TFIDF_test), columns=['category']).to_csv('../subs/tfids_logreg.csv', index=False)

accuracy на лидерборде 0.75

### SVC

In [None]:
# Подбор гиперпараметров
def objective_svc(trial):

    C = trial.suggest_float("C",0.1,1000)
    gamma = trial.suggest_float("gamma",0.0001,1)
    kernel = trial.suggest_categorical("kernel",['rbf','poly']) 
    model = SVC(
        C=C,
        gamma=gamma,
        kernel=kernel,
        random_state=RANDOM_STATE
    )  
    score = cross_val_score(model, X_TFIDF_train, y, cv=3)
    accuracy = score.mean()
    return accuracy

if 'params_svc.json' in os.listdir('../src'):
    # загрузка параметров из файла
    with open('../src/params_svc.json', 'r') as read_file:
        params_svc = json.load(read_file)
        
else:
    X_TFIDF_train = vectorizer.fit_transform(train['text'])
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_svc,
                n_trials=100,
                n_jobs = -1)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    params_svc = trial.params

    # сохранение результатов в файл
    with open('../src/params_svc.json', 'w') as f:
        json.dump(params_svc, f)

print("  Params: ")
for key, value in params_svc.items():
    print("    {}: {}".format(key, value))

In [None]:
# Обечение с валидацией
#svc_clf = SVC(C=2.5, gamma=0.7, random_state=RANDOM_STATE)
svc_clf = SVC(**params_svc)
svc_clf.fit(X_TFIDF_train, y_TFIDF_train)
print(balanced_accuracy_score(y_TFIDF_eval, svc_clf.predict(X_TFIDF_eval)))

# Обучение на всех данных
svc_clf.fit(X_TFIDF, train['category'])
pd.DataFrame(svc_clf.predict(X_TFIDF_test), columns=['category']).to_csv('../subs/tfids_svc.csv', index=False)

accuracy на лидерборде  0.7716

### LinearSVC

In [None]:
# Обечение с валидацией
linear_svc = LinearSVC(tol=0.01, dual=True, C=0.6, random_state=RANDOM_STATE)
linear_svc.fit(X_TFIDF_train, y_TFIDF_train)
print(balanced_accuracy_score(y_TFIDF_eval, linear_svc.predict(X_TFIDF_eval)))

# Обучение на всех данных
linear_svc.fit(X_TFIDF, train['category'])
pd.DataFrame(linear_svc.predict(X_TFIDF_test), columns=['category']).to_csv('../subs/tfids_lin_svc.csv', index=False)

accuracy на лидерборде 0.8

### ExtraTreesClassifier

In [None]:
# Обечение с валидацией
et_clf = ExtraTreesClassifier(n_estimators = 6_000, max_depth = 8, min_samples_leaf = 2, bootstrap = True,
                              class_weight = 'balanced',random_state = RANDOM_STATE, verbose=False, n_jobs=-1)
et_clf.fit(X_TFIDF_train, y_TFIDF_train)
print(balanced_accuracy_score(y_TFIDF_eval, et_clf.predict(X_TFIDF_eval)))

# Обучение на всех данных
et_clf.fit(X_TFIDF, train['category'])
pd.DataFrame(et_clf.predict(X_TFIDF_test), columns=['category']).to_csv('../subs/tfids_et.csv', index=False)

accuracy на лидерборде 0.6836 (можно подбором гиперпараметров увеличить до 0.7432, но метрика при стекинге падает)

### RandomForestClassifier

In [None]:
# Обечение с валидацией
rf = RandomForestClassifier(n_estimators=10_000, max_depth=200, n_jobs=-1, random_state=RANDOM_STATE)
rf.fit(X_TFIDF_train, y_TFIDF_train)
print(balanced_accuracy_score(y_TFIDF_eval, rf.predict(X_TFIDF_eval)))

# Обучение на всех данных
rf.fit(X_TFIDF, train['category'])
pd.DataFrame(rf.predict(X_TFIDF_test), columns=['category']).to_csv('../subs/tfids_rf.csv', index=False)

accuracy на лидерборде 0.7068

## Стекинг

In [None]:
class Stacker:
    def __init__(self, base_models, meta_model, preprocessing=None, metafeatures_mode=None):
        """
        base_models - список базовых моделей, которые нужно обучать на изначальных данных
        meta_model - мета модель, которая обучается на предсказаниях базовых моделей
        metafeatures_mode - режим формирования фичей ('pred' - предикт, 'proba' - вероятность, 'log_proba' - логарифм вероятности
        preprocessing - список словарей операций над датасетом:
            col_select - выбор столбцов
            col_drop - удаление столбцов
            tfidf - преобразование tfidf по указанному столбцу
            text_features - список текстовых сболбцов для подачи в CatBoost
        """
        self.base_models = base_models
        self.meta_model = meta_model
        self.preprocessing = preprocessing
        self.metafeatures_mode = metafeatures_mode if metafeatures_mode else [['pred'] for m in base_models]
        self.vectorizer = None
        self.additional_meta_features = False # дополнительные метафичи (разность, деление существующих)
        self.meta_cat_features = None
        self.meta_le = {}

    def X_preprocessing(self, X, num_model):
        if 'col_drop' in self.preprocessing[num_model]:
            X.drop(columns=self.preprocessing[num_model]['col_drop'], inplace=True)
        if 'col_select' in self.preprocessing[num_model]:
            X = X[self.preprocessing[num_model]['col_select']] 
        if 'tfidf' in self.preprocessing[num_model]:
            X = self.vectorizer.transform(X[self.preprocessing[num_model]['tfidf']])      
        return X
    
    def base_model_one_pred(self, model, mode, X):
        if mode == 'pred':
            preds = model.predict(X)
            preds = preds.reshape(len(preds), 1)
        elif mode == 'proba':
            if model.__class__.__name__ == 'LinearSVC':
                 preds = model._predict_proba_lr(X)
            else:
                preds = model.predict_proba(X)
        elif mode == 'log_proba':
            preds = model.predict_log_proba(X)
        
        # костыль для избавления от -inf в RF
        if model.__class__.__name__ == 'RandomForestClassifier':
            preds[np.where(preds == float('-inf'))] = -100
            preds[np.where(preds == float('inf'))] = 100

        return preds
    
    def base_model_all_preds(self, model, num_model, X):
        #preds_all = None
        preds_all = np.empty((X.shape[0], 0))
        for mode in self.metafeatures_mode[num_model]:
            preds = self.base_model_one_pred(model, mode, X)
            #preds_all = preds if preds_all is None else np.concatenate([preds_all, preds], axis=1) 
            preds_all = np.concatenate([preds_all, preds], axis=1)
        return preds_all
    
    def base_model_fit(self, model, num_model, X_train, y_train, X_val=None, y_val=None):
                
        if model.__class__.__name__ == 'CatBoostClassifier':
            #best_iter = 0
            text_features = []
            if 'text_features' in self.preprocessing[num_model]:            # возможно вынести наверх (в основную функцию)
                for i in preprocessing[num_model]['text_features']:
                    text_features.append(X_train.columns.get_loc(i))
            cat_features = []
            for i in range(X_train.shape[1]):
                if type(X_train.iloc[0, i]) == str:
                    if i not in text_features:
                        cat_features.append(i)

            if X_val is not None and y_val is not None:
                model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_features, text_features=text_features,
                          verbose=False, early_stopping_rounds=30)
                #if model.best_iteration_ > best_iter:
                    #best_iter = model.best_iteration_
            else:
                model.fit(X_train, y_train, cat_features=cat_features, text_features=text_features, verbose=False)
        else:
            model.fit(X_train, y_train)

    def fit_base(self, X, y, n_fold=5):
        # если есть tfidf - обучаем vectorizer
        for params in preprocessing:
            if 'tfidf' in params.keys():
                self.vectorizer = TfidfVectorizer()
                self.vectorizer.fit(X[params['tfidf']])
                print('Сформирован vectorizer:', self.vectorizer.get_feature_names_out().shape[0], 'токенов.')
                break
        
        folds = KFold(n_splits=n_fold)
        final_features = np.empty((len(X), 0))

        for num_model, model in enumerate(self.base_models):
            
            preds_model = None
        
            for train_indices, val_indices in folds.split(X, y):
 
                X_train, X_val = X.loc[train_indices], X.loc[val_indices]
                X_train, X_val = self.X_preprocessing(X_train, num_model), self.X_preprocessing(X_val, num_model)
                y_train, y_val = y[train_indices], y[val_indices]        
            
                self.base_model_fit(model, num_model, X_train, y_train, X_val, y_val)
                preds_fold = self.base_model_all_preds(model, num_model, X_val)
                preds_model = preds_fold if preds_model is None else np.concatenate([preds_model, preds_fold], axis=0)
            
            final_features = np.concatenate([final_features, preds_model], axis=1)

            self.base_model_fit(model, num_model, self.X_preprocessing(X, num_model), y)
            print('Обучена базовая модель №', num_model+1, model.__class__.__name__)
            
        return final_features
    
    def add_meta_features(self, X):
        
        meta_num_features = []
        for i in range(X.shape[1]):
            if type(X[0][i]) == float:
                meta_num_features.append(i)
        
        new_features = []
        for source in meta_num_features:
            for destination in meta_num_features:
                row = X[:, source] / X[:, destination]
                new_features.append(row.reshape(len(row),1))
                row = X[:, source] - X[:, destination]
                new_features.append(row.reshape(len(row),1))
        new_features = np.concatenate(new_features, axis=1)
        new_features = np.concatenate([X, new_features], axis=1)

        return new_features

    def fit_meta(self, meta_features, y):
        
        self.meta_cat_features = []
        for i in range(meta_features.shape[1]):
            if type(meta_features[0][i]) == str:
                self.meta_cat_features.append(i)
        
        if self.meta_model.__class__.__name__ == 'CatBoostClassifier':
            self.meta_model.fit(pd.DataFrame(meta_features), y, cat_features=self.meta_cat_features, verbose=False)
        else:
            for col in self.meta_cat_features:
                self.meta_le[col] = LabelEncoder()
                meta_features[:, col] = self.meta_le[col].fit_transform(meta_features[:, col])

            self.meta_model.fit(pd.DataFrame(meta_features), y)
        print('cat_features', self.meta_cat_features)
        print('Обучена метамодель', self.meta_model.__class__.__name__)
        return None
    
    def fit(self, X, y):
        meta_features = self.fit_base(X, y)
        if self.additional_meta_features == True:
            meta_features=self.add_meta_features(meta_features)
            print('Сформированы дополнительные фичи')
        self.fit_meta(meta_features, y)
    
    def predict(self, X):

        final_features = np.empty((len(X), 0))
        
        for num_model, model in enumerate(self.base_models):
            
            # X preprocessing
            X_test = self.X_preprocessing(X, num_model)
            preds_model = self.base_model_all_preds(model, num_model, X_test)
            final_features = np.concatenate([final_features, preds_model], axis=1)
        
        
        if self.meta_model.__class__.__name__ == 'CatBoostClassifier':
            final_preds = self.meta_model.predict(final_features)
        else:
            for col in self.meta_cat_features:
                final_features[:, col] = self.meta_le[col].transform(final_features[:, col])

            final_preds = self.meta_model.predict(final_features)
            final_preds = final_preds.reshape(len(final_preds), 1)

        return final_preds

In [None]:
# Создаем экземпляр Stacker
base_models = [CatBoostClassifier(**cb_init_params_cust),
               LogisticRegression(random_state=RANDOM_STATE),
               SVC(**params_svc, probability=True, decision_function_shape='ovr', random_state=RANDOM_STATE),
               LinearSVC(tol=0.01, dual=True, C=0.6, random_state=RANDOM_STATE),
               ExtraTreesClassifier(n_estimators = 6_000, max_depth = 8, min_samples_leaf = 2, bootstrap = True,
                                    class_weight = 'balanced',random_state = RANDOM_STATE, verbose=False, n_jobs=-1,),
               #RandomForestClassifier(n_estimators = 10_000, max_depth = 200, n_jobs=-1, random_state = RANDOM_STATE)
               ]

meta_model = LogisticRegression(random_state=RANDOM_STATE)

preprocessing = [{'col_select':train.drop(columns=['category']).columns[top_shap_idx[:500]], 'text_features':['text']},
                 {'col_select':['text'], 'tfidf':'text'},
                 {'col_select':['text'], 'tfidf':'text'},
                 {'col_select':['text'], 'tfidf':'text'},
                 {'col_select':['text'], 'tfidf':'text'},
                 #{'col_select':['text'], 'tfidf':'text'}
                 ]

metafeatures_mode = [['pred', 'proba', 'log_proba'],
                     ['pred', 'proba', 'log_proba'],
                     ['pred', 'proba', 'log_proba'],
                     ['pred', 'proba'],
                     ['pred', 'proba', 'log_proba'],
                     #['pred', 'proba', 'log_proba']
                     ]

stacker = Stacker(base_models, meta_model, preprocessing, metafeatures_mode)
stacker.fit(train.drop(columns=['category']), train['category'])
res = stacker.predict(test)
pd.DataFrame(res, columns=['category']).to_csv('../subs/tfids_stacking.csv', index=False)
res